Mercurial > hg > dml-open-cliopatria
diff cpack/dml/lib/crawler.pl @ 0:718306e29690 tip
commiting public release
author | Daniel Wolff |
---|---|
date | Tue, 09 Feb 2016 21:05:06 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpack/dml/lib/crawler.pl Tue Feb 09 21:05:06 2016 +0100 @@ -0,0 +1,223 @@ +/* Part of DML (Digital Music Laboratory) + Copyright 2014-2015 Samer Abdallah, University of London + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +:- module(crawler, + [ crawl_loaded/2 + , crawl_ui//2 + , crawl/2 + , crawl/1 + , crawl_reload_graph/1 + ]). + +:- multifile authority/3. +:- multifile source/3. + +:- use_module(library(settings)). +:- use_module(library(http/http_dispatch)). +:- use_module(library(http/http_parameters)). +:- use_module(library(http/http_wrapper)). +:- use_module(library(http/html_write)). +:- use_module(library(semweb/rdf_db)). +:- use_module(library(prov_schema)). +:- use_module(library(httpfiles)). +:- use_module(library(htmlutils), [element//2]). +:- use_module(library(dcg_core)). +:- use_module(library(insist)). +:- use_module(components(basics)). +:- use_module(components(messages)). +:- use_module(user(user_db)). + +:- setting(enable_auto_crawl, boolean, true, "Enables automatic semantic web searches for recognised URIs"). + +:- http_handler(root(crawl), crawl_handler, []). + +no_cache --> + html_post(head,meta(['http-equiv'='Cache-Control', content='no-cache, no-store, must-revalidate, max-age=0'],[])), + html_post(head,meta(['http-equiv'='Pragma', content='no-cache'],[])), + html_post(head,meta(['http-equiv'='Expires', content=0],[])). + +refresh(URL,Delay) --> + no_cache, + html_post(head,meta(['http-equiv'=refresh, content="~d;~w"-[Delay,URL]],[])). + +%% crawl_handler(+Request) is det. +% web service to query given authority about given URI +crawl_handler(Request) :- + authorized(write(default, load(lod))), + http_parameters(Request, + [ uri(URI, [optional(false), description("URI to search for")]) + , src(Src, [optional(false), description("Source ID")]) + , return_to(Return, [ optional(true), description('URI to return to') ]) + , return_after(Delay, [ default(2) ]) + , messages(Msgs, [boolean, default(true)]) + ]), + debug(crawler,"Got request to consult ~w on ~w",[Src,URI]), + return_options(Return, Delay, Options), + ( Msgs=true + -> call_showing_messages(insist(crawl(URI,Src)),Options) + ; call_without_messages(insist(crawl(URI,Src)),Options) + ). + +call_without_messages(Goal,Options) :- + catch( (Goal, Msg='Success'), Ex, (Msg='Error', print_message(error,Ex))), + ( option(return_to(Return),Options) + -> option(return_after(Delay),Options,2), + reply_html_page(cliopatria(default),[], [ h2(Msg), \refresh(Return,Delay) ], [unstable]) + ; reply_html_page(cliopatria(default),[], [ h2(Msg) ], [unstable]) + ). + +return_options(Return, _, []) :- var(Return), !. +return_options(Return, Delay, [ return_to(Return), return_after(Delay) ]). + + +%% crawl_ui(+URI:resource,+NT:natural) is det. +% Component to allow user to trigger a crawl on a given URI, depending on +% NT, the number of triples which currently have it has subject. +crawl_ui(URI,NT) --> + ( {setof(S-Auto, uri_authority(URI,S,Auto),Sources)} + -> ( {setof(S-A, (member(S-A,Sources), \+crawl_loaded(URI,S)), Untapped)} + % !!! should check permissions here + -> { http_current_request(Request), + memberchk(request_uri(Here), Request) + }, + {debug(crawler,'Untapped sources: ~q',[Untapped])}, + no_cache, + ( {setting(enable_auto_crawl,true)}, + {setof(S, member(S-true,Untapped), AutoSources)} + -> html([ 'The following sources have been consulted automatically: ', + \seqmap_with_sep(html(', '),element(code),AutoSources), '.', br([]) + ]), + { debug(crawler,'consulting in parallel: ~w...',[AutoSources]), + concurrent_maplist(consult_source(URI),AutoSources,Statuses), + debug(crawler,'finished consulting on ~w.',[URI]) + }, + ( {member(ok,Statuses)} + -> refresh(Here,0), + html([b('At least one consultation succeeded; refreshing automatically.'),br([])]) + ; html([b('All consultations failed.'),br([])]) + ) + ; html(p([ 'Click to consult one of the following sources for more information.' + , br([]), \seqmap(consult_form(URI,Here),Untapped) + ])) + ) + ; [] + ) + ; {NT>0} -> [] + ; html(p('No triples and no authorities known for this URI.')) + ). + +consult_source(URI,Source,Status) :- + catch( (crawl(URI,Source), Status=ok), Ex, + (print_message(error,Ex), Status=error(Ex))). + +consult_form(URI,Here,Source-_) --> + {http_link_to_id(crawl_handler, [], FetchURL)}, + {source_name(Source,Name)}, + {source(Source,_,Opts), option(messages(Msgs),Opts,true)}, + html(form([style="display:inline-block", action(FetchURL)], + [ \hidden(uri, URI), + \hidden(src, Source), + \hidden(return_to, Here), + \hidden(messages, Msgs), + input([ type(submit), value(Name) ]) + ])). + +%% crawl_loaded(+URI:resource,+Source:atom) is semidet. +% True when URI has already been crawled and added to the RDF database. +crawl_loaded(URI,Source) :- + source_uri_graph(Source,URI,Graph), + debug(crawler,'Checking if loaded: ~w in graph ~w from ~w',[URI,Graph,Source]), + rdf(Graph,prov:wasDerivedFrom,URI,Graph). + + +%% crawl_reload_graph(+Graph:atom) is det. +% Attempts to delete the named graph and reload it from all the URIs +% that it was derived from. +crawl_reload_graph(Graph) :- + findall(URI,rdf(Graph,prov:wasDerivedFrom,URI,Graph),URIs), + length(URIs,NURIs), + print_message(information,crawl_reload(Graph,NURIs)), + rdf_transaction(( + rdf_unload_graph(Graph), + maplist(reload_into(Graph),URIs) + )). + +reload_into(Graph,URI) :- + uri_authority(URI,Source,_), + source_uri_graph(Source,URI,Graph), + load_into(Graph,Source,URI). + +%% crawl(+URI:resource) is det. +% Looks for information about URI on all authorities claiming authority on it. +% Queries are made in parallel using concurrent_maplist/2. +crawl(URI) :- + findall(S,uri_authority(URI,S,_),Sources), + concurrent_maplist(consult_source(URI),Sources,_). + + +%% crawl(+URI:resource,+Auth:atom) is det. +% Looks for information about URI on specified authority. +crawl(URI,Source) :- + debug(crawler,"Consulting source ~w on ~w...",[Source,URI]), + source_uri_graph(Source, URI, Graph), + rdf_transaction(load_into(Graph,Source,URI)). + +load_into(Graph,Source,URI) :- + source(Source, Handler, _), + ( call(Handler,crawl(URI,Graph)) + -> print_message(information,crawler_crawled(Source,URI,Graph)) + ; setof(T, call(Handler,describe(URI,T)), Triples), + length(Triples, NumTriples), + forall(member(rdf(S,P,O),Triples), rdf_assert(S,P,O,Graph)) + -> print_message(information,crawler_described(Source,URI,Graph,NumTriples)) + ; print_message(information,crawler_failed(Source,URI)) + ), + debug(crawler,'Asserting successful load of ~w into graph ~w from ~w',[URI,Graph,Source]), + rdf_assert(Graph, prov:wasDerivedFrom, URI, Graph). + +%% uri_authority(+URI:resource, -Auth:atom, -Auto:boolean) is nondet. +% True when Auth is declared as an authority on URI. If Auto=true, +% then, this authority should be queried automatically when URI is the +% subject of a 'list_resource' page. +uri_authority(URI,Src,Auto) :- + authority(Cond,Src,Opts), + once(matches(Cond,URI)), + option(auto(Auto),Opts,false). + +matches(begins(Prefix),URI) :- sub_atom(URI,0,_,_,Prefix). +matches(\+Cond,URI) :- \+matches(Cond,URI). +matches((C1,C2),URI) :- matches(C1,URI), matches(C2,URI). +matches((C1;C2),URI) :- matches(C1,URI); matches(C2,URI). + + +%% source_uri_graph(+Auth:atom,+URI:resource,-Graph:atom) is det. +% Derive the RDF graph name for a given authority. +source_uri_graph(Source,URI,Graph) :- + source(Source,Handler,_), + call(Handler,uri_graph(URI,Graph)). + +source_name(Source,Name) :- source(Source,Handler,_), call(Handler,name(Name)), !. +source_name(Source,Source). + +prolog:message(crawler_crawled(Src,URI,Graph)) --> + ["Triples from ~w about ~w loaded into ~w via crawl method"-[Src,URI,Graph]]. +prolog:message(crawler_described(Src,URI,Graph,N)) --> + ["Triples from ~w about ~w loaded into ~w via describe method: ~d."-[Src,URI,Graph,N]]. +prolog:message(crawler_failed(Src,URI)) --> + ["Failed to load anything from ~w about ~w."-[Src,URI]]. +