Daniel@0: /* Part of DML (Digital Music Laboratory) Daniel@0: Copyright 2014-2015 Samer Abdallah, University of London Daniel@0: Daniel@0: This program is free software; you can redistribute it and/or Daniel@0: modify it under the terms of the GNU General Public License Daniel@0: as published by the Free Software Foundation; either version 2 Daniel@0: of the License, or (at your option) any later version. Daniel@0: Daniel@0: This program is distributed in the hope that it will be useful, Daniel@0: but WITHOUT ANY WARRANTY; without even the implied warranty of Daniel@0: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Daniel@0: GNU General Public License for more details. Daniel@0: Daniel@0: You should have received a copy of the GNU General Public Daniel@0: License along with this library; if not, write to the Free Software Daniel@0: Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Daniel@0: */ Daniel@0: Daniel@0: :- module(crawler, Daniel@0: [ crawl_loaded/2 Daniel@0: , crawl_ui//2 Daniel@0: , crawl/2 Daniel@0: , crawl/1 Daniel@0: , crawl_reload_graph/1 Daniel@0: ]). Daniel@0: Daniel@0: :- multifile authority/3. Daniel@0: :- multifile source/3. Daniel@0: Daniel@0: :- use_module(library(settings)). Daniel@0: :- use_module(library(http/http_dispatch)). Daniel@0: :- use_module(library(http/http_parameters)). Daniel@0: :- use_module(library(http/http_wrapper)). Daniel@0: :- use_module(library(http/html_write)). Daniel@0: :- use_module(library(semweb/rdf_db)). Daniel@0: :- use_module(library(prov_schema)). Daniel@0: :- use_module(library(httpfiles)). Daniel@0: :- use_module(library(htmlutils), [element//2]). Daniel@0: :- use_module(library(dcg_core)). Daniel@0: :- use_module(library(insist)). Daniel@0: :- use_module(components(basics)). Daniel@0: :- use_module(components(messages)). Daniel@0: :- use_module(user(user_db)). Daniel@0: Daniel@0: :- setting(enable_auto_crawl, boolean, true, "Enables automatic semantic web searches for recognised URIs"). Daniel@0: Daniel@0: :- http_handler(root(crawl), crawl_handler, []). Daniel@0: Daniel@0: no_cache --> Daniel@0: html_post(head,meta(['http-equiv'='Cache-Control', content='no-cache, no-store, must-revalidate, max-age=0'],[])), Daniel@0: html_post(head,meta(['http-equiv'='Pragma', content='no-cache'],[])), Daniel@0: html_post(head,meta(['http-equiv'='Expires', content=0],[])). Daniel@0: Daniel@0: refresh(URL,Delay) --> Daniel@0: no_cache, Daniel@0: html_post(head,meta(['http-equiv'=refresh, content="~d;~w"-[Delay,URL]],[])). Daniel@0: Daniel@0: %% crawl_handler(+Request) is det. Daniel@0: % web service to query given authority about given URI Daniel@0: crawl_handler(Request) :- Daniel@0: authorized(write(default, load(lod))), Daniel@0: http_parameters(Request, Daniel@0: [ uri(URI, [optional(false), description("URI to search for")]) Daniel@0: , src(Src, [optional(false), description("Source ID")]) Daniel@0: , return_to(Return, [ optional(true), description('URI to return to') ]) Daniel@0: , return_after(Delay, [ default(2) ]) Daniel@0: , messages(Msgs, [boolean, default(true)]) Daniel@0: ]), Daniel@0: debug(crawler,"Got request to consult ~w on ~w",[Src,URI]), Daniel@0: return_options(Return, Delay, Options), Daniel@0: ( Msgs=true Daniel@0: -> call_showing_messages(insist(crawl(URI,Src)),Options) Daniel@0: ; call_without_messages(insist(crawl(URI,Src)),Options) Daniel@0: ). Daniel@0: Daniel@0: call_without_messages(Goal,Options) :- Daniel@0: catch( (Goal, Msg='Success'), Ex, (Msg='Error', print_message(error,Ex))), Daniel@0: ( option(return_to(Return),Options) Daniel@0: -> option(return_after(Delay),Options,2), Daniel@0: reply_html_page(cliopatria(default),[], [ h2(Msg), \refresh(Return,Delay) ], [unstable]) Daniel@0: ; reply_html_page(cliopatria(default),[], [ h2(Msg) ], [unstable]) Daniel@0: ). Daniel@0: Daniel@0: return_options(Return, _, []) :- var(Return), !. Daniel@0: return_options(Return, Delay, [ return_to(Return), return_after(Delay) ]). Daniel@0: Daniel@0: Daniel@0: %% crawl_ui(+URI:resource,+NT:natural) is det. Daniel@0: % Component to allow user to trigger a crawl on a given URI, depending on Daniel@0: % NT, the number of triples which currently have it has subject. Daniel@0: crawl_ui(URI,NT) --> Daniel@0: ( {setof(S-Auto, uri_authority(URI,S,Auto),Sources)} Daniel@0: -> ( {setof(S-A, (member(S-A,Sources), \+crawl_loaded(URI,S)), Untapped)} Daniel@0: % !!! should check permissions here Daniel@0: -> { http_current_request(Request), Daniel@0: memberchk(request_uri(Here), Request) Daniel@0: }, Daniel@0: {debug(crawler,'Untapped sources: ~q',[Untapped])}, Daniel@0: no_cache, Daniel@0: ( {setting(enable_auto_crawl,true)}, Daniel@0: {setof(S, member(S-true,Untapped), AutoSources)} Daniel@0: -> html([ 'The following sources have been consulted automatically: ', Daniel@0: \seqmap_with_sep(html(', '),element(code),AutoSources), '.', br([]) Daniel@0: ]), Daniel@0: { debug(crawler,'consulting in parallel: ~w...',[AutoSources]), Daniel@0: concurrent_maplist(consult_source(URI),AutoSources,Statuses), Daniel@0: debug(crawler,'finished consulting on ~w.',[URI]) Daniel@0: }, Daniel@0: ( {member(ok,Statuses)} Daniel@0: -> refresh(Here,0), Daniel@0: html([b('At least one consultation succeeded; refreshing automatically.'),br([])]) Daniel@0: ; html([b('All consultations failed.'),br([])]) Daniel@0: ) Daniel@0: ; html(p([ 'Click to consult one of the following sources for more information.' Daniel@0: , br([]), \seqmap(consult_form(URI,Here),Untapped) Daniel@0: ])) Daniel@0: ) Daniel@0: ; [] Daniel@0: ) Daniel@0: ; {NT>0} -> [] Daniel@0: ; html(p('No triples and no authorities known for this URI.')) Daniel@0: ). Daniel@0: Daniel@0: consult_source(URI,Source,Status) :- Daniel@0: catch( (crawl(URI,Source), Status=ok), Ex, Daniel@0: (print_message(error,Ex), Status=error(Ex))). Daniel@0: Daniel@0: consult_form(URI,Here,Source-_) --> Daniel@0: {http_link_to_id(crawl_handler, [], FetchURL)}, Daniel@0: {source_name(Source,Name)}, Daniel@0: {source(Source,_,Opts), option(messages(Msgs),Opts,true)}, Daniel@0: html(form([style="display:inline-block", action(FetchURL)], Daniel@0: [ \hidden(uri, URI), Daniel@0: \hidden(src, Source), Daniel@0: \hidden(return_to, Here), Daniel@0: \hidden(messages, Msgs), Daniel@0: input([ type(submit), value(Name) ]) Daniel@0: ])). Daniel@0: Daniel@0: %% crawl_loaded(+URI:resource,+Source:atom) is semidet. Daniel@0: % True when URI has already been crawled and added to the RDF database. Daniel@0: crawl_loaded(URI,Source) :- Daniel@0: source_uri_graph(Source,URI,Graph), Daniel@0: debug(crawler,'Checking if loaded: ~w in graph ~w from ~w',[URI,Graph,Source]), Daniel@0: rdf(Graph,prov:wasDerivedFrom,URI,Graph). Daniel@0: Daniel@0: Daniel@0: %% crawl_reload_graph(+Graph:atom) is det. Daniel@0: % Attempts to delete the named graph and reload it from all the URIs Daniel@0: % that it was derived from. Daniel@0: crawl_reload_graph(Graph) :- Daniel@0: findall(URI,rdf(Graph,prov:wasDerivedFrom,URI,Graph),URIs), Daniel@0: length(URIs,NURIs), Daniel@0: print_message(information,crawl_reload(Graph,NURIs)), Daniel@0: rdf_transaction(( Daniel@0: rdf_unload_graph(Graph), Daniel@0: maplist(reload_into(Graph),URIs) Daniel@0: )). Daniel@0: Daniel@0: reload_into(Graph,URI) :- Daniel@0: uri_authority(URI,Source,_), Daniel@0: source_uri_graph(Source,URI,Graph), Daniel@0: load_into(Graph,Source,URI). Daniel@0: Daniel@0: %% crawl(+URI:resource) is det. Daniel@0: % Looks for information about URI on all authorities claiming authority on it. Daniel@0: % Queries are made in parallel using concurrent_maplist/2. Daniel@0: crawl(URI) :- Daniel@0: findall(S,uri_authority(URI,S,_),Sources), Daniel@0: concurrent_maplist(consult_source(URI),Sources,_). Daniel@0: Daniel@0: Daniel@0: %% crawl(+URI:resource,+Auth:atom) is det. Daniel@0: % Looks for information about URI on specified authority. Daniel@0: crawl(URI,Source) :- Daniel@0: debug(crawler,"Consulting source ~w on ~w...",[Source,URI]), Daniel@0: source_uri_graph(Source, URI, Graph), Daniel@0: rdf_transaction(load_into(Graph,Source,URI)). Daniel@0: Daniel@0: load_into(Graph,Source,URI) :- Daniel@0: source(Source, Handler, _), Daniel@0: ( call(Handler,crawl(URI,Graph)) Daniel@0: -> print_message(information,crawler_crawled(Source,URI,Graph)) Daniel@0: ; setof(T, call(Handler,describe(URI,T)), Triples), Daniel@0: length(Triples, NumTriples), Daniel@0: forall(member(rdf(S,P,O),Triples), rdf_assert(S,P,O,Graph)) Daniel@0: -> print_message(information,crawler_described(Source,URI,Graph,NumTriples)) Daniel@0: ; print_message(information,crawler_failed(Source,URI)) Daniel@0: ), Daniel@0: debug(crawler,'Asserting successful load of ~w into graph ~w from ~w',[URI,Graph,Source]), Daniel@0: rdf_assert(Graph, prov:wasDerivedFrom, URI, Graph). Daniel@0: Daniel@0: %% uri_authority(+URI:resource, -Auth:atom, -Auto:boolean) is nondet. Daniel@0: % True when Auth is declared as an authority on URI. If Auto=true, Daniel@0: % then, this authority should be queried automatically when URI is the Daniel@0: % subject of a 'list_resource' page. Daniel@0: uri_authority(URI,Src,Auto) :- Daniel@0: authority(Cond,Src,Opts), Daniel@0: once(matches(Cond,URI)), Daniel@0: option(auto(Auto),Opts,false). Daniel@0: Daniel@0: matches(begins(Prefix),URI) :- sub_atom(URI,0,_,_,Prefix). Daniel@0: matches(\+Cond,URI) :- \+matches(Cond,URI). Daniel@0: matches((C1,C2),URI) :- matches(C1,URI), matches(C2,URI). Daniel@0: matches((C1;C2),URI) :- matches(C1,URI); matches(C2,URI). Daniel@0: Daniel@0: Daniel@0: %% source_uri_graph(+Auth:atom,+URI:resource,-Graph:atom) is det. Daniel@0: % Derive the RDF graph name for a given authority. Daniel@0: source_uri_graph(Source,URI,Graph) :- Daniel@0: source(Source,Handler,_), Daniel@0: call(Handler,uri_graph(URI,Graph)). Daniel@0: Daniel@0: source_name(Source,Name) :- source(Source,Handler,_), call(Handler,name(Name)), !. Daniel@0: source_name(Source,Source). Daniel@0: Daniel@0: prolog:message(crawler_crawled(Src,URI,Graph)) --> Daniel@0: ["Triples from ~w about ~w loaded into ~w via crawl method"-[Src,URI,Graph]]. Daniel@0: prolog:message(crawler_described(Src,URI,Graph,N)) --> Daniel@0: ["Triples from ~w about ~w loaded into ~w via describe method: ~d."-[Src,URI,Graph,N]]. Daniel@0: prolog:message(crawler_failed(Src,URI)) --> Daniel@0: ["Failed to load anything from ~w about ~w."-[Src,URI]]. Daniel@0: