Mercurial > hg > dml-open-cliopatria
view cpack/dml/lib/crawler.pl @ 0:718306e29690 tip
commiting public release
author | Daniel Wolff |
---|---|
date | Tue, 09 Feb 2016 21:05:06 +0100 |
parents | |
children |
line wrap: on
line source
/* Part of DML (Digital Music Laboratory) Copyright 2014-2015 Samer Abdallah, University of London This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ :- module(crawler, [ crawl_loaded/2 , crawl_ui//2 , crawl/2 , crawl/1 , crawl_reload_graph/1 ]). :- multifile authority/3. :- multifile source/3. :- use_module(library(settings)). :- use_module(library(http/http_dispatch)). :- use_module(library(http/http_parameters)). :- use_module(library(http/http_wrapper)). :- use_module(library(http/html_write)). :- use_module(library(semweb/rdf_db)). :- use_module(library(prov_schema)). :- use_module(library(httpfiles)). :- use_module(library(htmlutils), [element//2]). :- use_module(library(dcg_core)). :- use_module(library(insist)). :- use_module(components(basics)). :- use_module(components(messages)). :- use_module(user(user_db)). :- setting(enable_auto_crawl, boolean, true, "Enables automatic semantic web searches for recognised URIs"). :- http_handler(root(crawl), crawl_handler, []). no_cache --> html_post(head,meta(['http-equiv'='Cache-Control', content='no-cache, no-store, must-revalidate, max-age=0'],[])), html_post(head,meta(['http-equiv'='Pragma', content='no-cache'],[])), html_post(head,meta(['http-equiv'='Expires', content=0],[])). refresh(URL,Delay) --> no_cache, html_post(head,meta(['http-equiv'=refresh, content="~d;~w"-[Delay,URL]],[])). %% crawl_handler(+Request) is det. % web service to query given authority about given URI crawl_handler(Request) :- authorized(write(default, load(lod))), http_parameters(Request, [ uri(URI, [optional(false), description("URI to search for")]) , src(Src, [optional(false), description("Source ID")]) , return_to(Return, [ optional(true), description('URI to return to') ]) , return_after(Delay, [ default(2) ]) , messages(Msgs, [boolean, default(true)]) ]), debug(crawler,"Got request to consult ~w on ~w",[Src,URI]), return_options(Return, Delay, Options), ( Msgs=true -> call_showing_messages(insist(crawl(URI,Src)),Options) ; call_without_messages(insist(crawl(URI,Src)),Options) ). call_without_messages(Goal,Options) :- catch( (Goal, Msg='Success'), Ex, (Msg='Error', print_message(error,Ex))), ( option(return_to(Return),Options) -> option(return_after(Delay),Options,2), reply_html_page(cliopatria(default),[], [ h2(Msg), \refresh(Return,Delay) ], [unstable]) ; reply_html_page(cliopatria(default),[], [ h2(Msg) ], [unstable]) ). return_options(Return, _, []) :- var(Return), !. return_options(Return, Delay, [ return_to(Return), return_after(Delay) ]). %% crawl_ui(+URI:resource,+NT:natural) is det. % Component to allow user to trigger a crawl on a given URI, depending on % NT, the number of triples which currently have it has subject. crawl_ui(URI,NT) --> ( {setof(S-Auto, uri_authority(URI,S,Auto),Sources)} -> ( {setof(S-A, (member(S-A,Sources), \+crawl_loaded(URI,S)), Untapped)} % !!! should check permissions here -> { http_current_request(Request), memberchk(request_uri(Here), Request) }, {debug(crawler,'Untapped sources: ~q',[Untapped])}, no_cache, ( {setting(enable_auto_crawl,true)}, {setof(S, member(S-true,Untapped), AutoSources)} -> html([ 'The following sources have been consulted automatically: ', \seqmap_with_sep(html(', '),element(code),AutoSources), '.', br([]) ]), { debug(crawler,'consulting in parallel: ~w...',[AutoSources]), concurrent_maplist(consult_source(URI),AutoSources,Statuses), debug(crawler,'finished consulting on ~w.',[URI]) }, ( {member(ok,Statuses)} -> refresh(Here,0), html([b('At least one consultation succeeded; refreshing automatically.'),br([])]) ; html([b('All consultations failed.'),br([])]) ) ; html(p([ 'Click to consult one of the following sources for more information.' , br([]), \seqmap(consult_form(URI,Here),Untapped) ])) ) ; [] ) ; {NT>0} -> [] ; html(p('No triples and no authorities known for this URI.')) ). consult_source(URI,Source,Status) :- catch( (crawl(URI,Source), Status=ok), Ex, (print_message(error,Ex), Status=error(Ex))). consult_form(URI,Here,Source-_) --> {http_link_to_id(crawl_handler, [], FetchURL)}, {source_name(Source,Name)}, {source(Source,_,Opts), option(messages(Msgs),Opts,true)}, html(form([style="display:inline-block", action(FetchURL)], [ \hidden(uri, URI), \hidden(src, Source), \hidden(return_to, Here), \hidden(messages, Msgs), input([ type(submit), value(Name) ]) ])). %% crawl_loaded(+URI:resource,+Source:atom) is semidet. % True when URI has already been crawled and added to the RDF database. crawl_loaded(URI,Source) :- source_uri_graph(Source,URI,Graph), debug(crawler,'Checking if loaded: ~w in graph ~w from ~w',[URI,Graph,Source]), rdf(Graph,prov:wasDerivedFrom,URI,Graph). %% crawl_reload_graph(+Graph:atom) is det. % Attempts to delete the named graph and reload it from all the URIs % that it was derived from. crawl_reload_graph(Graph) :- findall(URI,rdf(Graph,prov:wasDerivedFrom,URI,Graph),URIs), length(URIs,NURIs), print_message(information,crawl_reload(Graph,NURIs)), rdf_transaction(( rdf_unload_graph(Graph), maplist(reload_into(Graph),URIs) )). reload_into(Graph,URI) :- uri_authority(URI,Source,_), source_uri_graph(Source,URI,Graph), load_into(Graph,Source,URI). %% crawl(+URI:resource) is det. % Looks for information about URI on all authorities claiming authority on it. % Queries are made in parallel using concurrent_maplist/2. crawl(URI) :- findall(S,uri_authority(URI,S,_),Sources), concurrent_maplist(consult_source(URI),Sources,_). %% crawl(+URI:resource,+Auth:atom) is det. % Looks for information about URI on specified authority. crawl(URI,Source) :- debug(crawler,"Consulting source ~w on ~w...",[Source,URI]), source_uri_graph(Source, URI, Graph), rdf_transaction(load_into(Graph,Source,URI)). load_into(Graph,Source,URI) :- source(Source, Handler, _), ( call(Handler,crawl(URI,Graph)) -> print_message(information,crawler_crawled(Source,URI,Graph)) ; setof(T, call(Handler,describe(URI,T)), Triples), length(Triples, NumTriples), forall(member(rdf(S,P,O),Triples), rdf_assert(S,P,O,Graph)) -> print_message(information,crawler_described(Source,URI,Graph,NumTriples)) ; print_message(information,crawler_failed(Source,URI)) ), debug(crawler,'Asserting successful load of ~w into graph ~w from ~w',[URI,Graph,Source]), rdf_assert(Graph, prov:wasDerivedFrom, URI, Graph). %% uri_authority(+URI:resource, -Auth:atom, -Auto:boolean) is nondet. % True when Auth is declared as an authority on URI. If Auto=true, % then, this authority should be queried automatically when URI is the % subject of a 'list_resource' page. uri_authority(URI,Src,Auto) :- authority(Cond,Src,Opts), once(matches(Cond,URI)), option(auto(Auto),Opts,false). matches(begins(Prefix),URI) :- sub_atom(URI,0,_,_,Prefix). matches(\+Cond,URI) :- \+matches(Cond,URI). matches((C1,C2),URI) :- matches(C1,URI), matches(C2,URI). matches((C1;C2),URI) :- matches(C1,URI); matches(C2,URI). %% source_uri_graph(+Auth:atom,+URI:resource,-Graph:atom) is det. % Derive the RDF graph name for a given authority. source_uri_graph(Source,URI,Graph) :- source(Source,Handler,_), call(Handler,uri_graph(URI,Graph)). source_name(Source,Name) :- source(Source,Handler,_), call(Handler,name(Name)), !. source_name(Source,Source). prolog:message(crawler_crawled(Src,URI,Graph)) --> ["Triples from ~w about ~w loaded into ~w via crawl method"-[Src,URI,Graph]]. prolog:message(crawler_described(Src,URI,Graph,N)) --> ["Triples from ~w about ~w loaded into ~w via describe method: ~d."-[Src,URI,Graph,N]]. prolog:message(crawler_failed(Src,URI)) --> ["Failed to load anything from ~w about ~w."-[Src,URI]].