diff cpack/dml/lib/crawler.pl @ 0:718306e29690 tip

commiting public release
author Daniel Wolff
date Tue, 09 Feb 2016 21:05:06 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpack/dml/lib/crawler.pl	Tue Feb 09 21:05:06 2016 +0100
@@ -0,0 +1,223 @@
+/* Part of DML (Digital Music Laboratory)
+	Copyright 2014-2015 Samer Abdallah, University of London
+	 
+	This program is free software; you can redistribute it and/or
+	modify it under the terms of the GNU General Public License
+	as published by the Free Software Foundation; either version 2
+	of the License, or (at your option) any later version.
+
+	This program is distributed in the hope that it will be useful,
+	but WITHOUT ANY WARRANTY; without even the implied warranty of
+	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+	GNU General Public License for more details.
+
+	You should have received a copy of the GNU General Public
+	License along with this library; if not, write to the Free Software
+	Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+:- module(crawler, 
+      [ crawl_loaded/2
+      , crawl_ui//2
+      , crawl/2
+      , crawl/1
+      , crawl_reload_graph/1
+      ]).
+
+:- multifile authority/3.
+:- multifile source/3.
+
+:- use_module(library(settings)).
+:- use_module(library(http/http_dispatch)).
+:- use_module(library(http/http_parameters)).
+:- use_module(library(http/http_wrapper)).
+:- use_module(library(http/html_write)).
+:- use_module(library(semweb/rdf_db)).
+:- use_module(library(prov_schema)).
+:- use_module(library(httpfiles)).
+:- use_module(library(htmlutils), [element//2]).
+:- use_module(library(dcg_core)).
+:- use_module(library(insist)).
+:- use_module(components(basics)).
+:- use_module(components(messages)).
+:- use_module(user(user_db)).
+
+:- setting(enable_auto_crawl, boolean, true, "Enables automatic semantic web searches for recognised URIs").
+
+:- http_handler(root(crawl), crawl_handler, []).
+
+no_cache -->
+   html_post(head,meta(['http-equiv'='Cache-Control', content='no-cache, no-store, must-revalidate, max-age=0'],[])),
+   html_post(head,meta(['http-equiv'='Pragma', content='no-cache'],[])),
+   html_post(head,meta(['http-equiv'='Expires', content=0],[])).
+
+refresh(URL,Delay) -->
+   no_cache,
+   html_post(head,meta(['http-equiv'=refresh, content="~d;~w"-[Delay,URL]],[])).
+
+%% crawl_handler(+Request) is det.
+% web service to query given authority about given URI
+crawl_handler(Request) :-
+   authorized(write(default, load(lod))),
+	http_parameters(Request,
+		[	uri(URI, [optional(false), description("URI to search for")])
+		,	src(Src, [optional(false), description("Source ID")])
+      ,  return_to(Return, [ optional(true), description('URI to return to') ])
+      ,  return_after(Delay, [ default(2) ])
+      ,  messages(Msgs, [boolean, default(true)])
+		]),
+   debug(crawler,"Got request to consult ~w on ~w",[Src,URI]),
+   return_options(Return, Delay, Options),
+   (  Msgs=true
+   -> call_showing_messages(insist(crawl(URI,Src)),Options)
+   ;  call_without_messages(insist(crawl(URI,Src)),Options)
+   ).
+
+call_without_messages(Goal,Options) :-
+   catch( (Goal, Msg='Success'), Ex, (Msg='Error', print_message(error,Ex))),
+   (  option(return_to(Return),Options)
+   -> option(return_after(Delay),Options,2),
+      reply_html_page(cliopatria(default),[], [ h2(Msg), \refresh(Return,Delay) ], [unstable])
+   ;  reply_html_page(cliopatria(default),[], [ h2(Msg) ], [unstable])
+   ).
+
+return_options(Return, _, []) :- var(Return), !.
+return_options(Return, Delay, [ return_to(Return), return_after(Delay) ]).
+
+
+%% crawl_ui(+URI:resource,+NT:natural) is det.
+%  Component to allow user to trigger a crawl on a given URI, depending on
+%  NT, the number of triples which currently have it has subject.
+crawl_ui(URI,NT) -->
+   (  {setof(S-Auto, uri_authority(URI,S,Auto),Sources)}
+   -> (  {setof(S-A, (member(S-A,Sources), \+crawl_loaded(URI,S)), Untapped)}
+         % !!! should check permissions here
+      -> { http_current_request(Request), 
+           memberchk(request_uri(Here), Request) 
+         },
+         {debug(crawler,'Untapped sources: ~q',[Untapped])},
+         no_cache,
+         (  {setting(enable_auto_crawl,true)},
+            {setof(S, member(S-true,Untapped), AutoSources)}
+         -> html([ 'The following sources have been consulted automatically: ',
+                   \seqmap_with_sep(html(', '),element(code),AutoSources), '.', br([])
+                 ]),
+            {  debug(crawler,'consulting in parallel: ~w...',[AutoSources]),
+               concurrent_maplist(consult_source(URI),AutoSources,Statuses),
+               debug(crawler,'finished consulting on ~w.',[URI])
+            },
+            (  {member(ok,Statuses)}
+            -> refresh(Here,0),
+               html([b('At least one consultation succeeded; refreshing automatically.'),br([])])
+            ;  html([b('All consultations failed.'),br([])])
+            )
+         ;  html(p([ 'Click to consult one of the following sources for more information.'
+                   , br([]), \seqmap(consult_form(URI,Here),Untapped)
+                   ]))
+         )
+      ; []
+      )
+   ;  {NT>0} -> []
+   ;  html(p('No triples and no authorities known for this URI.'))
+   ).
+
+consult_source(URI,Source,Status) :-
+   catch( (crawl(URI,Source),       Status=ok), Ex,
+          (print_message(error,Ex), Status=error(Ex))).
+
+consult_form(URI,Here,Source-_) -->
+   {http_link_to_id(crawl_handler, [], FetchURL)},
+   {source_name(Source,Name)},
+   {source(Source,_,Opts), option(messages(Msgs),Opts,true)},
+   html(form([style="display:inline-block", action(FetchURL)],
+      [ \hidden(uri, URI),
+        \hidden(src, Source),
+        \hidden(return_to, Here),
+        \hidden(messages, Msgs),
+        input([ type(submit), value(Name) ])
+      ])).
+
+%% crawl_loaded(+URI:resource,+Source:atom) is semidet.
+%  True when URI has already been crawled and added to the RDF database.
+crawl_loaded(URI,Source) :-
+   source_uri_graph(Source,URI,Graph),
+   debug(crawler,'Checking if loaded: ~w in graph ~w from ~w',[URI,Graph,Source]),
+	rdf(Graph,prov:wasDerivedFrom,URI,Graph).
+	
+
+%% crawl_reload_graph(+Graph:atom) is det.
+%  Attempts to delete the named graph and reload it from all the URIs
+%  that it was derived from.
+crawl_reload_graph(Graph) :-
+   findall(URI,rdf(Graph,prov:wasDerivedFrom,URI,Graph),URIs),
+   length(URIs,NURIs),
+   print_message(information,crawl_reload(Graph,NURIs)),
+   rdf_transaction((
+      rdf_unload_graph(Graph),
+      maplist(reload_into(Graph),URIs)
+   )).
+
+reload_into(Graph,URI) :-
+   uri_authority(URI,Source,_),
+   source_uri_graph(Source,URI,Graph),
+   load_into(Graph,Source,URI).
+
+%% crawl(+URI:resource) is det.
+%  Looks for information about URI on all authorities claiming authority on it.
+%  Queries are made in parallel using concurrent_maplist/2.
+crawl(URI) :-
+   findall(S,uri_authority(URI,S,_),Sources),
+   concurrent_maplist(consult_source(URI),Sources,_).
+   
+
+%% crawl(+URI:resource,+Auth:atom) is det.
+%  Looks for information about URI on specified authority.
+crawl(URI,Source) :-
+	debug(crawler,"Consulting source ~w on ~w...",[Source,URI]),
+	source_uri_graph(Source, URI, Graph),
+   rdf_transaction(load_into(Graph,Source,URI)).
+
+load_into(Graph,Source,URI) :-
+   source(Source, Handler, _),
+   (  call(Handler,crawl(URI,Graph)) 
+   -> print_message(information,crawler_crawled(Source,URI,Graph))
+   ;  setof(T, call(Handler,describe(URI,T)), Triples),
+      length(Triples, NumTriples),
+      forall(member(rdf(S,P,O),Triples), rdf_assert(S,P,O,Graph))
+   -> print_message(information,crawler_described(Source,URI,Graph,NumTriples))
+   ;  print_message(information,crawler_failed(Source,URI))
+   ),
+   debug(crawler,'Asserting successful load of ~w into graph ~w from ~w',[URI,Graph,Source]),
+   rdf_assert(Graph, prov:wasDerivedFrom, URI, Graph).
+
+%% uri_authority(+URI:resource, -Auth:atom, -Auto:boolean) is nondet.
+%  True when Auth is declared as an authority on URI. If Auto=true,
+%  then, this authority should be queried automatically when URI is the 
+%  subject of a 'list_resource' page.
+uri_authority(URI,Src,Auto) :-
+   authority(Cond,Src,Opts),
+   once(matches(Cond,URI)),
+   option(auto(Auto),Opts,false).
+
+matches(begins(Prefix),URI) :- sub_atom(URI,0,_,_,Prefix).
+matches(\+Cond,URI) :- \+matches(Cond,URI).
+matches((C1,C2),URI) :- matches(C1,URI), matches(C2,URI).
+matches((C1;C2),URI) :- matches(C1,URI); matches(C2,URI).
+
+
+%% source_uri_graph(+Auth:atom,+URI:resource,-Graph:atom) is det.
+%  Derive the RDF graph name for a given authority.
+source_uri_graph(Source,URI,Graph) :-
+   source(Source,Handler,_),
+   call(Handler,uri_graph(URI,Graph)).
+
+source_name(Source,Name) :- source(Source,Handler,_), call(Handler,name(Name)), !.
+source_name(Source,Source).
+
+prolog:message(crawler_crawled(Src,URI,Graph)) -->
+   ["Triples from ~w about ~w loaded into ~w via crawl method"-[Src,URI,Graph]].
+prolog:message(crawler_described(Src,URI,Graph,N)) -->
+   ["Triples from ~w about ~w loaded into ~w via describe method: ~d."-[Src,URI,Graph,N]].
+prolog:message(crawler_failed(Src,URI)) -->
+   ["Failed to load anything from ~w about ~w."-[Src,URI]].
+