view cpack/dml/lib/crawler.pl @ 0:718306e29690 tip

commiting public release
author Daniel Wolff
date Tue, 09 Feb 2016 21:05:06 +0100
parents
children
line wrap: on
line source
/* Part of DML (Digital Music Laboratory)
	Copyright 2014-2015 Samer Abdallah, University of London
	 
	This program is free software; you can redistribute it and/or
	modify it under the terms of the GNU General Public License
	as published by the Free Software Foundation; either version 2
	of the License, or (at your option) any later version.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
	GNU General Public License for more details.

	You should have received a copy of the GNU General Public
	License along with this library; if not, write to the Free Software
	Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
*/

:- module(crawler, 
      [ crawl_loaded/2
      , crawl_ui//2
      , crawl/2
      , crawl/1
      , crawl_reload_graph/1
      ]).

:- multifile authority/3.
:- multifile source/3.

:- use_module(library(settings)).
:- use_module(library(http/http_dispatch)).
:- use_module(library(http/http_parameters)).
:- use_module(library(http/http_wrapper)).
:- use_module(library(http/html_write)).
:- use_module(library(semweb/rdf_db)).
:- use_module(library(prov_schema)).
:- use_module(library(httpfiles)).
:- use_module(library(htmlutils), [element//2]).
:- use_module(library(dcg_core)).
:- use_module(library(insist)).
:- use_module(components(basics)).
:- use_module(components(messages)).
:- use_module(user(user_db)).

:- setting(enable_auto_crawl, boolean, true, "Enables automatic semantic web searches for recognised URIs").

:- http_handler(root(crawl), crawl_handler, []).

no_cache -->
   html_post(head,meta(['http-equiv'='Cache-Control', content='no-cache, no-store, must-revalidate, max-age=0'],[])),
   html_post(head,meta(['http-equiv'='Pragma', content='no-cache'],[])),
   html_post(head,meta(['http-equiv'='Expires', content=0],[])).

refresh(URL,Delay) -->
   no_cache,
   html_post(head,meta(['http-equiv'=refresh, content="~d;~w"-[Delay,URL]],[])).

%% crawl_handler(+Request) is det.
% web service to query given authority about given URI
crawl_handler(Request) :-
   authorized(write(default, load(lod))),
	http_parameters(Request,
		[	uri(URI, [optional(false), description("URI to search for")])
		,	src(Src, [optional(false), description("Source ID")])
      ,  return_to(Return, [ optional(true), description('URI to return to') ])
      ,  return_after(Delay, [ default(2) ])
      ,  messages(Msgs, [boolean, default(true)])
		]),
   debug(crawler,"Got request to consult ~w on ~w",[Src,URI]),
   return_options(Return, Delay, Options),
   (  Msgs=true
   -> call_showing_messages(insist(crawl(URI,Src)),Options)
   ;  call_without_messages(insist(crawl(URI,Src)),Options)
   ).

call_without_messages(Goal,Options) :-
   catch( (Goal, Msg='Success'), Ex, (Msg='Error', print_message(error,Ex))),
   (  option(return_to(Return),Options)
   -> option(return_after(Delay),Options,2),
      reply_html_page(cliopatria(default),[], [ h2(Msg), \refresh(Return,Delay) ], [unstable])
   ;  reply_html_page(cliopatria(default),[], [ h2(Msg) ], [unstable])
   ).

return_options(Return, _, []) :- var(Return), !.
return_options(Return, Delay, [ return_to(Return), return_after(Delay) ]).


%% crawl_ui(+URI:resource,+NT:natural) is det.
%  Component to allow user to trigger a crawl on a given URI, depending on
%  NT, the number of triples which currently have it has subject.
crawl_ui(URI,NT) -->
   (  {setof(S-Auto, uri_authority(URI,S,Auto),Sources)}
   -> (  {setof(S-A, (member(S-A,Sources), \+crawl_loaded(URI,S)), Untapped)}
         % !!! should check permissions here
      -> { http_current_request(Request), 
           memberchk(request_uri(Here), Request) 
         },
         {debug(crawler,'Untapped sources: ~q',[Untapped])},
         no_cache,
         (  {setting(enable_auto_crawl,true)},
            {setof(S, member(S-true,Untapped), AutoSources)}
         -> html([ 'The following sources have been consulted automatically: ',
                   \seqmap_with_sep(html(', '),element(code),AutoSources), '.', br([])
                 ]),
            {  debug(crawler,'consulting in parallel: ~w...',[AutoSources]),
               concurrent_maplist(consult_source(URI),AutoSources,Statuses),
               debug(crawler,'finished consulting on ~w.',[URI])
            },
            (  {member(ok,Statuses)}
            -> refresh(Here,0),
               html([b('At least one consultation succeeded; refreshing automatically.'),br([])])
            ;  html([b('All consultations failed.'),br([])])
            )
         ;  html(p([ 'Click to consult one of the following sources for more information.'
                   , br([]), \seqmap(consult_form(URI,Here),Untapped)
                   ]))
         )
      ; []
      )
   ;  {NT>0} -> []
   ;  html(p('No triples and no authorities known for this URI.'))
   ).

consult_source(URI,Source,Status) :-
   catch( (crawl(URI,Source),       Status=ok), Ex,
          (print_message(error,Ex), Status=error(Ex))).

consult_form(URI,Here,Source-_) -->
   {http_link_to_id(crawl_handler, [], FetchURL)},
   {source_name(Source,Name)},
   {source(Source,_,Opts), option(messages(Msgs),Opts,true)},
   html(form([style="display:inline-block", action(FetchURL)],
      [ \hidden(uri, URI),
        \hidden(src, Source),
        \hidden(return_to, Here),
        \hidden(messages, Msgs),
        input([ type(submit), value(Name) ])
      ])).

%% crawl_loaded(+URI:resource,+Source:atom) is semidet.
%  True when URI has already been crawled and added to the RDF database.
crawl_loaded(URI,Source) :-
   source_uri_graph(Source,URI,Graph),
   debug(crawler,'Checking if loaded: ~w in graph ~w from ~w',[URI,Graph,Source]),
	rdf(Graph,prov:wasDerivedFrom,URI,Graph).
	

%% crawl_reload_graph(+Graph:atom) is det.
%  Attempts to delete the named graph and reload it from all the URIs
%  that it was derived from.
crawl_reload_graph(Graph) :-
   findall(URI,rdf(Graph,prov:wasDerivedFrom,URI,Graph),URIs),
   length(URIs,NURIs),
   print_message(information,crawl_reload(Graph,NURIs)),
   rdf_transaction((
      rdf_unload_graph(Graph),
      maplist(reload_into(Graph),URIs)
   )).

reload_into(Graph,URI) :-
   uri_authority(URI,Source,_),
   source_uri_graph(Source,URI,Graph),
   load_into(Graph,Source,URI).

%% crawl(+URI:resource) is det.
%  Looks for information about URI on all authorities claiming authority on it.
%  Queries are made in parallel using concurrent_maplist/2.
crawl(URI) :-
   findall(S,uri_authority(URI,S,_),Sources),
   concurrent_maplist(consult_source(URI),Sources,_).
   

%% crawl(+URI:resource,+Auth:atom) is det.
%  Looks for information about URI on specified authority.
crawl(URI,Source) :-
	debug(crawler,"Consulting source ~w on ~w...",[Source,URI]),
	source_uri_graph(Source, URI, Graph),
   rdf_transaction(load_into(Graph,Source,URI)).

load_into(Graph,Source,URI) :-
   source(Source, Handler, _),
   (  call(Handler,crawl(URI,Graph)) 
   -> print_message(information,crawler_crawled(Source,URI,Graph))
   ;  setof(T, call(Handler,describe(URI,T)), Triples),
      length(Triples, NumTriples),
      forall(member(rdf(S,P,O),Triples), rdf_assert(S,P,O,Graph))
   -> print_message(information,crawler_described(Source,URI,Graph,NumTriples))
   ;  print_message(information,crawler_failed(Source,URI))
   ),
   debug(crawler,'Asserting successful load of ~w into graph ~w from ~w',[URI,Graph,Source]),
   rdf_assert(Graph, prov:wasDerivedFrom, URI, Graph).

%% uri_authority(+URI:resource, -Auth:atom, -Auto:boolean) is nondet.
%  True when Auth is declared as an authority on URI. If Auto=true,
%  then, this authority should be queried automatically when URI is the 
%  subject of a 'list_resource' page.
uri_authority(URI,Src,Auto) :-
   authority(Cond,Src,Opts),
   once(matches(Cond,URI)),
   option(auto(Auto),Opts,false).

matches(begins(Prefix),URI) :- sub_atom(URI,0,_,_,Prefix).
matches(\+Cond,URI) :- \+matches(Cond,URI).
matches((C1,C2),URI) :- matches(C1,URI), matches(C2,URI).
matches((C1;C2),URI) :- matches(C1,URI); matches(C2,URI).


%% source_uri_graph(+Auth:atom,+URI:resource,-Graph:atom) is det.
%  Derive the RDF graph name for a given authority.
source_uri_graph(Source,URI,Graph) :-
   source(Source,Handler,_),
   call(Handler,uri_graph(URI,Graph)).

source_name(Source,Name) :- source(Source,Handler,_), call(Handler,name(Name)), !.
source_name(Source,Source).

prolog:message(crawler_crawled(Src,URI,Graph)) -->
   ["Triples from ~w about ~w loaded into ~w via crawl method"-[Src,URI,Graph]].
prolog:message(crawler_described(Src,URI,Graph,N)) -->
   ["Triples from ~w about ~w loaded into ~w via describe method: ~d."-[Src,URI,Graph,N]].
prolog:message(crawler_failed(Src,URI)) -->
   ["Failed to load anything from ~w about ~w."-[Src,URI]].