annotate cpack/dml/lib/crawler.pl @ 0:718306e29690 tip

commiting public release
author Daniel Wolff
date Tue, 09 Feb 2016 21:05:06 +0100
parents
children
rev   line source
Daniel@0 1 /* Part of DML (Digital Music Laboratory)
Daniel@0 2 Copyright 2014-2015 Samer Abdallah, University of London
Daniel@0 3
Daniel@0 4 This program is free software; you can redistribute it and/or
Daniel@0 5 modify it under the terms of the GNU General Public License
Daniel@0 6 as published by the Free Software Foundation; either version 2
Daniel@0 7 of the License, or (at your option) any later version.
Daniel@0 8
Daniel@0 9 This program is distributed in the hope that it will be useful,
Daniel@0 10 but WITHOUT ANY WARRANTY; without even the implied warranty of
Daniel@0 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Daniel@0 12 GNU General Public License for more details.
Daniel@0 13
Daniel@0 14 You should have received a copy of the GNU General Public
Daniel@0 15 License along with this library; if not, write to the Free Software
Daniel@0 16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Daniel@0 17 */
Daniel@0 18
Daniel@0 19 :- module(crawler,
Daniel@0 20 [ crawl_loaded/2
Daniel@0 21 , crawl_ui//2
Daniel@0 22 , crawl/2
Daniel@0 23 , crawl/1
Daniel@0 24 , crawl_reload_graph/1
Daniel@0 25 ]).
Daniel@0 26
Daniel@0 27 :- multifile authority/3.
Daniel@0 28 :- multifile source/3.
Daniel@0 29
Daniel@0 30 :- use_module(library(settings)).
Daniel@0 31 :- use_module(library(http/http_dispatch)).
Daniel@0 32 :- use_module(library(http/http_parameters)).
Daniel@0 33 :- use_module(library(http/http_wrapper)).
Daniel@0 34 :- use_module(library(http/html_write)).
Daniel@0 35 :- use_module(library(semweb/rdf_db)).
Daniel@0 36 :- use_module(library(prov_schema)).
Daniel@0 37 :- use_module(library(httpfiles)).
Daniel@0 38 :- use_module(library(htmlutils), [element//2]).
Daniel@0 39 :- use_module(library(dcg_core)).
Daniel@0 40 :- use_module(library(insist)).
Daniel@0 41 :- use_module(components(basics)).
Daniel@0 42 :- use_module(components(messages)).
Daniel@0 43 :- use_module(user(user_db)).
Daniel@0 44
Daniel@0 45 :- setting(enable_auto_crawl, boolean, true, "Enables automatic semantic web searches for recognised URIs").
Daniel@0 46
Daniel@0 47 :- http_handler(root(crawl), crawl_handler, []).
Daniel@0 48
Daniel@0 49 no_cache -->
Daniel@0 50 html_post(head,meta(['http-equiv'='Cache-Control', content='no-cache, no-store, must-revalidate, max-age=0'],[])),
Daniel@0 51 html_post(head,meta(['http-equiv'='Pragma', content='no-cache'],[])),
Daniel@0 52 html_post(head,meta(['http-equiv'='Expires', content=0],[])).
Daniel@0 53
Daniel@0 54 refresh(URL,Delay) -->
Daniel@0 55 no_cache,
Daniel@0 56 html_post(head,meta(['http-equiv'=refresh, content="~d;~w"-[Delay,URL]],[])).
Daniel@0 57
Daniel@0 58 %% crawl_handler(+Request) is det.
Daniel@0 59 % web service to query given authority about given URI
Daniel@0 60 crawl_handler(Request) :-
Daniel@0 61 authorized(write(default, load(lod))),
Daniel@0 62 http_parameters(Request,
Daniel@0 63 [ uri(URI, [optional(false), description("URI to search for")])
Daniel@0 64 , src(Src, [optional(false), description("Source ID")])
Daniel@0 65 , return_to(Return, [ optional(true), description('URI to return to') ])
Daniel@0 66 , return_after(Delay, [ default(2) ])
Daniel@0 67 , messages(Msgs, [boolean, default(true)])
Daniel@0 68 ]),
Daniel@0 69 debug(crawler,"Got request to consult ~w on ~w",[Src,URI]),
Daniel@0 70 return_options(Return, Delay, Options),
Daniel@0 71 ( Msgs=true
Daniel@0 72 -> call_showing_messages(insist(crawl(URI,Src)),Options)
Daniel@0 73 ; call_without_messages(insist(crawl(URI,Src)),Options)
Daniel@0 74 ).
Daniel@0 75
Daniel@0 76 call_without_messages(Goal,Options) :-
Daniel@0 77 catch( (Goal, Msg='Success'), Ex, (Msg='Error', print_message(error,Ex))),
Daniel@0 78 ( option(return_to(Return),Options)
Daniel@0 79 -> option(return_after(Delay),Options,2),
Daniel@0 80 reply_html_page(cliopatria(default),[], [ h2(Msg), \refresh(Return,Delay) ], [unstable])
Daniel@0 81 ; reply_html_page(cliopatria(default),[], [ h2(Msg) ], [unstable])
Daniel@0 82 ).
Daniel@0 83
Daniel@0 84 return_options(Return, _, []) :- var(Return), !.
Daniel@0 85 return_options(Return, Delay, [ return_to(Return), return_after(Delay) ]).
Daniel@0 86
Daniel@0 87
Daniel@0 88 %% crawl_ui(+URI:resource,+NT:natural) is det.
Daniel@0 89 % Component to allow user to trigger a crawl on a given URI, depending on
Daniel@0 90 % NT, the number of triples which currently have it has subject.
Daniel@0 91 crawl_ui(URI,NT) -->
Daniel@0 92 ( {setof(S-Auto, uri_authority(URI,S,Auto),Sources)}
Daniel@0 93 -> ( {setof(S-A, (member(S-A,Sources), \+crawl_loaded(URI,S)), Untapped)}
Daniel@0 94 % !!! should check permissions here
Daniel@0 95 -> { http_current_request(Request),
Daniel@0 96 memberchk(request_uri(Here), Request)
Daniel@0 97 },
Daniel@0 98 {debug(crawler,'Untapped sources: ~q',[Untapped])},
Daniel@0 99 no_cache,
Daniel@0 100 ( {setting(enable_auto_crawl,true)},
Daniel@0 101 {setof(S, member(S-true,Untapped), AutoSources)}
Daniel@0 102 -> html([ 'The following sources have been consulted automatically: ',
Daniel@0 103 \seqmap_with_sep(html(', '),element(code),AutoSources), '.', br([])
Daniel@0 104 ]),
Daniel@0 105 { debug(crawler,'consulting in parallel: ~w...',[AutoSources]),
Daniel@0 106 concurrent_maplist(consult_source(URI),AutoSources,Statuses),
Daniel@0 107 debug(crawler,'finished consulting on ~w.',[URI])
Daniel@0 108 },
Daniel@0 109 ( {member(ok,Statuses)}
Daniel@0 110 -> refresh(Here,0),
Daniel@0 111 html([b('At least one consultation succeeded; refreshing automatically.'),br([])])
Daniel@0 112 ; html([b('All consultations failed.'),br([])])
Daniel@0 113 )
Daniel@0 114 ; html(p([ 'Click to consult one of the following sources for more information.'
Daniel@0 115 , br([]), \seqmap(consult_form(URI,Here),Untapped)
Daniel@0 116 ]))
Daniel@0 117 )
Daniel@0 118 ; []
Daniel@0 119 )
Daniel@0 120 ; {NT>0} -> []
Daniel@0 121 ; html(p('No triples and no authorities known for this URI.'))
Daniel@0 122 ).
Daniel@0 123
Daniel@0 124 consult_source(URI,Source,Status) :-
Daniel@0 125 catch( (crawl(URI,Source), Status=ok), Ex,
Daniel@0 126 (print_message(error,Ex), Status=error(Ex))).
Daniel@0 127
Daniel@0 128 consult_form(URI,Here,Source-_) -->
Daniel@0 129 {http_link_to_id(crawl_handler, [], FetchURL)},
Daniel@0 130 {source_name(Source,Name)},
Daniel@0 131 {source(Source,_,Opts), option(messages(Msgs),Opts,true)},
Daniel@0 132 html(form([style="display:inline-block", action(FetchURL)],
Daniel@0 133 [ \hidden(uri, URI),
Daniel@0 134 \hidden(src, Source),
Daniel@0 135 \hidden(return_to, Here),
Daniel@0 136 \hidden(messages, Msgs),
Daniel@0 137 input([ type(submit), value(Name) ])
Daniel@0 138 ])).
Daniel@0 139
Daniel@0 140 %% crawl_loaded(+URI:resource,+Source:atom) is semidet.
Daniel@0 141 % True when URI has already been crawled and added to the RDF database.
Daniel@0 142 crawl_loaded(URI,Source) :-
Daniel@0 143 source_uri_graph(Source,URI,Graph),
Daniel@0 144 debug(crawler,'Checking if loaded: ~w in graph ~w from ~w',[URI,Graph,Source]),
Daniel@0 145 rdf(Graph,prov:wasDerivedFrom,URI,Graph).
Daniel@0 146
Daniel@0 147
Daniel@0 148 %% crawl_reload_graph(+Graph:atom) is det.
Daniel@0 149 % Attempts to delete the named graph and reload it from all the URIs
Daniel@0 150 % that it was derived from.
Daniel@0 151 crawl_reload_graph(Graph) :-
Daniel@0 152 findall(URI,rdf(Graph,prov:wasDerivedFrom,URI,Graph),URIs),
Daniel@0 153 length(URIs,NURIs),
Daniel@0 154 print_message(information,crawl_reload(Graph,NURIs)),
Daniel@0 155 rdf_transaction((
Daniel@0 156 rdf_unload_graph(Graph),
Daniel@0 157 maplist(reload_into(Graph),URIs)
Daniel@0 158 )).
Daniel@0 159
Daniel@0 160 reload_into(Graph,URI) :-
Daniel@0 161 uri_authority(URI,Source,_),
Daniel@0 162 source_uri_graph(Source,URI,Graph),
Daniel@0 163 load_into(Graph,Source,URI).
Daniel@0 164
Daniel@0 165 %% crawl(+URI:resource) is det.
Daniel@0 166 % Looks for information about URI on all authorities claiming authority on it.
Daniel@0 167 % Queries are made in parallel using concurrent_maplist/2.
Daniel@0 168 crawl(URI) :-
Daniel@0 169 findall(S,uri_authority(URI,S,_),Sources),
Daniel@0 170 concurrent_maplist(consult_source(URI),Sources,_).
Daniel@0 171
Daniel@0 172
Daniel@0 173 %% crawl(+URI:resource,+Auth:atom) is det.
Daniel@0 174 % Looks for information about URI on specified authority.
Daniel@0 175 crawl(URI,Source) :-
Daniel@0 176 debug(crawler,"Consulting source ~w on ~w...",[Source,URI]),
Daniel@0 177 source_uri_graph(Source, URI, Graph),
Daniel@0 178 rdf_transaction(load_into(Graph,Source,URI)).
Daniel@0 179
Daniel@0 180 load_into(Graph,Source,URI) :-
Daniel@0 181 source(Source, Handler, _),
Daniel@0 182 ( call(Handler,crawl(URI,Graph))
Daniel@0 183 -> print_message(information,crawler_crawled(Source,URI,Graph))
Daniel@0 184 ; setof(T, call(Handler,describe(URI,T)), Triples),
Daniel@0 185 length(Triples, NumTriples),
Daniel@0 186 forall(member(rdf(S,P,O),Triples), rdf_assert(S,P,O,Graph))
Daniel@0 187 -> print_message(information,crawler_described(Source,URI,Graph,NumTriples))
Daniel@0 188 ; print_message(information,crawler_failed(Source,URI))
Daniel@0 189 ),
Daniel@0 190 debug(crawler,'Asserting successful load of ~w into graph ~w from ~w',[URI,Graph,Source]),
Daniel@0 191 rdf_assert(Graph, prov:wasDerivedFrom, URI, Graph).
Daniel@0 192
Daniel@0 193 %% uri_authority(+URI:resource, -Auth:atom, -Auto:boolean) is nondet.
Daniel@0 194 % True when Auth is declared as an authority on URI. If Auto=true,
Daniel@0 195 % then, this authority should be queried automatically when URI is the
Daniel@0 196 % subject of a 'list_resource' page.
Daniel@0 197 uri_authority(URI,Src,Auto) :-
Daniel@0 198 authority(Cond,Src,Opts),
Daniel@0 199 once(matches(Cond,URI)),
Daniel@0 200 option(auto(Auto),Opts,false).
Daniel@0 201
Daniel@0 202 matches(begins(Prefix),URI) :- sub_atom(URI,0,_,_,Prefix).
Daniel@0 203 matches(\+Cond,URI) :- \+matches(Cond,URI).
Daniel@0 204 matches((C1,C2),URI) :- matches(C1,URI), matches(C2,URI).
Daniel@0 205 matches((C1;C2),URI) :- matches(C1,URI); matches(C2,URI).
Daniel@0 206
Daniel@0 207
Daniel@0 208 %% source_uri_graph(+Auth:atom,+URI:resource,-Graph:atom) is det.
Daniel@0 209 % Derive the RDF graph name for a given authority.
Daniel@0 210 source_uri_graph(Source,URI,Graph) :-
Daniel@0 211 source(Source,Handler,_),
Daniel@0 212 call(Handler,uri_graph(URI,Graph)).
Daniel@0 213
Daniel@0 214 source_name(Source,Name) :- source(Source,Handler,_), call(Handler,name(Name)), !.
Daniel@0 215 source_name(Source,Source).
Daniel@0 216
Daniel@0 217 prolog:message(crawler_crawled(Src,URI,Graph)) -->
Daniel@0 218 ["Triples from ~w about ~w loaded into ~w via crawl method"-[Src,URI,Graph]].
Daniel@0 219 prolog:message(crawler_described(Src,URI,Graph,N)) -->
Daniel@0 220 ["Triples from ~w about ~w loaded into ~w via describe method: ~d."-[Src,URI,Graph,N]].
Daniel@0 221 prolog:message(crawler_failed(Src,URI)) -->
Daniel@0 222 ["Failed to load anything from ~w about ~w."-[Src,URI]].
Daniel@0 223