comparison cpack/dml/lib/crawler.pl @ 0:718306e29690 tip

commiting public release
author Daniel Wolff
date Tue, 09 Feb 2016 21:05:06 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:718306e29690
1 /* Part of DML (Digital Music Laboratory)
2 Copyright 2014-2015 Samer Abdallah, University of London
3
4 This program is free software; you can redistribute it and/or
5 modify it under the terms of the GNU General Public License
6 as published by the Free Software Foundation; either version 2
7 of the License, or (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public
15 License along with this library; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18
19 :- module(crawler,
20 [ crawl_loaded/2
21 , crawl_ui//2
22 , crawl/2
23 , crawl/1
24 , crawl_reload_graph/1
25 ]).
26
27 :- multifile authority/3.
28 :- multifile source/3.
29
30 :- use_module(library(settings)).
31 :- use_module(library(http/http_dispatch)).
32 :- use_module(library(http/http_parameters)).
33 :- use_module(library(http/http_wrapper)).
34 :- use_module(library(http/html_write)).
35 :- use_module(library(semweb/rdf_db)).
36 :- use_module(library(prov_schema)).
37 :- use_module(library(httpfiles)).
38 :- use_module(library(htmlutils), [element//2]).
39 :- use_module(library(dcg_core)).
40 :- use_module(library(insist)).
41 :- use_module(components(basics)).
42 :- use_module(components(messages)).
43 :- use_module(user(user_db)).
44
45 :- setting(enable_auto_crawl, boolean, true, "Enables automatic semantic web searches for recognised URIs").
46
47 :- http_handler(root(crawl), crawl_handler, []).
48
49 no_cache -->
50 html_post(head,meta(['http-equiv'='Cache-Control', content='no-cache, no-store, must-revalidate, max-age=0'],[])),
51 html_post(head,meta(['http-equiv'='Pragma', content='no-cache'],[])),
52 html_post(head,meta(['http-equiv'='Expires', content=0],[])).
53
54 refresh(URL,Delay) -->
55 no_cache,
56 html_post(head,meta(['http-equiv'=refresh, content="~d;~w"-[Delay,URL]],[])).
57
58 %% crawl_handler(+Request) is det.
59 % web service to query given authority about given URI
60 crawl_handler(Request) :-
61 authorized(write(default, load(lod))),
62 http_parameters(Request,
63 [ uri(URI, [optional(false), description("URI to search for")])
64 , src(Src, [optional(false), description("Source ID")])
65 , return_to(Return, [ optional(true), description('URI to return to') ])
66 , return_after(Delay, [ default(2) ])
67 , messages(Msgs, [boolean, default(true)])
68 ]),
69 debug(crawler,"Got request to consult ~w on ~w",[Src,URI]),
70 return_options(Return, Delay, Options),
71 ( Msgs=true
72 -> call_showing_messages(insist(crawl(URI,Src)),Options)
73 ; call_without_messages(insist(crawl(URI,Src)),Options)
74 ).
75
76 call_without_messages(Goal,Options) :-
77 catch( (Goal, Msg='Success'), Ex, (Msg='Error', print_message(error,Ex))),
78 ( option(return_to(Return),Options)
79 -> option(return_after(Delay),Options,2),
80 reply_html_page(cliopatria(default),[], [ h2(Msg), \refresh(Return,Delay) ], [unstable])
81 ; reply_html_page(cliopatria(default),[], [ h2(Msg) ], [unstable])
82 ).
83
84 return_options(Return, _, []) :- var(Return), !.
85 return_options(Return, Delay, [ return_to(Return), return_after(Delay) ]).
86
87
88 %% crawl_ui(+URI:resource,+NT:natural) is det.
89 % Component to allow user to trigger a crawl on a given URI, depending on
90 % NT, the number of triples which currently have it has subject.
91 crawl_ui(URI,NT) -->
92 ( {setof(S-Auto, uri_authority(URI,S,Auto),Sources)}
93 -> ( {setof(S-A, (member(S-A,Sources), \+crawl_loaded(URI,S)), Untapped)}
94 % !!! should check permissions here
95 -> { http_current_request(Request),
96 memberchk(request_uri(Here), Request)
97 },
98 {debug(crawler,'Untapped sources: ~q',[Untapped])},
99 no_cache,
100 ( {setting(enable_auto_crawl,true)},
101 {setof(S, member(S-true,Untapped), AutoSources)}
102 -> html([ 'The following sources have been consulted automatically: ',
103 \seqmap_with_sep(html(', '),element(code),AutoSources), '.', br([])
104 ]),
105 { debug(crawler,'consulting in parallel: ~w...',[AutoSources]),
106 concurrent_maplist(consult_source(URI),AutoSources,Statuses),
107 debug(crawler,'finished consulting on ~w.',[URI])
108 },
109 ( {member(ok,Statuses)}
110 -> refresh(Here,0),
111 html([b('At least one consultation succeeded; refreshing automatically.'),br([])])
112 ; html([b('All consultations failed.'),br([])])
113 )
114 ; html(p([ 'Click to consult one of the following sources for more information.'
115 , br([]), \seqmap(consult_form(URI,Here),Untapped)
116 ]))
117 )
118 ; []
119 )
120 ; {NT>0} -> []
121 ; html(p('No triples and no authorities known for this URI.'))
122 ).
123
124 consult_source(URI,Source,Status) :-
125 catch( (crawl(URI,Source), Status=ok), Ex,
126 (print_message(error,Ex), Status=error(Ex))).
127
128 consult_form(URI,Here,Source-_) -->
129 {http_link_to_id(crawl_handler, [], FetchURL)},
130 {source_name(Source,Name)},
131 {source(Source,_,Opts), option(messages(Msgs),Opts,true)},
132 html(form([style="display:inline-block", action(FetchURL)],
133 [ \hidden(uri, URI),
134 \hidden(src, Source),
135 \hidden(return_to, Here),
136 \hidden(messages, Msgs),
137 input([ type(submit), value(Name) ])
138 ])).
139
140 %% crawl_loaded(+URI:resource,+Source:atom) is semidet.
141 % True when URI has already been crawled and added to the RDF database.
142 crawl_loaded(URI,Source) :-
143 source_uri_graph(Source,URI,Graph),
144 debug(crawler,'Checking if loaded: ~w in graph ~w from ~w',[URI,Graph,Source]),
145 rdf(Graph,prov:wasDerivedFrom,URI,Graph).
146
147
148 %% crawl_reload_graph(+Graph:atom) is det.
149 % Attempts to delete the named graph and reload it from all the URIs
150 % that it was derived from.
151 crawl_reload_graph(Graph) :-
152 findall(URI,rdf(Graph,prov:wasDerivedFrom,URI,Graph),URIs),
153 length(URIs,NURIs),
154 print_message(information,crawl_reload(Graph,NURIs)),
155 rdf_transaction((
156 rdf_unload_graph(Graph),
157 maplist(reload_into(Graph),URIs)
158 )).
159
160 reload_into(Graph,URI) :-
161 uri_authority(URI,Source,_),
162 source_uri_graph(Source,URI,Graph),
163 load_into(Graph,Source,URI).
164
165 %% crawl(+URI:resource) is det.
166 % Looks for information about URI on all authorities claiming authority on it.
167 % Queries are made in parallel using concurrent_maplist/2.
168 crawl(URI) :-
169 findall(S,uri_authority(URI,S,_),Sources),
170 concurrent_maplist(consult_source(URI),Sources,_).
171
172
173 %% crawl(+URI:resource,+Auth:atom) is det.
174 % Looks for information about URI on specified authority.
175 crawl(URI,Source) :-
176 debug(crawler,"Consulting source ~w on ~w...",[Source,URI]),
177 source_uri_graph(Source, URI, Graph),
178 rdf_transaction(load_into(Graph,Source,URI)).
179
180 load_into(Graph,Source,URI) :-
181 source(Source, Handler, _),
182 ( call(Handler,crawl(URI,Graph))
183 -> print_message(information,crawler_crawled(Source,URI,Graph))
184 ; setof(T, call(Handler,describe(URI,T)), Triples),
185 length(Triples, NumTriples),
186 forall(member(rdf(S,P,O),Triples), rdf_assert(S,P,O,Graph))
187 -> print_message(information,crawler_described(Source,URI,Graph,NumTriples))
188 ; print_message(information,crawler_failed(Source,URI))
189 ),
190 debug(crawler,'Asserting successful load of ~w into graph ~w from ~w',[URI,Graph,Source]),
191 rdf_assert(Graph, prov:wasDerivedFrom, URI, Graph).
192
193 %% uri_authority(+URI:resource, -Auth:atom, -Auto:boolean) is nondet.
194 % True when Auth is declared as an authority on URI. If Auto=true,
195 % then, this authority should be queried automatically when URI is the
196 % subject of a 'list_resource' page.
197 uri_authority(URI,Src,Auto) :-
198 authority(Cond,Src,Opts),
199 once(matches(Cond,URI)),
200 option(auto(Auto),Opts,false).
201
202 matches(begins(Prefix),URI) :- sub_atom(URI,0,_,_,Prefix).
203 matches(\+Cond,URI) :- \+matches(Cond,URI).
204 matches((C1,C2),URI) :- matches(C1,URI), matches(C2,URI).
205 matches((C1;C2),URI) :- matches(C1,URI); matches(C2,URI).
206
207
208 %% source_uri_graph(+Auth:atom,+URI:resource,-Graph:atom) is det.
209 % Derive the RDF graph name for a given authority.
210 source_uri_graph(Source,URI,Graph) :-
211 source(Source,Handler,_),
212 call(Handler,uri_graph(URI,Graph)).
213
214 source_name(Source,Name) :- source(Source,Handler,_), call(Handler,name(Name)), !.
215 source_name(Source,Source).
216
217 prolog:message(crawler_crawled(Src,URI,Graph)) -->
218 ["Triples from ~w about ~w loaded into ~w via crawl method"-[Src,URI,Graph]].
219 prolog:message(crawler_described(Src,URI,Graph,N)) -->
220 ["Triples from ~w about ~w loaded into ~w via describe method: ~d."-[Src,URI,Graph,N]].
221 prolog:message(crawler_failed(Src,URI)) -->
222 ["Failed to load anything from ~w about ~w."-[Src,URI]].
223