Mercurial > hg > dml-open-cliopatria
comparison cpack/dml/lib/crawler.pl @ 0:718306e29690 tip
commiting public release
author | Daniel Wolff |
---|---|
date | Tue, 09 Feb 2016 21:05:06 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:718306e29690 |
---|---|
1 /* Part of DML (Digital Music Laboratory) | |
2 Copyright 2014-2015 Samer Abdallah, University of London | |
3 | |
4 This program is free software; you can redistribute it and/or | |
5 modify it under the terms of the GNU General Public License | |
6 as published by the Free Software Foundation; either version 2 | |
7 of the License, or (at your option) any later version. | |
8 | |
9 This program is distributed in the hope that it will be useful, | |
10 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
12 GNU General Public License for more details. | |
13 | |
14 You should have received a copy of the GNU General Public | |
15 License along with this library; if not, write to the Free Software | |
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
17 */ | |
18 | |
19 :- module(crawler, | |
20 [ crawl_loaded/2 | |
21 , crawl_ui//2 | |
22 , crawl/2 | |
23 , crawl/1 | |
24 , crawl_reload_graph/1 | |
25 ]). | |
26 | |
27 :- multifile authority/3. | |
28 :- multifile source/3. | |
29 | |
30 :- use_module(library(settings)). | |
31 :- use_module(library(http/http_dispatch)). | |
32 :- use_module(library(http/http_parameters)). | |
33 :- use_module(library(http/http_wrapper)). | |
34 :- use_module(library(http/html_write)). | |
35 :- use_module(library(semweb/rdf_db)). | |
36 :- use_module(library(prov_schema)). | |
37 :- use_module(library(httpfiles)). | |
38 :- use_module(library(htmlutils), [element//2]). | |
39 :- use_module(library(dcg_core)). | |
40 :- use_module(library(insist)). | |
41 :- use_module(components(basics)). | |
42 :- use_module(components(messages)). | |
43 :- use_module(user(user_db)). | |
44 | |
45 :- setting(enable_auto_crawl, boolean, true, "Enables automatic semantic web searches for recognised URIs"). | |
46 | |
47 :- http_handler(root(crawl), crawl_handler, []). | |
48 | |
49 no_cache --> | |
50 html_post(head,meta(['http-equiv'='Cache-Control', content='no-cache, no-store, must-revalidate, max-age=0'],[])), | |
51 html_post(head,meta(['http-equiv'='Pragma', content='no-cache'],[])), | |
52 html_post(head,meta(['http-equiv'='Expires', content=0],[])). | |
53 | |
54 refresh(URL,Delay) --> | |
55 no_cache, | |
56 html_post(head,meta(['http-equiv'=refresh, content="~d;~w"-[Delay,URL]],[])). | |
57 | |
58 %% crawl_handler(+Request) is det. | |
59 % web service to query given authority about given URI | |
60 crawl_handler(Request) :- | |
61 authorized(write(default, load(lod))), | |
62 http_parameters(Request, | |
63 [ uri(URI, [optional(false), description("URI to search for")]) | |
64 , src(Src, [optional(false), description("Source ID")]) | |
65 , return_to(Return, [ optional(true), description('URI to return to') ]) | |
66 , return_after(Delay, [ default(2) ]) | |
67 , messages(Msgs, [boolean, default(true)]) | |
68 ]), | |
69 debug(crawler,"Got request to consult ~w on ~w",[Src,URI]), | |
70 return_options(Return, Delay, Options), | |
71 ( Msgs=true | |
72 -> call_showing_messages(insist(crawl(URI,Src)),Options) | |
73 ; call_without_messages(insist(crawl(URI,Src)),Options) | |
74 ). | |
75 | |
76 call_without_messages(Goal,Options) :- | |
77 catch( (Goal, Msg='Success'), Ex, (Msg='Error', print_message(error,Ex))), | |
78 ( option(return_to(Return),Options) | |
79 -> option(return_after(Delay),Options,2), | |
80 reply_html_page(cliopatria(default),[], [ h2(Msg), \refresh(Return,Delay) ], [unstable]) | |
81 ; reply_html_page(cliopatria(default),[], [ h2(Msg) ], [unstable]) | |
82 ). | |
83 | |
84 return_options(Return, _, []) :- var(Return), !. | |
85 return_options(Return, Delay, [ return_to(Return), return_after(Delay) ]). | |
86 | |
87 | |
88 %% crawl_ui(+URI:resource,+NT:natural) is det. | |
89 % Component to allow user to trigger a crawl on a given URI, depending on | |
90 % NT, the number of triples which currently have it has subject. | |
91 crawl_ui(URI,NT) --> | |
92 ( {setof(S-Auto, uri_authority(URI,S,Auto),Sources)} | |
93 -> ( {setof(S-A, (member(S-A,Sources), \+crawl_loaded(URI,S)), Untapped)} | |
94 % !!! should check permissions here | |
95 -> { http_current_request(Request), | |
96 memberchk(request_uri(Here), Request) | |
97 }, | |
98 {debug(crawler,'Untapped sources: ~q',[Untapped])}, | |
99 no_cache, | |
100 ( {setting(enable_auto_crawl,true)}, | |
101 {setof(S, member(S-true,Untapped), AutoSources)} | |
102 -> html([ 'The following sources have been consulted automatically: ', | |
103 \seqmap_with_sep(html(', '),element(code),AutoSources), '.', br([]) | |
104 ]), | |
105 { debug(crawler,'consulting in parallel: ~w...',[AutoSources]), | |
106 concurrent_maplist(consult_source(URI),AutoSources,Statuses), | |
107 debug(crawler,'finished consulting on ~w.',[URI]) | |
108 }, | |
109 ( {member(ok,Statuses)} | |
110 -> refresh(Here,0), | |
111 html([b('At least one consultation succeeded; refreshing automatically.'),br([])]) | |
112 ; html([b('All consultations failed.'),br([])]) | |
113 ) | |
114 ; html(p([ 'Click to consult one of the following sources for more information.' | |
115 , br([]), \seqmap(consult_form(URI,Here),Untapped) | |
116 ])) | |
117 ) | |
118 ; [] | |
119 ) | |
120 ; {NT>0} -> [] | |
121 ; html(p('No triples and no authorities known for this URI.')) | |
122 ). | |
123 | |
124 consult_source(URI,Source,Status) :- | |
125 catch( (crawl(URI,Source), Status=ok), Ex, | |
126 (print_message(error,Ex), Status=error(Ex))). | |
127 | |
128 consult_form(URI,Here,Source-_) --> | |
129 {http_link_to_id(crawl_handler, [], FetchURL)}, | |
130 {source_name(Source,Name)}, | |
131 {source(Source,_,Opts), option(messages(Msgs),Opts,true)}, | |
132 html(form([style="display:inline-block", action(FetchURL)], | |
133 [ \hidden(uri, URI), | |
134 \hidden(src, Source), | |
135 \hidden(return_to, Here), | |
136 \hidden(messages, Msgs), | |
137 input([ type(submit), value(Name) ]) | |
138 ])). | |
139 | |
140 %% crawl_loaded(+URI:resource,+Source:atom) is semidet. | |
141 % True when URI has already been crawled and added to the RDF database. | |
142 crawl_loaded(URI,Source) :- | |
143 source_uri_graph(Source,URI,Graph), | |
144 debug(crawler,'Checking if loaded: ~w in graph ~w from ~w',[URI,Graph,Source]), | |
145 rdf(Graph,prov:wasDerivedFrom,URI,Graph). | |
146 | |
147 | |
148 %% crawl_reload_graph(+Graph:atom) is det. | |
149 % Attempts to delete the named graph and reload it from all the URIs | |
150 % that it was derived from. | |
151 crawl_reload_graph(Graph) :- | |
152 findall(URI,rdf(Graph,prov:wasDerivedFrom,URI,Graph),URIs), | |
153 length(URIs,NURIs), | |
154 print_message(information,crawl_reload(Graph,NURIs)), | |
155 rdf_transaction(( | |
156 rdf_unload_graph(Graph), | |
157 maplist(reload_into(Graph),URIs) | |
158 )). | |
159 | |
160 reload_into(Graph,URI) :- | |
161 uri_authority(URI,Source,_), | |
162 source_uri_graph(Source,URI,Graph), | |
163 load_into(Graph,Source,URI). | |
164 | |
165 %% crawl(+URI:resource) is det. | |
166 % Looks for information about URI on all authorities claiming authority on it. | |
167 % Queries are made in parallel using concurrent_maplist/2. | |
168 crawl(URI) :- | |
169 findall(S,uri_authority(URI,S,_),Sources), | |
170 concurrent_maplist(consult_source(URI),Sources,_). | |
171 | |
172 | |
173 %% crawl(+URI:resource,+Auth:atom) is det. | |
174 % Looks for information about URI on specified authority. | |
175 crawl(URI,Source) :- | |
176 debug(crawler,"Consulting source ~w on ~w...",[Source,URI]), | |
177 source_uri_graph(Source, URI, Graph), | |
178 rdf_transaction(load_into(Graph,Source,URI)). | |
179 | |
180 load_into(Graph,Source,URI) :- | |
181 source(Source, Handler, _), | |
182 ( call(Handler,crawl(URI,Graph)) | |
183 -> print_message(information,crawler_crawled(Source,URI,Graph)) | |
184 ; setof(T, call(Handler,describe(URI,T)), Triples), | |
185 length(Triples, NumTriples), | |
186 forall(member(rdf(S,P,O),Triples), rdf_assert(S,P,O,Graph)) | |
187 -> print_message(information,crawler_described(Source,URI,Graph,NumTriples)) | |
188 ; print_message(information,crawler_failed(Source,URI)) | |
189 ), | |
190 debug(crawler,'Asserting successful load of ~w into graph ~w from ~w',[URI,Graph,Source]), | |
191 rdf_assert(Graph, prov:wasDerivedFrom, URI, Graph). | |
192 | |
193 %% uri_authority(+URI:resource, -Auth:atom, -Auto:boolean) is nondet. | |
194 % True when Auth is declared as an authority on URI. If Auto=true, | |
195 % then, this authority should be queried automatically when URI is the | |
196 % subject of a 'list_resource' page. | |
197 uri_authority(URI,Src,Auto) :- | |
198 authority(Cond,Src,Opts), | |
199 once(matches(Cond,URI)), | |
200 option(auto(Auto),Opts,false). | |
201 | |
202 matches(begins(Prefix),URI) :- sub_atom(URI,0,_,_,Prefix). | |
203 matches(\+Cond,URI) :- \+matches(Cond,URI). | |
204 matches((C1,C2),URI) :- matches(C1,URI), matches(C2,URI). | |
205 matches((C1;C2),URI) :- matches(C1,URI); matches(C2,URI). | |
206 | |
207 | |
208 %% source_uri_graph(+Auth:atom,+URI:resource,-Graph:atom) is det. | |
209 % Derive the RDF graph name for a given authority. | |
210 source_uri_graph(Source,URI,Graph) :- | |
211 source(Source,Handler,_), | |
212 call(Handler,uri_graph(URI,Graph)). | |
213 | |
214 source_name(Source,Name) :- source(Source,Handler,_), call(Handler,name(Name)), !. | |
215 source_name(Source,Source). | |
216 | |
217 prolog:message(crawler_crawled(Src,URI,Graph)) --> | |
218 ["Triples from ~w about ~w loaded into ~w via crawl method"-[Src,URI,Graph]]. | |
219 prolog:message(crawler_described(Src,URI,Graph,N)) --> | |
220 ["Triples from ~w about ~w loaded into ~w via describe method: ~d."-[Src,URI,Graph,N]]. | |
221 prolog:message(crawler_failed(Src,URI)) --> | |
222 ["Failed to load anything from ~w about ~w."-[Src,URI]]. | |
223 |