Daniel@0
|
1 /* Part of DML (Digital Music Laboratory)
|
Daniel@0
|
2 Copyright 2014-2015 Samer Abdallah, University of London
|
Daniel@0
|
3
|
Daniel@0
|
4 This program is free software; you can redistribute it and/or
|
Daniel@0
|
5 modify it under the terms of the GNU General Public License
|
Daniel@0
|
6 as published by the Free Software Foundation; either version 2
|
Daniel@0
|
7 of the License, or (at your option) any later version.
|
Daniel@0
|
8
|
Daniel@0
|
9 This program is distributed in the hope that it will be useful,
|
Daniel@0
|
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
Daniel@0
|
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
Daniel@0
|
12 GNU General Public License for more details.
|
Daniel@0
|
13
|
Daniel@0
|
14 You should have received a copy of the GNU General Public
|
Daniel@0
|
15 License along with this library; if not, write to the Free Software
|
Daniel@0
|
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
Daniel@0
|
17 */
|
Daniel@0
|
18
|
Daniel@0
|
19 :- module(crawler,
|
Daniel@0
|
20 [ crawl_loaded/2
|
Daniel@0
|
21 , crawl_ui//2
|
Daniel@0
|
22 , crawl/2
|
Daniel@0
|
23 , crawl/1
|
Daniel@0
|
24 , crawl_reload_graph/1
|
Daniel@0
|
25 ]).
|
Daniel@0
|
26
|
Daniel@0
|
27 :- multifile authority/3.
|
Daniel@0
|
28 :- multifile source/3.
|
Daniel@0
|
29
|
Daniel@0
|
30 :- use_module(library(settings)).
|
Daniel@0
|
31 :- use_module(library(http/http_dispatch)).
|
Daniel@0
|
32 :- use_module(library(http/http_parameters)).
|
Daniel@0
|
33 :- use_module(library(http/http_wrapper)).
|
Daniel@0
|
34 :- use_module(library(http/html_write)).
|
Daniel@0
|
35 :- use_module(library(semweb/rdf_db)).
|
Daniel@0
|
36 :- use_module(library(prov_schema)).
|
Daniel@0
|
37 :- use_module(library(httpfiles)).
|
Daniel@0
|
38 :- use_module(library(htmlutils), [element//2]).
|
Daniel@0
|
39 :- use_module(library(dcg_core)).
|
Daniel@0
|
40 :- use_module(library(insist)).
|
Daniel@0
|
41 :- use_module(components(basics)).
|
Daniel@0
|
42 :- use_module(components(messages)).
|
Daniel@0
|
43 :- use_module(user(user_db)).
|
Daniel@0
|
44
|
Daniel@0
|
45 :- setting(enable_auto_crawl, boolean, true, "Enables automatic semantic web searches for recognised URIs").
|
Daniel@0
|
46
|
Daniel@0
|
47 :- http_handler(root(crawl), crawl_handler, []).
|
Daniel@0
|
48
|
Daniel@0
|
49 no_cache -->
|
Daniel@0
|
50 html_post(head,meta(['http-equiv'='Cache-Control', content='no-cache, no-store, must-revalidate, max-age=0'],[])),
|
Daniel@0
|
51 html_post(head,meta(['http-equiv'='Pragma', content='no-cache'],[])),
|
Daniel@0
|
52 html_post(head,meta(['http-equiv'='Expires', content=0],[])).
|
Daniel@0
|
53
|
Daniel@0
|
54 refresh(URL,Delay) -->
|
Daniel@0
|
55 no_cache,
|
Daniel@0
|
56 html_post(head,meta(['http-equiv'=refresh, content="~d;~w"-[Delay,URL]],[])).
|
Daniel@0
|
57
|
Daniel@0
|
58 %% crawl_handler(+Request) is det.
|
Daniel@0
|
59 % web service to query given authority about given URI
|
Daniel@0
|
60 crawl_handler(Request) :-
|
Daniel@0
|
61 authorized(write(default, load(lod))),
|
Daniel@0
|
62 http_parameters(Request,
|
Daniel@0
|
63 [ uri(URI, [optional(false), description("URI to search for")])
|
Daniel@0
|
64 , src(Src, [optional(false), description("Source ID")])
|
Daniel@0
|
65 , return_to(Return, [ optional(true), description('URI to return to') ])
|
Daniel@0
|
66 , return_after(Delay, [ default(2) ])
|
Daniel@0
|
67 , messages(Msgs, [boolean, default(true)])
|
Daniel@0
|
68 ]),
|
Daniel@0
|
69 debug(crawler,"Got request to consult ~w on ~w",[Src,URI]),
|
Daniel@0
|
70 return_options(Return, Delay, Options),
|
Daniel@0
|
71 ( Msgs=true
|
Daniel@0
|
72 -> call_showing_messages(insist(crawl(URI,Src)),Options)
|
Daniel@0
|
73 ; call_without_messages(insist(crawl(URI,Src)),Options)
|
Daniel@0
|
74 ).
|
Daniel@0
|
75
|
Daniel@0
|
76 call_without_messages(Goal,Options) :-
|
Daniel@0
|
77 catch( (Goal, Msg='Success'), Ex, (Msg='Error', print_message(error,Ex))),
|
Daniel@0
|
78 ( option(return_to(Return),Options)
|
Daniel@0
|
79 -> option(return_after(Delay),Options,2),
|
Daniel@0
|
80 reply_html_page(cliopatria(default),[], [ h2(Msg), \refresh(Return,Delay) ], [unstable])
|
Daniel@0
|
81 ; reply_html_page(cliopatria(default),[], [ h2(Msg) ], [unstable])
|
Daniel@0
|
82 ).
|
Daniel@0
|
83
|
Daniel@0
|
84 return_options(Return, _, []) :- var(Return), !.
|
Daniel@0
|
85 return_options(Return, Delay, [ return_to(Return), return_after(Delay) ]).
|
Daniel@0
|
86
|
Daniel@0
|
87
|
Daniel@0
|
88 %% crawl_ui(+URI:resource,+NT:natural) is det.
|
Daniel@0
|
89 % Component to allow user to trigger a crawl on a given URI, depending on
|
Daniel@0
|
90 % NT, the number of triples which currently have it has subject.
|
Daniel@0
|
91 crawl_ui(URI,NT) -->
|
Daniel@0
|
92 ( {setof(S-Auto, uri_authority(URI,S,Auto),Sources)}
|
Daniel@0
|
93 -> ( {setof(S-A, (member(S-A,Sources), \+crawl_loaded(URI,S)), Untapped)}
|
Daniel@0
|
94 % !!! should check permissions here
|
Daniel@0
|
95 -> { http_current_request(Request),
|
Daniel@0
|
96 memberchk(request_uri(Here), Request)
|
Daniel@0
|
97 },
|
Daniel@0
|
98 {debug(crawler,'Untapped sources: ~q',[Untapped])},
|
Daniel@0
|
99 no_cache,
|
Daniel@0
|
100 ( {setting(enable_auto_crawl,true)},
|
Daniel@0
|
101 {setof(S, member(S-true,Untapped), AutoSources)}
|
Daniel@0
|
102 -> html([ 'The following sources have been consulted automatically: ',
|
Daniel@0
|
103 \seqmap_with_sep(html(', '),element(code),AutoSources), '.', br([])
|
Daniel@0
|
104 ]),
|
Daniel@0
|
105 { debug(crawler,'consulting in parallel: ~w...',[AutoSources]),
|
Daniel@0
|
106 concurrent_maplist(consult_source(URI),AutoSources,Statuses),
|
Daniel@0
|
107 debug(crawler,'finished consulting on ~w.',[URI])
|
Daniel@0
|
108 },
|
Daniel@0
|
109 ( {member(ok,Statuses)}
|
Daniel@0
|
110 -> refresh(Here,0),
|
Daniel@0
|
111 html([b('At least one consultation succeeded; refreshing automatically.'),br([])])
|
Daniel@0
|
112 ; html([b('All consultations failed.'),br([])])
|
Daniel@0
|
113 )
|
Daniel@0
|
114 ; html(p([ 'Click to consult one of the following sources for more information.'
|
Daniel@0
|
115 , br([]), \seqmap(consult_form(URI,Here),Untapped)
|
Daniel@0
|
116 ]))
|
Daniel@0
|
117 )
|
Daniel@0
|
118 ; []
|
Daniel@0
|
119 )
|
Daniel@0
|
120 ; {NT>0} -> []
|
Daniel@0
|
121 ; html(p('No triples and no authorities known for this URI.'))
|
Daniel@0
|
122 ).
|
Daniel@0
|
123
|
Daniel@0
|
124 consult_source(URI,Source,Status) :-
|
Daniel@0
|
125 catch( (crawl(URI,Source), Status=ok), Ex,
|
Daniel@0
|
126 (print_message(error,Ex), Status=error(Ex))).
|
Daniel@0
|
127
|
Daniel@0
|
128 consult_form(URI,Here,Source-_) -->
|
Daniel@0
|
129 {http_link_to_id(crawl_handler, [], FetchURL)},
|
Daniel@0
|
130 {source_name(Source,Name)},
|
Daniel@0
|
131 {source(Source,_,Opts), option(messages(Msgs),Opts,true)},
|
Daniel@0
|
132 html(form([style="display:inline-block", action(FetchURL)],
|
Daniel@0
|
133 [ \hidden(uri, URI),
|
Daniel@0
|
134 \hidden(src, Source),
|
Daniel@0
|
135 \hidden(return_to, Here),
|
Daniel@0
|
136 \hidden(messages, Msgs),
|
Daniel@0
|
137 input([ type(submit), value(Name) ])
|
Daniel@0
|
138 ])).
|
Daniel@0
|
139
|
Daniel@0
|
140 %% crawl_loaded(+URI:resource,+Source:atom) is semidet.
|
Daniel@0
|
141 % True when URI has already been crawled and added to the RDF database.
|
Daniel@0
|
142 crawl_loaded(URI,Source) :-
|
Daniel@0
|
143 source_uri_graph(Source,URI,Graph),
|
Daniel@0
|
144 debug(crawler,'Checking if loaded: ~w in graph ~w from ~w',[URI,Graph,Source]),
|
Daniel@0
|
145 rdf(Graph,prov:wasDerivedFrom,URI,Graph).
|
Daniel@0
|
146
|
Daniel@0
|
147
|
Daniel@0
|
148 %% crawl_reload_graph(+Graph:atom) is det.
|
Daniel@0
|
149 % Attempts to delete the named graph and reload it from all the URIs
|
Daniel@0
|
150 % that it was derived from.
|
Daniel@0
|
151 crawl_reload_graph(Graph) :-
|
Daniel@0
|
152 findall(URI,rdf(Graph,prov:wasDerivedFrom,URI,Graph),URIs),
|
Daniel@0
|
153 length(URIs,NURIs),
|
Daniel@0
|
154 print_message(information,crawl_reload(Graph,NURIs)),
|
Daniel@0
|
155 rdf_transaction((
|
Daniel@0
|
156 rdf_unload_graph(Graph),
|
Daniel@0
|
157 maplist(reload_into(Graph),URIs)
|
Daniel@0
|
158 )).
|
Daniel@0
|
159
|
Daniel@0
|
160 reload_into(Graph,URI) :-
|
Daniel@0
|
161 uri_authority(URI,Source,_),
|
Daniel@0
|
162 source_uri_graph(Source,URI,Graph),
|
Daniel@0
|
163 load_into(Graph,Source,URI).
|
Daniel@0
|
164
|
Daniel@0
|
165 %% crawl(+URI:resource) is det.
|
Daniel@0
|
166 % Looks for information about URI on all authorities claiming authority on it.
|
Daniel@0
|
167 % Queries are made in parallel using concurrent_maplist/2.
|
Daniel@0
|
168 crawl(URI) :-
|
Daniel@0
|
169 findall(S,uri_authority(URI,S,_),Sources),
|
Daniel@0
|
170 concurrent_maplist(consult_source(URI),Sources,_).
|
Daniel@0
|
171
|
Daniel@0
|
172
|
Daniel@0
|
173 %% crawl(+URI:resource,+Auth:atom) is det.
|
Daniel@0
|
174 % Looks for information about URI on specified authority.
|
Daniel@0
|
175 crawl(URI,Source) :-
|
Daniel@0
|
176 debug(crawler,"Consulting source ~w on ~w...",[Source,URI]),
|
Daniel@0
|
177 source_uri_graph(Source, URI, Graph),
|
Daniel@0
|
178 rdf_transaction(load_into(Graph,Source,URI)).
|
Daniel@0
|
179
|
Daniel@0
|
180 load_into(Graph,Source,URI) :-
|
Daniel@0
|
181 source(Source, Handler, _),
|
Daniel@0
|
182 ( call(Handler,crawl(URI,Graph))
|
Daniel@0
|
183 -> print_message(information,crawler_crawled(Source,URI,Graph))
|
Daniel@0
|
184 ; setof(T, call(Handler,describe(URI,T)), Triples),
|
Daniel@0
|
185 length(Triples, NumTriples),
|
Daniel@0
|
186 forall(member(rdf(S,P,O),Triples), rdf_assert(S,P,O,Graph))
|
Daniel@0
|
187 -> print_message(information,crawler_described(Source,URI,Graph,NumTriples))
|
Daniel@0
|
188 ; print_message(information,crawler_failed(Source,URI))
|
Daniel@0
|
189 ),
|
Daniel@0
|
190 debug(crawler,'Asserting successful load of ~w into graph ~w from ~w',[URI,Graph,Source]),
|
Daniel@0
|
191 rdf_assert(Graph, prov:wasDerivedFrom, URI, Graph).
|
Daniel@0
|
192
|
Daniel@0
|
193 %% uri_authority(+URI:resource, -Auth:atom, -Auto:boolean) is nondet.
|
Daniel@0
|
194 % True when Auth is declared as an authority on URI. If Auto=true,
|
Daniel@0
|
195 % then, this authority should be queried automatically when URI is the
|
Daniel@0
|
196 % subject of a 'list_resource' page.
|
Daniel@0
|
197 uri_authority(URI,Src,Auto) :-
|
Daniel@0
|
198 authority(Cond,Src,Opts),
|
Daniel@0
|
199 once(matches(Cond,URI)),
|
Daniel@0
|
200 option(auto(Auto),Opts,false).
|
Daniel@0
|
201
|
Daniel@0
|
202 matches(begins(Prefix),URI) :- sub_atom(URI,0,_,_,Prefix).
|
Daniel@0
|
203 matches(\+Cond,URI) :- \+matches(Cond,URI).
|
Daniel@0
|
204 matches((C1,C2),URI) :- matches(C1,URI), matches(C2,URI).
|
Daniel@0
|
205 matches((C1;C2),URI) :- matches(C1,URI); matches(C2,URI).
|
Daniel@0
|
206
|
Daniel@0
|
207
|
Daniel@0
|
208 %% source_uri_graph(+Auth:atom,+URI:resource,-Graph:atom) is det.
|
Daniel@0
|
209 % Derive the RDF graph name for a given authority.
|
Daniel@0
|
210 source_uri_graph(Source,URI,Graph) :-
|
Daniel@0
|
211 source(Source,Handler,_),
|
Daniel@0
|
212 call(Handler,uri_graph(URI,Graph)).
|
Daniel@0
|
213
|
Daniel@0
|
214 source_name(Source,Name) :- source(Source,Handler,_), call(Handler,name(Name)), !.
|
Daniel@0
|
215 source_name(Source,Source).
|
Daniel@0
|
216
|
Daniel@0
|
217 prolog:message(crawler_crawled(Src,URI,Graph)) -->
|
Daniel@0
|
218 ["Triples from ~w about ~w loaded into ~w via crawl method"-[Src,URI,Graph]].
|
Daniel@0
|
219 prolog:message(crawler_described(Src,URI,Graph,N)) -->
|
Daniel@0
|
220 ["Triples from ~w about ~w loaded into ~w via describe method: ~d."-[Src,URI,Graph,N]].
|
Daniel@0
|
221 prolog:message(crawler_failed(Src,URI)) -->
|
Daniel@0
|
222 ["Failed to load anything from ~w about ~w."-[Src,URI]].
|
Daniel@0
|
223
|