Mercurial > hg > dml-open-cliopatria
comparison cpack/dml/lib/bl_p2r.pl @ 0:718306e29690 tip
commiting public release
author | Daniel Wolff |
---|---|
date | Tue, 09 Feb 2016 21:05:06 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:718306e29690 |
---|---|
1 /* Part of DML (Digital Music Laboratory) | |
2 Copyright 2014-2015 Samer Abdallah, University of London | |
3 | |
4 This program is free software; you can redistribute it and/or | |
5 modify it under the terms of the GNU General Public License | |
6 as published by the Free Software Foundation; either version 2 | |
7 of the License, or (at your option) any later version. | |
8 | |
9 This program is distributed in the hope that it will be useful, | |
10 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
12 GNU General Public License for more details. | |
13 | |
14 You should have received a copy of the GNU General Public | |
15 License along with this library; if not, write to the Free Software | |
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
17 */ | |
18 | |
19 :- module(bl_p2r,[ audio_file/3, audio_link/2, scrape_audio_link/2, has_available_audio/1 ]). | |
20 | |
21 :- use_module(library(semweb/rdf_db)). | |
22 :- use_module(library(xmlarchive)). | |
23 :- use_module(library(xpath)). | |
24 :- use_module(library(settings)). | |
25 :- use_module(library(sandbox)). | |
26 :- use_module(library(fileutils)). | |
27 :- use_module(library(termutils)). | |
28 :- use_module(library(rdfutils)). | |
29 :- use_module(library(insist)). | |
30 :- use_module(library(memo)). | |
31 :- use_module(entailment(p2r)). | |
32 :- use_module(cliopatria(hooks)). | |
33 :- use_module(library(http/http_client)). | |
34 | |
35 :- set_prolog_flag(double_quotes,string). | |
36 | |
37 /* | |
38 METS to RDF conversion. | |
39 | |
40 Useful info: | |
41 0. Each top-level mets:mets element contains several sections. | |
42 1,4 x dmdSec .. 1 x mdWrap | |
43 1 x amdSec .. [ N x techMD, {0,1} x rightsMD, {0,1} x sourceMD ] | |
44 1 x fileSec .. 1-4 x fileGrp .. 1-36 x file | |
45 1 x structLink .. {1,3} x smLink | |
46 2-3 x structMap .. 1 x div | |
47 | |
48 1. Each dmdSec only ever contains one mdWrap element | |
49 5. each techMD or sourceMD contains exactly 1 mdWrap | |
50 6. each rightsMD contains an mdRef | |
51 7. The rightsMD is not interesting to look at. | |
52 8. dmdSec mdWraps can be empty | |
53 | |
54 List of all dmdSec tags in dataset | |
55 blapsi:id3album | |
56 blapsi:id3artist | |
57 blapsi:id3comment | |
58 blapsi:id3songtitle | |
59 blapsi:id3year | |
60 dc:description | |
61 dc:identifier | |
62 dc:language | |
63 dc:rights | |
64 dc:source | |
65 dc:subject | |
66 dc:title | |
67 dc:type | |
68 dcterms:created | |
69 dcterms:isPartOf | |
70 dcterms:spatial | |
71 dcterms:temporal | |
72 marcrel:CMP | |
73 marcrel:CND | |
74 marcrel:IVE | |
75 marcrel:IVR | |
76 marcrel:LYR | |
77 marcrel:PRF | |
78 marcrel:RCE | |
79 marcrel:SPK | |
80 mods:accessCondition | |
81 mods:identifier | |
82 mods:name | |
83 mods:recordInfo | |
84 mods:titleInfo | |
85 | |
86 amdSec tags | |
87 blapsi:audioObject | |
88 blapsi:file_bitrate | |
89 blapsi:file_channels | |
90 blapsi:file_duration | |
91 blapsi:file_sample | |
92 blapsi:resolution | |
93 | |
94 sourceMD .. blapsi:audioObject | |
95 blapsi:face | |
96 blapsi:format | |
97 blapsi:physicalProperties | |
98 blapsi:face | |
99 blapsi:format | |
100 blapsi:physicalProperties | |
101 blapsi:primaryIdentifier | |
102 blapsi:secondaryIdentifier | |
103 blapsi:primaryIdentifier | |
104 blapsi:secondaryIdentifier | |
105 | |
106 would prefer to have mo predicates to foaf:Person resources for these. | |
107 marcrel:'CMP',dml:composer). | |
108 marcrel:'CND',dml:conductor). | |
109 marcrel:'PRF',dml:performer). | |
110 | |
111 */ | |
112 | |
113 :- setting(audio_root,ground,nothing,'Root directory of audio file collection'). | |
114 :- setting(archive_pattern,atom,'~/lib/datasets/mets/BL_metadata_complete.7z','Pattern to match METS metadata files'). | |
115 | |
116 :- rdf_register_prefix(bldata,'http://sounds.bl.uk/resource/'). | |
117 :- rdf_register_prefix(marcrel,'http://id.loc.gov/vocabulary/relators/'). | |
118 :- rdf_register_prefix(blapsi,'http://sounds.bl.uk/blapsi#'). | |
119 :- rdf_register_prefix(blterms,'http://www.bl.uk/schemas/bibliographic/blterms#'). | |
120 | |
121 % https://code.google.com/p/libarchive/issues/detail?id=328&colspec=ID%20Type%20Status%20Priority%20Milestone%20OpSys%20Owner%20Summary | |
122 | |
123 :- public import/0. | |
124 import :- assert_all(bl_p2r). | |
125 | |
126 rdf(dml:blpage,rdfs:range,foaf:'Document') <== true. | |
127 rdf(dml:blpage,rdfs:subPropertyOf,foaf:page) <== true. | |
128 rdf(S,P,O) <== bl_archive_triple(T), once(expand_triple(T,rdf(S,P,O))). | |
129 | |
130 expand_triple(rdf(SS,PP,OO),rdf(S,P,O)) :- | |
131 uripattern:pattern_uri(SS,S), | |
132 uripattern:pattern_uri(PP,P), | |
133 ( OO=literal(_) -> rdf_global_object(OO,O) | |
134 ; uripattern:pattern_uri(OO,O) | |
135 ). | |
136 | |
137 %% import_directory(+Dir:atom, +Graph:atom) is det. | |
138 % | |
139 % Import contents of a given directory into a named RDF graph. | |
140 % The directory must be a subdirectory of the directory named in the | |
141 % bl_p2r:audio_root setting (whose value is either =|just(Dir)|= or | |
142 % =|nothing|=). | |
143 import_directory(Dir,Graph) :- | |
144 forall( directory_triple(Dir,T), (once(expand_triple(T,rdf(S,P,O))), rdf_assert(S,P,O,Graph))). | |
145 | |
146 directory_triple(Dir,T) :- | |
147 Ext=txt, | |
148 setting(audio_root,just(AudioRoot)), | |
149 atom_concat(AudioRoot,'/',Prefix), | |
150 find_files(under(AudioRoot/Dir),FullPath), | |
151 atom_concat(Prefix,Path,FullPath), | |
152 split_path(Path,Loc,Ext), | |
153 with_stream(Str,open(FullPath,read,Str), ext_loc_stream_triple(Ext,Loc,Str,T)). | |
154 | |
155 | |
156 %% bl_archive_triple(-T:rdf) is nondet. | |
157 % This predicate generates triples from the metadata archive files whose names | |
158 % match the pattern stored in the setting bl_p2r:archive_pattern. | |
159 bl_archive_triple(T) :- | |
160 setting(archive_pattern, ArchivePattern), | |
161 find_files(like(ArchivePattern), Archive), | |
162 with_archive_stream(Archive, Path, path_triple_stream(Path,T)). | |
163 | |
164 path_triple_stream(Path,T,S) :- | |
165 status('Importing ~s',[Path]), | |
166 catch(( insist(split_path(Path,Loc,Ext)), | |
167 ext_loc_stream_triple(Ext,Loc,S,T) | |
168 ), Ex, (nl,print_message(warning,Ex),fail)). | |
169 | |
170 split_path(Path,Dirs-Base,Ext) :- | |
171 % split_string(Path,'/','',Parts), | |
172 atomic_list_concat(Parts,'/',Path), | |
173 exclude(ignore_dir,Parts,Parts1), | |
174 append(Dirs,[Name],Parts1), | |
175 sub_atom(Name,_,3,0,Ext), % NB this assumes three character extension | |
176 sub_atom(Name,0,_,4,Base). | |
177 | |
178 %% ext_loc_stream_triple(+Ext:atom, +Loc:pair(list(atom), atom), +S:stream, -T:rdf) is nondet. | |
179 % | |
180 % This predicate succeeds once for each RDF triple that can be derived from | |
181 % an archive stream whose path in the archive Loc=Dirs-Base consist of the directories | |
182 % Dirs and whose name is Base,'.',Ext. It understands entries with extensions | |
183 % xml and txt. XML streams are decoded as full METS documents. TXT streams are decoded | |
184 % as the DMD section of a METS document. Other extensions generate a warning and then fail. | |
185 | |
186 ext_loc_stream_triple(xml,Dirs-Base,S,T) :- !, | |
187 insist(load_xmlns(S,Doc)), | |
188 insist(member(element(mets:mets,_,METS),Doc)), | |
189 ( insist(get_bl_url(Base,Dirs,URL)), | |
190 T=rdf(bldata:Base,dml:blpage,URL) | |
191 ; T=rdf(bldata:Base,rdf:type,mo:'Signal') | |
192 ; insist(multi,member(element(Tag,_,Content),METS),no_content(mets,Dirs-Base)), | |
193 mets_triple(Tag,Content,Dirs,Base,T) | |
194 ). | |
195 | |
196 ext_loc_stream_triple(txt,Dirs-Base,S,T) :- !, | |
197 % TXT streams appear to be LATIN-1 encoded, not UTF-8 | |
198 Base\=combined, % exclude combined metadata files | |
199 set_stream(S,encoding(iso_latin_1)), | |
200 insist(load_xmlns(S,Doc)), | |
201 txt_triple(Doc,Dirs,Base,T). | |
202 | |
203 ext_loc_stream_triple(Ext,Dirs-Base,_,_) :- | |
204 warning(unrecognised_extension(Ext,Dirs-Base)). | |
205 | |
206 | |
207 %% get_bl_url(+Name:atom,+Dirs:list(atom),-URL:atom) is det. | |
208 % Deduces the sounds.bl.uk URL from entry name and directory. | |
209 get_bl_url(Name,Dirs,URL) :- | |
210 sub_atom(Name,0,4,_,CatCode), | |
211 category(CatCode,Category), | |
212 append(_,[Collection],Dirs), | |
213 atomic_list_concat([ 'http://sounds.bl.uk', | |
214 Category,Collection,Name],'/',URL). | |
215 | |
216 category('021M','Oral-history'). | |
217 category('025M','World-and-traditional-music'). | |
218 category('023M','Jazz-and-popular-music'). | |
219 category('026M','Classical-music'). | |
220 category('028M','Jazz-and-popular-music'). | |
221 category('020A','Classical-music'). | |
222 | |
223 ignore_dir('_Metadata'). | |
224 ignore_dir('_Non-music'). | |
225 ignore_dir('_Audio_Metadata'). | |
226 | |
227 % --------------------- TOP LEVEL STRUCTURE ------------------------------- | |
228 | |
229 unwrap([element(mets:mdWrap,MDAttr,[element(mets:xmlData,_,XMLContent)])],MDAttr,XMLContent). | |
230 | |
231 % for complete METS documents | |
232 mets_triple(mets:dmdSec,DMDContent,_,ID,T) :- !, | |
233 unwrap(DMDContent,MDAttr,XMLContent), | |
234 member('MDTYPE'='DC',MDAttr), | |
235 dmd_triple(XMLContent,bldata:ID,T). | |
236 | |
237 mets_triple(mets:amdSec,AMDContent,_,ID,T) :- !, | |
238 member(element(Tag,Attr,Content),AMDContent), | |
239 amd_triple(Tag,Attr,Content,ID,T). | |
240 | |
241 mets_triple(mets:fileSec,FileSecContent,Dirs,ID,T) :- !, | |
242 member(element(T1,GAttr,GroupContent),FileSecContent), insist(T1=mets:fileGrp), | |
243 member(element(T2,FAttr,FileContent),GroupContent), insist(T2=mets:file), | |
244 \+member('ID'='WEBRESOURCES',GAttr), | |
245 FileContent=[element(Tag,Attr,Content)], | |
246 file_triple(Tag,Attr,Content,GAttr-FAttr,Dirs,ID,T). | |
247 | |
248 mets_triple(mets:metsHdr,_,_,_,_) :- !, fail. | |
249 mets_triple(mets:structLink,_,_,_,_) :- !, fail. | |
250 mets_triple(mets:structMap,_,_,_,_) :- !, fail. | |
251 mets_triple(Tag,_,_,_,_) :- warning(unrecognised_tag(Tag,mets:mets)). | |
252 | |
253 % for txt, partial XML documents | |
254 txt_triple(_,_,ID,rdf(bldata:ID,rdf:type,mo:'Signal')). | |
255 txt_triple(Doc,Dirs,ID,T) :- | |
256 insist(multi,member(element(Tag,_,Content),Doc),no_content(txt)), | |
257 txt_tag_triple(Tag,Content,Dirs,ID,T). | |
258 | |
259 identifier_file_ext(F,F1,mp3) :- sub_atom(F,Bef,_,_,'.mp3'), !, insist(sub_atom(F,0,Bef,_,F1)). | |
260 identifier_file_ext(F,F1,wav) :- sub_atom(F,Bef,_,0,'.wav'), !, insist(sub_atom(F,0,Bef,_,F1)). | |
261 identifier_file_ext(F,F1,m4a) :- sub_atom(F,Bef,_,0,'.m4a'), !, insist(sub_atom(F,0,Bef,_,F1)). | |
262 | |
263 txt_tag_triple(dc:identifier, [F], Dirs, ID, rdf(bldata:ID, bldata:path, literal(Path))) :- !, | |
264 % NB some of the txt files have the file name written twice. Hence I am going to discard | |
265 % everything after the first dot. Relies on sub_atom returning matches starting from the beginning | |
266 ( identifier_file_ext(F,F1,Ext) | |
267 -> file_name_extension(F1,Ext,Name), | |
268 atomics_to_string(Dirs,"/",Dir), | |
269 directory_file_path(Dir,Name,Path) | |
270 ; print_message(warning,txt_triple_identifier_fail(ID,F)), | |
271 fail | |
272 ). | |
273 | |
274 % !!! MUSICALS only. Should not really have mo:duration in them either... | |
275 txt_tag_triple(dml:rating,Content, _, ID, rdf(bldata:ID, dml:rating, literal(Content))) :- !. | |
276 txt_tag_triple(mo:duration,Content, _, ID, rdf(bldata:ID, mo:duration, literal(type(xsd:float,Millis)))) :- !, | |
277 insist(Content=[Dur],bad_content(Content,mo:duration)), | |
278 insist(atom_number(Dur,Millis)). | |
279 txt_tag_triple(blapsi:file_duration,Content, _, ID, rdf(bldata:ID, mo:duration, literal(type(xsd:float,Millis)))) :- !, | |
280 insist(Content=[Dur],bad_content(Content,blapsi:file_duration)), | |
281 % insist(parse_duration_millis(Dur,Millis)). | |
282 ( parse_duration_millis(Dur,Millis) -> true | |
283 ; warning(bad_duration(ID,Dur)) | |
284 ). | |
285 txt_tag_triple(Tag,Content,_,ID,T) :- dmd_tag_triple(Tag,Content,bldata:ID,T). | |
286 | |
287 | |
288 % --------------- Document meta data -------------------------- | |
289 | |
290 dmd_triple(DMD,URI,T) :- | |
291 member(element(Tag,_,Content),DMD), | |
292 dmd_tag_triple(Tag,Content,URI,T). | |
293 | |
294 dmd_tag_triple(dcterms:contributor,Content,ID,T) :- !, dmd_triple(Content,bldata:ID,T). | |
295 dmd_tag_triple(dc:contributor,Content,ID,T) :- !, dmd_triple(Content,bldata:ID,T). | |
296 dmd_tag_triple(marcrel:REL,Content,URI,rdf(URI,marcrel:Rel,literal(Lit))) :- !, | |
297 Content=[Lit],%empty_tag(marcrel:REL,Content)), | |
298 downcase_atom(REL,Rel). | |
299 dmd_tag_triple(Tag,Content,URI,rdf(URI,Tag,literal(Lit))) :- keep_tag(Tag), !, Content=[Lit]. | |
300 dmd_tag_triple(Tag,_,_,_) :- ignore_tag(Tag), !, fail. | |
301 dmd_tag_triple(Tag,_Content,URI,_) :- warning(unrecognised_tag(Tag,dmd,URI)). | |
302 | |
303 % !!!FIXME - sometimes dates are given in D/M/Y instead of Y-M-D | |
304 keep_tag(dc:title). | |
305 keep_tag(dc:description). | |
306 keep_tag(dc:source). | |
307 keep_tag(dc:subject). | |
308 keep_tag(dc:language). | |
309 keep_tag(dc:created). | |
310 keep_tag(dcterms:language). | |
311 keep_tag(dcterms:abstract). | |
312 keep_tag(dcterms:created). | |
313 keep_tag(dcterms:spatial). | |
314 keep_tag(dcterms:temporal). | |
315 keep_tag(dcterms:extent). % !!!FIXME need to parse this | |
316 keep_tag(blterms:mechanism). | |
317 keep_tag(dcterms:isPartOf). | |
318 keep_tag(blapsi:format). | |
319 | |
320 ignore_tag(dc:identifier). | |
321 ignore_tag(blapsi:marker). | |
322 ignore_tag(dc:rights). | |
323 ignore_tag(dc:type). | |
324 ignore_tag(rdf:about). | |
325 ignore_tag('ARK'). | |
326 | |
327 % ------------------------- ADMINISTRATIVE METADATA SECTION ----------------------------- | |
328 | |
329 amd_triple(mets:sourceMD,_,SMDContent,ID,T) :- | |
330 insist(unwrap(SMDContent,_,XMLContent),no_xml_content(SMDContent,smd)), | |
331 atom_concat(ID,'#source',Src), | |
332 ( T=rdf(bldata:ID,dml:source,bldata:Src) | |
333 ; insist(multi,member(element(Tag,Attr,Content),XMLContent),no_xml_content(smd)), | |
334 smd_xml_triple(Tag,Attr,Content,bldata:Src,T) | |
335 ). | |
336 | |
337 amd_triple(mets:techMD,Attr,TMDContent,ID,T) :- | |
338 insist(member('ID'=TMDId,Attr)), | |
339 unwrap(TMDContent,_,XMLContent), | |
340 ( T=rdf(bldata:ID/TMDId, mo:sampled_version_of, bldata:ID) | |
341 % ; T=rdf(bldata:ID/TMDId, dml:annotation,literal(Label)), member('LABEL'=Label,TMDAttr) | |
342 ; member(element(Tag,_,Content),XMLContent), | |
343 blapsi_triple(Tag, Content, bldata:ID/TMDId, T) | |
344 ). | |
345 | |
346 blapsi_triple(blapsi:Tag, [Text], Signal, rdf(Signal, Pred, literal(Lit))) :- | |
347 insist(blapsi_info(Tag, Text, Pred, Lit)). | |
348 | |
349 % ------------ Source --------------- | |
350 | |
351 smd_xml_triple(blapsi:audioObject,Attr,AOContent,SrcURI,rdf(SrcURI,Pred,literal(Lit))) :- !, | |
352 ( member(A=Lit,Attr), A\='ID', Pred=bldata:A | |
353 ; insist(multi,member(element(Tag,Attr1,Content),AOContent),no_content(blapsi:audioObject,AOContent)), | |
354 ao_tag_info(Tag,Attr1,Content,Pred,Lit) | |
355 ). | |
356 | |
357 smd_xml_triple(blapsi:Tag,Content,SrcURI,rdf(SrcURI,blapsi:Tag,literal(Lit))) :- !, | |
358 insist(Content=[Lit],bad_content(blapsi:Tag,Content,smd_xml_triple)). | |
359 | |
360 ao_tag_info(blapsi:primaryIdentifier,_,_,_,_) :- !, fail. | |
361 ao_tag_info(blapsi:secondaryIdentifier,_,_,_,_) :- !, fail. | |
362 ao_tag_info(blapsi:format,_,Content,blapsi:format,Lit) :- !, | |
363 insist(Content=[Lit],bad_content(Content,blapsi:format)). | |
364 ao_tag_info(blapsi:face, Attr, Content, Pred, Lit) :- !, | |
365 %insist(member('ID'=ID,Attr)), | |
366 insist(member('label'=Label,Attr)), | |
367 insist(Content=[],non_empty_content(blapsi:face,Content)), | |
368 ( fail % Pred=bldata:face_id, Lit=ID IGNORE FOR NOW | |
369 ; Pred=bldata:face_label, Lit=Label | |
370 ). | |
371 ao_tag_info(blapsi:physicalProperties, _, PPContent, Tag, Lit) :- !, | |
372 insist(multi,member(element(Tag,_,Content),PPContent),no_content(blapsi:physicalProperties)), | |
373 insist(Content=[Lit],bad_content(Tag,Content)). | |
374 ao_tag_info(Tag,_,_,_,_) :- | |
375 warning(unrecognised_tag(Tag,blapsi:audioObject)). | |
376 | |
377 % identifier_pred('ASR Root ID',asr_root_id). | |
378 % identifier_pred('Sound Archive accession number',accession_number). | |
379 | |
380 | |
381 | |
382 % -------------- FILE SECTION --------------------------- | |
383 | |
384 file_triple(mets:'FLocat',Attr,LocContent,GAttr-FAttr,Dirs,ID,T) :- !, | |
385 ( member('MIMETYPE'=MimeType,FAttr) | |
386 -> audio_mimetype(MimeType), | |
387 insist(member('AMDID'=TMDId1,FAttr)), | |
388 insist(member('LOCTYPE'='URL',Attr)), | |
389 insist(member((xlink:href)=Link,Attr)), | |
390 insist(LocContent=[],non_empty(mets:'FLocat',LocContent)), | |
391 insist(member('USE'=Use1,FAttr);member('USE'=Use1,GAttr)), | |
392 % TMDId1 is sometimes "techMDxx digiprovXX" - need to get rid of second word | |
393 atomic_list_concat([TMDId|_],' ',TMDId1), | |
394 downcase_atom(Use1,Use), | |
395 ( T=rdf(bldata:ID/TMDId,dml:mimetype,literal(MimeType)) | |
396 ; file_path_triple(bldata:ID/TMDId,Dirs,Link,T) | |
397 ; T=rdf(bldata:ID/TMDId,bldata:use,literal(Use)) | |
398 ) | |
399 ; insist(\+member('AMDID'=_,FAttr)), | |
400 insist(member('ID'=FileID,FAttr)), | |
401 insist(member('LOCTYPE'='URL',Attr)), | |
402 insist(member((xlink:href)=Link,Attr)), | |
403 sub_atom(Link,_,3,0,Ext), | |
404 audio_extension(Ext), | |
405 ( T=rdf(bldata:ID/FileID,mo:sampled_version_of,bldata:ID) | |
406 ; file_path_triple(bldata:ID/FileID,Dirs,Link,T) | |
407 ; member(element(Tag,_,Content),LocContent), | |
408 blapsi_triple(Tag,Content,bldata:ID/FileID,T) | |
409 ) | |
410 ). | |
411 | |
412 file_triple(mets:'Fcontent',_,_,_,_,_,_) :- !, fail. | |
413 file_triple(Tag,_,_,_,_,_,_) :- warning(unrecognised_tag(Tag,file)). | |
414 | |
415 file_path_triple(URI,Dirs,Link,rdf(URI,bldata:path,literal(Path))) :- | |
416 atomics_to_string(Parts,"/",Link), | |
417 atomics_to_string(Dirs,"/",Dir), | |
418 append(_,[Name],Parts), | |
419 directory_file_path(Dir,Name,Path). | |
420 | |
421 audio_mimetype('sound/wav'). | |
422 audio_mimetype('sound/wma'). | |
423 audio_mimetype('sound/mp3'). | |
424 audio_mimetype('sound/ogg'). | |
425 | |
426 audio_extension(wav). | |
427 audio_extension(mp3). | |
428 audio_extension(wma). | |
429 | |
430 % ---------------------------- BLAPSI INFO --------------------------------- | |
431 | |
432 blapsi_info(file_sample, X, mo:sample_rate, type(xsd:float,SampleRate)) :- atom_number(X,SampleRate). | |
433 blapsi_info(file_resolution, X, mo:bitsPerSample, type(xsd:int,Bits)) :- atom_number(X,Bits). | |
434 blapsi_info(resolution, X, mo:bitsPerSample, type(xsd:int,Bits)) :- atom_number(X,Bits). | |
435 blapsi_info(file_channels, X, mo:channels, type(xsd:int,Channels)) :- atom_number(X,Channels). | |
436 blapsi_info(file_duration, X, mo:duration, type(xsd:float,Millis)) :- parse_duration_millis(X,Millis). | |
437 blapsi_info(file_bitrate, X, blapsi:file_bitrate, X). % !!! FIXME should be attached to file, not to signal | |
438 blapsi_info(file_size, X, blapsi:file_size, type(xsd:int,Size)) :- atom_number(X,Size). | |
439 blapsi_info(file_length, X, blapsi:file_length, type(xsd:int,Size)) :- atom_number(X,Size). | |
440 | |
441 % --------------------------- SUPPORTING PREDICATES -------------------------- | |
442 | |
443 :- use_module(library(async)). | |
444 :- public scrape_all/2. | |
445 :- volatile_memo scrape_all(+options:list,-count:nonneg). | |
446 scrape_all(Opts,Count) :- | |
447 option(spacing(Sleep),Opts,1), | |
448 option(timeout(Timeout),Opts,10), | |
449 findall( R, rdf(R,dml:blpage,_), Rs), | |
450 with_progress_stack(map_with_progress(scrape_then_sleep(Sleep,Timeout),Rs,Ss)), | |
451 exclude(=(ok),Ss, Failures), | |
452 (Failures=[] -> length(Rs,Count); throw(scrape_errors(Failures))). | |
453 | |
454 scrape_then_sleep(D,T,R,Status) :- | |
455 ( audio_link(R,_) -> Status=ok | |
456 ; memo:reify(bl_p2r:slow(D,call_with_time_limit(T,scrape_audio_link(R,_))),Status), | |
457 (Status=ex(abort(Reason)) -> throw(abort(Reason)); true) | |
458 ). | |
459 | |
460 slow(Delay,Goal) :- call(Goal), sleep(Delay). | |
461 | |
462 %% has_available_audio(+R:uri) is semidet. | |
463 %% has_available_audio(-R:uri) is nondet. | |
464 % True when R is a recording in the BL collection whose audio is | |
465 % publicly available. | |
466 has_available_audio(R) :- | |
467 rdf(R,dml:blpage,_), | |
468 scrape_audio_link(R,_). | |
469 | |
470 :- public audio_link/2. | |
471 audio_link(URI,AudioURL) :- | |
472 browse(scrape_audio_link(URI,AudioURL)). | |
473 | |
474 :- public audio_file/3. | |
475 audio_file(URI,Path,just(mp3)) :- | |
476 setting(audio_root,just(Root)), | |
477 ( rdf(URI,bldata:path,literal(RelPath)), | |
478 rdf(URI,rdf:type,mo:'Signal') | |
479 ; rdf(URI2,mo:sampled_version_of,URI), | |
480 rdf(URI2,dml:mimetype,literal('sound/mp3')), | |
481 rdf(URI2,bldata:path,literal(RelPath)) | |
482 ), | |
483 atomic_list_concat([Root,RelPath],'/',Path). | |
484 | |
485 :- volatile_memo scrape_audio_link(+atom,-atom). | |
486 | |
487 scrape_audio_link(URI,AudioURL) :- | |
488 rdf(URI,dml:blpage,PageURL), | |
489 debug(bl_p2r,'Scraping audio link for ~w...',[URI]), | |
490 atom_concat('http://sounds.bl.uk/',_,PageURL), | |
491 http_get(PageURL,Doc,[]), | |
492 xpath(Doc,//li(@class=mainTrack)/a(@id),ID), | |
493 string_concat("MNT-",Key,ID), | |
494 string_concat('http://sounds.bl.uk/GT/',Key,AudioURL). | |
495 | |
496 sandbox:safe_primitive(bl_p2r:audio_link(_,_)). | |
497 sandbox:safe_primitive(bl_p2r:scrape_audio_link(_,_)). | |
498 | |
499 xpath(Prop,E,Path,Val) :- xpath(E,Path,I), xpath(I,/self(Prop),Val). | |
500 | |
501 warning(Term) :- nl, print_message(warning,Term), fail. |