Daniel@0
|
1 /* Part of DML (Digital Music Laboratory)
|
Daniel@0
|
2 Copyright 2014-2015 Samer Abdallah, University of London
|
Daniel@0
|
3
|
Daniel@0
|
4 This program is free software; you can redistribute it and/or
|
Daniel@0
|
5 modify it under the terms of the GNU General Public License
|
Daniel@0
|
6 as published by the Free Software Foundation; either version 2
|
Daniel@0
|
7 of the License, or (at your option) any later version.
|
Daniel@0
|
8
|
Daniel@0
|
9 This program is distributed in the hope that it will be useful,
|
Daniel@0
|
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
Daniel@0
|
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
Daniel@0
|
12 GNU General Public License for more details.
|
Daniel@0
|
13
|
Daniel@0
|
14 You should have received a copy of the GNU General Public
|
Daniel@0
|
15 License along with this library; if not, write to the Free Software
|
Daniel@0
|
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
Daniel@0
|
17 */
|
Daniel@0
|
18
|
Daniel@0
|
19 :- module(bl_p2r,[ audio_file/3, audio_link/2, scrape_audio_link/2, has_available_audio/1 ]).
|
Daniel@0
|
20
|
Daniel@0
|
21 :- use_module(library(semweb/rdf_db)).
|
Daniel@0
|
22 :- use_module(library(xmlarchive)).
|
Daniel@0
|
23 :- use_module(library(xpath)).
|
Daniel@0
|
24 :- use_module(library(settings)).
|
Daniel@0
|
25 :- use_module(library(sandbox)).
|
Daniel@0
|
26 :- use_module(library(fileutils)).
|
Daniel@0
|
27 :- use_module(library(termutils)).
|
Daniel@0
|
28 :- use_module(library(rdfutils)).
|
Daniel@0
|
29 :- use_module(library(insist)).
|
Daniel@0
|
30 :- use_module(library(memo)).
|
Daniel@0
|
31 :- use_module(entailment(p2r)).
|
Daniel@0
|
32 :- use_module(cliopatria(hooks)).
|
Daniel@0
|
33 :- use_module(library(http/http_client)).
|
Daniel@0
|
34
|
Daniel@0
|
35 :- set_prolog_flag(double_quotes,string).
|
Daniel@0
|
36
|
Daniel@0
|
37 /*
|
Daniel@0
|
38 METS to RDF conversion.
|
Daniel@0
|
39
|
Daniel@0
|
40 Useful info:
|
Daniel@0
|
41 0. Each top-level mets:mets element contains several sections.
|
Daniel@0
|
42 1,4 x dmdSec .. 1 x mdWrap
|
Daniel@0
|
43 1 x amdSec .. [ N x techMD, {0,1} x rightsMD, {0,1} x sourceMD ]
|
Daniel@0
|
44 1 x fileSec .. 1-4 x fileGrp .. 1-36 x file
|
Daniel@0
|
45 1 x structLink .. {1,3} x smLink
|
Daniel@0
|
46 2-3 x structMap .. 1 x div
|
Daniel@0
|
47
|
Daniel@0
|
48 1. Each dmdSec only ever contains one mdWrap element
|
Daniel@0
|
49 5. each techMD or sourceMD contains exactly 1 mdWrap
|
Daniel@0
|
50 6. each rightsMD contains an mdRef
|
Daniel@0
|
51 7. The rightsMD is not interesting to look at.
|
Daniel@0
|
52 8. dmdSec mdWraps can be empty
|
Daniel@0
|
53
|
Daniel@0
|
54 List of all dmdSec tags in dataset
|
Daniel@0
|
55 blapsi:id3album
|
Daniel@0
|
56 blapsi:id3artist
|
Daniel@0
|
57 blapsi:id3comment
|
Daniel@0
|
58 blapsi:id3songtitle
|
Daniel@0
|
59 blapsi:id3year
|
Daniel@0
|
60 dc:description
|
Daniel@0
|
61 dc:identifier
|
Daniel@0
|
62 dc:language
|
Daniel@0
|
63 dc:rights
|
Daniel@0
|
64 dc:source
|
Daniel@0
|
65 dc:subject
|
Daniel@0
|
66 dc:title
|
Daniel@0
|
67 dc:type
|
Daniel@0
|
68 dcterms:created
|
Daniel@0
|
69 dcterms:isPartOf
|
Daniel@0
|
70 dcterms:spatial
|
Daniel@0
|
71 dcterms:temporal
|
Daniel@0
|
72 marcrel:CMP
|
Daniel@0
|
73 marcrel:CND
|
Daniel@0
|
74 marcrel:IVE
|
Daniel@0
|
75 marcrel:IVR
|
Daniel@0
|
76 marcrel:LYR
|
Daniel@0
|
77 marcrel:PRF
|
Daniel@0
|
78 marcrel:RCE
|
Daniel@0
|
79 marcrel:SPK
|
Daniel@0
|
80 mods:accessCondition
|
Daniel@0
|
81 mods:identifier
|
Daniel@0
|
82 mods:name
|
Daniel@0
|
83 mods:recordInfo
|
Daniel@0
|
84 mods:titleInfo
|
Daniel@0
|
85
|
Daniel@0
|
86 amdSec tags
|
Daniel@0
|
87 blapsi:audioObject
|
Daniel@0
|
88 blapsi:file_bitrate
|
Daniel@0
|
89 blapsi:file_channels
|
Daniel@0
|
90 blapsi:file_duration
|
Daniel@0
|
91 blapsi:file_sample
|
Daniel@0
|
92 blapsi:resolution
|
Daniel@0
|
93
|
Daniel@0
|
94 sourceMD .. blapsi:audioObject
|
Daniel@0
|
95 blapsi:face
|
Daniel@0
|
96 blapsi:format
|
Daniel@0
|
97 blapsi:physicalProperties
|
Daniel@0
|
98 blapsi:face
|
Daniel@0
|
99 blapsi:format
|
Daniel@0
|
100 blapsi:physicalProperties
|
Daniel@0
|
101 blapsi:primaryIdentifier
|
Daniel@0
|
102 blapsi:secondaryIdentifier
|
Daniel@0
|
103 blapsi:primaryIdentifier
|
Daniel@0
|
104 blapsi:secondaryIdentifier
|
Daniel@0
|
105
|
Daniel@0
|
106 would prefer to have mo predicates to foaf:Person resources for these.
|
Daniel@0
|
107 marcrel:'CMP',dml:composer).
|
Daniel@0
|
108 marcrel:'CND',dml:conductor).
|
Daniel@0
|
109 marcrel:'PRF',dml:performer).
|
Daniel@0
|
110
|
Daniel@0
|
111 */
|
Daniel@0
|
112
|
Daniel@0
|
113 :- setting(audio_root,ground,nothing,'Root directory of audio file collection').
|
Daniel@0
|
114 :- setting(archive_pattern,atom,'~/lib/datasets/mets/BL_metadata_complete.7z','Pattern to match METS metadata files').
|
Daniel@0
|
115
|
Daniel@0
|
116 :- rdf_register_prefix(bldata,'http://sounds.bl.uk/resource/').
|
Daniel@0
|
117 :- rdf_register_prefix(marcrel,'http://id.loc.gov/vocabulary/relators/').
|
Daniel@0
|
118 :- rdf_register_prefix(blapsi,'http://sounds.bl.uk/blapsi#').
|
Daniel@0
|
119 :- rdf_register_prefix(blterms,'http://www.bl.uk/schemas/bibliographic/blterms#').
|
Daniel@0
|
120
|
Daniel@0
|
121 % https://code.google.com/p/libarchive/issues/detail?id=328&colspec=ID%20Type%20Status%20Priority%20Milestone%20OpSys%20Owner%20Summary
|
Daniel@0
|
122
|
Daniel@0
|
123 :- public import/0.
|
Daniel@0
|
124 import :- assert_all(bl_p2r).
|
Daniel@0
|
125
|
Daniel@0
|
126 rdf(dml:blpage,rdfs:range,foaf:'Document') <== true.
|
Daniel@0
|
127 rdf(dml:blpage,rdfs:subPropertyOf,foaf:page) <== true.
|
Daniel@0
|
128 rdf(S,P,O) <== bl_archive_triple(T), once(expand_triple(T,rdf(S,P,O))).
|
Daniel@0
|
129
|
Daniel@0
|
130 expand_triple(rdf(SS,PP,OO),rdf(S,P,O)) :-
|
Daniel@0
|
131 uripattern:pattern_uri(SS,S),
|
Daniel@0
|
132 uripattern:pattern_uri(PP,P),
|
Daniel@0
|
133 ( OO=literal(_) -> rdf_global_object(OO,O)
|
Daniel@0
|
134 ; uripattern:pattern_uri(OO,O)
|
Daniel@0
|
135 ).
|
Daniel@0
|
136
|
Daniel@0
|
137 %% import_directory(+Dir:atom, +Graph:atom) is det.
|
Daniel@0
|
138 %
|
Daniel@0
|
139 % Import contents of a given directory into a named RDF graph.
|
Daniel@0
|
140 % The directory must be a subdirectory of the directory named in the
|
Daniel@0
|
141 % bl_p2r:audio_root setting (whose value is either =|just(Dir)|= or
|
Daniel@0
|
142 % =|nothing|=).
|
Daniel@0
|
143 import_directory(Dir,Graph) :-
|
Daniel@0
|
144 forall( directory_triple(Dir,T), (once(expand_triple(T,rdf(S,P,O))), rdf_assert(S,P,O,Graph))).
|
Daniel@0
|
145
|
Daniel@0
|
146 directory_triple(Dir,T) :-
|
Daniel@0
|
147 Ext=txt,
|
Daniel@0
|
148 setting(audio_root,just(AudioRoot)),
|
Daniel@0
|
149 atom_concat(AudioRoot,'/',Prefix),
|
Daniel@0
|
150 find_files(under(AudioRoot/Dir),FullPath),
|
Daniel@0
|
151 atom_concat(Prefix,Path,FullPath),
|
Daniel@0
|
152 split_path(Path,Loc,Ext),
|
Daniel@0
|
153 with_stream(Str,open(FullPath,read,Str), ext_loc_stream_triple(Ext,Loc,Str,T)).
|
Daniel@0
|
154
|
Daniel@0
|
155
|
Daniel@0
|
156 %% bl_archive_triple(-T:rdf) is nondet.
|
Daniel@0
|
157 % This predicate generates triples from the metadata archive files whose names
|
Daniel@0
|
158 % match the pattern stored in the setting bl_p2r:archive_pattern.
|
Daniel@0
|
159 bl_archive_triple(T) :-
|
Daniel@0
|
160 setting(archive_pattern, ArchivePattern),
|
Daniel@0
|
161 find_files(like(ArchivePattern), Archive),
|
Daniel@0
|
162 with_archive_stream(Archive, Path, path_triple_stream(Path,T)).
|
Daniel@0
|
163
|
Daniel@0
|
164 path_triple_stream(Path,T,S) :-
|
Daniel@0
|
165 status('Importing ~s',[Path]),
|
Daniel@0
|
166 catch(( insist(split_path(Path,Loc,Ext)),
|
Daniel@0
|
167 ext_loc_stream_triple(Ext,Loc,S,T)
|
Daniel@0
|
168 ), Ex, (nl,print_message(warning,Ex),fail)).
|
Daniel@0
|
169
|
Daniel@0
|
170 split_path(Path,Dirs-Base,Ext) :-
|
Daniel@0
|
171 % split_string(Path,'/','',Parts),
|
Daniel@0
|
172 atomic_list_concat(Parts,'/',Path),
|
Daniel@0
|
173 exclude(ignore_dir,Parts,Parts1),
|
Daniel@0
|
174 append(Dirs,[Name],Parts1),
|
Daniel@0
|
175 sub_atom(Name,_,3,0,Ext), % NB this assumes three character extension
|
Daniel@0
|
176 sub_atom(Name,0,_,4,Base).
|
Daniel@0
|
177
|
Daniel@0
|
178 %% ext_loc_stream_triple(+Ext:atom, +Loc:pair(list(atom), atom), +S:stream, -T:rdf) is nondet.
|
Daniel@0
|
179 %
|
Daniel@0
|
180 % This predicate succeeds once for each RDF triple that can be derived from
|
Daniel@0
|
181 % an archive stream whose path in the archive Loc=Dirs-Base consist of the directories
|
Daniel@0
|
182 % Dirs and whose name is Base,'.',Ext. It understands entries with extensions
|
Daniel@0
|
183 % xml and txt. XML streams are decoded as full METS documents. TXT streams are decoded
|
Daniel@0
|
184 % as the DMD section of a METS document. Other extensions generate a warning and then fail.
|
Daniel@0
|
185
|
Daniel@0
|
186 ext_loc_stream_triple(xml,Dirs-Base,S,T) :- !,
|
Daniel@0
|
187 insist(load_xmlns(S,Doc)),
|
Daniel@0
|
188 insist(member(element(mets:mets,_,METS),Doc)),
|
Daniel@0
|
189 ( insist(get_bl_url(Base,Dirs,URL)),
|
Daniel@0
|
190 T=rdf(bldata:Base,dml:blpage,URL)
|
Daniel@0
|
191 ; T=rdf(bldata:Base,rdf:type,mo:'Signal')
|
Daniel@0
|
192 ; insist(multi,member(element(Tag,_,Content),METS),no_content(mets,Dirs-Base)),
|
Daniel@0
|
193 mets_triple(Tag,Content,Dirs,Base,T)
|
Daniel@0
|
194 ).
|
Daniel@0
|
195
|
Daniel@0
|
196 ext_loc_stream_triple(txt,Dirs-Base,S,T) :- !,
|
Daniel@0
|
197 % TXT streams appear to be LATIN-1 encoded, not UTF-8
|
Daniel@0
|
198 Base\=combined, % exclude combined metadata files
|
Daniel@0
|
199 set_stream(S,encoding(iso_latin_1)),
|
Daniel@0
|
200 insist(load_xmlns(S,Doc)),
|
Daniel@0
|
201 txt_triple(Doc,Dirs,Base,T).
|
Daniel@0
|
202
|
Daniel@0
|
203 ext_loc_stream_triple(Ext,Dirs-Base,_,_) :-
|
Daniel@0
|
204 warning(unrecognised_extension(Ext,Dirs-Base)).
|
Daniel@0
|
205
|
Daniel@0
|
206
|
Daniel@0
|
207 %% get_bl_url(+Name:atom,+Dirs:list(atom),-URL:atom) is det.
|
Daniel@0
|
208 % Deduces the sounds.bl.uk URL from entry name and directory.
|
Daniel@0
|
209 get_bl_url(Name,Dirs,URL) :-
|
Daniel@0
|
210 sub_atom(Name,0,4,_,CatCode),
|
Daniel@0
|
211 category(CatCode,Category),
|
Daniel@0
|
212 append(_,[Collection],Dirs),
|
Daniel@0
|
213 atomic_list_concat([ 'http://sounds.bl.uk',
|
Daniel@0
|
214 Category,Collection,Name],'/',URL).
|
Daniel@0
|
215
|
Daniel@0
|
216 category('021M','Oral-history').
|
Daniel@0
|
217 category('025M','World-and-traditional-music').
|
Daniel@0
|
218 category('023M','Jazz-and-popular-music').
|
Daniel@0
|
219 category('026M','Classical-music').
|
Daniel@0
|
220 category('028M','Jazz-and-popular-music').
|
Daniel@0
|
221 category('020A','Classical-music').
|
Daniel@0
|
222
|
Daniel@0
|
223 ignore_dir('_Metadata').
|
Daniel@0
|
224 ignore_dir('_Non-music').
|
Daniel@0
|
225 ignore_dir('_Audio_Metadata').
|
Daniel@0
|
226
|
Daniel@0
|
227 % --------------------- TOP LEVEL STRUCTURE -------------------------------
|
Daniel@0
|
228
|
Daniel@0
|
229 unwrap([element(mets:mdWrap,MDAttr,[element(mets:xmlData,_,XMLContent)])],MDAttr,XMLContent).
|
Daniel@0
|
230
|
Daniel@0
|
231 % for complete METS documents
|
Daniel@0
|
232 mets_triple(mets:dmdSec,DMDContent,_,ID,T) :- !,
|
Daniel@0
|
233 unwrap(DMDContent,MDAttr,XMLContent),
|
Daniel@0
|
234 member('MDTYPE'='DC',MDAttr),
|
Daniel@0
|
235 dmd_triple(XMLContent,bldata:ID,T).
|
Daniel@0
|
236
|
Daniel@0
|
237 mets_triple(mets:amdSec,AMDContent,_,ID,T) :- !,
|
Daniel@0
|
238 member(element(Tag,Attr,Content),AMDContent),
|
Daniel@0
|
239 amd_triple(Tag,Attr,Content,ID,T).
|
Daniel@0
|
240
|
Daniel@0
|
241 mets_triple(mets:fileSec,FileSecContent,Dirs,ID,T) :- !,
|
Daniel@0
|
242 member(element(T1,GAttr,GroupContent),FileSecContent), insist(T1=mets:fileGrp),
|
Daniel@0
|
243 member(element(T2,FAttr,FileContent),GroupContent), insist(T2=mets:file),
|
Daniel@0
|
244 \+member('ID'='WEBRESOURCES',GAttr),
|
Daniel@0
|
245 FileContent=[element(Tag,Attr,Content)],
|
Daniel@0
|
246 file_triple(Tag,Attr,Content,GAttr-FAttr,Dirs,ID,T).
|
Daniel@0
|
247
|
Daniel@0
|
248 mets_triple(mets:metsHdr,_,_,_,_) :- !, fail.
|
Daniel@0
|
249 mets_triple(mets:structLink,_,_,_,_) :- !, fail.
|
Daniel@0
|
250 mets_triple(mets:structMap,_,_,_,_) :- !, fail.
|
Daniel@0
|
251 mets_triple(Tag,_,_,_,_) :- warning(unrecognised_tag(Tag,mets:mets)).
|
Daniel@0
|
252
|
Daniel@0
|
253 % for txt, partial XML documents
|
Daniel@0
|
254 txt_triple(_,_,ID,rdf(bldata:ID,rdf:type,mo:'Signal')).
|
Daniel@0
|
255 txt_triple(Doc,Dirs,ID,T) :-
|
Daniel@0
|
256 insist(multi,member(element(Tag,_,Content),Doc),no_content(txt)),
|
Daniel@0
|
257 txt_tag_triple(Tag,Content,Dirs,ID,T).
|
Daniel@0
|
258
|
Daniel@0
|
259 identifier_file_ext(F,F1,mp3) :- sub_atom(F,Bef,_,_,'.mp3'), !, insist(sub_atom(F,0,Bef,_,F1)).
|
Daniel@0
|
260 identifier_file_ext(F,F1,wav) :- sub_atom(F,Bef,_,0,'.wav'), !, insist(sub_atom(F,0,Bef,_,F1)).
|
Daniel@0
|
261 identifier_file_ext(F,F1,m4a) :- sub_atom(F,Bef,_,0,'.m4a'), !, insist(sub_atom(F,0,Bef,_,F1)).
|
Daniel@0
|
262
|
Daniel@0
|
263 txt_tag_triple(dc:identifier, [F], Dirs, ID, rdf(bldata:ID, bldata:path, literal(Path))) :- !,
|
Daniel@0
|
264 % NB some of the txt files have the file name written twice. Hence I am going to discard
|
Daniel@0
|
265 % everything after the first dot. Relies on sub_atom returning matches starting from the beginning
|
Daniel@0
|
266 ( identifier_file_ext(F,F1,Ext)
|
Daniel@0
|
267 -> file_name_extension(F1,Ext,Name),
|
Daniel@0
|
268 atomics_to_string(Dirs,"/",Dir),
|
Daniel@0
|
269 directory_file_path(Dir,Name,Path)
|
Daniel@0
|
270 ; print_message(warning,txt_triple_identifier_fail(ID,F)),
|
Daniel@0
|
271 fail
|
Daniel@0
|
272 ).
|
Daniel@0
|
273
|
Daniel@0
|
274 % !!! MUSICALS only. Should not really have mo:duration in them either...
|
Daniel@0
|
275 txt_tag_triple(dml:rating,Content, _, ID, rdf(bldata:ID, dml:rating, literal(Content))) :- !.
|
Daniel@0
|
276 txt_tag_triple(mo:duration,Content, _, ID, rdf(bldata:ID, mo:duration, literal(type(xsd:float,Millis)))) :- !,
|
Daniel@0
|
277 insist(Content=[Dur],bad_content(Content,mo:duration)),
|
Daniel@0
|
278 insist(atom_number(Dur,Millis)).
|
Daniel@0
|
279 txt_tag_triple(blapsi:file_duration,Content, _, ID, rdf(bldata:ID, mo:duration, literal(type(xsd:float,Millis)))) :- !,
|
Daniel@0
|
280 insist(Content=[Dur],bad_content(Content,blapsi:file_duration)),
|
Daniel@0
|
281 % insist(parse_duration_millis(Dur,Millis)).
|
Daniel@0
|
282 ( parse_duration_millis(Dur,Millis) -> true
|
Daniel@0
|
283 ; warning(bad_duration(ID,Dur))
|
Daniel@0
|
284 ).
|
Daniel@0
|
285 txt_tag_triple(Tag,Content,_,ID,T) :- dmd_tag_triple(Tag,Content,bldata:ID,T).
|
Daniel@0
|
286
|
Daniel@0
|
287
|
Daniel@0
|
288 % --------------- Document meta data --------------------------
|
Daniel@0
|
289
|
Daniel@0
|
290 dmd_triple(DMD,URI,T) :-
|
Daniel@0
|
291 member(element(Tag,_,Content),DMD),
|
Daniel@0
|
292 dmd_tag_triple(Tag,Content,URI,T).
|
Daniel@0
|
293
|
Daniel@0
|
294 dmd_tag_triple(dcterms:contributor,Content,ID,T) :- !, dmd_triple(Content,bldata:ID,T).
|
Daniel@0
|
295 dmd_tag_triple(dc:contributor,Content,ID,T) :- !, dmd_triple(Content,bldata:ID,T).
|
Daniel@0
|
296 dmd_tag_triple(marcrel:REL,Content,URI,rdf(URI,marcrel:Rel,literal(Lit))) :- !,
|
Daniel@0
|
297 Content=[Lit],%empty_tag(marcrel:REL,Content)),
|
Daniel@0
|
298 downcase_atom(REL,Rel).
|
Daniel@0
|
299 dmd_tag_triple(Tag,Content,URI,rdf(URI,Tag,literal(Lit))) :- keep_tag(Tag), !, Content=[Lit].
|
Daniel@0
|
300 dmd_tag_triple(Tag,_,_,_) :- ignore_tag(Tag), !, fail.
|
Daniel@0
|
301 dmd_tag_triple(Tag,_Content,URI,_) :- warning(unrecognised_tag(Tag,dmd,URI)).
|
Daniel@0
|
302
|
Daniel@0
|
303 % !!!FIXME - sometimes dates are given in D/M/Y instead of Y-M-D
|
Daniel@0
|
304 keep_tag(dc:title).
|
Daniel@0
|
305 keep_tag(dc:description).
|
Daniel@0
|
306 keep_tag(dc:source).
|
Daniel@0
|
307 keep_tag(dc:subject).
|
Daniel@0
|
308 keep_tag(dc:language).
|
Daniel@0
|
309 keep_tag(dc:created).
|
Daniel@0
|
310 keep_tag(dcterms:language).
|
Daniel@0
|
311 keep_tag(dcterms:abstract).
|
Daniel@0
|
312 keep_tag(dcterms:created).
|
Daniel@0
|
313 keep_tag(dcterms:spatial).
|
Daniel@0
|
314 keep_tag(dcterms:temporal).
|
Daniel@0
|
315 keep_tag(dcterms:extent). % !!!FIXME need to parse this
|
Daniel@0
|
316 keep_tag(blterms:mechanism).
|
Daniel@0
|
317 keep_tag(dcterms:isPartOf).
|
Daniel@0
|
318 keep_tag(blapsi:format).
|
Daniel@0
|
319
|
Daniel@0
|
320 ignore_tag(dc:identifier).
|
Daniel@0
|
321 ignore_tag(blapsi:marker).
|
Daniel@0
|
322 ignore_tag(dc:rights).
|
Daniel@0
|
323 ignore_tag(dc:type).
|
Daniel@0
|
324 ignore_tag(rdf:about).
|
Daniel@0
|
325 ignore_tag('ARK').
|
Daniel@0
|
326
|
Daniel@0
|
327 % ------------------------- ADMINISTRATIVE METADATA SECTION -----------------------------
|
Daniel@0
|
328
|
Daniel@0
|
329 amd_triple(mets:sourceMD,_,SMDContent,ID,T) :-
|
Daniel@0
|
330 insist(unwrap(SMDContent,_,XMLContent),no_xml_content(SMDContent,smd)),
|
Daniel@0
|
331 atom_concat(ID,'#source',Src),
|
Daniel@0
|
332 ( T=rdf(bldata:ID,dml:source,bldata:Src)
|
Daniel@0
|
333 ; insist(multi,member(element(Tag,Attr,Content),XMLContent),no_xml_content(smd)),
|
Daniel@0
|
334 smd_xml_triple(Tag,Attr,Content,bldata:Src,T)
|
Daniel@0
|
335 ).
|
Daniel@0
|
336
|
Daniel@0
|
337 amd_triple(mets:techMD,Attr,TMDContent,ID,T) :-
|
Daniel@0
|
338 insist(member('ID'=TMDId,Attr)),
|
Daniel@0
|
339 unwrap(TMDContent,_,XMLContent),
|
Daniel@0
|
340 ( T=rdf(bldata:ID/TMDId, mo:sampled_version_of, bldata:ID)
|
Daniel@0
|
341 % ; T=rdf(bldata:ID/TMDId, dml:annotation,literal(Label)), member('LABEL'=Label,TMDAttr)
|
Daniel@0
|
342 ; member(element(Tag,_,Content),XMLContent),
|
Daniel@0
|
343 blapsi_triple(Tag, Content, bldata:ID/TMDId, T)
|
Daniel@0
|
344 ).
|
Daniel@0
|
345
|
Daniel@0
|
346 blapsi_triple(blapsi:Tag, [Text], Signal, rdf(Signal, Pred, literal(Lit))) :-
|
Daniel@0
|
347 insist(blapsi_info(Tag, Text, Pred, Lit)).
|
Daniel@0
|
348
|
Daniel@0
|
349 % ------------ Source ---------------
|
Daniel@0
|
350
|
Daniel@0
|
351 smd_xml_triple(blapsi:audioObject,Attr,AOContent,SrcURI,rdf(SrcURI,Pred,literal(Lit))) :- !,
|
Daniel@0
|
352 ( member(A=Lit,Attr), A\='ID', Pred=bldata:A
|
Daniel@0
|
353 ; insist(multi,member(element(Tag,Attr1,Content),AOContent),no_content(blapsi:audioObject,AOContent)),
|
Daniel@0
|
354 ao_tag_info(Tag,Attr1,Content,Pred,Lit)
|
Daniel@0
|
355 ).
|
Daniel@0
|
356
|
Daniel@0
|
357 smd_xml_triple(blapsi:Tag,Content,SrcURI,rdf(SrcURI,blapsi:Tag,literal(Lit))) :- !,
|
Daniel@0
|
358 insist(Content=[Lit],bad_content(blapsi:Tag,Content,smd_xml_triple)).
|
Daniel@0
|
359
|
Daniel@0
|
360 ao_tag_info(blapsi:primaryIdentifier,_,_,_,_) :- !, fail.
|
Daniel@0
|
361 ao_tag_info(blapsi:secondaryIdentifier,_,_,_,_) :- !, fail.
|
Daniel@0
|
362 ao_tag_info(blapsi:format,_,Content,blapsi:format,Lit) :- !,
|
Daniel@0
|
363 insist(Content=[Lit],bad_content(Content,blapsi:format)).
|
Daniel@0
|
364 ao_tag_info(blapsi:face, Attr, Content, Pred, Lit) :- !,
|
Daniel@0
|
365 %insist(member('ID'=ID,Attr)),
|
Daniel@0
|
366 insist(member('label'=Label,Attr)),
|
Daniel@0
|
367 insist(Content=[],non_empty_content(blapsi:face,Content)),
|
Daniel@0
|
368 ( fail % Pred=bldata:face_id, Lit=ID IGNORE FOR NOW
|
Daniel@0
|
369 ; Pred=bldata:face_label, Lit=Label
|
Daniel@0
|
370 ).
|
Daniel@0
|
371 ao_tag_info(blapsi:physicalProperties, _, PPContent, Tag, Lit) :- !,
|
Daniel@0
|
372 insist(multi,member(element(Tag,_,Content),PPContent),no_content(blapsi:physicalProperties)),
|
Daniel@0
|
373 insist(Content=[Lit],bad_content(Tag,Content)).
|
Daniel@0
|
374 ao_tag_info(Tag,_,_,_,_) :-
|
Daniel@0
|
375 warning(unrecognised_tag(Tag,blapsi:audioObject)).
|
Daniel@0
|
376
|
Daniel@0
|
377 % identifier_pred('ASR Root ID',asr_root_id).
|
Daniel@0
|
378 % identifier_pred('Sound Archive accession number',accession_number).
|
Daniel@0
|
379
|
Daniel@0
|
380
|
Daniel@0
|
381
|
Daniel@0
|
382 % -------------- FILE SECTION ---------------------------
|
Daniel@0
|
383
|
Daniel@0
|
384 file_triple(mets:'FLocat',Attr,LocContent,GAttr-FAttr,Dirs,ID,T) :- !,
|
Daniel@0
|
385 ( member('MIMETYPE'=MimeType,FAttr)
|
Daniel@0
|
386 -> audio_mimetype(MimeType),
|
Daniel@0
|
387 insist(member('AMDID'=TMDId1,FAttr)),
|
Daniel@0
|
388 insist(member('LOCTYPE'='URL',Attr)),
|
Daniel@0
|
389 insist(member((xlink:href)=Link,Attr)),
|
Daniel@0
|
390 insist(LocContent=[],non_empty(mets:'FLocat',LocContent)),
|
Daniel@0
|
391 insist(member('USE'=Use1,FAttr);member('USE'=Use1,GAttr)),
|
Daniel@0
|
392 % TMDId1 is sometimes "techMDxx digiprovXX" - need to get rid of second word
|
Daniel@0
|
393 atomic_list_concat([TMDId|_],' ',TMDId1),
|
Daniel@0
|
394 downcase_atom(Use1,Use),
|
Daniel@0
|
395 ( T=rdf(bldata:ID/TMDId,dml:mimetype,literal(MimeType))
|
Daniel@0
|
396 ; file_path_triple(bldata:ID/TMDId,Dirs,Link,T)
|
Daniel@0
|
397 ; T=rdf(bldata:ID/TMDId,bldata:use,literal(Use))
|
Daniel@0
|
398 )
|
Daniel@0
|
399 ; insist(\+member('AMDID'=_,FAttr)),
|
Daniel@0
|
400 insist(member('ID'=FileID,FAttr)),
|
Daniel@0
|
401 insist(member('LOCTYPE'='URL',Attr)),
|
Daniel@0
|
402 insist(member((xlink:href)=Link,Attr)),
|
Daniel@0
|
403 sub_atom(Link,_,3,0,Ext),
|
Daniel@0
|
404 audio_extension(Ext),
|
Daniel@0
|
405 ( T=rdf(bldata:ID/FileID,mo:sampled_version_of,bldata:ID)
|
Daniel@0
|
406 ; file_path_triple(bldata:ID/FileID,Dirs,Link,T)
|
Daniel@0
|
407 ; member(element(Tag,_,Content),LocContent),
|
Daniel@0
|
408 blapsi_triple(Tag,Content,bldata:ID/FileID,T)
|
Daniel@0
|
409 )
|
Daniel@0
|
410 ).
|
Daniel@0
|
411
|
Daniel@0
|
412 file_triple(mets:'Fcontent',_,_,_,_,_,_) :- !, fail.
|
Daniel@0
|
413 file_triple(Tag,_,_,_,_,_,_) :- warning(unrecognised_tag(Tag,file)).
|
Daniel@0
|
414
|
Daniel@0
|
415 file_path_triple(URI,Dirs,Link,rdf(URI,bldata:path,literal(Path))) :-
|
Daniel@0
|
416 atomics_to_string(Parts,"/",Link),
|
Daniel@0
|
417 atomics_to_string(Dirs,"/",Dir),
|
Daniel@0
|
418 append(_,[Name],Parts),
|
Daniel@0
|
419 directory_file_path(Dir,Name,Path).
|
Daniel@0
|
420
|
Daniel@0
|
421 audio_mimetype('sound/wav').
|
Daniel@0
|
422 audio_mimetype('sound/wma').
|
Daniel@0
|
423 audio_mimetype('sound/mp3').
|
Daniel@0
|
424 audio_mimetype('sound/ogg').
|
Daniel@0
|
425
|
Daniel@0
|
426 audio_extension(wav).
|
Daniel@0
|
427 audio_extension(mp3).
|
Daniel@0
|
428 audio_extension(wma).
|
Daniel@0
|
429
|
Daniel@0
|
430 % ---------------------------- BLAPSI INFO ---------------------------------
|
Daniel@0
|
431
|
Daniel@0
|
432 blapsi_info(file_sample, X, mo:sample_rate, type(xsd:float,SampleRate)) :- atom_number(X,SampleRate).
|
Daniel@0
|
433 blapsi_info(file_resolution, X, mo:bitsPerSample, type(xsd:int,Bits)) :- atom_number(X,Bits).
|
Daniel@0
|
434 blapsi_info(resolution, X, mo:bitsPerSample, type(xsd:int,Bits)) :- atom_number(X,Bits).
|
Daniel@0
|
435 blapsi_info(file_channels, X, mo:channels, type(xsd:int,Channels)) :- atom_number(X,Channels).
|
Daniel@0
|
436 blapsi_info(file_duration, X, mo:duration, type(xsd:float,Millis)) :- parse_duration_millis(X,Millis).
|
Daniel@0
|
437 blapsi_info(file_bitrate, X, blapsi:file_bitrate, X). % !!! FIXME should be attached to file, not to signal
|
Daniel@0
|
438 blapsi_info(file_size, X, blapsi:file_size, type(xsd:int,Size)) :- atom_number(X,Size).
|
Daniel@0
|
439 blapsi_info(file_length, X, blapsi:file_length, type(xsd:int,Size)) :- atom_number(X,Size).
|
Daniel@0
|
440
|
Daniel@0
|
441 % --------------------------- SUPPORTING PREDICATES --------------------------
|
Daniel@0
|
442
|
Daniel@0
|
443 :- use_module(library(async)).
|
Daniel@0
|
444 :- public scrape_all/2.
|
Daniel@0
|
445 :- volatile_memo scrape_all(+options:list,-count:nonneg).
|
Daniel@0
|
446 scrape_all(Opts,Count) :-
|
Daniel@0
|
447 option(spacing(Sleep),Opts,1),
|
Daniel@0
|
448 option(timeout(Timeout),Opts,10),
|
Daniel@0
|
449 findall( R, rdf(R,dml:blpage,_), Rs),
|
Daniel@0
|
450 with_progress_stack(map_with_progress(scrape_then_sleep(Sleep,Timeout),Rs,Ss)),
|
Daniel@0
|
451 exclude(=(ok),Ss, Failures),
|
Daniel@0
|
452 (Failures=[] -> length(Rs,Count); throw(scrape_errors(Failures))).
|
Daniel@0
|
453
|
Daniel@0
|
454 scrape_then_sleep(D,T,R,Status) :-
|
Daniel@0
|
455 ( audio_link(R,_) -> Status=ok
|
Daniel@0
|
456 ; memo:reify(bl_p2r:slow(D,call_with_time_limit(T,scrape_audio_link(R,_))),Status),
|
Daniel@0
|
457 (Status=ex(abort(Reason)) -> throw(abort(Reason)); true)
|
Daniel@0
|
458 ).
|
Daniel@0
|
459
|
Daniel@0
|
460 slow(Delay,Goal) :- call(Goal), sleep(Delay).
|
Daniel@0
|
461
|
Daniel@0
|
462 %% has_available_audio(+R:uri) is semidet.
|
Daniel@0
|
463 %% has_available_audio(-R:uri) is nondet.
|
Daniel@0
|
464 % True when R is a recording in the BL collection whose audio is
|
Daniel@0
|
465 % publicly available.
|
Daniel@0
|
466 has_available_audio(R) :-
|
Daniel@0
|
467 rdf(R,dml:blpage,_),
|
Daniel@0
|
468 scrape_audio_link(R,_).
|
Daniel@0
|
469
|
Daniel@0
|
470 :- public audio_link/2.
|
Daniel@0
|
471 audio_link(URI,AudioURL) :-
|
Daniel@0
|
472 browse(scrape_audio_link(URI,AudioURL)).
|
Daniel@0
|
473
|
Daniel@0
|
474 :- public audio_file/3.
|
Daniel@0
|
475 audio_file(URI,Path,just(mp3)) :-
|
Daniel@0
|
476 setting(audio_root,just(Root)),
|
Daniel@0
|
477 ( rdf(URI,bldata:path,literal(RelPath)),
|
Daniel@0
|
478 rdf(URI,rdf:type,mo:'Signal')
|
Daniel@0
|
479 ; rdf(URI2,mo:sampled_version_of,URI),
|
Daniel@0
|
480 rdf(URI2,dml:mimetype,literal('sound/mp3')),
|
Daniel@0
|
481 rdf(URI2,bldata:path,literal(RelPath))
|
Daniel@0
|
482 ),
|
Daniel@0
|
483 atomic_list_concat([Root,RelPath],'/',Path).
|
Daniel@0
|
484
|
Daniel@0
|
485 :- volatile_memo scrape_audio_link(+atom,-atom).
|
Daniel@0
|
486
|
Daniel@0
|
487 scrape_audio_link(URI,AudioURL) :-
|
Daniel@0
|
488 rdf(URI,dml:blpage,PageURL),
|
Daniel@0
|
489 debug(bl_p2r,'Scraping audio link for ~w...',[URI]),
|
Daniel@0
|
490 atom_concat('http://sounds.bl.uk/',_,PageURL),
|
Daniel@0
|
491 http_get(PageURL,Doc,[]),
|
Daniel@0
|
492 xpath(Doc,//li(@class=mainTrack)/a(@id),ID),
|
Daniel@0
|
493 string_concat("MNT-",Key,ID),
|
Daniel@0
|
494 string_concat('http://sounds.bl.uk/GT/',Key,AudioURL).
|
Daniel@0
|
495
|
Daniel@0
|
496 sandbox:safe_primitive(bl_p2r:audio_link(_,_)).
|
Daniel@0
|
497 sandbox:safe_primitive(bl_p2r:scrape_audio_link(_,_)).
|
Daniel@0
|
498
|
Daniel@0
|
499 xpath(Prop,E,Path,Val) :- xpath(E,Path,I), xpath(I,/self(Prop),Val).
|
Daniel@0
|
500
|
Daniel@0
|
501 warning(Term) :- nl, print_message(warning,Term), fail.
|