annotate cpack/dml/lib/bl_p2r.pl @ 0:718306e29690 tip

commiting public release
author Daniel Wolff
date Tue, 09 Feb 2016 21:05:06 +0100
parents
children
rev   line source
Daniel@0 1 /* Part of DML (Digital Music Laboratory)
Daniel@0 2 Copyright 2014-2015 Samer Abdallah, University of London
Daniel@0 3
Daniel@0 4 This program is free software; you can redistribute it and/or
Daniel@0 5 modify it under the terms of the GNU General Public License
Daniel@0 6 as published by the Free Software Foundation; either version 2
Daniel@0 7 of the License, or (at your option) any later version.
Daniel@0 8
Daniel@0 9 This program is distributed in the hope that it will be useful,
Daniel@0 10 but WITHOUT ANY WARRANTY; without even the implied warranty of
Daniel@0 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Daniel@0 12 GNU General Public License for more details.
Daniel@0 13
Daniel@0 14 You should have received a copy of the GNU General Public
Daniel@0 15 License along with this library; if not, write to the Free Software
Daniel@0 16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Daniel@0 17 */
Daniel@0 18
Daniel@0 19 :- module(bl_p2r,[ audio_file/3, audio_link/2, scrape_audio_link/2, has_available_audio/1 ]).
Daniel@0 20
Daniel@0 21 :- use_module(library(semweb/rdf_db)).
Daniel@0 22 :- use_module(library(xmlarchive)).
Daniel@0 23 :- use_module(library(xpath)).
Daniel@0 24 :- use_module(library(settings)).
Daniel@0 25 :- use_module(library(sandbox)).
Daniel@0 26 :- use_module(library(fileutils)).
Daniel@0 27 :- use_module(library(termutils)).
Daniel@0 28 :- use_module(library(rdfutils)).
Daniel@0 29 :- use_module(library(insist)).
Daniel@0 30 :- use_module(library(memo)).
Daniel@0 31 :- use_module(entailment(p2r)).
Daniel@0 32 :- use_module(cliopatria(hooks)).
Daniel@0 33 :- use_module(library(http/http_client)).
Daniel@0 34
Daniel@0 35 :- set_prolog_flag(double_quotes,string).
Daniel@0 36
Daniel@0 37 /*
Daniel@0 38 METS to RDF conversion.
Daniel@0 39
Daniel@0 40 Useful info:
Daniel@0 41 0. Each top-level mets:mets element contains several sections.
Daniel@0 42 1,4 x dmdSec .. 1 x mdWrap
Daniel@0 43 1 x amdSec .. [ N x techMD, {0,1} x rightsMD, {0,1} x sourceMD ]
Daniel@0 44 1 x fileSec .. 1-4 x fileGrp .. 1-36 x file
Daniel@0 45 1 x structLink .. {1,3} x smLink
Daniel@0 46 2-3 x structMap .. 1 x div
Daniel@0 47
Daniel@0 48 1. Each dmdSec only ever contains one mdWrap element
Daniel@0 49 5. each techMD or sourceMD contains exactly 1 mdWrap
Daniel@0 50 6. each rightsMD contains an mdRef
Daniel@0 51 7. The rightsMD is not interesting to look at.
Daniel@0 52 8. dmdSec mdWraps can be empty
Daniel@0 53
Daniel@0 54 List of all dmdSec tags in dataset
Daniel@0 55 blapsi:id3album
Daniel@0 56 blapsi:id3artist
Daniel@0 57 blapsi:id3comment
Daniel@0 58 blapsi:id3songtitle
Daniel@0 59 blapsi:id3year
Daniel@0 60 dc:description
Daniel@0 61 dc:identifier
Daniel@0 62 dc:language
Daniel@0 63 dc:rights
Daniel@0 64 dc:source
Daniel@0 65 dc:subject
Daniel@0 66 dc:title
Daniel@0 67 dc:type
Daniel@0 68 dcterms:created
Daniel@0 69 dcterms:isPartOf
Daniel@0 70 dcterms:spatial
Daniel@0 71 dcterms:temporal
Daniel@0 72 marcrel:CMP
Daniel@0 73 marcrel:CND
Daniel@0 74 marcrel:IVE
Daniel@0 75 marcrel:IVR
Daniel@0 76 marcrel:LYR
Daniel@0 77 marcrel:PRF
Daniel@0 78 marcrel:RCE
Daniel@0 79 marcrel:SPK
Daniel@0 80 mods:accessCondition
Daniel@0 81 mods:identifier
Daniel@0 82 mods:name
Daniel@0 83 mods:recordInfo
Daniel@0 84 mods:titleInfo
Daniel@0 85
Daniel@0 86 amdSec tags
Daniel@0 87 blapsi:audioObject
Daniel@0 88 blapsi:file_bitrate
Daniel@0 89 blapsi:file_channels
Daniel@0 90 blapsi:file_duration
Daniel@0 91 blapsi:file_sample
Daniel@0 92 blapsi:resolution
Daniel@0 93
Daniel@0 94 sourceMD .. blapsi:audioObject
Daniel@0 95 blapsi:face
Daniel@0 96 blapsi:format
Daniel@0 97 blapsi:physicalProperties
Daniel@0 98 blapsi:face
Daniel@0 99 blapsi:format
Daniel@0 100 blapsi:physicalProperties
Daniel@0 101 blapsi:primaryIdentifier
Daniel@0 102 blapsi:secondaryIdentifier
Daniel@0 103 blapsi:primaryIdentifier
Daniel@0 104 blapsi:secondaryIdentifier
Daniel@0 105
Daniel@0 106 would prefer to have mo predicates to foaf:Person resources for these.
Daniel@0 107 marcrel:'CMP',dml:composer).
Daniel@0 108 marcrel:'CND',dml:conductor).
Daniel@0 109 marcrel:'PRF',dml:performer).
Daniel@0 110
Daniel@0 111 */
Daniel@0 112
Daniel@0 113 :- setting(audio_root,ground,nothing,'Root directory of audio file collection').
Daniel@0 114 :- setting(archive_pattern,atom,'~/lib/datasets/mets/BL_metadata_complete.7z','Pattern to match METS metadata files').
Daniel@0 115
Daniel@0 116 :- rdf_register_prefix(bldata,'http://sounds.bl.uk/resource/').
Daniel@0 117 :- rdf_register_prefix(marcrel,'http://id.loc.gov/vocabulary/relators/').
Daniel@0 118 :- rdf_register_prefix(blapsi,'http://sounds.bl.uk/blapsi#').
Daniel@0 119 :- rdf_register_prefix(blterms,'http://www.bl.uk/schemas/bibliographic/blterms#').
Daniel@0 120
Daniel@0 121 % https://code.google.com/p/libarchive/issues/detail?id=328&colspec=ID%20Type%20Status%20Priority%20Milestone%20OpSys%20Owner%20Summary
Daniel@0 122
Daniel@0 123 :- public import/0.
Daniel@0 124 import :- assert_all(bl_p2r).
Daniel@0 125
Daniel@0 126 rdf(dml:blpage,rdfs:range,foaf:'Document') <== true.
Daniel@0 127 rdf(dml:blpage,rdfs:subPropertyOf,foaf:page) <== true.
Daniel@0 128 rdf(S,P,O) <== bl_archive_triple(T), once(expand_triple(T,rdf(S,P,O))).
Daniel@0 129
Daniel@0 130 expand_triple(rdf(SS,PP,OO),rdf(S,P,O)) :-
Daniel@0 131 uripattern:pattern_uri(SS,S),
Daniel@0 132 uripattern:pattern_uri(PP,P),
Daniel@0 133 ( OO=literal(_) -> rdf_global_object(OO,O)
Daniel@0 134 ; uripattern:pattern_uri(OO,O)
Daniel@0 135 ).
Daniel@0 136
Daniel@0 137 %% import_directory(+Dir:atom, +Graph:atom) is det.
Daniel@0 138 %
Daniel@0 139 % Import contents of a given directory into a named RDF graph.
Daniel@0 140 % The directory must be a subdirectory of the directory named in the
Daniel@0 141 % bl_p2r:audio_root setting (whose value is either =|just(Dir)|= or
Daniel@0 142 % =|nothing|=).
Daniel@0 143 import_directory(Dir,Graph) :-
Daniel@0 144 forall( directory_triple(Dir,T), (once(expand_triple(T,rdf(S,P,O))), rdf_assert(S,P,O,Graph))).
Daniel@0 145
Daniel@0 146 directory_triple(Dir,T) :-
Daniel@0 147 Ext=txt,
Daniel@0 148 setting(audio_root,just(AudioRoot)),
Daniel@0 149 atom_concat(AudioRoot,'/',Prefix),
Daniel@0 150 find_files(under(AudioRoot/Dir),FullPath),
Daniel@0 151 atom_concat(Prefix,Path,FullPath),
Daniel@0 152 split_path(Path,Loc,Ext),
Daniel@0 153 with_stream(Str,open(FullPath,read,Str), ext_loc_stream_triple(Ext,Loc,Str,T)).
Daniel@0 154
Daniel@0 155
Daniel@0 156 %% bl_archive_triple(-T:rdf) is nondet.
Daniel@0 157 % This predicate generates triples from the metadata archive files whose names
Daniel@0 158 % match the pattern stored in the setting bl_p2r:archive_pattern.
Daniel@0 159 bl_archive_triple(T) :-
Daniel@0 160 setting(archive_pattern, ArchivePattern),
Daniel@0 161 find_files(like(ArchivePattern), Archive),
Daniel@0 162 with_archive_stream(Archive, Path, path_triple_stream(Path,T)).
Daniel@0 163
Daniel@0 164 path_triple_stream(Path,T,S) :-
Daniel@0 165 status('Importing ~s',[Path]),
Daniel@0 166 catch(( insist(split_path(Path,Loc,Ext)),
Daniel@0 167 ext_loc_stream_triple(Ext,Loc,S,T)
Daniel@0 168 ), Ex, (nl,print_message(warning,Ex),fail)).
Daniel@0 169
Daniel@0 170 split_path(Path,Dirs-Base,Ext) :-
Daniel@0 171 % split_string(Path,'/','',Parts),
Daniel@0 172 atomic_list_concat(Parts,'/',Path),
Daniel@0 173 exclude(ignore_dir,Parts,Parts1),
Daniel@0 174 append(Dirs,[Name],Parts1),
Daniel@0 175 sub_atom(Name,_,3,0,Ext), % NB this assumes three character extension
Daniel@0 176 sub_atom(Name,0,_,4,Base).
Daniel@0 177
Daniel@0 178 %% ext_loc_stream_triple(+Ext:atom, +Loc:pair(list(atom), atom), +S:stream, -T:rdf) is nondet.
Daniel@0 179 %
Daniel@0 180 % This predicate succeeds once for each RDF triple that can be derived from
Daniel@0 181 % an archive stream whose path in the archive Loc=Dirs-Base consist of the directories
Daniel@0 182 % Dirs and whose name is Base,'.',Ext. It understands entries with extensions
Daniel@0 183 % xml and txt. XML streams are decoded as full METS documents. TXT streams are decoded
Daniel@0 184 % as the DMD section of a METS document. Other extensions generate a warning and then fail.
Daniel@0 185
Daniel@0 186 ext_loc_stream_triple(xml,Dirs-Base,S,T) :- !,
Daniel@0 187 insist(load_xmlns(S,Doc)),
Daniel@0 188 insist(member(element(mets:mets,_,METS),Doc)),
Daniel@0 189 ( insist(get_bl_url(Base,Dirs,URL)),
Daniel@0 190 T=rdf(bldata:Base,dml:blpage,URL)
Daniel@0 191 ; T=rdf(bldata:Base,rdf:type,mo:'Signal')
Daniel@0 192 ; insist(multi,member(element(Tag,_,Content),METS),no_content(mets,Dirs-Base)),
Daniel@0 193 mets_triple(Tag,Content,Dirs,Base,T)
Daniel@0 194 ).
Daniel@0 195
Daniel@0 196 ext_loc_stream_triple(txt,Dirs-Base,S,T) :- !,
Daniel@0 197 % TXT streams appear to be LATIN-1 encoded, not UTF-8
Daniel@0 198 Base\=combined, % exclude combined metadata files
Daniel@0 199 set_stream(S,encoding(iso_latin_1)),
Daniel@0 200 insist(load_xmlns(S,Doc)),
Daniel@0 201 txt_triple(Doc,Dirs,Base,T).
Daniel@0 202
Daniel@0 203 ext_loc_stream_triple(Ext,Dirs-Base,_,_) :-
Daniel@0 204 warning(unrecognised_extension(Ext,Dirs-Base)).
Daniel@0 205
Daniel@0 206
Daniel@0 207 %% get_bl_url(+Name:atom,+Dirs:list(atom),-URL:atom) is det.
Daniel@0 208 % Deduces the sounds.bl.uk URL from entry name and directory.
Daniel@0 209 get_bl_url(Name,Dirs,URL) :-
Daniel@0 210 sub_atom(Name,0,4,_,CatCode),
Daniel@0 211 category(CatCode,Category),
Daniel@0 212 append(_,[Collection],Dirs),
Daniel@0 213 atomic_list_concat([ 'http://sounds.bl.uk',
Daniel@0 214 Category,Collection,Name],'/',URL).
Daniel@0 215
Daniel@0 216 category('021M','Oral-history').
Daniel@0 217 category('025M','World-and-traditional-music').
Daniel@0 218 category('023M','Jazz-and-popular-music').
Daniel@0 219 category('026M','Classical-music').
Daniel@0 220 category('028M','Jazz-and-popular-music').
Daniel@0 221 category('020A','Classical-music').
Daniel@0 222
Daniel@0 223 ignore_dir('_Metadata').
Daniel@0 224 ignore_dir('_Non-music').
Daniel@0 225 ignore_dir('_Audio_Metadata').
Daniel@0 226
Daniel@0 227 % --------------------- TOP LEVEL STRUCTURE -------------------------------
Daniel@0 228
Daniel@0 229 unwrap([element(mets:mdWrap,MDAttr,[element(mets:xmlData,_,XMLContent)])],MDAttr,XMLContent).
Daniel@0 230
Daniel@0 231 % for complete METS documents
Daniel@0 232 mets_triple(mets:dmdSec,DMDContent,_,ID,T) :- !,
Daniel@0 233 unwrap(DMDContent,MDAttr,XMLContent),
Daniel@0 234 member('MDTYPE'='DC',MDAttr),
Daniel@0 235 dmd_triple(XMLContent,bldata:ID,T).
Daniel@0 236
Daniel@0 237 mets_triple(mets:amdSec,AMDContent,_,ID,T) :- !,
Daniel@0 238 member(element(Tag,Attr,Content),AMDContent),
Daniel@0 239 amd_triple(Tag,Attr,Content,ID,T).
Daniel@0 240
Daniel@0 241 mets_triple(mets:fileSec,FileSecContent,Dirs,ID,T) :- !,
Daniel@0 242 member(element(T1,GAttr,GroupContent),FileSecContent), insist(T1=mets:fileGrp),
Daniel@0 243 member(element(T2,FAttr,FileContent),GroupContent), insist(T2=mets:file),
Daniel@0 244 \+member('ID'='WEBRESOURCES',GAttr),
Daniel@0 245 FileContent=[element(Tag,Attr,Content)],
Daniel@0 246 file_triple(Tag,Attr,Content,GAttr-FAttr,Dirs,ID,T).
Daniel@0 247
Daniel@0 248 mets_triple(mets:metsHdr,_,_,_,_) :- !, fail.
Daniel@0 249 mets_triple(mets:structLink,_,_,_,_) :- !, fail.
Daniel@0 250 mets_triple(mets:structMap,_,_,_,_) :- !, fail.
Daniel@0 251 mets_triple(Tag,_,_,_,_) :- warning(unrecognised_tag(Tag,mets:mets)).
Daniel@0 252
Daniel@0 253 % for txt, partial XML documents
Daniel@0 254 txt_triple(_,_,ID,rdf(bldata:ID,rdf:type,mo:'Signal')).
Daniel@0 255 txt_triple(Doc,Dirs,ID,T) :-
Daniel@0 256 insist(multi,member(element(Tag,_,Content),Doc),no_content(txt)),
Daniel@0 257 txt_tag_triple(Tag,Content,Dirs,ID,T).
Daniel@0 258
Daniel@0 259 identifier_file_ext(F,F1,mp3) :- sub_atom(F,Bef,_,_,'.mp3'), !, insist(sub_atom(F,0,Bef,_,F1)).
Daniel@0 260 identifier_file_ext(F,F1,wav) :- sub_atom(F,Bef,_,0,'.wav'), !, insist(sub_atom(F,0,Bef,_,F1)).
Daniel@0 261 identifier_file_ext(F,F1,m4a) :- sub_atom(F,Bef,_,0,'.m4a'), !, insist(sub_atom(F,0,Bef,_,F1)).
Daniel@0 262
Daniel@0 263 txt_tag_triple(dc:identifier, [F], Dirs, ID, rdf(bldata:ID, bldata:path, literal(Path))) :- !,
Daniel@0 264 % NB some of the txt files have the file name written twice. Hence I am going to discard
Daniel@0 265 % everything after the first dot. Relies on sub_atom returning matches starting from the beginning
Daniel@0 266 ( identifier_file_ext(F,F1,Ext)
Daniel@0 267 -> file_name_extension(F1,Ext,Name),
Daniel@0 268 atomics_to_string(Dirs,"/",Dir),
Daniel@0 269 directory_file_path(Dir,Name,Path)
Daniel@0 270 ; print_message(warning,txt_triple_identifier_fail(ID,F)),
Daniel@0 271 fail
Daniel@0 272 ).
Daniel@0 273
Daniel@0 274 % !!! MUSICALS only. Should not really have mo:duration in them either...
Daniel@0 275 txt_tag_triple(dml:rating,Content, _, ID, rdf(bldata:ID, dml:rating, literal(Content))) :- !.
Daniel@0 276 txt_tag_triple(mo:duration,Content, _, ID, rdf(bldata:ID, mo:duration, literal(type(xsd:float,Millis)))) :- !,
Daniel@0 277 insist(Content=[Dur],bad_content(Content,mo:duration)),
Daniel@0 278 insist(atom_number(Dur,Millis)).
Daniel@0 279 txt_tag_triple(blapsi:file_duration,Content, _, ID, rdf(bldata:ID, mo:duration, literal(type(xsd:float,Millis)))) :- !,
Daniel@0 280 insist(Content=[Dur],bad_content(Content,blapsi:file_duration)),
Daniel@0 281 % insist(parse_duration_millis(Dur,Millis)).
Daniel@0 282 ( parse_duration_millis(Dur,Millis) -> true
Daniel@0 283 ; warning(bad_duration(ID,Dur))
Daniel@0 284 ).
Daniel@0 285 txt_tag_triple(Tag,Content,_,ID,T) :- dmd_tag_triple(Tag,Content,bldata:ID,T).
Daniel@0 286
Daniel@0 287
Daniel@0 288 % --------------- Document meta data --------------------------
Daniel@0 289
Daniel@0 290 dmd_triple(DMD,URI,T) :-
Daniel@0 291 member(element(Tag,_,Content),DMD),
Daniel@0 292 dmd_tag_triple(Tag,Content,URI,T).
Daniel@0 293
Daniel@0 294 dmd_tag_triple(dcterms:contributor,Content,ID,T) :- !, dmd_triple(Content,bldata:ID,T).
Daniel@0 295 dmd_tag_triple(dc:contributor,Content,ID,T) :- !, dmd_triple(Content,bldata:ID,T).
Daniel@0 296 dmd_tag_triple(marcrel:REL,Content,URI,rdf(URI,marcrel:Rel,literal(Lit))) :- !,
Daniel@0 297 Content=[Lit],%empty_tag(marcrel:REL,Content)),
Daniel@0 298 downcase_atom(REL,Rel).
Daniel@0 299 dmd_tag_triple(Tag,Content,URI,rdf(URI,Tag,literal(Lit))) :- keep_tag(Tag), !, Content=[Lit].
Daniel@0 300 dmd_tag_triple(Tag,_,_,_) :- ignore_tag(Tag), !, fail.
Daniel@0 301 dmd_tag_triple(Tag,_Content,URI,_) :- warning(unrecognised_tag(Tag,dmd,URI)).
Daniel@0 302
Daniel@0 303 % !!!FIXME - sometimes dates are given in D/M/Y instead of Y-M-D
Daniel@0 304 keep_tag(dc:title).
Daniel@0 305 keep_tag(dc:description).
Daniel@0 306 keep_tag(dc:source).
Daniel@0 307 keep_tag(dc:subject).
Daniel@0 308 keep_tag(dc:language).
Daniel@0 309 keep_tag(dc:created).
Daniel@0 310 keep_tag(dcterms:language).
Daniel@0 311 keep_tag(dcterms:abstract).
Daniel@0 312 keep_tag(dcterms:created).
Daniel@0 313 keep_tag(dcterms:spatial).
Daniel@0 314 keep_tag(dcterms:temporal).
Daniel@0 315 keep_tag(dcterms:extent). % !!!FIXME need to parse this
Daniel@0 316 keep_tag(blterms:mechanism).
Daniel@0 317 keep_tag(dcterms:isPartOf).
Daniel@0 318 keep_tag(blapsi:format).
Daniel@0 319
Daniel@0 320 ignore_tag(dc:identifier).
Daniel@0 321 ignore_tag(blapsi:marker).
Daniel@0 322 ignore_tag(dc:rights).
Daniel@0 323 ignore_tag(dc:type).
Daniel@0 324 ignore_tag(rdf:about).
Daniel@0 325 ignore_tag('ARK').
Daniel@0 326
Daniel@0 327 % ------------------------- ADMINISTRATIVE METADATA SECTION -----------------------------
Daniel@0 328
Daniel@0 329 amd_triple(mets:sourceMD,_,SMDContent,ID,T) :-
Daniel@0 330 insist(unwrap(SMDContent,_,XMLContent),no_xml_content(SMDContent,smd)),
Daniel@0 331 atom_concat(ID,'#source',Src),
Daniel@0 332 ( T=rdf(bldata:ID,dml:source,bldata:Src)
Daniel@0 333 ; insist(multi,member(element(Tag,Attr,Content),XMLContent),no_xml_content(smd)),
Daniel@0 334 smd_xml_triple(Tag,Attr,Content,bldata:Src,T)
Daniel@0 335 ).
Daniel@0 336
Daniel@0 337 amd_triple(mets:techMD,Attr,TMDContent,ID,T) :-
Daniel@0 338 insist(member('ID'=TMDId,Attr)),
Daniel@0 339 unwrap(TMDContent,_,XMLContent),
Daniel@0 340 ( T=rdf(bldata:ID/TMDId, mo:sampled_version_of, bldata:ID)
Daniel@0 341 % ; T=rdf(bldata:ID/TMDId, dml:annotation,literal(Label)), member('LABEL'=Label,TMDAttr)
Daniel@0 342 ; member(element(Tag,_,Content),XMLContent),
Daniel@0 343 blapsi_triple(Tag, Content, bldata:ID/TMDId, T)
Daniel@0 344 ).
Daniel@0 345
Daniel@0 346 blapsi_triple(blapsi:Tag, [Text], Signal, rdf(Signal, Pred, literal(Lit))) :-
Daniel@0 347 insist(blapsi_info(Tag, Text, Pred, Lit)).
Daniel@0 348
Daniel@0 349 % ------------ Source ---------------
Daniel@0 350
Daniel@0 351 smd_xml_triple(blapsi:audioObject,Attr,AOContent,SrcURI,rdf(SrcURI,Pred,literal(Lit))) :- !,
Daniel@0 352 ( member(A=Lit,Attr), A\='ID', Pred=bldata:A
Daniel@0 353 ; insist(multi,member(element(Tag,Attr1,Content),AOContent),no_content(blapsi:audioObject,AOContent)),
Daniel@0 354 ao_tag_info(Tag,Attr1,Content,Pred,Lit)
Daniel@0 355 ).
Daniel@0 356
Daniel@0 357 smd_xml_triple(blapsi:Tag,Content,SrcURI,rdf(SrcURI,blapsi:Tag,literal(Lit))) :- !,
Daniel@0 358 insist(Content=[Lit],bad_content(blapsi:Tag,Content,smd_xml_triple)).
Daniel@0 359
Daniel@0 360 ao_tag_info(blapsi:primaryIdentifier,_,_,_,_) :- !, fail.
Daniel@0 361 ao_tag_info(blapsi:secondaryIdentifier,_,_,_,_) :- !, fail.
Daniel@0 362 ao_tag_info(blapsi:format,_,Content,blapsi:format,Lit) :- !,
Daniel@0 363 insist(Content=[Lit],bad_content(Content,blapsi:format)).
Daniel@0 364 ao_tag_info(blapsi:face, Attr, Content, Pred, Lit) :- !,
Daniel@0 365 %insist(member('ID'=ID,Attr)),
Daniel@0 366 insist(member('label'=Label,Attr)),
Daniel@0 367 insist(Content=[],non_empty_content(blapsi:face,Content)),
Daniel@0 368 ( fail % Pred=bldata:face_id, Lit=ID IGNORE FOR NOW
Daniel@0 369 ; Pred=bldata:face_label, Lit=Label
Daniel@0 370 ).
Daniel@0 371 ao_tag_info(blapsi:physicalProperties, _, PPContent, Tag, Lit) :- !,
Daniel@0 372 insist(multi,member(element(Tag,_,Content),PPContent),no_content(blapsi:physicalProperties)),
Daniel@0 373 insist(Content=[Lit],bad_content(Tag,Content)).
Daniel@0 374 ao_tag_info(Tag,_,_,_,_) :-
Daniel@0 375 warning(unrecognised_tag(Tag,blapsi:audioObject)).
Daniel@0 376
Daniel@0 377 % identifier_pred('ASR Root ID',asr_root_id).
Daniel@0 378 % identifier_pred('Sound Archive accession number',accession_number).
Daniel@0 379
Daniel@0 380
Daniel@0 381
Daniel@0 382 % -------------- FILE SECTION ---------------------------
Daniel@0 383
Daniel@0 384 file_triple(mets:'FLocat',Attr,LocContent,GAttr-FAttr,Dirs,ID,T) :- !,
Daniel@0 385 ( member('MIMETYPE'=MimeType,FAttr)
Daniel@0 386 -> audio_mimetype(MimeType),
Daniel@0 387 insist(member('AMDID'=TMDId1,FAttr)),
Daniel@0 388 insist(member('LOCTYPE'='URL',Attr)),
Daniel@0 389 insist(member((xlink:href)=Link,Attr)),
Daniel@0 390 insist(LocContent=[],non_empty(mets:'FLocat',LocContent)),
Daniel@0 391 insist(member('USE'=Use1,FAttr);member('USE'=Use1,GAttr)),
Daniel@0 392 % TMDId1 is sometimes "techMDxx digiprovXX" - need to get rid of second word
Daniel@0 393 atomic_list_concat([TMDId|_],' ',TMDId1),
Daniel@0 394 downcase_atom(Use1,Use),
Daniel@0 395 ( T=rdf(bldata:ID/TMDId,dml:mimetype,literal(MimeType))
Daniel@0 396 ; file_path_triple(bldata:ID/TMDId,Dirs,Link,T)
Daniel@0 397 ; T=rdf(bldata:ID/TMDId,bldata:use,literal(Use))
Daniel@0 398 )
Daniel@0 399 ; insist(\+member('AMDID'=_,FAttr)),
Daniel@0 400 insist(member('ID'=FileID,FAttr)),
Daniel@0 401 insist(member('LOCTYPE'='URL',Attr)),
Daniel@0 402 insist(member((xlink:href)=Link,Attr)),
Daniel@0 403 sub_atom(Link,_,3,0,Ext),
Daniel@0 404 audio_extension(Ext),
Daniel@0 405 ( T=rdf(bldata:ID/FileID,mo:sampled_version_of,bldata:ID)
Daniel@0 406 ; file_path_triple(bldata:ID/FileID,Dirs,Link,T)
Daniel@0 407 ; member(element(Tag,_,Content),LocContent),
Daniel@0 408 blapsi_triple(Tag,Content,bldata:ID/FileID,T)
Daniel@0 409 )
Daniel@0 410 ).
Daniel@0 411
Daniel@0 412 file_triple(mets:'Fcontent',_,_,_,_,_,_) :- !, fail.
Daniel@0 413 file_triple(Tag,_,_,_,_,_,_) :- warning(unrecognised_tag(Tag,file)).
Daniel@0 414
Daniel@0 415 file_path_triple(URI,Dirs,Link,rdf(URI,bldata:path,literal(Path))) :-
Daniel@0 416 atomics_to_string(Parts,"/",Link),
Daniel@0 417 atomics_to_string(Dirs,"/",Dir),
Daniel@0 418 append(_,[Name],Parts),
Daniel@0 419 directory_file_path(Dir,Name,Path).
Daniel@0 420
Daniel@0 421 audio_mimetype('sound/wav').
Daniel@0 422 audio_mimetype('sound/wma').
Daniel@0 423 audio_mimetype('sound/mp3').
Daniel@0 424 audio_mimetype('sound/ogg').
Daniel@0 425
Daniel@0 426 audio_extension(wav).
Daniel@0 427 audio_extension(mp3).
Daniel@0 428 audio_extension(wma).
Daniel@0 429
Daniel@0 430 % ---------------------------- BLAPSI INFO ---------------------------------
Daniel@0 431
Daniel@0 432 blapsi_info(file_sample, X, mo:sample_rate, type(xsd:float,SampleRate)) :- atom_number(X,SampleRate).
Daniel@0 433 blapsi_info(file_resolution, X, mo:bitsPerSample, type(xsd:int,Bits)) :- atom_number(X,Bits).
Daniel@0 434 blapsi_info(resolution, X, mo:bitsPerSample, type(xsd:int,Bits)) :- atom_number(X,Bits).
Daniel@0 435 blapsi_info(file_channels, X, mo:channels, type(xsd:int,Channels)) :- atom_number(X,Channels).
Daniel@0 436 blapsi_info(file_duration, X, mo:duration, type(xsd:float,Millis)) :- parse_duration_millis(X,Millis).
Daniel@0 437 blapsi_info(file_bitrate, X, blapsi:file_bitrate, X). % !!! FIXME should be attached to file, not to signal
Daniel@0 438 blapsi_info(file_size, X, blapsi:file_size, type(xsd:int,Size)) :- atom_number(X,Size).
Daniel@0 439 blapsi_info(file_length, X, blapsi:file_length, type(xsd:int,Size)) :- atom_number(X,Size).
Daniel@0 440
Daniel@0 441 % --------------------------- SUPPORTING PREDICATES --------------------------
Daniel@0 442
Daniel@0 443 :- use_module(library(async)).
Daniel@0 444 :- public scrape_all/2.
Daniel@0 445 :- volatile_memo scrape_all(+options:list,-count:nonneg).
Daniel@0 446 scrape_all(Opts,Count) :-
Daniel@0 447 option(spacing(Sleep),Opts,1),
Daniel@0 448 option(timeout(Timeout),Opts,10),
Daniel@0 449 findall( R, rdf(R,dml:blpage,_), Rs),
Daniel@0 450 with_progress_stack(map_with_progress(scrape_then_sleep(Sleep,Timeout),Rs,Ss)),
Daniel@0 451 exclude(=(ok),Ss, Failures),
Daniel@0 452 (Failures=[] -> length(Rs,Count); throw(scrape_errors(Failures))).
Daniel@0 453
Daniel@0 454 scrape_then_sleep(D,T,R,Status) :-
Daniel@0 455 ( audio_link(R,_) -> Status=ok
Daniel@0 456 ; memo:reify(bl_p2r:slow(D,call_with_time_limit(T,scrape_audio_link(R,_))),Status),
Daniel@0 457 (Status=ex(abort(Reason)) -> throw(abort(Reason)); true)
Daniel@0 458 ).
Daniel@0 459
Daniel@0 460 slow(Delay,Goal) :- call(Goal), sleep(Delay).
Daniel@0 461
Daniel@0 462 %% has_available_audio(+R:uri) is semidet.
Daniel@0 463 %% has_available_audio(-R:uri) is nondet.
Daniel@0 464 % True when R is a recording in the BL collection whose audio is
Daniel@0 465 % publicly available.
Daniel@0 466 has_available_audio(R) :-
Daniel@0 467 rdf(R,dml:blpage,_),
Daniel@0 468 scrape_audio_link(R,_).
Daniel@0 469
Daniel@0 470 :- public audio_link/2.
Daniel@0 471 audio_link(URI,AudioURL) :-
Daniel@0 472 browse(scrape_audio_link(URI,AudioURL)).
Daniel@0 473
Daniel@0 474 :- public audio_file/3.
Daniel@0 475 audio_file(URI,Path,just(mp3)) :-
Daniel@0 476 setting(audio_root,just(Root)),
Daniel@0 477 ( rdf(URI,bldata:path,literal(RelPath)),
Daniel@0 478 rdf(URI,rdf:type,mo:'Signal')
Daniel@0 479 ; rdf(URI2,mo:sampled_version_of,URI),
Daniel@0 480 rdf(URI2,dml:mimetype,literal('sound/mp3')),
Daniel@0 481 rdf(URI2,bldata:path,literal(RelPath))
Daniel@0 482 ),
Daniel@0 483 atomic_list_concat([Root,RelPath],'/',Path).
Daniel@0 484
Daniel@0 485 :- volatile_memo scrape_audio_link(+atom,-atom).
Daniel@0 486
Daniel@0 487 scrape_audio_link(URI,AudioURL) :-
Daniel@0 488 rdf(URI,dml:blpage,PageURL),
Daniel@0 489 debug(bl_p2r,'Scraping audio link for ~w...',[URI]),
Daniel@0 490 atom_concat('http://sounds.bl.uk/',_,PageURL),
Daniel@0 491 http_get(PageURL,Doc,[]),
Daniel@0 492 xpath(Doc,//li(@class=mainTrack)/a(@id),ID),
Daniel@0 493 string_concat("MNT-",Key,ID),
Daniel@0 494 string_concat('http://sounds.bl.uk/GT/',Key,AudioURL).
Daniel@0 495
Daniel@0 496 sandbox:safe_primitive(bl_p2r:audio_link(_,_)).
Daniel@0 497 sandbox:safe_primitive(bl_p2r:scrape_audio_link(_,_)).
Daniel@0 498
Daniel@0 499 xpath(Prop,E,Path,Val) :- xpath(E,Path,I), xpath(I,/self(Prop),Val).
Daniel@0 500
Daniel@0 501 warning(Term) :- nl, print_message(warning,Term), fail.