| Daniel@0 | 1 /* Part of DML (Digital Music Laboratory) | 
| Daniel@0 | 2 	Copyright 2014-2015 Samer Abdallah, University of London | 
| Daniel@0 | 3 | 
| Daniel@0 | 4 	This program is free software; you can redistribute it and/or | 
| Daniel@0 | 5 	modify it under the terms of the GNU General Public License | 
| Daniel@0 | 6 	as published by the Free Software Foundation; either version 2 | 
| Daniel@0 | 7 	of the License, or (at your option) any later version. | 
| Daniel@0 | 8 | 
| Daniel@0 | 9 	This program is distributed in the hope that it will be useful, | 
| Daniel@0 | 10 	but WITHOUT ANY WARRANTY; without even the implied warranty of | 
| Daniel@0 | 11 	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
| Daniel@0 | 12 	GNU General Public License for more details. | 
| Daniel@0 | 13 | 
| Daniel@0 | 14 	You should have received a copy of the GNU General Public | 
| Daniel@0 | 15 	License along with this library; if not, write to the Free Software | 
| Daniel@0 | 16 	Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA | 
| Daniel@0 | 17 */ | 
| Daniel@0 | 18 | 
| Daniel@0 | 19 :- module(bl_p2r,[ audio_file/3, audio_link/2, scrape_audio_link/2, has_available_audio/1 ]). | 
| Daniel@0 | 20 | 
| Daniel@0 | 21 :- use_module(library(semweb/rdf_db)). | 
| Daniel@0 | 22 :- use_module(library(xmlarchive)). | 
| Daniel@0 | 23 :- use_module(library(xpath)). | 
| Daniel@0 | 24 :- use_module(library(settings)). | 
| Daniel@0 | 25 :- use_module(library(sandbox)). | 
| Daniel@0 | 26 :- use_module(library(fileutils)). | 
| Daniel@0 | 27 :- use_module(library(termutils)). | 
| Daniel@0 | 28 :- use_module(library(rdfutils)). | 
| Daniel@0 | 29 :- use_module(library(insist)). | 
| Daniel@0 | 30 :- use_module(library(memo)). | 
| Daniel@0 | 31 :- use_module(entailment(p2r)). | 
| Daniel@0 | 32 :- use_module(cliopatria(hooks)). | 
| Daniel@0 | 33 :- use_module(library(http/http_client)). | 
| Daniel@0 | 34 | 
| Daniel@0 | 35 :- set_prolog_flag(double_quotes,string). | 
| Daniel@0 | 36 | 
| Daniel@0 | 37 /* | 
| Daniel@0 | 38    METS to RDF conversion. | 
| Daniel@0 | 39 | 
| Daniel@0 | 40    Useful info: | 
| Daniel@0 | 41    0. Each top-level mets:mets element contains several sections. | 
| Daniel@0 | 42       1,4 x dmdSec     .. 1 x mdWrap | 
| Daniel@0 | 43       1   x amdSec     .. [ N x techMD, {0,1} x rightsMD, {0,1} x sourceMD ] | 
| Daniel@0 | 44       1   x fileSec    .. 1-4 x fileGrp .. 1-36 x file | 
| Daniel@0 | 45       1   x structLink .. {1,3} x smLink | 
| Daniel@0 | 46       2-3 x structMap  .. 1 x div | 
| Daniel@0 | 47 | 
| Daniel@0 | 48    1. Each dmdSec only ever contains one mdWrap element | 
| Daniel@0 | 49    5. each techMD or sourceMD contains exactly 1 mdWrap | 
| Daniel@0 | 50    6. each rightsMD contains an mdRef | 
| Daniel@0 | 51    7. The rightsMD is not interesting to look at. | 
| Daniel@0 | 52    8. dmdSec mdWraps can be empty | 
| Daniel@0 | 53 | 
| Daniel@0 | 54    List of all dmdSec tags in dataset | 
| Daniel@0 | 55       blapsi:id3album | 
| Daniel@0 | 56       blapsi:id3artist | 
| Daniel@0 | 57       blapsi:id3comment | 
| Daniel@0 | 58       blapsi:id3songtitle | 
| Daniel@0 | 59       blapsi:id3year | 
| Daniel@0 | 60       dc:description | 
| Daniel@0 | 61       dc:identifier | 
| Daniel@0 | 62       dc:language | 
| Daniel@0 | 63       dc:rights | 
| Daniel@0 | 64       dc:source | 
| Daniel@0 | 65       dc:subject | 
| Daniel@0 | 66       dc:title | 
| Daniel@0 | 67       dc:type | 
| Daniel@0 | 68       dcterms:created | 
| Daniel@0 | 69       dcterms:isPartOf | 
| Daniel@0 | 70       dcterms:spatial | 
| Daniel@0 | 71       dcterms:temporal | 
| Daniel@0 | 72       marcrel:CMP | 
| Daniel@0 | 73       marcrel:CND | 
| Daniel@0 | 74       marcrel:IVE | 
| Daniel@0 | 75       marcrel:IVR | 
| Daniel@0 | 76       marcrel:LYR | 
| Daniel@0 | 77       marcrel:PRF | 
| Daniel@0 | 78       marcrel:RCE | 
| Daniel@0 | 79       marcrel:SPK | 
| Daniel@0 | 80       mods:accessCondition | 
| Daniel@0 | 81       mods:identifier | 
| Daniel@0 | 82       mods:name | 
| Daniel@0 | 83       mods:recordInfo | 
| Daniel@0 | 84       mods:titleInfo | 
| Daniel@0 | 85 | 
| Daniel@0 | 86    amdSec tags | 
| Daniel@0 | 87       blapsi:audioObject | 
| Daniel@0 | 88       blapsi:file_bitrate | 
| Daniel@0 | 89       blapsi:file_channels | 
| Daniel@0 | 90       blapsi:file_duration | 
| Daniel@0 | 91       blapsi:file_sample | 
| Daniel@0 | 92       blapsi:resolution | 
| Daniel@0 | 93 | 
| Daniel@0 | 94    sourceMD .. blapsi:audioObject | 
| Daniel@0 | 95       blapsi:face | 
| Daniel@0 | 96       blapsi:format | 
| Daniel@0 | 97       blapsi:physicalProperties | 
| Daniel@0 | 98          blapsi:face | 
| Daniel@0 | 99          blapsi:format | 
| Daniel@0 | 100          blapsi:physicalProperties | 
| Daniel@0 | 101          blapsi:primaryIdentifier | 
| Daniel@0 | 102          blapsi:secondaryIdentifier | 
| Daniel@0 | 103       blapsi:primaryIdentifier | 
| Daniel@0 | 104       blapsi:secondaryIdentifier | 
| Daniel@0 | 105 | 
| Daniel@0 | 106    would prefer to have mo predicates to foaf:Person resources for these. | 
| Daniel@0 | 107       marcrel:'CMP',dml:composer). | 
| Daniel@0 | 108       marcrel:'CND',dml:conductor). | 
| Daniel@0 | 109       marcrel:'PRF',dml:performer). | 
| Daniel@0 | 110 | 
| Daniel@0 | 111  */ | 
| Daniel@0 | 112 | 
| Daniel@0 | 113 :- setting(audio_root,ground,nothing,'Root directory of audio file collection'). | 
| Daniel@0 | 114 :- setting(archive_pattern,atom,'~/lib/datasets/mets/BL_metadata_complete.7z','Pattern to match METS metadata files'). | 
| Daniel@0 | 115 | 
| Daniel@0 | 116 :- rdf_register_prefix(bldata,'http://sounds.bl.uk/resource/'). | 
| Daniel@0 | 117 :- rdf_register_prefix(marcrel,'http://id.loc.gov/vocabulary/relators/'). | 
| Daniel@0 | 118 :- rdf_register_prefix(blapsi,'http://sounds.bl.uk/blapsi#'). | 
| Daniel@0 | 119 :- rdf_register_prefix(blterms,'http://www.bl.uk/schemas/bibliographic/blterms#'). | 
| Daniel@0 | 120 | 
| Daniel@0 | 121 % https://code.google.com/p/libarchive/issues/detail?id=328&colspec=ID%20Type%20Status%20Priority%20Milestone%20OpSys%20Owner%20Summary | 
| Daniel@0 | 122 | 
| Daniel@0 | 123 :- public import/0. | 
| Daniel@0 | 124 import :- assert_all(bl_p2r). | 
| Daniel@0 | 125 | 
| Daniel@0 | 126 rdf(dml:blpage,rdfs:range,foaf:'Document') <== true. | 
| Daniel@0 | 127 rdf(dml:blpage,rdfs:subPropertyOf,foaf:page) <== true. | 
| Daniel@0 | 128 rdf(S,P,O) <== bl_archive_triple(T), once(expand_triple(T,rdf(S,P,O))). | 
| Daniel@0 | 129 | 
| Daniel@0 | 130 expand_triple(rdf(SS,PP,OO),rdf(S,P,O)) :- | 
| Daniel@0 | 131    uripattern:pattern_uri(SS,S), | 
| Daniel@0 | 132    uripattern:pattern_uri(PP,P), | 
| Daniel@0 | 133    (  OO=literal(_) -> rdf_global_object(OO,O) | 
| Daniel@0 | 134    ;  uripattern:pattern_uri(OO,O) | 
| Daniel@0 | 135    ). | 
| Daniel@0 | 136 | 
| Daniel@0 | 137 %% import_directory(+Dir:atom, +Graph:atom) is det. | 
| Daniel@0 | 138 % | 
| Daniel@0 | 139 %  Import contents of a given directory into a named RDF graph. | 
| Daniel@0 | 140 %  The directory must be a subdirectory of the directory named in the | 
| Daniel@0 | 141 %  bl_p2r:audio_root setting (whose value is either =|just(Dir)|= or | 
| Daniel@0 | 142 %  =|nothing|=). | 
| Daniel@0 | 143 import_directory(Dir,Graph) :- | 
| Daniel@0 | 144    forall( directory_triple(Dir,T), (once(expand_triple(T,rdf(S,P,O))), rdf_assert(S,P,O,Graph))). | 
| Daniel@0 | 145 | 
| Daniel@0 | 146 directory_triple(Dir,T) :- | 
| Daniel@0 | 147    Ext=txt, | 
| Daniel@0 | 148    setting(audio_root,just(AudioRoot)), | 
| Daniel@0 | 149    atom_concat(AudioRoot,'/',Prefix), | 
| Daniel@0 | 150    find_files(under(AudioRoot/Dir),FullPath), | 
| Daniel@0 | 151    atom_concat(Prefix,Path,FullPath), | 
| Daniel@0 | 152    split_path(Path,Loc,Ext), | 
| Daniel@0 | 153    with_stream(Str,open(FullPath,read,Str), ext_loc_stream_triple(Ext,Loc,Str,T)). | 
| Daniel@0 | 154 | 
| Daniel@0 | 155 | 
| Daniel@0 | 156 %% bl_archive_triple(-T:rdf) is nondet. | 
| Daniel@0 | 157 %  This predicate generates triples from the metadata archive files whose names | 
| Daniel@0 | 158 %  match the pattern stored in the setting bl_p2r:archive_pattern. | 
| Daniel@0 | 159 bl_archive_triple(T) :- | 
| Daniel@0 | 160    setting(archive_pattern, ArchivePattern), | 
| Daniel@0 | 161    find_files(like(ArchivePattern), Archive), | 
| Daniel@0 | 162    with_archive_stream(Archive, Path, path_triple_stream(Path,T)). | 
| Daniel@0 | 163 | 
| Daniel@0 | 164 path_triple_stream(Path,T,S) :- | 
| Daniel@0 | 165   status('Importing ~s',[Path]), | 
| Daniel@0 | 166   catch(( insist(split_path(Path,Loc,Ext)), | 
| Daniel@0 | 167            ext_loc_stream_triple(Ext,Loc,S,T) | 
| Daniel@0 | 168         ), Ex, (nl,print_message(warning,Ex),fail)). | 
| Daniel@0 | 169 | 
| Daniel@0 | 170 split_path(Path,Dirs-Base,Ext) :- | 
| Daniel@0 | 171    % split_string(Path,'/','',Parts), | 
| Daniel@0 | 172    atomic_list_concat(Parts,'/',Path), | 
| Daniel@0 | 173    exclude(ignore_dir,Parts,Parts1), | 
| Daniel@0 | 174    append(Dirs,[Name],Parts1), | 
| Daniel@0 | 175    sub_atom(Name,_,3,0,Ext), % NB this assumes three character extension | 
| Daniel@0 | 176    sub_atom(Name,0,_,4,Base). | 
| Daniel@0 | 177 | 
| Daniel@0 | 178 %% ext_loc_stream_triple(+Ext:atom, +Loc:pair(list(atom), atom), +S:stream, -T:rdf) is nondet. | 
| Daniel@0 | 179 % | 
| Daniel@0 | 180 %  This predicate succeeds once for each RDF triple that can be derived from | 
| Daniel@0 | 181 %  an archive stream whose path in the archive Loc=Dirs-Base consist of the directories | 
| Daniel@0 | 182 %  Dirs and whose name is Base,'.',Ext. It understands entries with extensions | 
| Daniel@0 | 183 %  xml and txt. XML streams are decoded as full METS documents. TXT streams are decoded | 
| Daniel@0 | 184 %  as the DMD section of a METS document. Other extensions generate a warning and then fail. | 
| Daniel@0 | 185 | 
| Daniel@0 | 186 ext_loc_stream_triple(xml,Dirs-Base,S,T) :- !, | 
| Daniel@0 | 187    insist(load_xmlns(S,Doc)), | 
| Daniel@0 | 188    insist(member(element(mets:mets,_,METS),Doc)), | 
| Daniel@0 | 189    (  insist(get_bl_url(Base,Dirs,URL)), | 
| Daniel@0 | 190       T=rdf(bldata:Base,dml:blpage,URL) | 
| Daniel@0 | 191    ;  T=rdf(bldata:Base,rdf:type,mo:'Signal') | 
| Daniel@0 | 192    ;  insist(multi,member(element(Tag,_,Content),METS),no_content(mets,Dirs-Base)), | 
| Daniel@0 | 193       mets_triple(Tag,Content,Dirs,Base,T) | 
| Daniel@0 | 194    ). | 
| Daniel@0 | 195 | 
| Daniel@0 | 196 ext_loc_stream_triple(txt,Dirs-Base,S,T) :- !, | 
| Daniel@0 | 197    % TXT streams appear to be LATIN-1 encoded, not UTF-8 | 
| Daniel@0 | 198    Base\=combined, % exclude combined metadata files | 
| Daniel@0 | 199    set_stream(S,encoding(iso_latin_1)), | 
| Daniel@0 | 200    insist(load_xmlns(S,Doc)), | 
| Daniel@0 | 201    txt_triple(Doc,Dirs,Base,T). | 
| Daniel@0 | 202 | 
| Daniel@0 | 203 ext_loc_stream_triple(Ext,Dirs-Base,_,_) :- | 
| Daniel@0 | 204    warning(unrecognised_extension(Ext,Dirs-Base)). | 
| Daniel@0 | 205 | 
| Daniel@0 | 206 | 
| Daniel@0 | 207 %% get_bl_url(+Name:atom,+Dirs:list(atom),-URL:atom) is det. | 
| Daniel@0 | 208 %  Deduces the sounds.bl.uk URL from entry name and directory. | 
| Daniel@0 | 209 get_bl_url(Name,Dirs,URL) :- | 
| Daniel@0 | 210    sub_atom(Name,0,4,_,CatCode), | 
| Daniel@0 | 211    category(CatCode,Category), | 
| Daniel@0 | 212    append(_,[Collection],Dirs), | 
| Daniel@0 | 213    atomic_list_concat([ 'http://sounds.bl.uk', | 
| Daniel@0 | 214                         Category,Collection,Name],'/',URL). | 
| Daniel@0 | 215 | 
| Daniel@0 | 216 category('021M','Oral-history'). | 
| Daniel@0 | 217 category('025M','World-and-traditional-music'). | 
| Daniel@0 | 218 category('023M','Jazz-and-popular-music'). | 
| Daniel@0 | 219 category('026M','Classical-music'). | 
| Daniel@0 | 220 category('028M','Jazz-and-popular-music'). | 
| Daniel@0 | 221 category('020A','Classical-music'). | 
| Daniel@0 | 222 | 
| Daniel@0 | 223 ignore_dir('_Metadata'). | 
| Daniel@0 | 224 ignore_dir('_Non-music'). | 
| Daniel@0 | 225 ignore_dir('_Audio_Metadata'). | 
| Daniel@0 | 226 | 
| Daniel@0 | 227 % --------------------- TOP LEVEL STRUCTURE ------------------------------- | 
| Daniel@0 | 228 | 
| Daniel@0 | 229 unwrap([element(mets:mdWrap,MDAttr,[element(mets:xmlData,_,XMLContent)])],MDAttr,XMLContent). | 
| Daniel@0 | 230 | 
| Daniel@0 | 231 % for complete METS documents | 
| Daniel@0 | 232 mets_triple(mets:dmdSec,DMDContent,_,ID,T) :- !, | 
| Daniel@0 | 233    unwrap(DMDContent,MDAttr,XMLContent), | 
| Daniel@0 | 234    member('MDTYPE'='DC',MDAttr), | 
| Daniel@0 | 235    dmd_triple(XMLContent,bldata:ID,T). | 
| Daniel@0 | 236 | 
| Daniel@0 | 237 mets_triple(mets:amdSec,AMDContent,_,ID,T) :- !, | 
| Daniel@0 | 238    member(element(Tag,Attr,Content),AMDContent), | 
| Daniel@0 | 239    amd_triple(Tag,Attr,Content,ID,T). | 
| Daniel@0 | 240 | 
| Daniel@0 | 241 mets_triple(mets:fileSec,FileSecContent,Dirs,ID,T) :- !, | 
| Daniel@0 | 242    member(element(T1,GAttr,GroupContent),FileSecContent), insist(T1=mets:fileGrp), | 
| Daniel@0 | 243    member(element(T2,FAttr,FileContent),GroupContent), insist(T2=mets:file), | 
| Daniel@0 | 244    \+member('ID'='WEBRESOURCES',GAttr), | 
| Daniel@0 | 245    FileContent=[element(Tag,Attr,Content)], | 
| Daniel@0 | 246    file_triple(Tag,Attr,Content,GAttr-FAttr,Dirs,ID,T). | 
| Daniel@0 | 247 | 
| Daniel@0 | 248 mets_triple(mets:metsHdr,_,_,_,_) :- !, fail. | 
| Daniel@0 | 249 mets_triple(mets:structLink,_,_,_,_) :- !, fail. | 
| Daniel@0 | 250 mets_triple(mets:structMap,_,_,_,_) :- !, fail. | 
| Daniel@0 | 251 mets_triple(Tag,_,_,_,_) :- warning(unrecognised_tag(Tag,mets:mets)). | 
| Daniel@0 | 252 | 
| Daniel@0 | 253 % for txt, partial XML documents | 
| Daniel@0 | 254 txt_triple(_,_,ID,rdf(bldata:ID,rdf:type,mo:'Signal')). | 
| Daniel@0 | 255 txt_triple(Doc,Dirs,ID,T) :- | 
| Daniel@0 | 256    insist(multi,member(element(Tag,_,Content),Doc),no_content(txt)), | 
| Daniel@0 | 257    txt_tag_triple(Tag,Content,Dirs,ID,T). | 
| Daniel@0 | 258 | 
| Daniel@0 | 259 identifier_file_ext(F,F1,mp3) :- sub_atom(F,Bef,_,_,'.mp3'), !, insist(sub_atom(F,0,Bef,_,F1)). | 
| Daniel@0 | 260 identifier_file_ext(F,F1,wav) :- sub_atom(F,Bef,_,0,'.wav'), !, insist(sub_atom(F,0,Bef,_,F1)). | 
| Daniel@0 | 261 identifier_file_ext(F,F1,m4a) :- sub_atom(F,Bef,_,0,'.m4a'), !, insist(sub_atom(F,0,Bef,_,F1)). | 
| Daniel@0 | 262 | 
| Daniel@0 | 263 txt_tag_triple(dc:identifier, [F], Dirs, ID, rdf(bldata:ID, bldata:path, literal(Path))) :- !, | 
| Daniel@0 | 264    % NB some of the txt files have the file name written twice. Hence I am going to discard | 
| Daniel@0 | 265    % everything after the first dot. Relies on sub_atom returning matches starting from the beginning | 
| Daniel@0 | 266    (  identifier_file_ext(F,F1,Ext) | 
| Daniel@0 | 267    -> file_name_extension(F1,Ext,Name), | 
| Daniel@0 | 268       atomics_to_string(Dirs,"/",Dir), | 
| Daniel@0 | 269       directory_file_path(Dir,Name,Path) | 
| Daniel@0 | 270    ;  print_message(warning,txt_triple_identifier_fail(ID,F)), | 
| Daniel@0 | 271       fail | 
| Daniel@0 | 272    ). | 
| Daniel@0 | 273 | 
| Daniel@0 | 274 % !!! MUSICALS only. Should not really have mo:duration in them either... | 
| Daniel@0 | 275 txt_tag_triple(dml:rating,Content, _, ID, rdf(bldata:ID, dml:rating, literal(Content))) :- !. | 
| Daniel@0 | 276 txt_tag_triple(mo:duration,Content, _, ID, rdf(bldata:ID, mo:duration, literal(type(xsd:float,Millis)))) :- !, | 
| Daniel@0 | 277    insist(Content=[Dur],bad_content(Content,mo:duration)), | 
| Daniel@0 | 278    insist(atom_number(Dur,Millis)). | 
| Daniel@0 | 279 txt_tag_triple(blapsi:file_duration,Content, _, ID, rdf(bldata:ID, mo:duration, literal(type(xsd:float,Millis)))) :- !, | 
| Daniel@0 | 280    insist(Content=[Dur],bad_content(Content,blapsi:file_duration)), | 
| Daniel@0 | 281    % insist(parse_duration_millis(Dur,Millis)). | 
| Daniel@0 | 282    (  parse_duration_millis(Dur,Millis) -> true | 
| Daniel@0 | 283    ;  warning(bad_duration(ID,Dur)) | 
| Daniel@0 | 284    ). | 
| Daniel@0 | 285 txt_tag_triple(Tag,Content,_,ID,T) :- dmd_tag_triple(Tag,Content,bldata:ID,T). | 
| Daniel@0 | 286 | 
| Daniel@0 | 287 | 
| Daniel@0 | 288 % --------------- Document meta data -------------------------- | 
| Daniel@0 | 289 | 
| Daniel@0 | 290 dmd_triple(DMD,URI,T) :- | 
| Daniel@0 | 291    member(element(Tag,_,Content),DMD), | 
| Daniel@0 | 292    dmd_tag_triple(Tag,Content,URI,T). | 
| Daniel@0 | 293 | 
| Daniel@0 | 294 dmd_tag_triple(dcterms:contributor,Content,ID,T) :- !, dmd_triple(Content,bldata:ID,T). | 
| Daniel@0 | 295 dmd_tag_triple(dc:contributor,Content,ID,T) :- !, dmd_triple(Content,bldata:ID,T). | 
| Daniel@0 | 296 dmd_tag_triple(marcrel:REL,Content,URI,rdf(URI,marcrel:Rel,literal(Lit))) :- !, | 
| Daniel@0 | 297    Content=[Lit],%empty_tag(marcrel:REL,Content)), | 
| Daniel@0 | 298    downcase_atom(REL,Rel). | 
| Daniel@0 | 299 dmd_tag_triple(Tag,Content,URI,rdf(URI,Tag,literal(Lit))) :- keep_tag(Tag), !, Content=[Lit]. | 
| Daniel@0 | 300 dmd_tag_triple(Tag,_,_,_) :- ignore_tag(Tag), !, fail. | 
| Daniel@0 | 301 dmd_tag_triple(Tag,_Content,URI,_) :- warning(unrecognised_tag(Tag,dmd,URI)). | 
| Daniel@0 | 302 | 
| Daniel@0 | 303 % !!!FIXME - sometimes dates are given in D/M/Y instead of Y-M-D | 
| Daniel@0 | 304 keep_tag(dc:title). | 
| Daniel@0 | 305 keep_tag(dc:description). | 
| Daniel@0 | 306 keep_tag(dc:source). | 
| Daniel@0 | 307 keep_tag(dc:subject). | 
| Daniel@0 | 308 keep_tag(dc:language). | 
| Daniel@0 | 309 keep_tag(dc:created). | 
| Daniel@0 | 310 keep_tag(dcterms:language). | 
| Daniel@0 | 311 keep_tag(dcterms:abstract). | 
| Daniel@0 | 312 keep_tag(dcterms:created). | 
| Daniel@0 | 313 keep_tag(dcterms:spatial). | 
| Daniel@0 | 314 keep_tag(dcterms:temporal). | 
| Daniel@0 | 315 keep_tag(dcterms:extent). % !!!FIXME need to parse this | 
| Daniel@0 | 316 keep_tag(blterms:mechanism). | 
| Daniel@0 | 317 keep_tag(dcterms:isPartOf). | 
| Daniel@0 | 318 keep_tag(blapsi:format). | 
| Daniel@0 | 319 | 
| Daniel@0 | 320 ignore_tag(dc:identifier). | 
| Daniel@0 | 321 ignore_tag(blapsi:marker). | 
| Daniel@0 | 322 ignore_tag(dc:rights). | 
| Daniel@0 | 323 ignore_tag(dc:type). | 
| Daniel@0 | 324 ignore_tag(rdf:about). | 
| Daniel@0 | 325 ignore_tag('ARK'). | 
| Daniel@0 | 326 | 
| Daniel@0 | 327 % ------------------------- ADMINISTRATIVE METADATA SECTION ----------------------------- | 
| Daniel@0 | 328 | 
| Daniel@0 | 329 amd_triple(mets:sourceMD,_,SMDContent,ID,T) :- | 
| Daniel@0 | 330    insist(unwrap(SMDContent,_,XMLContent),no_xml_content(SMDContent,smd)), | 
| Daniel@0 | 331    atom_concat(ID,'#source',Src), | 
| Daniel@0 | 332    (  T=rdf(bldata:ID,dml:source,bldata:Src) | 
| Daniel@0 | 333    ;  insist(multi,member(element(Tag,Attr,Content),XMLContent),no_xml_content(smd)), | 
| Daniel@0 | 334       smd_xml_triple(Tag,Attr,Content,bldata:Src,T) | 
| Daniel@0 | 335    ). | 
| Daniel@0 | 336 | 
| Daniel@0 | 337 amd_triple(mets:techMD,Attr,TMDContent,ID,T) :- | 
| Daniel@0 | 338    insist(member('ID'=TMDId,Attr)), | 
| Daniel@0 | 339    unwrap(TMDContent,_,XMLContent), | 
| Daniel@0 | 340    (  T=rdf(bldata:ID/TMDId, mo:sampled_version_of, bldata:ID) | 
| Daniel@0 | 341 %  ;  T=rdf(bldata:ID/TMDId, dml:annotation,literal(Label)), member('LABEL'=Label,TMDAttr) | 
| Daniel@0 | 342    ;  member(element(Tag,_,Content),XMLContent), | 
| Daniel@0 | 343       blapsi_triple(Tag, Content, bldata:ID/TMDId, T) | 
| Daniel@0 | 344    ). | 
| Daniel@0 | 345 | 
| Daniel@0 | 346 blapsi_triple(blapsi:Tag, [Text], Signal, rdf(Signal, Pred, literal(Lit))) :- | 
| Daniel@0 | 347    insist(blapsi_info(Tag, Text, Pred, Lit)). | 
| Daniel@0 | 348 | 
| Daniel@0 | 349 % ------------ Source --------------- | 
| Daniel@0 | 350 | 
| Daniel@0 | 351 smd_xml_triple(blapsi:audioObject,Attr,AOContent,SrcURI,rdf(SrcURI,Pred,literal(Lit))) :- !, | 
| Daniel@0 | 352    (  member(A=Lit,Attr), A\='ID', Pred=bldata:A | 
| Daniel@0 | 353    ;  insist(multi,member(element(Tag,Attr1,Content),AOContent),no_content(blapsi:audioObject,AOContent)), | 
| Daniel@0 | 354       ao_tag_info(Tag,Attr1,Content,Pred,Lit) | 
| Daniel@0 | 355    ). | 
| Daniel@0 | 356 | 
| Daniel@0 | 357 smd_xml_triple(blapsi:Tag,Content,SrcURI,rdf(SrcURI,blapsi:Tag,literal(Lit))) :- !, | 
| Daniel@0 | 358    insist(Content=[Lit],bad_content(blapsi:Tag,Content,smd_xml_triple)). | 
| Daniel@0 | 359 | 
| Daniel@0 | 360 ao_tag_info(blapsi:primaryIdentifier,_,_,_,_) :- !, fail. | 
| Daniel@0 | 361 ao_tag_info(blapsi:secondaryIdentifier,_,_,_,_) :- !, fail. | 
| Daniel@0 | 362 ao_tag_info(blapsi:format,_,Content,blapsi:format,Lit) :- !, | 
| Daniel@0 | 363    insist(Content=[Lit],bad_content(Content,blapsi:format)). | 
| Daniel@0 | 364 ao_tag_info(blapsi:face, Attr, Content, Pred, Lit) :- !, | 
| Daniel@0 | 365    %insist(member('ID'=ID,Attr)), | 
| Daniel@0 | 366    insist(member('label'=Label,Attr)), | 
| Daniel@0 | 367    insist(Content=[],non_empty_content(blapsi:face,Content)), | 
| Daniel@0 | 368    (  fail % Pred=bldata:face_id, Lit=ID    IGNORE FOR NOW | 
| Daniel@0 | 369    ;  Pred=bldata:face_label, Lit=Label | 
| Daniel@0 | 370    ). | 
| Daniel@0 | 371 ao_tag_info(blapsi:physicalProperties, _, PPContent, Tag, Lit) :- !, | 
| Daniel@0 | 372    insist(multi,member(element(Tag,_,Content),PPContent),no_content(blapsi:physicalProperties)), | 
| Daniel@0 | 373    insist(Content=[Lit],bad_content(Tag,Content)). | 
| Daniel@0 | 374 ao_tag_info(Tag,_,_,_,_) :- | 
| Daniel@0 | 375    warning(unrecognised_tag(Tag,blapsi:audioObject)). | 
| Daniel@0 | 376 | 
| Daniel@0 | 377 % identifier_pred('ASR Root ID',asr_root_id). | 
| Daniel@0 | 378 % identifier_pred('Sound Archive accession number',accession_number). | 
| Daniel@0 | 379 | 
| Daniel@0 | 380 | 
| Daniel@0 | 381 | 
| Daniel@0 | 382 % -------------- FILE SECTION --------------------------- | 
| Daniel@0 | 383 | 
| Daniel@0 | 384 file_triple(mets:'FLocat',Attr,LocContent,GAttr-FAttr,Dirs,ID,T) :- !, | 
| Daniel@0 | 385    (  member('MIMETYPE'=MimeType,FAttr) | 
| Daniel@0 | 386    -> audio_mimetype(MimeType), | 
| Daniel@0 | 387       insist(member('AMDID'=TMDId1,FAttr)), | 
| Daniel@0 | 388       insist(member('LOCTYPE'='URL',Attr)), | 
| Daniel@0 | 389       insist(member((xlink:href)=Link,Attr)), | 
| Daniel@0 | 390       insist(LocContent=[],non_empty(mets:'FLocat',LocContent)), | 
| Daniel@0 | 391       insist(member('USE'=Use1,FAttr);member('USE'=Use1,GAttr)), | 
| Daniel@0 | 392       % TMDId1 is sometimes "techMDxx digiprovXX" - need to get rid of second word | 
| Daniel@0 | 393       atomic_list_concat([TMDId|_],' ',TMDId1), | 
| Daniel@0 | 394       downcase_atom(Use1,Use), | 
| Daniel@0 | 395       (  T=rdf(bldata:ID/TMDId,dml:mimetype,literal(MimeType)) | 
| Daniel@0 | 396       ;  file_path_triple(bldata:ID/TMDId,Dirs,Link,T) | 
| Daniel@0 | 397       ;  T=rdf(bldata:ID/TMDId,bldata:use,literal(Use)) | 
| Daniel@0 | 398       ) | 
| Daniel@0 | 399    ;  insist(\+member('AMDID'=_,FAttr)), | 
| Daniel@0 | 400       insist(member('ID'=FileID,FAttr)), | 
| Daniel@0 | 401       insist(member('LOCTYPE'='URL',Attr)), | 
| Daniel@0 | 402       insist(member((xlink:href)=Link,Attr)), | 
| Daniel@0 | 403       sub_atom(Link,_,3,0,Ext), | 
| Daniel@0 | 404       audio_extension(Ext), | 
| Daniel@0 | 405       (  T=rdf(bldata:ID/FileID,mo:sampled_version_of,bldata:ID) | 
| Daniel@0 | 406       ;  file_path_triple(bldata:ID/FileID,Dirs,Link,T) | 
| Daniel@0 | 407       ;  member(element(Tag,_,Content),LocContent), | 
| Daniel@0 | 408          blapsi_triple(Tag,Content,bldata:ID/FileID,T) | 
| Daniel@0 | 409       ) | 
| Daniel@0 | 410    ). | 
| Daniel@0 | 411 | 
| Daniel@0 | 412 file_triple(mets:'Fcontent',_,_,_,_,_,_) :- !, fail. | 
| Daniel@0 | 413 file_triple(Tag,_,_,_,_,_,_) :- warning(unrecognised_tag(Tag,file)). | 
| Daniel@0 | 414 | 
| Daniel@0 | 415 file_path_triple(URI,Dirs,Link,rdf(URI,bldata:path,literal(Path))) :- | 
| Daniel@0 | 416    atomics_to_string(Parts,"/",Link), | 
| Daniel@0 | 417    atomics_to_string(Dirs,"/",Dir), | 
| Daniel@0 | 418    append(_,[Name],Parts), | 
| Daniel@0 | 419    directory_file_path(Dir,Name,Path). | 
| Daniel@0 | 420 | 
| Daniel@0 | 421 audio_mimetype('sound/wav'). | 
| Daniel@0 | 422 audio_mimetype('sound/wma'). | 
| Daniel@0 | 423 audio_mimetype('sound/mp3'). | 
| Daniel@0 | 424 audio_mimetype('sound/ogg'). | 
| Daniel@0 | 425 | 
| Daniel@0 | 426 audio_extension(wav). | 
| Daniel@0 | 427 audio_extension(mp3). | 
| Daniel@0 | 428 audio_extension(wma). | 
| Daniel@0 | 429 | 
| Daniel@0 | 430 % ---------------------------- BLAPSI INFO --------------------------------- | 
| Daniel@0 | 431 | 
| Daniel@0 | 432 blapsi_info(file_sample,     X, mo:sample_rate,   type(xsd:float,SampleRate)) :- atom_number(X,SampleRate). | 
| Daniel@0 | 433 blapsi_info(file_resolution, X, mo:bitsPerSample, type(xsd:int,Bits)) :- atom_number(X,Bits). | 
| Daniel@0 | 434 blapsi_info(resolution,      X, mo:bitsPerSample, type(xsd:int,Bits)) :- atom_number(X,Bits). | 
| Daniel@0 | 435 blapsi_info(file_channels,   X, mo:channels,      type(xsd:int,Channels)) :- atom_number(X,Channels). | 
| Daniel@0 | 436 blapsi_info(file_duration,   X, mo:duration,      type(xsd:float,Millis)) :- parse_duration_millis(X,Millis). | 
| Daniel@0 | 437 blapsi_info(file_bitrate,    X, blapsi:file_bitrate, X). % !!! FIXME should be attached to file, not to signal | 
| Daniel@0 | 438 blapsi_info(file_size,       X, blapsi:file_size, type(xsd:int,Size)) :- atom_number(X,Size). | 
| Daniel@0 | 439 blapsi_info(file_length,     X, blapsi:file_length, type(xsd:int,Size)) :- atom_number(X,Size). | 
| Daniel@0 | 440 | 
| Daniel@0 | 441 % --------------------------- SUPPORTING PREDICATES -------------------------- | 
| Daniel@0 | 442 | 
| Daniel@0 | 443 :- use_module(library(async)). | 
| Daniel@0 | 444 :- public scrape_all/2. | 
| Daniel@0 | 445 :- volatile_memo scrape_all(+options:list,-count:nonneg). | 
| Daniel@0 | 446 scrape_all(Opts,Count) :- | 
| Daniel@0 | 447    option(spacing(Sleep),Opts,1), | 
| Daniel@0 | 448    option(timeout(Timeout),Opts,10), | 
| Daniel@0 | 449    findall( R, rdf(R,dml:blpage,_), Rs), | 
| Daniel@0 | 450    with_progress_stack(map_with_progress(scrape_then_sleep(Sleep,Timeout),Rs,Ss)), | 
| Daniel@0 | 451    exclude(=(ok),Ss, Failures), | 
| Daniel@0 | 452    (Failures=[] -> length(Rs,Count); throw(scrape_errors(Failures))). | 
| Daniel@0 | 453 | 
| Daniel@0 | 454 scrape_then_sleep(D,T,R,Status) :- | 
| Daniel@0 | 455    (  audio_link(R,_) -> Status=ok | 
| Daniel@0 | 456    ;  memo:reify(bl_p2r:slow(D,call_with_time_limit(T,scrape_audio_link(R,_))),Status), | 
| Daniel@0 | 457       (Status=ex(abort(Reason)) -> throw(abort(Reason)); true) | 
| Daniel@0 | 458    ). | 
| Daniel@0 | 459 | 
| Daniel@0 | 460 slow(Delay,Goal) :- call(Goal), sleep(Delay). | 
| Daniel@0 | 461 | 
| Daniel@0 | 462 %% has_available_audio(+R:uri) is semidet. | 
| Daniel@0 | 463 %% has_available_audio(-R:uri) is nondet. | 
| Daniel@0 | 464 %  True when R is a recording in the BL collection whose audio is | 
| Daniel@0 | 465 %  publicly available. | 
| Daniel@0 | 466 has_available_audio(R) :- | 
| Daniel@0 | 467    rdf(R,dml:blpage,_), | 
| Daniel@0 | 468    scrape_audio_link(R,_). | 
| Daniel@0 | 469 | 
| Daniel@0 | 470 :- public audio_link/2. | 
| Daniel@0 | 471 audio_link(URI,AudioURL) :- | 
| Daniel@0 | 472    browse(scrape_audio_link(URI,AudioURL)). | 
| Daniel@0 | 473 | 
| Daniel@0 | 474 :- public audio_file/3. | 
| Daniel@0 | 475 audio_file(URI,Path,just(mp3)) :- | 
| Daniel@0 | 476    setting(audio_root,just(Root)), | 
| Daniel@0 | 477    (  rdf(URI,bldata:path,literal(RelPath)), | 
| Daniel@0 | 478       rdf(URI,rdf:type,mo:'Signal') | 
| Daniel@0 | 479    ;  rdf(URI2,mo:sampled_version_of,URI), | 
| Daniel@0 | 480       rdf(URI2,dml:mimetype,literal('sound/mp3')), | 
| Daniel@0 | 481       rdf(URI2,bldata:path,literal(RelPath)) | 
| Daniel@0 | 482    ), | 
| Daniel@0 | 483    atomic_list_concat([Root,RelPath],'/',Path). | 
| Daniel@0 | 484 | 
| Daniel@0 | 485 :- volatile_memo scrape_audio_link(+atom,-atom). | 
| Daniel@0 | 486 | 
| Daniel@0 | 487 scrape_audio_link(URI,AudioURL) :- | 
| Daniel@0 | 488    rdf(URI,dml:blpage,PageURL), | 
| Daniel@0 | 489    debug(bl_p2r,'Scraping audio link for ~w...',[URI]), | 
| Daniel@0 | 490    atom_concat('http://sounds.bl.uk/',_,PageURL), | 
| Daniel@0 | 491    http_get(PageURL,Doc,[]), | 
| Daniel@0 | 492    xpath(Doc,//li(@class=mainTrack)/a(@id),ID), | 
| Daniel@0 | 493    string_concat("MNT-",Key,ID), | 
| Daniel@0 | 494    string_concat('http://sounds.bl.uk/GT/',Key,AudioURL). | 
| Daniel@0 | 495 | 
| Daniel@0 | 496 sandbox:safe_primitive(bl_p2r:audio_link(_,_)). | 
| Daniel@0 | 497 sandbox:safe_primitive(bl_p2r:scrape_audio_link(_,_)). | 
| Daniel@0 | 498 | 
| Daniel@0 | 499 xpath(Prop,E,Path,Val) :- xpath(E,Path,I), xpath(I,/self(Prop),Val). | 
| Daniel@0 | 500 | 
| Daniel@0 | 501 warning(Term) :- nl, print_message(warning,Term), fail. |