Daniel@0: /* Part of DML (Digital Music Laboratory) Daniel@0: Copyright 2014-2015 Samer Abdallah, University of London Daniel@0: Daniel@0: This program is free software; you can redistribute it and/or Daniel@0: modify it under the terms of the GNU General Public License Daniel@0: as published by the Free Software Foundation; either version 2 Daniel@0: of the License, or (at your option) any later version. Daniel@0: Daniel@0: This program is distributed in the hope that it will be useful, Daniel@0: but WITHOUT ANY WARRANTY; without even the implied warranty of Daniel@0: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Daniel@0: GNU General Public License for more details. Daniel@0: Daniel@0: You should have received a copy of the GNU General Public Daniel@0: License along with this library; if not, write to the Free Software Daniel@0: Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Daniel@0: */ Daniel@0: Daniel@0: :- module(bl_p2r,[ audio_file/3, audio_link/2, scrape_audio_link/2, has_available_audio/1 ]). Daniel@0: Daniel@0: :- use_module(library(semweb/rdf_db)). Daniel@0: :- use_module(library(xmlarchive)). Daniel@0: :- use_module(library(xpath)). Daniel@0: :- use_module(library(settings)). Daniel@0: :- use_module(library(sandbox)). Daniel@0: :- use_module(library(fileutils)). Daniel@0: :- use_module(library(termutils)). Daniel@0: :- use_module(library(rdfutils)). Daniel@0: :- use_module(library(insist)). Daniel@0: :- use_module(library(memo)). Daniel@0: :- use_module(entailment(p2r)). Daniel@0: :- use_module(cliopatria(hooks)). Daniel@0: :- use_module(library(http/http_client)). Daniel@0: Daniel@0: :- set_prolog_flag(double_quotes,string). Daniel@0: Daniel@0: /* Daniel@0: METS to RDF conversion. Daniel@0: Daniel@0: Useful info: Daniel@0: 0. Each top-level mets:mets element contains several sections. Daniel@0: 1,4 x dmdSec .. 1 x mdWrap Daniel@0: 1 x amdSec .. [ N x techMD, {0,1} x rightsMD, {0,1} x sourceMD ] Daniel@0: 1 x fileSec .. 1-4 x fileGrp .. 1-36 x file Daniel@0: 1 x structLink .. {1,3} x smLink Daniel@0: 2-3 x structMap .. 1 x div Daniel@0: Daniel@0: 1. Each dmdSec only ever contains one mdWrap element Daniel@0: 5. each techMD or sourceMD contains exactly 1 mdWrap Daniel@0: 6. each rightsMD contains an mdRef Daniel@0: 7. The rightsMD is not interesting to look at. Daniel@0: 8. dmdSec mdWraps can be empty Daniel@0: Daniel@0: List of all dmdSec tags in dataset Daniel@0: blapsi:id3album Daniel@0: blapsi:id3artist Daniel@0: blapsi:id3comment Daniel@0: blapsi:id3songtitle Daniel@0: blapsi:id3year Daniel@0: dc:description Daniel@0: dc:identifier Daniel@0: dc:language Daniel@0: dc:rights Daniel@0: dc:source Daniel@0: dc:subject Daniel@0: dc:title Daniel@0: dc:type Daniel@0: dcterms:created Daniel@0: dcterms:isPartOf Daniel@0: dcterms:spatial Daniel@0: dcterms:temporal Daniel@0: marcrel:CMP Daniel@0: marcrel:CND Daniel@0: marcrel:IVE Daniel@0: marcrel:IVR Daniel@0: marcrel:LYR Daniel@0: marcrel:PRF Daniel@0: marcrel:RCE Daniel@0: marcrel:SPK Daniel@0: mods:accessCondition Daniel@0: mods:identifier Daniel@0: mods:name Daniel@0: mods:recordInfo Daniel@0: mods:titleInfo Daniel@0: Daniel@0: amdSec tags Daniel@0: blapsi:audioObject Daniel@0: blapsi:file_bitrate Daniel@0: blapsi:file_channels Daniel@0: blapsi:file_duration Daniel@0: blapsi:file_sample Daniel@0: blapsi:resolution Daniel@0: Daniel@0: sourceMD .. blapsi:audioObject Daniel@0: blapsi:face Daniel@0: blapsi:format Daniel@0: blapsi:physicalProperties Daniel@0: blapsi:face Daniel@0: blapsi:format Daniel@0: blapsi:physicalProperties Daniel@0: blapsi:primaryIdentifier Daniel@0: blapsi:secondaryIdentifier Daniel@0: blapsi:primaryIdentifier Daniel@0: blapsi:secondaryIdentifier Daniel@0: Daniel@0: would prefer to have mo predicates to foaf:Person resources for these. Daniel@0: marcrel:'CMP',dml:composer). Daniel@0: marcrel:'CND',dml:conductor). Daniel@0: marcrel:'PRF',dml:performer). Daniel@0: Daniel@0: */ Daniel@0: Daniel@0: :- setting(audio_root,ground,nothing,'Root directory of audio file collection'). Daniel@0: :- setting(archive_pattern,atom,'~/lib/datasets/mets/BL_metadata_complete.7z','Pattern to match METS metadata files'). Daniel@0: Daniel@0: :- rdf_register_prefix(bldata,'http://sounds.bl.uk/resource/'). Daniel@0: :- rdf_register_prefix(marcrel,'http://id.loc.gov/vocabulary/relators/'). Daniel@0: :- rdf_register_prefix(blapsi,'http://sounds.bl.uk/blapsi#'). Daniel@0: :- rdf_register_prefix(blterms,'http://www.bl.uk/schemas/bibliographic/blterms#'). Daniel@0: Daniel@0: % https://code.google.com/p/libarchive/issues/detail?id=328&colspec=ID%20Type%20Status%20Priority%20Milestone%20OpSys%20Owner%20Summary Daniel@0: Daniel@0: :- public import/0. Daniel@0: import :- assert_all(bl_p2r). Daniel@0: Daniel@0: rdf(dml:blpage,rdfs:range,foaf:'Document') <== true. Daniel@0: rdf(dml:blpage,rdfs:subPropertyOf,foaf:page) <== true. Daniel@0: rdf(S,P,O) <== bl_archive_triple(T), once(expand_triple(T,rdf(S,P,O))). Daniel@0: Daniel@0: expand_triple(rdf(SS,PP,OO),rdf(S,P,O)) :- Daniel@0: uripattern:pattern_uri(SS,S), Daniel@0: uripattern:pattern_uri(PP,P), Daniel@0: ( OO=literal(_) -> rdf_global_object(OO,O) Daniel@0: ; uripattern:pattern_uri(OO,O) Daniel@0: ). Daniel@0: Daniel@0: %% import_directory(+Dir:atom, +Graph:atom) is det. Daniel@0: % Daniel@0: % Import contents of a given directory into a named RDF graph. Daniel@0: % The directory must be a subdirectory of the directory named in the Daniel@0: % bl_p2r:audio_root setting (whose value is either =|just(Dir)|= or Daniel@0: % =|nothing|=). Daniel@0: import_directory(Dir,Graph) :- Daniel@0: forall( directory_triple(Dir,T), (once(expand_triple(T,rdf(S,P,O))), rdf_assert(S,P,O,Graph))). Daniel@0: Daniel@0: directory_triple(Dir,T) :- Daniel@0: Ext=txt, Daniel@0: setting(audio_root,just(AudioRoot)), Daniel@0: atom_concat(AudioRoot,'/',Prefix), Daniel@0: find_files(under(AudioRoot/Dir),FullPath), Daniel@0: atom_concat(Prefix,Path,FullPath), Daniel@0: split_path(Path,Loc,Ext), Daniel@0: with_stream(Str,open(FullPath,read,Str), ext_loc_stream_triple(Ext,Loc,Str,T)). Daniel@0: Daniel@0: Daniel@0: %% bl_archive_triple(-T:rdf) is nondet. Daniel@0: % This predicate generates triples from the metadata archive files whose names Daniel@0: % match the pattern stored in the setting bl_p2r:archive_pattern. Daniel@0: bl_archive_triple(T) :- Daniel@0: setting(archive_pattern, ArchivePattern), Daniel@0: find_files(like(ArchivePattern), Archive), Daniel@0: with_archive_stream(Archive, Path, path_triple_stream(Path,T)). Daniel@0: Daniel@0: path_triple_stream(Path,T,S) :- Daniel@0: status('Importing ~s',[Path]), Daniel@0: catch(( insist(split_path(Path,Loc,Ext)), Daniel@0: ext_loc_stream_triple(Ext,Loc,S,T) Daniel@0: ), Ex, (nl,print_message(warning,Ex),fail)). Daniel@0: Daniel@0: split_path(Path,Dirs-Base,Ext) :- Daniel@0: % split_string(Path,'/','',Parts), Daniel@0: atomic_list_concat(Parts,'/',Path), Daniel@0: exclude(ignore_dir,Parts,Parts1), Daniel@0: append(Dirs,[Name],Parts1), Daniel@0: sub_atom(Name,_,3,0,Ext), % NB this assumes three character extension Daniel@0: sub_atom(Name,0,_,4,Base). Daniel@0: Daniel@0: %% ext_loc_stream_triple(+Ext:atom, +Loc:pair(list(atom), atom), +S:stream, -T:rdf) is nondet. Daniel@0: % Daniel@0: % This predicate succeeds once for each RDF triple that can be derived from Daniel@0: % an archive stream whose path in the archive Loc=Dirs-Base consist of the directories Daniel@0: % Dirs and whose name is Base,'.',Ext. It understands entries with extensions Daniel@0: % xml and txt. XML streams are decoded as full METS documents. TXT streams are decoded Daniel@0: % as the DMD section of a METS document. Other extensions generate a warning and then fail. Daniel@0: Daniel@0: ext_loc_stream_triple(xml,Dirs-Base,S,T) :- !, Daniel@0: insist(load_xmlns(S,Doc)), Daniel@0: insist(member(element(mets:mets,_,METS),Doc)), Daniel@0: ( insist(get_bl_url(Base,Dirs,URL)), Daniel@0: T=rdf(bldata:Base,dml:blpage,URL) Daniel@0: ; T=rdf(bldata:Base,rdf:type,mo:'Signal') Daniel@0: ; insist(multi,member(element(Tag,_,Content),METS),no_content(mets,Dirs-Base)), Daniel@0: mets_triple(Tag,Content,Dirs,Base,T) Daniel@0: ). Daniel@0: Daniel@0: ext_loc_stream_triple(txt,Dirs-Base,S,T) :- !, Daniel@0: % TXT streams appear to be LATIN-1 encoded, not UTF-8 Daniel@0: Base\=combined, % exclude combined metadata files Daniel@0: set_stream(S,encoding(iso_latin_1)), Daniel@0: insist(load_xmlns(S,Doc)), Daniel@0: txt_triple(Doc,Dirs,Base,T). Daniel@0: Daniel@0: ext_loc_stream_triple(Ext,Dirs-Base,_,_) :- Daniel@0: warning(unrecognised_extension(Ext,Dirs-Base)). Daniel@0: Daniel@0: Daniel@0: %% get_bl_url(+Name:atom,+Dirs:list(atom),-URL:atom) is det. Daniel@0: % Deduces the sounds.bl.uk URL from entry name and directory. Daniel@0: get_bl_url(Name,Dirs,URL) :- Daniel@0: sub_atom(Name,0,4,_,CatCode), Daniel@0: category(CatCode,Category), Daniel@0: append(_,[Collection],Dirs), Daniel@0: atomic_list_concat([ 'http://sounds.bl.uk', Daniel@0: Category,Collection,Name],'/',URL). Daniel@0: Daniel@0: category('021M','Oral-history'). Daniel@0: category('025M','World-and-traditional-music'). Daniel@0: category('023M','Jazz-and-popular-music'). Daniel@0: category('026M','Classical-music'). Daniel@0: category('028M','Jazz-and-popular-music'). Daniel@0: category('020A','Classical-music'). Daniel@0: Daniel@0: ignore_dir('_Metadata'). Daniel@0: ignore_dir('_Non-music'). Daniel@0: ignore_dir('_Audio_Metadata'). Daniel@0: Daniel@0: % --------------------- TOP LEVEL STRUCTURE ------------------------------- Daniel@0: Daniel@0: unwrap([element(mets:mdWrap,MDAttr,[element(mets:xmlData,_,XMLContent)])],MDAttr,XMLContent). Daniel@0: Daniel@0: % for complete METS documents Daniel@0: mets_triple(mets:dmdSec,DMDContent,_,ID,T) :- !, Daniel@0: unwrap(DMDContent,MDAttr,XMLContent), Daniel@0: member('MDTYPE'='DC',MDAttr), Daniel@0: dmd_triple(XMLContent,bldata:ID,T). Daniel@0: Daniel@0: mets_triple(mets:amdSec,AMDContent,_,ID,T) :- !, Daniel@0: member(element(Tag,Attr,Content),AMDContent), Daniel@0: amd_triple(Tag,Attr,Content,ID,T). Daniel@0: Daniel@0: mets_triple(mets:fileSec,FileSecContent,Dirs,ID,T) :- !, Daniel@0: member(element(T1,GAttr,GroupContent),FileSecContent), insist(T1=mets:fileGrp), Daniel@0: member(element(T2,FAttr,FileContent),GroupContent), insist(T2=mets:file), Daniel@0: \+member('ID'='WEBRESOURCES',GAttr), Daniel@0: FileContent=[element(Tag,Attr,Content)], Daniel@0: file_triple(Tag,Attr,Content,GAttr-FAttr,Dirs,ID,T). Daniel@0: Daniel@0: mets_triple(mets:metsHdr,_,_,_,_) :- !, fail. Daniel@0: mets_triple(mets:structLink,_,_,_,_) :- !, fail. Daniel@0: mets_triple(mets:structMap,_,_,_,_) :- !, fail. Daniel@0: mets_triple(Tag,_,_,_,_) :- warning(unrecognised_tag(Tag,mets:mets)). Daniel@0: Daniel@0: % for txt, partial XML documents Daniel@0: txt_triple(_,_,ID,rdf(bldata:ID,rdf:type,mo:'Signal')). Daniel@0: txt_triple(Doc,Dirs,ID,T) :- Daniel@0: insist(multi,member(element(Tag,_,Content),Doc),no_content(txt)), Daniel@0: txt_tag_triple(Tag,Content,Dirs,ID,T). Daniel@0: Daniel@0: identifier_file_ext(F,F1,mp3) :- sub_atom(F,Bef,_,_,'.mp3'), !, insist(sub_atom(F,0,Bef,_,F1)). Daniel@0: identifier_file_ext(F,F1,wav) :- sub_atom(F,Bef,_,0,'.wav'), !, insist(sub_atom(F,0,Bef,_,F1)). Daniel@0: identifier_file_ext(F,F1,m4a) :- sub_atom(F,Bef,_,0,'.m4a'), !, insist(sub_atom(F,0,Bef,_,F1)). Daniel@0: Daniel@0: txt_tag_triple(dc:identifier, [F], Dirs, ID, rdf(bldata:ID, bldata:path, literal(Path))) :- !, Daniel@0: % NB some of the txt files have the file name written twice. Hence I am going to discard Daniel@0: % everything after the first dot. Relies on sub_atom returning matches starting from the beginning Daniel@0: ( identifier_file_ext(F,F1,Ext) Daniel@0: -> file_name_extension(F1,Ext,Name), Daniel@0: atomics_to_string(Dirs,"/",Dir), Daniel@0: directory_file_path(Dir,Name,Path) Daniel@0: ; print_message(warning,txt_triple_identifier_fail(ID,F)), Daniel@0: fail Daniel@0: ). Daniel@0: Daniel@0: % !!! MUSICALS only. Should not really have mo:duration in them either... Daniel@0: txt_tag_triple(dml:rating,Content, _, ID, rdf(bldata:ID, dml:rating, literal(Content))) :- !. Daniel@0: txt_tag_triple(mo:duration,Content, _, ID, rdf(bldata:ID, mo:duration, literal(type(xsd:float,Millis)))) :- !, Daniel@0: insist(Content=[Dur],bad_content(Content,mo:duration)), Daniel@0: insist(atom_number(Dur,Millis)). Daniel@0: txt_tag_triple(blapsi:file_duration,Content, _, ID, rdf(bldata:ID, mo:duration, literal(type(xsd:float,Millis)))) :- !, Daniel@0: insist(Content=[Dur],bad_content(Content,blapsi:file_duration)), Daniel@0: % insist(parse_duration_millis(Dur,Millis)). Daniel@0: ( parse_duration_millis(Dur,Millis) -> true Daniel@0: ; warning(bad_duration(ID,Dur)) Daniel@0: ). Daniel@0: txt_tag_triple(Tag,Content,_,ID,T) :- dmd_tag_triple(Tag,Content,bldata:ID,T). Daniel@0: Daniel@0: Daniel@0: % --------------- Document meta data -------------------------- Daniel@0: Daniel@0: dmd_triple(DMD,URI,T) :- Daniel@0: member(element(Tag,_,Content),DMD), Daniel@0: dmd_tag_triple(Tag,Content,URI,T). Daniel@0: Daniel@0: dmd_tag_triple(dcterms:contributor,Content,ID,T) :- !, dmd_triple(Content,bldata:ID,T). Daniel@0: dmd_tag_triple(dc:contributor,Content,ID,T) :- !, dmd_triple(Content,bldata:ID,T). Daniel@0: dmd_tag_triple(marcrel:REL,Content,URI,rdf(URI,marcrel:Rel,literal(Lit))) :- !, Daniel@0: Content=[Lit],%empty_tag(marcrel:REL,Content)), Daniel@0: downcase_atom(REL,Rel). Daniel@0: dmd_tag_triple(Tag,Content,URI,rdf(URI,Tag,literal(Lit))) :- keep_tag(Tag), !, Content=[Lit]. Daniel@0: dmd_tag_triple(Tag,_,_,_) :- ignore_tag(Tag), !, fail. Daniel@0: dmd_tag_triple(Tag,_Content,URI,_) :- warning(unrecognised_tag(Tag,dmd,URI)). Daniel@0: Daniel@0: % !!!FIXME - sometimes dates are given in D/M/Y instead of Y-M-D Daniel@0: keep_tag(dc:title). Daniel@0: keep_tag(dc:description). Daniel@0: keep_tag(dc:source). Daniel@0: keep_tag(dc:subject). Daniel@0: keep_tag(dc:language). Daniel@0: keep_tag(dc:created). Daniel@0: keep_tag(dcterms:language). Daniel@0: keep_tag(dcterms:abstract). Daniel@0: keep_tag(dcterms:created). Daniel@0: keep_tag(dcterms:spatial). Daniel@0: keep_tag(dcterms:temporal). Daniel@0: keep_tag(dcterms:extent). % !!!FIXME need to parse this Daniel@0: keep_tag(blterms:mechanism). Daniel@0: keep_tag(dcterms:isPartOf). Daniel@0: keep_tag(blapsi:format). Daniel@0: Daniel@0: ignore_tag(dc:identifier). Daniel@0: ignore_tag(blapsi:marker). Daniel@0: ignore_tag(dc:rights). Daniel@0: ignore_tag(dc:type). Daniel@0: ignore_tag(rdf:about). Daniel@0: ignore_tag('ARK'). Daniel@0: Daniel@0: % ------------------------- ADMINISTRATIVE METADATA SECTION ----------------------------- Daniel@0: Daniel@0: amd_triple(mets:sourceMD,_,SMDContent,ID,T) :- Daniel@0: insist(unwrap(SMDContent,_,XMLContent),no_xml_content(SMDContent,smd)), Daniel@0: atom_concat(ID,'#source',Src), Daniel@0: ( T=rdf(bldata:ID,dml:source,bldata:Src) Daniel@0: ; insist(multi,member(element(Tag,Attr,Content),XMLContent),no_xml_content(smd)), Daniel@0: smd_xml_triple(Tag,Attr,Content,bldata:Src,T) Daniel@0: ). Daniel@0: Daniel@0: amd_triple(mets:techMD,Attr,TMDContent,ID,T) :- Daniel@0: insist(member('ID'=TMDId,Attr)), Daniel@0: unwrap(TMDContent,_,XMLContent), Daniel@0: ( T=rdf(bldata:ID/TMDId, mo:sampled_version_of, bldata:ID) Daniel@0: % ; T=rdf(bldata:ID/TMDId, dml:annotation,literal(Label)), member('LABEL'=Label,TMDAttr) Daniel@0: ; member(element(Tag,_,Content),XMLContent), Daniel@0: blapsi_triple(Tag, Content, bldata:ID/TMDId, T) Daniel@0: ). Daniel@0: Daniel@0: blapsi_triple(blapsi:Tag, [Text], Signal, rdf(Signal, Pred, literal(Lit))) :- Daniel@0: insist(blapsi_info(Tag, Text, Pred, Lit)). Daniel@0: Daniel@0: % ------------ Source --------------- Daniel@0: Daniel@0: smd_xml_triple(blapsi:audioObject,Attr,AOContent,SrcURI,rdf(SrcURI,Pred,literal(Lit))) :- !, Daniel@0: ( member(A=Lit,Attr), A\='ID', Pred=bldata:A Daniel@0: ; insist(multi,member(element(Tag,Attr1,Content),AOContent),no_content(blapsi:audioObject,AOContent)), Daniel@0: ao_tag_info(Tag,Attr1,Content,Pred,Lit) Daniel@0: ). Daniel@0: Daniel@0: smd_xml_triple(blapsi:Tag,Content,SrcURI,rdf(SrcURI,blapsi:Tag,literal(Lit))) :- !, Daniel@0: insist(Content=[Lit],bad_content(blapsi:Tag,Content,smd_xml_triple)). Daniel@0: Daniel@0: ao_tag_info(blapsi:primaryIdentifier,_,_,_,_) :- !, fail. Daniel@0: ao_tag_info(blapsi:secondaryIdentifier,_,_,_,_) :- !, fail. Daniel@0: ao_tag_info(blapsi:format,_,Content,blapsi:format,Lit) :- !, Daniel@0: insist(Content=[Lit],bad_content(Content,blapsi:format)). Daniel@0: ao_tag_info(blapsi:face, Attr, Content, Pred, Lit) :- !, Daniel@0: %insist(member('ID'=ID,Attr)), Daniel@0: insist(member('label'=Label,Attr)), Daniel@0: insist(Content=[],non_empty_content(blapsi:face,Content)), Daniel@0: ( fail % Pred=bldata:face_id, Lit=ID IGNORE FOR NOW Daniel@0: ; Pred=bldata:face_label, Lit=Label Daniel@0: ). Daniel@0: ao_tag_info(blapsi:physicalProperties, _, PPContent, Tag, Lit) :- !, Daniel@0: insist(multi,member(element(Tag,_,Content),PPContent),no_content(blapsi:physicalProperties)), Daniel@0: insist(Content=[Lit],bad_content(Tag,Content)). Daniel@0: ao_tag_info(Tag,_,_,_,_) :- Daniel@0: warning(unrecognised_tag(Tag,blapsi:audioObject)). Daniel@0: Daniel@0: % identifier_pred('ASR Root ID',asr_root_id). Daniel@0: % identifier_pred('Sound Archive accession number',accession_number). Daniel@0: Daniel@0: Daniel@0: Daniel@0: % -------------- FILE SECTION --------------------------- Daniel@0: Daniel@0: file_triple(mets:'FLocat',Attr,LocContent,GAttr-FAttr,Dirs,ID,T) :- !, Daniel@0: ( member('MIMETYPE'=MimeType,FAttr) Daniel@0: -> audio_mimetype(MimeType), Daniel@0: insist(member('AMDID'=TMDId1,FAttr)), Daniel@0: insist(member('LOCTYPE'='URL',Attr)), Daniel@0: insist(member((xlink:href)=Link,Attr)), Daniel@0: insist(LocContent=[],non_empty(mets:'FLocat',LocContent)), Daniel@0: insist(member('USE'=Use1,FAttr);member('USE'=Use1,GAttr)), Daniel@0: % TMDId1 is sometimes "techMDxx digiprovXX" - need to get rid of second word Daniel@0: atomic_list_concat([TMDId|_],' ',TMDId1), Daniel@0: downcase_atom(Use1,Use), Daniel@0: ( T=rdf(bldata:ID/TMDId,dml:mimetype,literal(MimeType)) Daniel@0: ; file_path_triple(bldata:ID/TMDId,Dirs,Link,T) Daniel@0: ; T=rdf(bldata:ID/TMDId,bldata:use,literal(Use)) Daniel@0: ) Daniel@0: ; insist(\+member('AMDID'=_,FAttr)), Daniel@0: insist(member('ID'=FileID,FAttr)), Daniel@0: insist(member('LOCTYPE'='URL',Attr)), Daniel@0: insist(member((xlink:href)=Link,Attr)), Daniel@0: sub_atom(Link,_,3,0,Ext), Daniel@0: audio_extension(Ext), Daniel@0: ( T=rdf(bldata:ID/FileID,mo:sampled_version_of,bldata:ID) Daniel@0: ; file_path_triple(bldata:ID/FileID,Dirs,Link,T) Daniel@0: ; member(element(Tag,_,Content),LocContent), Daniel@0: blapsi_triple(Tag,Content,bldata:ID/FileID,T) Daniel@0: ) Daniel@0: ). Daniel@0: Daniel@0: file_triple(mets:'Fcontent',_,_,_,_,_,_) :- !, fail. Daniel@0: file_triple(Tag,_,_,_,_,_,_) :- warning(unrecognised_tag(Tag,file)). Daniel@0: Daniel@0: file_path_triple(URI,Dirs,Link,rdf(URI,bldata:path,literal(Path))) :- Daniel@0: atomics_to_string(Parts,"/",Link), Daniel@0: atomics_to_string(Dirs,"/",Dir), Daniel@0: append(_,[Name],Parts), Daniel@0: directory_file_path(Dir,Name,Path). Daniel@0: Daniel@0: audio_mimetype('sound/wav'). Daniel@0: audio_mimetype('sound/wma'). Daniel@0: audio_mimetype('sound/mp3'). Daniel@0: audio_mimetype('sound/ogg'). Daniel@0: Daniel@0: audio_extension(wav). Daniel@0: audio_extension(mp3). Daniel@0: audio_extension(wma). Daniel@0: Daniel@0: % ---------------------------- BLAPSI INFO --------------------------------- Daniel@0: Daniel@0: blapsi_info(file_sample, X, mo:sample_rate, type(xsd:float,SampleRate)) :- atom_number(X,SampleRate). Daniel@0: blapsi_info(file_resolution, X, mo:bitsPerSample, type(xsd:int,Bits)) :- atom_number(X,Bits). Daniel@0: blapsi_info(resolution, X, mo:bitsPerSample, type(xsd:int,Bits)) :- atom_number(X,Bits). Daniel@0: blapsi_info(file_channels, X, mo:channels, type(xsd:int,Channels)) :- atom_number(X,Channels). Daniel@0: blapsi_info(file_duration, X, mo:duration, type(xsd:float,Millis)) :- parse_duration_millis(X,Millis). Daniel@0: blapsi_info(file_bitrate, X, blapsi:file_bitrate, X). % !!! FIXME should be attached to file, not to signal Daniel@0: blapsi_info(file_size, X, blapsi:file_size, type(xsd:int,Size)) :- atom_number(X,Size). Daniel@0: blapsi_info(file_length, X, blapsi:file_length, type(xsd:int,Size)) :- atom_number(X,Size). Daniel@0: Daniel@0: % --------------------------- SUPPORTING PREDICATES -------------------------- Daniel@0: Daniel@0: :- use_module(library(async)). Daniel@0: :- public scrape_all/2. Daniel@0: :- volatile_memo scrape_all(+options:list,-count:nonneg). Daniel@0: scrape_all(Opts,Count) :- Daniel@0: option(spacing(Sleep),Opts,1), Daniel@0: option(timeout(Timeout),Opts,10), Daniel@0: findall( R, rdf(R,dml:blpage,_), Rs), Daniel@0: with_progress_stack(map_with_progress(scrape_then_sleep(Sleep,Timeout),Rs,Ss)), Daniel@0: exclude(=(ok),Ss, Failures), Daniel@0: (Failures=[] -> length(Rs,Count); throw(scrape_errors(Failures))). Daniel@0: Daniel@0: scrape_then_sleep(D,T,R,Status) :- Daniel@0: ( audio_link(R,_) -> Status=ok Daniel@0: ; memo:reify(bl_p2r:slow(D,call_with_time_limit(T,scrape_audio_link(R,_))),Status), Daniel@0: (Status=ex(abort(Reason)) -> throw(abort(Reason)); true) Daniel@0: ). Daniel@0: Daniel@0: slow(Delay,Goal) :- call(Goal), sleep(Delay). Daniel@0: Daniel@0: %% has_available_audio(+R:uri) is semidet. Daniel@0: %% has_available_audio(-R:uri) is nondet. Daniel@0: % True when R is a recording in the BL collection whose audio is Daniel@0: % publicly available. Daniel@0: has_available_audio(R) :- Daniel@0: rdf(R,dml:blpage,_), Daniel@0: scrape_audio_link(R,_). Daniel@0: Daniel@0: :- public audio_link/2. Daniel@0: audio_link(URI,AudioURL) :- Daniel@0: browse(scrape_audio_link(URI,AudioURL)). Daniel@0: Daniel@0: :- public audio_file/3. Daniel@0: audio_file(URI,Path,just(mp3)) :- Daniel@0: setting(audio_root,just(Root)), Daniel@0: ( rdf(URI,bldata:path,literal(RelPath)), Daniel@0: rdf(URI,rdf:type,mo:'Signal') Daniel@0: ; rdf(URI2,mo:sampled_version_of,URI), Daniel@0: rdf(URI2,dml:mimetype,literal('sound/mp3')), Daniel@0: rdf(URI2,bldata:path,literal(RelPath)) Daniel@0: ), Daniel@0: atomic_list_concat([Root,RelPath],'/',Path). Daniel@0: Daniel@0: :- volatile_memo scrape_audio_link(+atom,-atom). Daniel@0: Daniel@0: scrape_audio_link(URI,AudioURL) :- Daniel@0: rdf(URI,dml:blpage,PageURL), Daniel@0: debug(bl_p2r,'Scraping audio link for ~w...',[URI]), Daniel@0: atom_concat('http://sounds.bl.uk/',_,PageURL), Daniel@0: http_get(PageURL,Doc,[]), Daniel@0: xpath(Doc,//li(@class=mainTrack)/a(@id),ID), Daniel@0: string_concat("MNT-",Key,ID), Daniel@0: string_concat('http://sounds.bl.uk/GT/',Key,AudioURL). Daniel@0: Daniel@0: sandbox:safe_primitive(bl_p2r:audio_link(_,_)). Daniel@0: sandbox:safe_primitive(bl_p2r:scrape_audio_link(_,_)). Daniel@0: Daniel@0: xpath(Prop,E,Path,Val) :- xpath(E,Path,I), xpath(I,/self(Prop),Val). Daniel@0: Daniel@0: warning(Term) :- nl, print_message(warning,Term), fail.