Mercurial > hg > dml-open-cliopatria
diff cpack/dml/lib/dml_crawler.pl @ 0:718306e29690 tip
commiting public release
author | Daniel Wolff |
---|---|
date | Tue, 09 Feb 2016 21:05:06 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpack/dml/lib/dml_crawler.pl Tue Feb 09 21:05:06 2016 +0100 @@ -0,0 +1,445 @@ +/* Part of DML (Digital Music Laboratory) + Copyright 2014-2015 Samer Abdallah, University of London + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +:- module(dml_crawler, []). + +:- use_module(library(http/html_write)). +:- use_module(library(semweb/rdf_db)). +:- use_module(library(semweb/rdfs)). +:- use_module(library(semweb/rdf_label)). +:- use_module(library(sparkle)). +:- use_module(library(crawler)). +:- use_module(library(musicbrainz)). +:- use_module(library(memo)). +:- use_module(applications(browse)). +:- use_module(api(lod_crawler)). +:- use_module(cliopatria(hooks)). + +:- sparql_endpoint(mb, 'http://dbtune.org/musicbrainz/sparql'). +:- sparql_endpoint(lb, 'http://linkedbrainz.org/sparql/'). +:- sparql_endpoint(dbp, 'http://dbpedia.org/sparql/'). +:- sparql_endpoint(self, 'http://localhost:3020/sparql/'). + +% :- sparql_endpoint(bbc, 'http://dbtune.org/bbc/programmes/sparql/'). +% :- sparql_endpoint(peel,'http://dbtune.org/bbc/peel/sparql/'), +% :- sparql_endpoint(classical,'http://dbtune.org/classical/sparql/'), +% :- sparql_endpoint(jamendo,'http://dbtune.org/jamendo/sparql/'), +% :- sparql_endpoint(magnatune,'http://dbtune.org/magnatune/sparql/'), +% :- sparql_endpoint(henry,'http://dbtune.org/henry/sparql/'), + + +crawler:source(dbp, dml_crawler:sparql_crawler(dbp), []). +crawler:source(lbz, dml_crawler:sparql_crawler(lb), []). +crawler:source(mbz, dml_crawler:mbz_crawler, []). +crawler:source(lod, dml_crawler:lod_crawler, []). + +crawler:authority( begins('http://dbpedia.org'), dbp, [auto(true)]). +crawler:authority( begins('http://musicbrainz.org'), lbz, [auto(true)]). +crawler:authority( begins('http://musicbrainz.org'), mbz, [auto(true)]). +crawler:authority( begins('http://id.loc.gov/vocabulary'), lod, [auto(true)]). +crawler:authority( begins('http://yago-knowledge.org'), lod, [auto(true)]). +crawler:authority( ( (begins('http://'); begins('https://')), + \+begins('http://dbpedia.org/'), + \+begins('http://musicbrainz.org/'), + \+begins('http://dml.org/'), + \+begins('http://sounds.bl.uk/') + ), lod, [auto(false)]). + +:- volatile_memo instrument(+label:atom,-uri:atom). + +instrument(Label,Instr) :- + rdf(Instr,skos:prefLabel,literal(exact(Label),_)). + +:- rdf_meta censor(+,t). +censor(lb, rdf(_,'http://purl.org/muto/core#taggedResource',_)). +censor(lb, rdf(_,mo:musicbrainz_guid,_)). +censor(lb, rdf(_,foaf:made,_)). + +% not needed after all: it was me giving the recordings type 'Signal', not lb. +% sparql_crawler:modify(lb,Tin,Tout) :- +% debug(sparql_crawler,"checking ~q",[Tin]), +% once(relink_brainz(Tin,Tout)). + +% :- rdf_meta relink_brainz(t,t). +% relink_brainz(rdf(MBZRecording,rdf:type,mo:'Track'),rdf(MBZRecording,rdf:type,mo:'Signal')):- +% mb_id_uri(recording,_,MBZRecording). + +% connects patched ClioPatria resource viewer to sparql_crawler module +cliopatria:resource_crawler(URI,NT) --> html(div(class(crawler),\crawl_ui(URI,NT))). + +% --------- SPARQL crawler using sparkle --------------- + +sparql_crawler(dbp,name('DBPedia')). +sparql_crawler(lb,name('LinkedBrainz')). + +sparql_crawler(EP,uri_graph(_,Graph)) :- + current_sparql_endpoint(EP,Host,Port,Path,_), + parse_url(Graph,[protocol(http),host(Host),port(Port),path(Path)]). + +sparql_crawler(EP,describe(URI,Triple)) :- + describe(EP,URI,Triple), + \+censor(EP,Triple). + + +describe(EP,URI,rdf(Subj,Pred,URI)) :- EP ?? rdf(Subj,Pred,URI). +describe(EP,URI,rdf(URI,Pred,Obj)) :- EP ?? rdf(URI,Pred,Obj). +% !!! FIXME This was to slow for URIs with many linked resources +% Need to get asynchronous crawling working first.. +% describe(EP,URI,rdf(Subj,Pred,Obj)) :- +% (EP ?? rdf(URI,P1,O1)), +% ( Subj=URI, Pred=P1, Obj=O1 +% ; Subj=O1, Pred=P2, Obj=O2, +% O1\=literal(_), +% (EP ?? rdf(O1,P2,O2)) +% ). + +% ----------- LOD crawler --------------------------- +lod_crawler(name('LOD Cloud')). +lod_crawler(uri_graph(URI,Graph)) :- + uri_components(URI, uri_components(Sch,Auth,_,_,_)), + uri_components(Graph, uri_components(Sch,Auth,_,_,_)). + +lod_crawler(crawl(URI,Graph)) :- + lod_uri_graph(URI,URL), + rdf_load(URL,[graph(Graph)]). + +% ----------- MusicBrainz crawler ------------------- +mbz_crawler(uri_graph(_,'http://musicbrainz.org/ws/2')). +mbz_crawler(name('MusicBrainz')). +mbz_crawler(describe(URI,Triple)) :- + debug(crawler,'Doing ~q.',[mbz_crawler(describe(URI,Triple))]), + ( mb_id_uri(Type,_,URI) -> Context=Type + ; event_uri(URI,EvType,BaseURI,Extra), + mb_id_uri(BaseType,_,BaseURI) + -> Context=event(BaseType,BaseURI,EvType,Extra) + ; debug(crawler,'Unrecognised URI: ~w',[URI]), + fail + ), + mbz_triple(Context,URI,Triple1), + rdf_global_term(Triple1,Triple). + + +mbz_triple(artist,URI,rdf(URI,rdf:type,mo:'MusicArtist')). +mbz_triple(artist,URI,Triple) :- + mb_lookup(URI,[inc([aliases]), rels([recording,work,artist])],Artist), + subject_triple(Artist-URI,artist,Triple). + +mbz_triple(recording,URI,rdf(URI,rdf:type,mo:'Signal')). +mbz_triple(recording,URI,rdf(Event,rdf:type,mo:'Recording')) :- + event_uri(Event,production,URI). + +mbz_triple(recording,URI,Triple) :- + mb_lookup(URI,[inc([artists,'artist-credits']),rels([artist,work])],Recording), + event_uri(Event,production,URI), + subject_triple(Recording-URI, recording(Event), Triple). + +mbz_triple(work,URI,rdf(URI,rdf:type,mo:'MusicalWork')). +mbz_triple(work,URI,Triple) :- + mb_lookup(URI,[inc([aliases]),rels([recording,artist,work])],Work), + subject_triple(Work-URI, work, Triple). + +mbz_triple(event(work,_,composition,''),Event,rdf(Event,rdf:type:mo:'Composition')). +mbz_triple(event(work,W,composition,''),Event,Triple) :- + mb_lookup(W,[rels([artist,work])],Work), % need work rels here? + event_uri(Event,composition,WorkURI), + subject_triple(Work-WorkURI, work, Triple). + +mbz_triple(event(recording,Signal,production,''),Event,Triple) :- + production_triple(Signal,Event,Triple). + +production_triple(_, Ev, rdf(Ev,rdf:type,mo:'Performance')). +production_triple(_, Ev, rdf(Ev,rdf:type,mo:'Recording')). +production_triple(S, Ev, rdf(Ev,mo:produced_signal,S)). +production_triple(S, Ev, T) :- + mb_lookup(S,[inc([artists,'artist-credits']),rels([work,artist])],Recording), + mb_uri(Recording,Signal), + subject_triple(Recording-Signal,recording(Ev),T). + +%% subject_triple( +Subj:pair(element,resource), +Content:ground, -T:triple) is nondet. +% Produce triples relating to this subject without doing any more musicbrainz queries. + +% first deal with the non-relation facets +subject_triple(E-URI, Context, T) :- + dif(Facet,relation(_,_)), + mb_facet(E,Facet), + facet_triple(URI, Context, Facet, T). + +% then deal with relations +subject_triple(E1-URI1, _Context, T) :- + mb_relation(E1, E2, Name, Dir, Opts), + mb_uri(E2,URI2), + ( mb_class(E2,C2), + subject_triple(E2-URI2, C2, T) % do all facets of related object + ; normalise_direction(Dir,URI1,URI2,URI_A,URI_B), + relation_triple(Name,URI_A,URI_B,Opts,T) + ). + +normalise_direction(fwd,E1,E2,E1,E2). +normalise_direction(bwd,E1,E2,E2,E1). + + +facet_triple(_, _, credit(A), T) :- mb_uri(A,Agent), subject_triple(A-Agent,artist,T). +facet_triple(URI, _, title(T), rdf(URI,dc:title,literal(T))). +facet_triple(URI, artist, name(N), rdf(URI,foaf:name,literal(N))). +facet_triple(URI, artist, alias(A), rdf(URI,dml:alias,literal(A))). +facet_triple(URI, artist, gender(G), rdf(URI,foaf:gender,literal(GG))) :- downcase_atom(G,GG). +facet_triple(URI, artist, type(Type), rdf(URI,rdf:type,foaf:Type)) :- member(Type,['Person','Group']). +facet_triple(URI, artist, born(X), T) :- life_event_triple(URI,birth,time(X),T). +facet_triple(URI, artist, died(X), T) :- life_event_triple(URI,death,time(X),T). +facet_triple(URI, artist, birth_place(X), T) :- life_event_triple(URI,birth,area(X),T). +facet_triple(URI, artist, death_place(X), T) :- life_event_triple(URI,death,area(X),T). + +facet_triple(_, recording(E), credit(A), rdf(E,event:agent,Agent)) :- mb_uri(A,Agent). +facet_triple(URI, recording(_), length(L), rdf(URI,mo:duration,literal(type(xsd:float,LenA)))) :- + atom_number(LenA,L). + +life_event_triple(Agent,Type,Property,T) :- + event_uri(Event,Type,Agent), + ( T=rdf(Event,rdf:type,event:'Event') + ; T=rdf(Event,event:agent,Agent) + ; life_event_property_triple(Event,Property,T) + ). + + +life_event_property_triple(Event,area(A),rdf(Event,event:place,URI)) :- mb_uri(A,URI). +life_event_property_triple(Event,time(T),Triple) :- + prefix_uri('/time',Event,Time), + ( Triple=rdf(Event,event:time,Time) + ; time_instant_triple(Time-T,Triple) + ). + +% ----------------------------------------------------------------------------- +% relation_triple + +relation_triple(parts, Whole, Part, _, rdf(Part,dml:part_of,Whole)). +relation_triple(composer, Agent, Work, _, rdf(Work,dml:composer,Agent)). +relation_triple('is person', Agent, Person, _, rdf(Agent,dml:persona_of,Person)). +relation_triple('performing orchestra', _, Group, _, rdf(Group,rdf:type,mo:'Orchestra')). +relation_triple(Name, _, Group, _, rdf(Group,rdf:type,mo:'MusicGroup')) :- membership_role(Name). +relation_triple(Name, Agent, Group, _, rdf(Group,mo:member,Agent)) :- membership_role(Name). + +relation_triple(Role, Agent, _, _, rdf(Agent,rdf:type,mo:'MusicArtist')) :- + musical_role(Role). +relation_triple(Name, R1, R2, Opts, T) :- + relation_event(Name, R1, R2, Opts, Event, Relation), + relation_event_triple(Relation, Event, T). + +relation_event( based_on, Orig, Deriv, _, Ev, based_on(Orig)) :- event_uri(Ev,composition,Deriv). +relation_event( composer, A, W, O, Ev, composition(composer,A,W,O)) :- event_uri(Ev,composition,W). +relation_event( lyricist, A, W, _, Ev, composition(lyricist,A,W,[])) :- event_uri(Ev,composition,W). +relation_event( writer, A, W, _, Ev, composition(writer,A,W,[])) :- event_uri(Ev,composition,W). +relation_event( arrangement, W1, W2, O, Ev, arrangement(W1,W2,O)) :- event_uri(Ev,arrangement,W2). +relation_event( performance, Sig, W, O, Ev, performance(W,O)) :- event_uri(Ev,production,Sig). +relation_event( performance, Sig, _, _, Ev, recording(Sig)) :- event_uri(Ev,production,Sig). +relation_event( performer, _, Sig, _, Ev, recording(Sig)) :- event_uri(Ev,production,Sig). +relation_event( 'performing orchestra', _, Sig, _, Ev, recording(Sig)) :- event_uri(Ev,production,Sig). +relation_event( vocal, Ag, Sig, O, Ev, ED) :- relation_event( instrument, Ag, Sig, [attribute(voice)|O], Ev, ED). +relation_event( 'instrument arranger', Ag, Sig, O, Ev, ED) :- relation_event(instrument,Ag,Sig,O,Ev,ED). +relation_event( 'instrument arranger', Ag, Sig, O, Ev, ED) :- relation_event(arranger,Ag,Sig,O,Ev,ED). +relation_event( instrument, _, Sig, _, Ev, recording(Sig)) :- event_uri(Ev,production,Sig). +relation_event( instrument, Ag, Sig, O, Ev, instrument(Prod,Ag,O)) :- + mb_id_uri(_,PerformerID,Ag), + event_uri(Prod,production,Sig), + event_uri(Ev,performance,Sig,PerformerID). +relation_event( Name, Agent, Sig, _, Ev, role(Pred,Agent)) :- + production_role(Name,Pred), + mb_id_uri(recording,_,Sig), + event_uri(Ev,production,Sig). + +relation_event( Name, Agent, Group, Opts, Ev, membership(Agent,Group,[role(Role)|Opts])) :- + membership_role(Name,Role), + mb_id_uri(_,AID,Agent), + term_hash(t(AID,Role,Opts),Hash), + number_string(Hash,HashString), + event_uri(Ev,membership,Group,HashString). + +production_role(producer,mo:producer). +production_role(arranger,mo:arranger). +production_role(conductor,mo:conductor). +production_role(performer,mo:performer). +production_role('performing orchestra',mo:performer). +production_role(vocal,mo:performer). + +membership_role(Name) :- membership_role(Name,_). +membership_role('member of band',member). +membership_role('conductor position',conductor). +membership_role('vocal supporting musician',vocal_support). + +musical_role(composer). +musical_role(arranger). +musical_role(lyricist). +musical_role(instrument). +musical_role(performer). +musical_role('performing orchestra'). +musical_role('vocal supporting musician'). +musical_role(Name) :- membership_role(Name). + +relation_event_triple( based_on(_), Ev, rdf(Ev,rdf:type,mo:'Composition')). +relation_event_triple( based_on(Orig), Ev, rdf(Ev,event:factor,Orig)). + +relation_event_triple( membership(_,_,_), Ev, rdf(Ev,rdf:type,mo:'Membership')). +relation_event_triple( membership(A,_,_), Ev, rdf(Ev,mo:artist,A)). +relation_event_triple( membership(_,G,_), Ev, rdf(Ev,mo:group,G)). +relation_event_triple( membership(_,_,O), Ev, T) :- membership_triple(Ev,O,T). +relation_event_triple( membership(_,_,O), Ev, T) :- event_time_triple(Ev,O,T). + +relation_event_triple( composition(_,_,_,_), Ev, rdf(Ev,rdf:type,mo:'Composition')). +relation_event_triple( composition(_,_,W,_), Ev, rdf(Ev,mo:produced_work,W)). +relation_event_triple( composition(R,A,_,_), Ev, rdf(Ev,mo:R,A)). +relation_event_triple( composition(_,_,_,O), Ev, T) :- event_time_triple(Ev,O,T). + +relation_event_triple( arrangement(_,_,_), Ev, rdf(Ev,rdf:type,mo:'Arrangement')). +relation_event_triple( arrangement(W,_,_), Ev, rdf(Ev,mo:arrangement_of,W)). +relation_event_triple( arrangement(_,W,_), Ev, rdf(Ev,mo:produced_work,W)). +relation_event_triple( arrangement(_,_,O), Ev, T) :- event_time_triple(Ev,O,T). + +relation_event_triple( performance(_,_), Ev, rdf(Ev,rdf:type,mo:'Performance')). +relation_event_triple( performance(_,O), Ev, T) :- event_time_triple(Ev,O,T). +relation_event_triple( performance(W,O), Ev, rdf(Ev,Pred,W)) :- + ( member(attribute(partial),O) + -> Pred=mo:partial_performance_of + ; Pred=mo:performance_of + ). + +relation_event_triple( recording(_), Ev, rdf(Ev,rdf:type,mo:'Recording')). +relation_event_triple( recording(S), Ev, rdf(Ev,mo:produced_signal,S)). + +relation_event_triple( instrument(_,_,_), Ev, rdf(Ev,rdf:type,mo:'Performance')). +relation_event_triple( instrument(_,A,_), Ev, rdf(Ev,mo:performer,A)). +relation_event_triple( instrument(P,_,_), Ev, rdf(P,event:sub_event,Ev)). +relation_event_triple( instrument(_,_,O), Ev, T) :- event_time_triple(Ev,O,T). +% relation_event_triple( instrument(_,_,O), Ev, rdf(Ev,mo:instrument,literal(Instr))) :- +% member(attribute(Instr),O). +relation_event_triple( instrument(_,_,O), Ev, rdf(Ev,mo:instrument,Instr)) :- + member(attribute(Label),O), + ( instrument(Label,Instr) -> true + ; Instr=literal(Label) + ). + +membership_triple(E, O, rdf(E,dml:role,literal(R))) :- member(role(R),O), R\=member. +membership_triple(E, O, rdf(E,dml:modifier,literal(A))) :- member(attribute(A),O). + +event_time_triple(Event,Opts,T) :- + (memberchk(begin(_),Opts); memberchk(end(_),Opts); memberchk(in(_),Opts)), + prefix_uri('/time',Event,Time), + ( T=rdf(Event,event:time,Time) + ; ( member(in(DT),Opts) + -> time_instant_triple(Time-DT,T) + ; ( member(begin(DT),Opts), prefix_uri('/begin',Time,Pt), Pred=time:hasBeginning + ; member(end(DT),Opts), prefix_uri('/end',Time,Pt), Pred=time:hasEnd + ), + time_interval_triple(Time,Pred,Pt-DT,T) + ) + ). + +time_instant_triple(Time-_, rdf(Time,rdf:type,time:'Instant')). +time_instant_triple(Time-DT, rdf(Time,time:inXSDDateTime,literal(type(xsd:dateTime,DT)))). + +time_interval_triple(T, _, _, rdf(T,rdf:type, time:'Interval')). +time_interval_triple(T, Pred, Pt-_, rdf(T,Pred,Pt)). +time_interval_triple(_, _, PtDT, T) :- time_instant_triple(PtDT,T). + + +%% event_uri(-EventURI,+Type,+BaseURI) is det. +%% event_uri(-EventURI,+Type,+BaseURI) is det. +event_uri(EventURI,Type,BaseURI) :- event_uri(EventURI,Type,BaseURI,''). + +%% event_uri(+EventURI,-Type,-BaseURI,-Extra) is det. +%% event_uri(-EventURI,+Type,+BaseURI,+Extra) is det. +event_uri(EventURI,Type,BaseURI,Extra) :- + ( var(EventURI) + -> uri_components(BaseURI,uri_components(Sc,Ho,Pa,Extra,'_')), + uri_components(EventURI,uri_components(Sc,Ho,Pa,Extra,Type)), Type\='_' + ; uri_components(EventURI,uri_components(Sc,Ho,Pa,Extra,Type)), Type\='_', + uri_components(BaseURI,uri_components(Sc,Ho,Pa,Extra,'_')) + ). + +% prefix_uri(_,_,N) :- rdf_bnode(N), !. +prefix_uri(Prefix,URI,PURI) :- + uri_components(URI,uri_components(Sc,Ho,Pa,Extra,Type)), atom_concat(Prefix,Pa,TPa), + uri_components(PURI,uri_components(Sc,Ho,TPa,Extra,Type)). + +% -------------- display hooks --------------- +:- rdf_meta label(r,r,-). +label(time:'Instant',URI, Label) :- + ( rdf_has(URI,time:inXSDDateTime,literal(type(xsd:dateTime,DT))) + -> format(string(Label),'~w',[DT]) + ; Label='<unknown>' + ). + +label(time:'Interval',URI, Label) :- + (rdf_has(URI,time:hasBeginning,Begin) -> rdf_display_label(Begin,L1); L1='<unknown>'), + (rdf_has(URI,time:hasEnd,End) -> rdf_display_label(End,L2); L2='<unknown>'), + ( L1=L2 + -> format(string(Label),'~s',[L1]) + ; format(string(Label),'~s to ~s',[L1,L2]) + ). + +label(mo:'Recording',Ev, Label) :- + rdf_has(Ev,mo:produced_signal,Sig), !, + rdf_display_label(Sig,Title), + ( rdf_has(Ev,event:agent,Agent) + -> rdf_display_label(Agent,Name), + format(string(Label),'Recording of ~w by ~w',[Title,Name]) + ; format(string(Label),'Recording of ~w',[Title]) + ). + +label(mo:'Membership',Ev, Label) :- + rdf_has(Ev,mo:artist,A), + rdf_has(Ev,mo:group,G), !, + rdf_display_label(A,AL), + rdf_display_label(G,GL), + format(string(Label),'Membership of ~w in ~w',[AL,GL]). + +label(mo:'Performance',Ev, Label) :- + ( rdf_has(Ev,mo:produced_signal,Sig) + ; rdf_has(SEv,event:sub_event,Ev), + rdf_has(SEv,mo:produced_signal,Sig) + ), !, + rdf_display_label(Sig,Title), + rdf_has(Ev,mo:performer,Performer), rdf_display_label(Performer,Name), + rdf_has(Ev,mo:instrument,Instr), rdf_display_label(Instr,IName), + format(string(Label),'~w playing ~w on ~w',[Name,IName,Title]). + + +label(mo:'Composition',E,Label) :- + rdf_has(E,mo:produced_work,Work), + rdfs_individual_of(Work,mo:'MusicalWork'), !, + rdf_display_label(Work,Title), + format(string(Label),'Composition of ~w',[Title]). + +rdf_label:display_label_hook(URI, _, Label) :- + rdf(URI,rdf:type,Class), + label(Class,URI,Label). + +rdf_label:display_label_hook(URI, _, Label) :- + atom(URI), + sub_atom(URI,0,_,_,'http://musicbrainz.org'), + event_uri(URI,EvType,Base), + event_label(EvType,URI,Base,Label). + +event_label(birth,_,Person,Label) :- + rdfs_individual_of(Person,foaf:'Person'), !, + rdf_display_label(Person,Name), + format(string(Label),'Birth of ~w',[Name]). +event_label(death,_,Person,Label) :- + rdfs_individual_of(Person,foaf:'Person'), !, + rdf_display_label(Person,Name), + format(string(Label),'Death of ~w',[Name]). + +