Mercurial > hg > dml-open-cliopatria
view cpack/dml/lib/dml_crawler.pl @ 0:718306e29690 tip
commiting public release
author | Daniel Wolff |
---|---|
date | Tue, 09 Feb 2016 21:05:06 +0100 |
parents | |
children |
line wrap: on
line source
/* Part of DML (Digital Music Laboratory) Copyright 2014-2015 Samer Abdallah, University of London This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ :- module(dml_crawler, []). :- use_module(library(http/html_write)). :- use_module(library(semweb/rdf_db)). :- use_module(library(semweb/rdfs)). :- use_module(library(semweb/rdf_label)). :- use_module(library(sparkle)). :- use_module(library(crawler)). :- use_module(library(musicbrainz)). :- use_module(library(memo)). :- use_module(applications(browse)). :- use_module(api(lod_crawler)). :- use_module(cliopatria(hooks)). :- sparql_endpoint(mb, 'http://dbtune.org/musicbrainz/sparql'). :- sparql_endpoint(lb, 'http://linkedbrainz.org/sparql/'). :- sparql_endpoint(dbp, 'http://dbpedia.org/sparql/'). :- sparql_endpoint(self, 'http://localhost:3020/sparql/'). % :- sparql_endpoint(bbc, 'http://dbtune.org/bbc/programmes/sparql/'). % :- sparql_endpoint(peel,'http://dbtune.org/bbc/peel/sparql/'), % :- sparql_endpoint(classical,'http://dbtune.org/classical/sparql/'), % :- sparql_endpoint(jamendo,'http://dbtune.org/jamendo/sparql/'), % :- sparql_endpoint(magnatune,'http://dbtune.org/magnatune/sparql/'), % :- sparql_endpoint(henry,'http://dbtune.org/henry/sparql/'), crawler:source(dbp, dml_crawler:sparql_crawler(dbp), []). crawler:source(lbz, dml_crawler:sparql_crawler(lb), []). crawler:source(mbz, dml_crawler:mbz_crawler, []). crawler:source(lod, dml_crawler:lod_crawler, []). crawler:authority( begins('http://dbpedia.org'), dbp, [auto(true)]). crawler:authority( begins('http://musicbrainz.org'), lbz, [auto(true)]). crawler:authority( begins('http://musicbrainz.org'), mbz, [auto(true)]). crawler:authority( begins('http://id.loc.gov/vocabulary'), lod, [auto(true)]). crawler:authority( begins('http://yago-knowledge.org'), lod, [auto(true)]). crawler:authority( ( (begins('http://'); begins('https://')), \+begins('http://dbpedia.org/'), \+begins('http://musicbrainz.org/'), \+begins('http://dml.org/'), \+begins('http://sounds.bl.uk/') ), lod, [auto(false)]). :- volatile_memo instrument(+label:atom,-uri:atom). instrument(Label,Instr) :- rdf(Instr,skos:prefLabel,literal(exact(Label),_)). :- rdf_meta censor(+,t). censor(lb, rdf(_,'http://purl.org/muto/core#taggedResource',_)). censor(lb, rdf(_,mo:musicbrainz_guid,_)). censor(lb, rdf(_,foaf:made,_)). % not needed after all: it was me giving the recordings type 'Signal', not lb. % sparql_crawler:modify(lb,Tin,Tout) :- % debug(sparql_crawler,"checking ~q",[Tin]), % once(relink_brainz(Tin,Tout)). % :- rdf_meta relink_brainz(t,t). % relink_brainz(rdf(MBZRecording,rdf:type,mo:'Track'),rdf(MBZRecording,rdf:type,mo:'Signal')):- % mb_id_uri(recording,_,MBZRecording). % connects patched ClioPatria resource viewer to sparql_crawler module cliopatria:resource_crawler(URI,NT) --> html(div(class(crawler),\crawl_ui(URI,NT))). % --------- SPARQL crawler using sparkle --------------- sparql_crawler(dbp,name('DBPedia')). sparql_crawler(lb,name('LinkedBrainz')). sparql_crawler(EP,uri_graph(_,Graph)) :- current_sparql_endpoint(EP,Host,Port,Path,_), parse_url(Graph,[protocol(http),host(Host),port(Port),path(Path)]). sparql_crawler(EP,describe(URI,Triple)) :- describe(EP,URI,Triple), \+censor(EP,Triple). describe(EP,URI,rdf(Subj,Pred,URI)) :- EP ?? rdf(Subj,Pred,URI). describe(EP,URI,rdf(URI,Pred,Obj)) :- EP ?? rdf(URI,Pred,Obj). % !!! FIXME This was to slow for URIs with many linked resources % Need to get asynchronous crawling working first.. % describe(EP,URI,rdf(Subj,Pred,Obj)) :- % (EP ?? rdf(URI,P1,O1)), % ( Subj=URI, Pred=P1, Obj=O1 % ; Subj=O1, Pred=P2, Obj=O2, % O1\=literal(_), % (EP ?? rdf(O1,P2,O2)) % ). % ----------- LOD crawler --------------------------- lod_crawler(name('LOD Cloud')). lod_crawler(uri_graph(URI,Graph)) :- uri_components(URI, uri_components(Sch,Auth,_,_,_)), uri_components(Graph, uri_components(Sch,Auth,_,_,_)). lod_crawler(crawl(URI,Graph)) :- lod_uri_graph(URI,URL), rdf_load(URL,[graph(Graph)]). % ----------- MusicBrainz crawler ------------------- mbz_crawler(uri_graph(_,'http://musicbrainz.org/ws/2')). mbz_crawler(name('MusicBrainz')). mbz_crawler(describe(URI,Triple)) :- debug(crawler,'Doing ~q.',[mbz_crawler(describe(URI,Triple))]), ( mb_id_uri(Type,_,URI) -> Context=Type ; event_uri(URI,EvType,BaseURI,Extra), mb_id_uri(BaseType,_,BaseURI) -> Context=event(BaseType,BaseURI,EvType,Extra) ; debug(crawler,'Unrecognised URI: ~w',[URI]), fail ), mbz_triple(Context,URI,Triple1), rdf_global_term(Triple1,Triple). mbz_triple(artist,URI,rdf(URI,rdf:type,mo:'MusicArtist')). mbz_triple(artist,URI,Triple) :- mb_lookup(URI,[inc([aliases]), rels([recording,work,artist])],Artist), subject_triple(Artist-URI,artist,Triple). mbz_triple(recording,URI,rdf(URI,rdf:type,mo:'Signal')). mbz_triple(recording,URI,rdf(Event,rdf:type,mo:'Recording')) :- event_uri(Event,production,URI). mbz_triple(recording,URI,Triple) :- mb_lookup(URI,[inc([artists,'artist-credits']),rels([artist,work])],Recording), event_uri(Event,production,URI), subject_triple(Recording-URI, recording(Event), Triple). mbz_triple(work,URI,rdf(URI,rdf:type,mo:'MusicalWork')). mbz_triple(work,URI,Triple) :- mb_lookup(URI,[inc([aliases]),rels([recording,artist,work])],Work), subject_triple(Work-URI, work, Triple). mbz_triple(event(work,_,composition,''),Event,rdf(Event,rdf:type:mo:'Composition')). mbz_triple(event(work,W,composition,''),Event,Triple) :- mb_lookup(W,[rels([artist,work])],Work), % need work rels here? event_uri(Event,composition,WorkURI), subject_triple(Work-WorkURI, work, Triple). mbz_triple(event(recording,Signal,production,''),Event,Triple) :- production_triple(Signal,Event,Triple). production_triple(_, Ev, rdf(Ev,rdf:type,mo:'Performance')). production_triple(_, Ev, rdf(Ev,rdf:type,mo:'Recording')). production_triple(S, Ev, rdf(Ev,mo:produced_signal,S)). production_triple(S, Ev, T) :- mb_lookup(S,[inc([artists,'artist-credits']),rels([work,artist])],Recording), mb_uri(Recording,Signal), subject_triple(Recording-Signal,recording(Ev),T). %% subject_triple( +Subj:pair(element,resource), +Content:ground, -T:triple) is nondet. % Produce triples relating to this subject without doing any more musicbrainz queries. % first deal with the non-relation facets subject_triple(E-URI, Context, T) :- dif(Facet,relation(_,_)), mb_facet(E,Facet), facet_triple(URI, Context, Facet, T). % then deal with relations subject_triple(E1-URI1, _Context, T) :- mb_relation(E1, E2, Name, Dir, Opts), mb_uri(E2,URI2), ( mb_class(E2,C2), subject_triple(E2-URI2, C2, T) % do all facets of related object ; normalise_direction(Dir,URI1,URI2,URI_A,URI_B), relation_triple(Name,URI_A,URI_B,Opts,T) ). normalise_direction(fwd,E1,E2,E1,E2). normalise_direction(bwd,E1,E2,E2,E1). facet_triple(_, _, credit(A), T) :- mb_uri(A,Agent), subject_triple(A-Agent,artist,T). facet_triple(URI, _, title(T), rdf(URI,dc:title,literal(T))). facet_triple(URI, artist, name(N), rdf(URI,foaf:name,literal(N))). facet_triple(URI, artist, alias(A), rdf(URI,dml:alias,literal(A))). facet_triple(URI, artist, gender(G), rdf(URI,foaf:gender,literal(GG))) :- downcase_atom(G,GG). facet_triple(URI, artist, type(Type), rdf(URI,rdf:type,foaf:Type)) :- member(Type,['Person','Group']). facet_triple(URI, artist, born(X), T) :- life_event_triple(URI,birth,time(X),T). facet_triple(URI, artist, died(X), T) :- life_event_triple(URI,death,time(X),T). facet_triple(URI, artist, birth_place(X), T) :- life_event_triple(URI,birth,area(X),T). facet_triple(URI, artist, death_place(X), T) :- life_event_triple(URI,death,area(X),T). facet_triple(_, recording(E), credit(A), rdf(E,event:agent,Agent)) :- mb_uri(A,Agent). facet_triple(URI, recording(_), length(L), rdf(URI,mo:duration,literal(type(xsd:float,LenA)))) :- atom_number(LenA,L). life_event_triple(Agent,Type,Property,T) :- event_uri(Event,Type,Agent), ( T=rdf(Event,rdf:type,event:'Event') ; T=rdf(Event,event:agent,Agent) ; life_event_property_triple(Event,Property,T) ). life_event_property_triple(Event,area(A),rdf(Event,event:place,URI)) :- mb_uri(A,URI). life_event_property_triple(Event,time(T),Triple) :- prefix_uri('/time',Event,Time), ( Triple=rdf(Event,event:time,Time) ; time_instant_triple(Time-T,Triple) ). % ----------------------------------------------------------------------------- % relation_triple relation_triple(parts, Whole, Part, _, rdf(Part,dml:part_of,Whole)). relation_triple(composer, Agent, Work, _, rdf(Work,dml:composer,Agent)). relation_triple('is person', Agent, Person, _, rdf(Agent,dml:persona_of,Person)). relation_triple('performing orchestra', _, Group, _, rdf(Group,rdf:type,mo:'Orchestra')). relation_triple(Name, _, Group, _, rdf(Group,rdf:type,mo:'MusicGroup')) :- membership_role(Name). relation_triple(Name, Agent, Group, _, rdf(Group,mo:member,Agent)) :- membership_role(Name). relation_triple(Role, Agent, _, _, rdf(Agent,rdf:type,mo:'MusicArtist')) :- musical_role(Role). relation_triple(Name, R1, R2, Opts, T) :- relation_event(Name, R1, R2, Opts, Event, Relation), relation_event_triple(Relation, Event, T). relation_event( based_on, Orig, Deriv, _, Ev, based_on(Orig)) :- event_uri(Ev,composition,Deriv). relation_event( composer, A, W, O, Ev, composition(composer,A,W,O)) :- event_uri(Ev,composition,W). relation_event( lyricist, A, W, _, Ev, composition(lyricist,A,W,[])) :- event_uri(Ev,composition,W). relation_event( writer, A, W, _, Ev, composition(writer,A,W,[])) :- event_uri(Ev,composition,W). relation_event( arrangement, W1, W2, O, Ev, arrangement(W1,W2,O)) :- event_uri(Ev,arrangement,W2). relation_event( performance, Sig, W, O, Ev, performance(W,O)) :- event_uri(Ev,production,Sig). relation_event( performance, Sig, _, _, Ev, recording(Sig)) :- event_uri(Ev,production,Sig). relation_event( performer, _, Sig, _, Ev, recording(Sig)) :- event_uri(Ev,production,Sig). relation_event( 'performing orchestra', _, Sig, _, Ev, recording(Sig)) :- event_uri(Ev,production,Sig). relation_event( vocal, Ag, Sig, O, Ev, ED) :- relation_event( instrument, Ag, Sig, [attribute(voice)|O], Ev, ED). relation_event( 'instrument arranger', Ag, Sig, O, Ev, ED) :- relation_event(instrument,Ag,Sig,O,Ev,ED). relation_event( 'instrument arranger', Ag, Sig, O, Ev, ED) :- relation_event(arranger,Ag,Sig,O,Ev,ED). relation_event( instrument, _, Sig, _, Ev, recording(Sig)) :- event_uri(Ev,production,Sig). relation_event( instrument, Ag, Sig, O, Ev, instrument(Prod,Ag,O)) :- mb_id_uri(_,PerformerID,Ag), event_uri(Prod,production,Sig), event_uri(Ev,performance,Sig,PerformerID). relation_event( Name, Agent, Sig, _, Ev, role(Pred,Agent)) :- production_role(Name,Pred), mb_id_uri(recording,_,Sig), event_uri(Ev,production,Sig). relation_event( Name, Agent, Group, Opts, Ev, membership(Agent,Group,[role(Role)|Opts])) :- membership_role(Name,Role), mb_id_uri(_,AID,Agent), term_hash(t(AID,Role,Opts),Hash), number_string(Hash,HashString), event_uri(Ev,membership,Group,HashString). production_role(producer,mo:producer). production_role(arranger,mo:arranger). production_role(conductor,mo:conductor). production_role(performer,mo:performer). production_role('performing orchestra',mo:performer). production_role(vocal,mo:performer). membership_role(Name) :- membership_role(Name,_). membership_role('member of band',member). membership_role('conductor position',conductor). membership_role('vocal supporting musician',vocal_support). musical_role(composer). musical_role(arranger). musical_role(lyricist). musical_role(instrument). musical_role(performer). musical_role('performing orchestra'). musical_role('vocal supporting musician'). musical_role(Name) :- membership_role(Name). relation_event_triple( based_on(_), Ev, rdf(Ev,rdf:type,mo:'Composition')). relation_event_triple( based_on(Orig), Ev, rdf(Ev,event:factor,Orig)). relation_event_triple( membership(_,_,_), Ev, rdf(Ev,rdf:type,mo:'Membership')). relation_event_triple( membership(A,_,_), Ev, rdf(Ev,mo:artist,A)). relation_event_triple( membership(_,G,_), Ev, rdf(Ev,mo:group,G)). relation_event_triple( membership(_,_,O), Ev, T) :- membership_triple(Ev,O,T). relation_event_triple( membership(_,_,O), Ev, T) :- event_time_triple(Ev,O,T). relation_event_triple( composition(_,_,_,_), Ev, rdf(Ev,rdf:type,mo:'Composition')). relation_event_triple( composition(_,_,W,_), Ev, rdf(Ev,mo:produced_work,W)). relation_event_triple( composition(R,A,_,_), Ev, rdf(Ev,mo:R,A)). relation_event_triple( composition(_,_,_,O), Ev, T) :- event_time_triple(Ev,O,T). relation_event_triple( arrangement(_,_,_), Ev, rdf(Ev,rdf:type,mo:'Arrangement')). relation_event_triple( arrangement(W,_,_), Ev, rdf(Ev,mo:arrangement_of,W)). relation_event_triple( arrangement(_,W,_), Ev, rdf(Ev,mo:produced_work,W)). relation_event_triple( arrangement(_,_,O), Ev, T) :- event_time_triple(Ev,O,T). relation_event_triple( performance(_,_), Ev, rdf(Ev,rdf:type,mo:'Performance')). relation_event_triple( performance(_,O), Ev, T) :- event_time_triple(Ev,O,T). relation_event_triple( performance(W,O), Ev, rdf(Ev,Pred,W)) :- ( member(attribute(partial),O) -> Pred=mo:partial_performance_of ; Pred=mo:performance_of ). relation_event_triple( recording(_), Ev, rdf(Ev,rdf:type,mo:'Recording')). relation_event_triple( recording(S), Ev, rdf(Ev,mo:produced_signal,S)). relation_event_triple( instrument(_,_,_), Ev, rdf(Ev,rdf:type,mo:'Performance')). relation_event_triple( instrument(_,A,_), Ev, rdf(Ev,mo:performer,A)). relation_event_triple( instrument(P,_,_), Ev, rdf(P,event:sub_event,Ev)). relation_event_triple( instrument(_,_,O), Ev, T) :- event_time_triple(Ev,O,T). % relation_event_triple( instrument(_,_,O), Ev, rdf(Ev,mo:instrument,literal(Instr))) :- % member(attribute(Instr),O). relation_event_triple( instrument(_,_,O), Ev, rdf(Ev,mo:instrument,Instr)) :- member(attribute(Label),O), ( instrument(Label,Instr) -> true ; Instr=literal(Label) ). membership_triple(E, O, rdf(E,dml:role,literal(R))) :- member(role(R),O), R\=member. membership_triple(E, O, rdf(E,dml:modifier,literal(A))) :- member(attribute(A),O). event_time_triple(Event,Opts,T) :- (memberchk(begin(_),Opts); memberchk(end(_),Opts); memberchk(in(_),Opts)), prefix_uri('/time',Event,Time), ( T=rdf(Event,event:time,Time) ; ( member(in(DT),Opts) -> time_instant_triple(Time-DT,T) ; ( member(begin(DT),Opts), prefix_uri('/begin',Time,Pt), Pred=time:hasBeginning ; member(end(DT),Opts), prefix_uri('/end',Time,Pt), Pred=time:hasEnd ), time_interval_triple(Time,Pred,Pt-DT,T) ) ). time_instant_triple(Time-_, rdf(Time,rdf:type,time:'Instant')). time_instant_triple(Time-DT, rdf(Time,time:inXSDDateTime,literal(type(xsd:dateTime,DT)))). time_interval_triple(T, _, _, rdf(T,rdf:type, time:'Interval')). time_interval_triple(T, Pred, Pt-_, rdf(T,Pred,Pt)). time_interval_triple(_, _, PtDT, T) :- time_instant_triple(PtDT,T). %% event_uri(-EventURI,+Type,+BaseURI) is det. %% event_uri(-EventURI,+Type,+BaseURI) is det. event_uri(EventURI,Type,BaseURI) :- event_uri(EventURI,Type,BaseURI,''). %% event_uri(+EventURI,-Type,-BaseURI,-Extra) is det. %% event_uri(-EventURI,+Type,+BaseURI,+Extra) is det. event_uri(EventURI,Type,BaseURI,Extra) :- ( var(EventURI) -> uri_components(BaseURI,uri_components(Sc,Ho,Pa,Extra,'_')), uri_components(EventURI,uri_components(Sc,Ho,Pa,Extra,Type)), Type\='_' ; uri_components(EventURI,uri_components(Sc,Ho,Pa,Extra,Type)), Type\='_', uri_components(BaseURI,uri_components(Sc,Ho,Pa,Extra,'_')) ). % prefix_uri(_,_,N) :- rdf_bnode(N), !. prefix_uri(Prefix,URI,PURI) :- uri_components(URI,uri_components(Sc,Ho,Pa,Extra,Type)), atom_concat(Prefix,Pa,TPa), uri_components(PURI,uri_components(Sc,Ho,TPa,Extra,Type)). % -------------- display hooks --------------- :- rdf_meta label(r,r,-). label(time:'Instant',URI, Label) :- ( rdf_has(URI,time:inXSDDateTime,literal(type(xsd:dateTime,DT))) -> format(string(Label),'~w',[DT]) ; Label='<unknown>' ). label(time:'Interval',URI, Label) :- (rdf_has(URI,time:hasBeginning,Begin) -> rdf_display_label(Begin,L1); L1='<unknown>'), (rdf_has(URI,time:hasEnd,End) -> rdf_display_label(End,L2); L2='<unknown>'), ( L1=L2 -> format(string(Label),'~s',[L1]) ; format(string(Label),'~s to ~s',[L1,L2]) ). label(mo:'Recording',Ev, Label) :- rdf_has(Ev,mo:produced_signal,Sig), !, rdf_display_label(Sig,Title), ( rdf_has(Ev,event:agent,Agent) -> rdf_display_label(Agent,Name), format(string(Label),'Recording of ~w by ~w',[Title,Name]) ; format(string(Label),'Recording of ~w',[Title]) ). label(mo:'Membership',Ev, Label) :- rdf_has(Ev,mo:artist,A), rdf_has(Ev,mo:group,G), !, rdf_display_label(A,AL), rdf_display_label(G,GL), format(string(Label),'Membership of ~w in ~w',[AL,GL]). label(mo:'Performance',Ev, Label) :- ( rdf_has(Ev,mo:produced_signal,Sig) ; rdf_has(SEv,event:sub_event,Ev), rdf_has(SEv,mo:produced_signal,Sig) ), !, rdf_display_label(Sig,Title), rdf_has(Ev,mo:performer,Performer), rdf_display_label(Performer,Name), rdf_has(Ev,mo:instrument,Instr), rdf_display_label(Instr,IName), format(string(Label),'~w playing ~w on ~w',[Name,IName,Title]). label(mo:'Composition',E,Label) :- rdf_has(E,mo:produced_work,Work), rdfs_individual_of(Work,mo:'MusicalWork'), !, rdf_display_label(Work,Title), format(string(Label),'Composition of ~w',[Title]). rdf_label:display_label_hook(URI, _, Label) :- rdf(URI,rdf:type,Class), label(Class,URI,Label). rdf_label:display_label_hook(URI, _, Label) :- atom(URI), sub_atom(URI,0,_,_,'http://musicbrainz.org'), event_uri(URI,EvType,Base), event_label(EvType,URI,Base,Label). event_label(birth,_,Person,Label) :- rdfs_individual_of(Person,foaf:'Person'), !, rdf_display_label(Person,Name), format(string(Label),'Birth of ~w',[Name]). event_label(death,_,Person,Label) :- rdfs_individual_of(Person,foaf:'Person'), !, rdf_display_label(Person,Name), format(string(Label),'Death of ~w',[Name]).