view cpack/dml/lib/dml_crawler.pl @ 0:718306e29690 tip

commiting public release
author Daniel Wolff
date Tue, 09 Feb 2016 21:05:06 +0100
parents
children
line wrap: on
line source
/* Part of DML (Digital Music Laboratory)
	Copyright 2014-2015 Samer Abdallah, University of London
	 
	This program is free software; you can redistribute it and/or
	modify it under the terms of the GNU General Public License
	as published by the Free Software Foundation; either version 2
	of the License, or (at your option) any later version.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
	GNU General Public License for more details.

	You should have received a copy of the GNU General Public
	License along with this library; if not, write to the Free Software
	Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
*/

:- module(dml_crawler, []).

:- use_module(library(http/html_write)).
:- use_module(library(semweb/rdf_db)).
:- use_module(library(semweb/rdfs)).
:- use_module(library(semweb/rdf_label)).
:- use_module(library(sparkle)).
:- use_module(library(crawler)).
:- use_module(library(musicbrainz)).
:- use_module(library(memo)).
:- use_module(applications(browse)).
:- use_module(api(lod_crawler)).
:- use_module(cliopatria(hooks)).

:- sparql_endpoint(mb,  'http://dbtune.org/musicbrainz/sparql').
:- sparql_endpoint(lb,  'http://linkedbrainz.org/sparql/').
:- sparql_endpoint(dbp, 'http://dbpedia.org/sparql/').
:- sparql_endpoint(self, 'http://localhost:3020/sparql/').

% :- sparql_endpoint(bbc, 'http://dbtune.org/bbc/programmes/sparql/').
% :- sparql_endpoint(peel,'http://dbtune.org/bbc/peel/sparql/'),
% :- sparql_endpoint(classical,'http://dbtune.org/classical/sparql/'),
% :- sparql_endpoint(jamendo,'http://dbtune.org/jamendo/sparql/'),
% :- sparql_endpoint(magnatune,'http://dbtune.org/magnatune/sparql/'),
% :- sparql_endpoint(henry,'http://dbtune.org/henry/sparql/'),


crawler:source(dbp, dml_crawler:sparql_crawler(dbp), []).
crawler:source(lbz, dml_crawler:sparql_crawler(lb), []).
crawler:source(mbz, dml_crawler:mbz_crawler, []).
crawler:source(lod, dml_crawler:lod_crawler, []).

crawler:authority( begins('http://dbpedia.org'),     dbp, [auto(true)]).
crawler:authority( begins('http://musicbrainz.org'), lbz, [auto(true)]).
crawler:authority( begins('http://musicbrainz.org'), mbz, [auto(true)]).
crawler:authority( begins('http://id.loc.gov/vocabulary'), lod, [auto(true)]).
crawler:authority( begins('http://yago-knowledge.org'), lod, [auto(true)]).
crawler:authority( ( (begins('http://'); begins('https://')), 
                     \+begins('http://dbpedia.org/'),
                     \+begins('http://musicbrainz.org/'),
                     \+begins('http://dml.org/'),
                     \+begins('http://sounds.bl.uk/')
                   ), lod, [auto(false)]).

:- volatile_memo instrument(+label:atom,-uri:atom).

instrument(Label,Instr) :-
   rdf(Instr,skos:prefLabel,literal(exact(Label),_)).

:- rdf_meta censor(+,t).
censor(lb, rdf(_,'http://purl.org/muto/core#taggedResource',_)).
censor(lb, rdf(_,mo:musicbrainz_guid,_)).
censor(lb, rdf(_,foaf:made,_)).

% not needed after all: it was me giving the recordings type 'Signal', not lb.
% sparql_crawler:modify(lb,Tin,Tout) :- 
%    debug(sparql_crawler,"checking ~q",[Tin]),
%    once(relink_brainz(Tin,Tout)).

% :- rdf_meta relink_brainz(t,t).
% relink_brainz(rdf(MBZRecording,rdf:type,mo:'Track'),rdf(MBZRecording,rdf:type,mo:'Signal')):-
%    mb_id_uri(recording,_,MBZRecording).

% connects patched ClioPatria resource viewer to sparql_crawler module
cliopatria:resource_crawler(URI,NT) --> html(div(class(crawler),\crawl_ui(URI,NT))).

% --------- SPARQL crawler using sparkle ---------------

sparql_crawler(dbp,name('DBPedia')).
sparql_crawler(lb,name('LinkedBrainz')).

sparql_crawler(EP,uri_graph(_,Graph)) :-
	current_sparql_endpoint(EP,Host,Port,Path,_),
	parse_url(Graph,[protocol(http),host(Host),port(Port),path(Path)]).

sparql_crawler(EP,describe(URI,Triple)) :- 
   describe(EP,URI,Triple),
   \+censor(EP,Triple).


describe(EP,URI,rdf(Subj,Pred,URI)) :- EP ?? rdf(Subj,Pred,URI).
describe(EP,URI,rdf(URI,Pred,Obj)) :- EP ?? rdf(URI,Pred,Obj).
% !!! FIXME This was to slow for URIs with many linked resources
% Need to get asynchronous crawling working first..
% describe(EP,URI,rdf(Subj,Pred,Obj)) :- 
%    (EP ?? rdf(URI,P1,O1)),
%    (  Subj=URI, Pred=P1, Obj=O1
%    ;  Subj=O1, Pred=P2, Obj=O2,
%       O1\=literal(_),
%       (EP ?? rdf(O1,P2,O2))
%    ).

% ----------- LOD crawler ---------------------------
lod_crawler(name('LOD Cloud')).
lod_crawler(uri_graph(URI,Graph)) :-
   uri_components(URI, uri_components(Sch,Auth,_,_,_)),
   uri_components(Graph, uri_components(Sch,Auth,_,_,_)).
   
lod_crawler(crawl(URI,Graph)) :- 
   lod_uri_graph(URI,URL),
   rdf_load(URL,[graph(Graph)]).

% ----------- MusicBrainz crawler -------------------
mbz_crawler(uri_graph(_,'http://musicbrainz.org/ws/2')).
mbz_crawler(name('MusicBrainz')).
mbz_crawler(describe(URI,Triple)) :-
   debug(crawler,'Doing ~q.',[mbz_crawler(describe(URI,Triple))]),
   (  mb_id_uri(Type,_,URI) -> Context=Type
   ;  event_uri(URI,EvType,BaseURI,Extra),
      mb_id_uri(BaseType,_,BaseURI)
   -> Context=event(BaseType,BaseURI,EvType,Extra)
   ;  debug(crawler,'Unrecognised URI: ~w',[URI]),
      fail
   ),
   mbz_triple(Context,URI,Triple1),
   rdf_global_term(Triple1,Triple).


mbz_triple(artist,URI,rdf(URI,rdf:type,mo:'MusicArtist')).
mbz_triple(artist,URI,Triple) :-
   mb_lookup(URI,[inc([aliases]), rels([recording,work,artist])],Artist),
   subject_triple(Artist-URI,artist,Triple).

mbz_triple(recording,URI,rdf(URI,rdf:type,mo:'Signal')).
mbz_triple(recording,URI,rdf(Event,rdf:type,mo:'Recording')) :-
   event_uri(Event,production,URI).

mbz_triple(recording,URI,Triple) :-
   mb_lookup(URI,[inc([artists,'artist-credits']),rels([artist,work])],Recording),
   event_uri(Event,production,URI),
   subject_triple(Recording-URI, recording(Event), Triple).

mbz_triple(work,URI,rdf(URI,rdf:type,mo:'MusicalWork')).
mbz_triple(work,URI,Triple) :-
   mb_lookup(URI,[inc([aliases]),rels([recording,artist,work])],Work),
   subject_triple(Work-URI, work, Triple).

mbz_triple(event(work,_,composition,''),Event,rdf(Event,rdf:type:mo:'Composition')).
mbz_triple(event(work,W,composition,''),Event,Triple) :-
   mb_lookup(W,[rels([artist,work])],Work), % need work rels here?
   event_uri(Event,composition,WorkURI),
   subject_triple(Work-WorkURI, work, Triple).

mbz_triple(event(recording,Signal,production,''),Event,Triple) :-
   production_triple(Signal,Event,Triple).

production_triple(_, Ev, rdf(Ev,rdf:type,mo:'Performance')).
production_triple(_, Ev, rdf(Ev,rdf:type,mo:'Recording')).
production_triple(S, Ev, rdf(Ev,mo:produced_signal,S)).
production_triple(S, Ev, T) :-
   mb_lookup(S,[inc([artists,'artist-credits']),rels([work,artist])],Recording),
   mb_uri(Recording,Signal),
   subject_triple(Recording-Signal,recording(Ev),T).

%% subject_triple( +Subj:pair(element,resource), +Content:ground, -T:triple) is nondet.
%  Produce triples relating to this subject without doing any more musicbrainz queries.

% first deal with the non-relation facets
subject_triple(E-URI, Context, T) :- 
   dif(Facet,relation(_,_)), 
   mb_facet(E,Facet), 
   facet_triple(URI, Context, Facet, T).

% then deal with relations
subject_triple(E1-URI1, _Context, T) :-
   mb_relation(E1, E2, Name, Dir, Opts),
   mb_uri(E2,URI2), 
   (  mb_class(E2,C2), 
      subject_triple(E2-URI2, C2, T) % do all facets of related object
   ;  normalise_direction(Dir,URI1,URI2,URI_A,URI_B),
      relation_triple(Name,URI_A,URI_B,Opts,T)
   ).

normalise_direction(fwd,E1,E2,E1,E2).
normalise_direction(bwd,E1,E2,E2,E1).


facet_triple(_,   _, credit(A), T) :- mb_uri(A,Agent), subject_triple(A-Agent,artist,T).
facet_triple(URI, _, title(T),   rdf(URI,dc:title,literal(T))).
facet_triple(URI, artist, name(N),    rdf(URI,foaf:name,literal(N))).
facet_triple(URI, artist, alias(A), rdf(URI,dml:alias,literal(A))).
facet_triple(URI, artist, gender(G),  rdf(URI,foaf:gender,literal(GG))) :- downcase_atom(G,GG).
facet_triple(URI, artist, type(Type), rdf(URI,rdf:type,foaf:Type)) :- member(Type,['Person','Group']).
facet_triple(URI, artist, born(X), T) :-        life_event_triple(URI,birth,time(X),T).
facet_triple(URI, artist, died(X), T) :-        life_event_triple(URI,death,time(X),T).
facet_triple(URI, artist, birth_place(X), T) :- life_event_triple(URI,birth,area(X),T).
facet_triple(URI, artist, death_place(X), T) :- life_event_triple(URI,death,area(X),T).

facet_triple(_, recording(E), credit(A), rdf(E,event:agent,Agent)) :- mb_uri(A,Agent). 
facet_triple(URI, recording(_), length(L), rdf(URI,mo:duration,literal(type(xsd:float,LenA)))) :-
   atom_number(LenA,L).

life_event_triple(Agent,Type,Property,T) :-
   event_uri(Event,Type,Agent),
   (  T=rdf(Event,rdf:type,event:'Event') 
   ;  T=rdf(Event,event:agent,Agent)
   ;  life_event_property_triple(Event,Property,T)
   ).


life_event_property_triple(Event,area(A),rdf(Event,event:place,URI)) :- mb_uri(A,URI).
life_event_property_triple(Event,time(T),Triple) :-
   prefix_uri('/time',Event,Time),
   (  Triple=rdf(Event,event:time,Time)
   ;  time_instant_triple(Time-T,Triple)
   ).

% -----------------------------------------------------------------------------
% relation_triple

relation_triple(parts,            Whole, Part,   _, rdf(Part,dml:part_of,Whole)).
relation_triple(composer,         Agent, Work,   _, rdf(Work,dml:composer,Agent)). 
relation_triple('is person',      Agent, Person, _, rdf(Agent,dml:persona_of,Person)).
relation_triple('performing orchestra', _, Group,  _, rdf(Group,rdf:type,mo:'Orchestra')).
relation_triple(Name, _,     Group,  _, rdf(Group,rdf:type,mo:'MusicGroup')) :- membership_role(Name).
relation_triple(Name, Agent, Group,  _, rdf(Group,mo:member,Agent)) :- membership_role(Name).

relation_triple(Role, Agent, _, _, rdf(Agent,rdf:type,mo:'MusicArtist')) :- 
   musical_role(Role).
relation_triple(Name, R1, R2, Opts, T) :-
   relation_event(Name, R1, R2, Opts, Event, Relation),
   relation_event_triple(Relation, Event, T).

relation_event( based_on, Orig, Deriv, _, Ev, based_on(Orig))   :- event_uri(Ev,composition,Deriv).
relation_event( composer, A, W, O, Ev, composition(composer,A,W,O)) :- event_uri(Ev,composition,W).
relation_event( lyricist, A, W, _, Ev, composition(lyricist,A,W,[])) :- event_uri(Ev,composition,W).
relation_event( writer,   A, W, _, Ev, composition(writer,A,W,[])) :- event_uri(Ev,composition,W).
relation_event( arrangement, W1, W2, O, Ev, arrangement(W1,W2,O)) :- event_uri(Ev,arrangement,W2).
relation_event( performance, Sig, W, O, Ev, performance(W,O)) :- event_uri(Ev,production,Sig).
relation_event( performance, Sig, _, _, Ev, recording(Sig)) :- event_uri(Ev,production,Sig).
relation_event( performer, _, Sig, _, Ev, recording(Sig)) :- event_uri(Ev,production,Sig).
relation_event( 'performing orchestra', _, Sig, _, Ev, recording(Sig)) :- event_uri(Ev,production,Sig).
relation_event( vocal, Ag, Sig, O, Ev, ED) :- relation_event( instrument, Ag, Sig, [attribute(voice)|O], Ev, ED).
relation_event( 'instrument arranger', Ag, Sig, O, Ev, ED) :- relation_event(instrument,Ag,Sig,O,Ev,ED).
relation_event( 'instrument arranger', Ag, Sig, O, Ev, ED) :- relation_event(arranger,Ag,Sig,O,Ev,ED).
relation_event( instrument, _, Sig, _, Ev, recording(Sig)) :- event_uri(Ev,production,Sig).
relation_event( instrument, Ag, Sig, O, Ev, instrument(Prod,Ag,O)) :-
   mb_id_uri(_,PerformerID,Ag),
   event_uri(Prod,production,Sig),
   event_uri(Ev,performance,Sig,PerformerID).
relation_event( Name, Agent, Sig, _, Ev, role(Pred,Agent)) :- 
   production_role(Name,Pred), 
   mb_id_uri(recording,_,Sig),
   event_uri(Ev,production,Sig).

relation_event( Name, Agent, Group, Opts, Ev, membership(Agent,Group,[role(Role)|Opts])) :- 
   membership_role(Name,Role),
   mb_id_uri(_,AID,Agent),
   term_hash(t(AID,Role,Opts),Hash),
   number_string(Hash,HashString),
   event_uri(Ev,membership,Group,HashString).

production_role(producer,mo:producer).
production_role(arranger,mo:arranger).
production_role(conductor,mo:conductor).
production_role(performer,mo:performer).
production_role('performing orchestra',mo:performer).
production_role(vocal,mo:performer).

membership_role(Name) :- membership_role(Name,_).
membership_role('member of band',member).
membership_role('conductor position',conductor).
membership_role('vocal supporting musician',vocal_support).

musical_role(composer).
musical_role(arranger).
musical_role(lyricist).
musical_role(instrument).
musical_role(performer).
musical_role('performing orchestra').
musical_role('vocal supporting musician').
musical_role(Name) :- membership_role(Name).

relation_event_triple( based_on(_),          Ev, rdf(Ev,rdf:type,mo:'Composition')).
relation_event_triple( based_on(Orig),       Ev, rdf(Ev,event:factor,Orig)).

relation_event_triple( membership(_,_,_),    Ev, rdf(Ev,rdf:type,mo:'Membership')).
relation_event_triple( membership(A,_,_),    Ev, rdf(Ev,mo:artist,A)).
relation_event_triple( membership(_,G,_),    Ev, rdf(Ev,mo:group,G)).
relation_event_triple( membership(_,_,O),    Ev, T) :- membership_triple(Ev,O,T).
relation_event_triple( membership(_,_,O),    Ev, T) :- event_time_triple(Ev,O,T).

relation_event_triple( composition(_,_,_,_), Ev, rdf(Ev,rdf:type,mo:'Composition')).
relation_event_triple( composition(_,_,W,_), Ev, rdf(Ev,mo:produced_work,W)).
relation_event_triple( composition(R,A,_,_), Ev, rdf(Ev,mo:R,A)).
relation_event_triple( composition(_,_,_,O), Ev, T) :- event_time_triple(Ev,O,T).

relation_event_triple( arrangement(_,_,_),   Ev, rdf(Ev,rdf:type,mo:'Arrangement')).
relation_event_triple( arrangement(W,_,_),   Ev, rdf(Ev,mo:arrangement_of,W)).
relation_event_triple( arrangement(_,W,_),   Ev, rdf(Ev,mo:produced_work,W)).
relation_event_triple( arrangement(_,_,O),   Ev, T) :- event_time_triple(Ev,O,T).

relation_event_triple( performance(_,_), Ev, rdf(Ev,rdf:type,mo:'Performance')).
relation_event_triple( performance(_,O), Ev, T) :- event_time_triple(Ev,O,T).
relation_event_triple( performance(W,O), Ev, rdf(Ev,Pred,W)) :-
   (  member(attribute(partial),O)
   -> Pred=mo:partial_performance_of
   ;  Pred=mo:performance_of
   ).

relation_event_triple( recording(_), Ev, rdf(Ev,rdf:type,mo:'Recording')).
relation_event_triple( recording(S), Ev, rdf(Ev,mo:produced_signal,S)).

relation_event_triple( instrument(_,_,_), Ev, rdf(Ev,rdf:type,mo:'Performance')).
relation_event_triple( instrument(_,A,_), Ev, rdf(Ev,mo:performer,A)).
relation_event_triple( instrument(P,_,_), Ev, rdf(P,event:sub_event,Ev)).
relation_event_triple( instrument(_,_,O), Ev, T) :- event_time_triple(Ev,O,T).
% relation_event_triple( instrument(_,_,O), Ev, rdf(Ev,mo:instrument,literal(Instr))) :-
%    member(attribute(Instr),O). 
relation_event_triple( instrument(_,_,O), Ev, rdf(Ev,mo:instrument,Instr)) :-
   member(attribute(Label),O), 
   (  instrument(Label,Instr) -> true
   ;  Instr=literal(Label)
   ).

membership_triple(E, O, rdf(E,dml:role,literal(R))) :- member(role(R),O), R\=member.
membership_triple(E, O, rdf(E,dml:modifier,literal(A))) :- member(attribute(A),O).

event_time_triple(Event,Opts,T) :-
   (memberchk(begin(_),Opts); memberchk(end(_),Opts); memberchk(in(_),Opts)),
   prefix_uri('/time',Event,Time),
   (  T=rdf(Event,event:time,Time)
   ;  (  member(in(DT),Opts)
      -> time_instant_triple(Time-DT,T)
      ;  (  member(begin(DT),Opts), prefix_uri('/begin',Time,Pt), Pred=time:hasBeginning
         ;  member(end(DT),Opts),   prefix_uri('/end',Time,Pt),   Pred=time:hasEnd
         ),
         time_interval_triple(Time,Pred,Pt-DT,T)
      )
   ).

time_instant_triple(Time-_,  rdf(Time,rdf:type,time:'Instant')).
time_instant_triple(Time-DT, rdf(Time,time:inXSDDateTime,literal(type(xsd:dateTime,DT)))).

time_interval_triple(T, _,    _,  rdf(T,rdf:type, time:'Interval')).
time_interval_triple(T, Pred, Pt-_,  rdf(T,Pred,Pt)).
time_interval_triple(_, _,    PtDT, T) :- time_instant_triple(PtDT,T).


%% event_uri(-EventURI,+Type,+BaseURI) is det.
%% event_uri(-EventURI,+Type,+BaseURI) is det.
event_uri(EventURI,Type,BaseURI) :- event_uri(EventURI,Type,BaseURI,'').

%% event_uri(+EventURI,-Type,-BaseURI,-Extra) is det.
%% event_uri(-EventURI,+Type,+BaseURI,+Extra) is det.
event_uri(EventURI,Type,BaseURI,Extra) :-
   (  var(EventURI) 
   -> uri_components(BaseURI,uri_components(Sc,Ho,Pa,Extra,'_')),
      uri_components(EventURI,uri_components(Sc,Ho,Pa,Extra,Type)), Type\='_'
   ;  uri_components(EventURI,uri_components(Sc,Ho,Pa,Extra,Type)), Type\='_',
      uri_components(BaseURI,uri_components(Sc,Ho,Pa,Extra,'_'))
   ).

% prefix_uri(_,_,N) :- rdf_bnode(N), !.
prefix_uri(Prefix,URI,PURI) :-
   uri_components(URI,uri_components(Sc,Ho,Pa,Extra,Type)), atom_concat(Prefix,Pa,TPa),
   uri_components(PURI,uri_components(Sc,Ho,TPa,Extra,Type)).

% -------------- display hooks ---------------
:- rdf_meta label(r,r,-).
label(time:'Instant',URI, Label) :-
   (  rdf_has(URI,time:inXSDDateTime,literal(type(xsd:dateTime,DT)))
   -> format(string(Label),'~w',[DT])
   ;  Label='<unknown>'
   ).

label(time:'Interval',URI, Label) :-
   (rdf_has(URI,time:hasBeginning,Begin) -> rdf_display_label(Begin,L1); L1='<unknown>'),
   (rdf_has(URI,time:hasEnd,End) -> rdf_display_label(End,L2); L2='<unknown>'),
   (  L1=L2
   -> format(string(Label),'~s',[L1])
   ;  format(string(Label),'~s to ~s',[L1,L2])
   ).

label(mo:'Recording',Ev, Label) :-
   rdf_has(Ev,mo:produced_signal,Sig), !,
   rdf_display_label(Sig,Title),
   (  rdf_has(Ev,event:agent,Agent)
   -> rdf_display_label(Agent,Name),
      format(string(Label),'Recording of ~w by ~w',[Title,Name])
   ;  format(string(Label),'Recording of ~w',[Title])
   ).

label(mo:'Membership',Ev, Label) :-
   rdf_has(Ev,mo:artist,A),
   rdf_has(Ev,mo:group,G), !,
   rdf_display_label(A,AL),
   rdf_display_label(G,GL),
   format(string(Label),'Membership of ~w in ~w',[AL,GL]).

label(mo:'Performance',Ev, Label) :-
   (  rdf_has(Ev,mo:produced_signal,Sig)
   ;  rdf_has(SEv,event:sub_event,Ev),
      rdf_has(SEv,mo:produced_signal,Sig)
   ), !,
   rdf_display_label(Sig,Title),
   rdf_has(Ev,mo:performer,Performer), rdf_display_label(Performer,Name),
   rdf_has(Ev,mo:instrument,Instr), rdf_display_label(Instr,IName),
   format(string(Label),'~w playing ~w on ~w',[Name,IName,Title]).


label(mo:'Composition',E,Label) :- 
   rdf_has(E,mo:produced_work,Work),
   rdfs_individual_of(Work,mo:'MusicalWork'), !,
   rdf_display_label(Work,Title),
   format(string(Label),'Composition of ~w',[Title]).

rdf_label:display_label_hook(URI, _, Label) :- 
   rdf(URI,rdf:type,Class),
   label(Class,URI,Label).

rdf_label:display_label_hook(URI, _, Label) :- 
   atom(URI),
   sub_atom(URI,0,_,_,'http://musicbrainz.org'),
   event_uri(URI,EvType,Base),
   event_label(EvType,URI,Base,Label).

event_label(birth,_,Person,Label) :-
   rdfs_individual_of(Person,foaf:'Person'), !,
   rdf_display_label(Person,Name),
   format(string(Label),'Birth of ~w',[Name]).
event_label(death,_,Person,Label) :-
   rdfs_individual_of(Person,foaf:'Person'), !,
   rdf_display_label(Person,Name),
   format(string(Label),'Death of ~w',[Name]).