danielebarchiesi@4
|
1 <?php
|
danielebarchiesi@4
|
2 /**
|
danielebarchiesi@4
|
3 * ARC2 CrunchBase API JSON Parser
|
danielebarchiesi@4
|
4 *
|
danielebarchiesi@4
|
5 * @author Benjamin Nowack <bnowack@semsol.com>
|
danielebarchiesi@4
|
6 * @license http://arc.semsol.org/license
|
danielebarchiesi@4
|
7 * @homepage <http://arc.semsol.org/>
|
danielebarchiesi@4
|
8 * @package ARC2
|
danielebarchiesi@4
|
9 * @version 2010-11-16
|
danielebarchiesi@4
|
10 */
|
danielebarchiesi@4
|
11
|
danielebarchiesi@4
|
12 ARC2::inc('JSONParser');
|
danielebarchiesi@4
|
13
|
danielebarchiesi@4
|
14 class ARC2_CBJSONParser extends ARC2_JSONParser {
|
danielebarchiesi@4
|
15
|
danielebarchiesi@4
|
16 function __construct($a, &$caller) {
|
danielebarchiesi@4
|
17 parent::__construct($a, $caller);
|
danielebarchiesi@4
|
18 }
|
danielebarchiesi@4
|
19
|
danielebarchiesi@4
|
20 function __init() {/* reader */
|
danielebarchiesi@4
|
21 parent::__init();
|
danielebarchiesi@4
|
22 $this->base = 'http://cb.semsol.org/';
|
danielebarchiesi@4
|
23 $this->rdf = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
|
danielebarchiesi@4
|
24 $this->default_ns = $this->base . 'ns#';
|
danielebarchiesi@4
|
25 $this->nsp = array($this->rdf => 'rdf');
|
danielebarchiesi@4
|
26 }
|
danielebarchiesi@4
|
27
|
danielebarchiesi@4
|
28 /* */
|
danielebarchiesi@4
|
29
|
danielebarchiesi@4
|
30 function done() {
|
danielebarchiesi@4
|
31 $this->extractRDF();
|
danielebarchiesi@4
|
32 }
|
danielebarchiesi@4
|
33
|
danielebarchiesi@4
|
34 function extractRDF() {
|
danielebarchiesi@4
|
35 $struct = $this->struct;
|
danielebarchiesi@4
|
36 if ($type = $this->getStructType($struct)) {
|
danielebarchiesi@4
|
37 $s = $this->getResourceID($struct, $type);
|
danielebarchiesi@4
|
38 /* rdf:type */
|
danielebarchiesi@4
|
39 $this->addT($s, $this->rdf . 'type', $this->default_ns . $this->camelCase($type), 'uri', 'uri');
|
danielebarchiesi@4
|
40 /* explicit triples */
|
danielebarchiesi@4
|
41 $this->extractResourceRDF($struct, $s);
|
danielebarchiesi@4
|
42 }
|
danielebarchiesi@4
|
43 }
|
danielebarchiesi@4
|
44
|
danielebarchiesi@4
|
45 function getStructType($struct, $rel = '') {
|
danielebarchiesi@4
|
46 /* url-based */
|
danielebarchiesi@4
|
47 if ($url = $this->v('crunchbase_url', '', $struct)) {
|
danielebarchiesi@4
|
48 return preg_replace('/^.*crunchbase\.com\/([^\/]+)\/.*$/', '\\1', $url);
|
danielebarchiesi@4
|
49 }
|
danielebarchiesi@4
|
50 /* rel-based */
|
danielebarchiesi@4
|
51 if ($rel == 'person') return 'person';
|
danielebarchiesi@4
|
52 if ($rel == 'company') return 'company';
|
danielebarchiesi@4
|
53 if ($rel == 'acquiring_company') return 'company';
|
danielebarchiesi@4
|
54 if ($rel == 'firm') return 'company';
|
danielebarchiesi@4
|
55 if ($rel == 'provider') return 'service-provider';
|
danielebarchiesi@4
|
56 /* struct-based */
|
danielebarchiesi@4
|
57 if (isset($struct['_type'])) return $struct['_type'];
|
danielebarchiesi@4
|
58 if (isset($struct['round_code'])) return 'funding_round';
|
danielebarchiesi@4
|
59 if (isset($struct['products'])) return 'company';
|
danielebarchiesi@4
|
60 if (isset($struct['first_name'])) return 'person';
|
danielebarchiesi@4
|
61 if (isset($struct['investments'])) return 'financial-organization';
|
danielebarchiesi@4
|
62 if (isset($struct['launched_year'])) return 'product';
|
danielebarchiesi@4
|
63 if (isset($struct['providerships']) && is_array($struct['providerships'])) return 'service-provider';
|
danielebarchiesi@4
|
64 return '';
|
danielebarchiesi@4
|
65 }
|
danielebarchiesi@4
|
66
|
danielebarchiesi@4
|
67 function getResourceID($struct, $type) {
|
danielebarchiesi@4
|
68 if ($type && isset($struct['permalink'])) {
|
danielebarchiesi@4
|
69 return $this->base . $type . '/' . $struct['permalink'] . '#self';
|
danielebarchiesi@4
|
70 }
|
danielebarchiesi@4
|
71 return $this->createBnodeID();
|
danielebarchiesi@4
|
72 }
|
danielebarchiesi@4
|
73
|
danielebarchiesi@4
|
74 function getPropertyURI($name, $ns = '') {
|
danielebarchiesi@4
|
75 if (!$ns) $ns = $this->default_ns;
|
danielebarchiesi@4
|
76 if (preg_match('/^(product|funding_round|investment|acquisition|.+ship|office|milestone|.+embed|.+link|degree|fund)s/', $name, $m)) $name = $m[1];
|
danielebarchiesi@4
|
77 if ($name == 'tag_list') $name = 'tag';
|
danielebarchiesi@4
|
78 if ($name == 'competitions') $name = 'competitor';
|
danielebarchiesi@4
|
79 return $ns . $name;
|
danielebarchiesi@4
|
80 }
|
danielebarchiesi@4
|
81
|
danielebarchiesi@4
|
82 function createSubURI($s, $k, $pos) {
|
danielebarchiesi@4
|
83 $s = str_replace('#self', '/', $s);
|
danielebarchiesi@4
|
84 if (preg_match('/(office|ship|investment|milestone|fund|embed|link)s$/', $k)) $k = substr($k, 0, -1);
|
danielebarchiesi@4
|
85 return $s . $k . '-' . ($pos + 1) . '#self';
|
danielebarchiesi@4
|
86 }
|
danielebarchiesi@4
|
87
|
danielebarchiesi@4
|
88 /* */
|
danielebarchiesi@4
|
89
|
danielebarchiesi@4
|
90 function extractResourceRDF($struct, $s, $pos = 0) {
|
danielebarchiesi@4
|
91 $s_type = preg_match('/^\_\:/', $s) ? 'bnode' : 'uri';
|
danielebarchiesi@4
|
92 $date_prefixes = array();
|
danielebarchiesi@4
|
93 foreach ($struct as $k => $v) {
|
danielebarchiesi@4
|
94 if ($k == 'acquisition') $k = 'exit';
|
danielebarchiesi@4
|
95 if (preg_match('/^(.*)\_(year|month|day)$/', $k, $m)) {
|
danielebarchiesi@4
|
96 if (!in_array($m[1], $date_prefixes)) $date_prefixes[] = $m[1];
|
danielebarchiesi@4
|
97 }
|
danielebarchiesi@4
|
98 $sub_m = 'extract' . $this->camelCase($k) . 'RDF';
|
danielebarchiesi@4
|
99 if (method_exists($this, $sub_m)) {
|
danielebarchiesi@4
|
100 $this->$sub_m($s, $s_type, $v);
|
danielebarchiesi@4
|
101 continue;
|
danielebarchiesi@4
|
102 }
|
danielebarchiesi@4
|
103 $p = $this->getPropertyURI($k);
|
danielebarchiesi@4
|
104 if (!$v) continue;
|
danielebarchiesi@4
|
105 /* simple, single v */
|
danielebarchiesi@4
|
106 if (!is_array($v)) {
|
danielebarchiesi@4
|
107 $o_type = preg_match('/^[a-z]+\:[^\s]+$/is', $v) ? 'uri' : 'literal';
|
danielebarchiesi@4
|
108 $v = trim($v);
|
danielebarchiesi@4
|
109 if (preg_match('/^https?\:\/\/[^\/]+$/', $v)) $v .= '/';
|
danielebarchiesi@4
|
110 $this->addT($s, $p, $v, $s_type, $o_type);
|
danielebarchiesi@4
|
111 /* rdfs:label */
|
danielebarchiesi@4
|
112 if ($k == 'name') $this->addT($s, 'http://www.w3.org/2000/01/rdf-schema#label', $v, $s_type, $o_type);
|
danielebarchiesi@4
|
113 /* dc:identifier */
|
danielebarchiesi@4
|
114 //if ($k == 'permalink') $this->addT($s, 'http://purl.org/dc/elements/1.1/identifier', $v, $s_type, $o_type);
|
danielebarchiesi@4
|
115 }
|
danielebarchiesi@4
|
116 /* structured, single v */
|
danielebarchiesi@4
|
117 elseif (!$this->isFlatArray($v)) {
|
danielebarchiesi@4
|
118 if ($o_type = $this->getStructType($v, $k)) {/* known type */
|
danielebarchiesi@4
|
119 $o = $this->getResourceID($v, $o_type);
|
danielebarchiesi@4
|
120 $this->addT($s, $p, $o, $s_type, 'uri');
|
danielebarchiesi@4
|
121 $this->addT($o, $this->rdf . 'type', $this->default_ns . $this->camelCase($o_type), 'uri', 'uri');
|
danielebarchiesi@4
|
122 }
|
danielebarchiesi@4
|
123 else {/* unknown type */
|
danielebarchiesi@4
|
124 $o = $this->createSubURI($s, $k, $pos);
|
danielebarchiesi@4
|
125 $this->addT($s, $p, $o, $s_type, 'uri');
|
danielebarchiesi@4
|
126 $this->extractResourceRDF($v, $o);
|
danielebarchiesi@4
|
127 }
|
danielebarchiesi@4
|
128 }
|
danielebarchiesi@4
|
129 /* value list */
|
danielebarchiesi@4
|
130 else {
|
danielebarchiesi@4
|
131 foreach ($v as $sub_pos => $sub_v) {
|
danielebarchiesi@4
|
132 $this->extractResourceRDF(array($k => $sub_v), $s, $sub_pos);
|
danielebarchiesi@4
|
133 }
|
danielebarchiesi@4
|
134 }
|
danielebarchiesi@4
|
135 }
|
danielebarchiesi@4
|
136 /* infer XSD triples */
|
danielebarchiesi@4
|
137 foreach ($date_prefixes as $prefix) {
|
danielebarchiesi@4
|
138 $this->inferDate($prefix, $s, $struct);
|
danielebarchiesi@4
|
139 }
|
danielebarchiesi@4
|
140 }
|
danielebarchiesi@4
|
141
|
danielebarchiesi@4
|
142 function isFlatArray($v) {
|
danielebarchiesi@4
|
143 foreach ($v as $k => $sub_v) {
|
danielebarchiesi@4
|
144 return is_numeric($k) ? 1 : 0;
|
danielebarchiesi@4
|
145 }
|
danielebarchiesi@4
|
146 }
|
danielebarchiesi@4
|
147
|
danielebarchiesi@4
|
148 /* */
|
danielebarchiesi@4
|
149
|
danielebarchiesi@4
|
150 function extractTagListRDF($s, $s_type, $v) {
|
danielebarchiesi@4
|
151 if (!$v) return 0;
|
danielebarchiesi@4
|
152 $tags = preg_split('/\, /', $v);
|
danielebarchiesi@4
|
153 foreach ($tags as $tag) {
|
danielebarchiesi@4
|
154 if (!trim($tag)) continue;
|
danielebarchiesi@4
|
155 $this->addT($s, $this->getPropertyURI('tag'), $tag, $s_type, 'literal');
|
danielebarchiesi@4
|
156 }
|
danielebarchiesi@4
|
157 }
|
danielebarchiesi@4
|
158
|
danielebarchiesi@4
|
159 function extractImageRDF($s, $s_type, $v, $rel = 'image') {
|
danielebarchiesi@4
|
160 if (!$v) return 1;
|
danielebarchiesi@4
|
161 $sizes = $v['available_sizes'];
|
danielebarchiesi@4
|
162 foreach ($sizes as $size) {
|
danielebarchiesi@4
|
163 $w = $size[0][0];
|
danielebarchiesi@4
|
164 $h = $size[0][1];
|
danielebarchiesi@4
|
165 $img = 'http://www.crunchbase.com/' . $size[1];
|
danielebarchiesi@4
|
166 $this->addT($s, $this->getPropertyURI($rel), $img, $s_type, 'uri');
|
danielebarchiesi@4
|
167 $this->addT($img, $this->getPropertyURI('width'), $w, 'uri', 'literal');
|
danielebarchiesi@4
|
168 $this->addT($img, $this->getPropertyURI('height'), $h, 'uri', 'literal');
|
danielebarchiesi@4
|
169 }
|
danielebarchiesi@4
|
170 }
|
danielebarchiesi@4
|
171
|
danielebarchiesi@4
|
172 function extractScreenshotsRDF($s, $s_type, $v) {
|
danielebarchiesi@4
|
173 if (!$v) return 1;
|
danielebarchiesi@4
|
174 foreach ($v as $sub_v) {
|
danielebarchiesi@4
|
175 $this->extractImageRDF($s, $s_type, $sub_v, 'screenshot');
|
danielebarchiesi@4
|
176 }
|
danielebarchiesi@4
|
177 }
|
danielebarchiesi@4
|
178
|
danielebarchiesi@4
|
179 function extractProductsRDF($s, $s_type, $v) {
|
danielebarchiesi@4
|
180 foreach ($v as $sub_v) {
|
danielebarchiesi@4
|
181 $o = $this->getResourceID($sub_v, 'product');
|
danielebarchiesi@4
|
182 $this->addT($s, $this->getPropertyURI('product'), $o, $s_type, 'uri');
|
danielebarchiesi@4
|
183 }
|
danielebarchiesi@4
|
184 }
|
danielebarchiesi@4
|
185
|
danielebarchiesi@4
|
186 function extractCompetitionsRDF($s, $s_type, $v) {
|
danielebarchiesi@4
|
187 foreach ($v as $sub_v) {
|
danielebarchiesi@4
|
188 $o = $this->getResourceID($sub_v['competitor'], 'company');
|
danielebarchiesi@4
|
189 $this->addT($s, $this->getPropertyURI('competitor'), $o, $s_type, 'uri');
|
danielebarchiesi@4
|
190 }
|
danielebarchiesi@4
|
191 }
|
danielebarchiesi@4
|
192
|
danielebarchiesi@4
|
193 function extractFundingRoundsRDF($s, $s_type, $v) {
|
danielebarchiesi@4
|
194 foreach ($v as $pos => $sub_v) {
|
danielebarchiesi@4
|
195 $o = $this->createSubURI($s, 'funding_round', $pos);
|
danielebarchiesi@4
|
196 $this->addT($s, $this->getPropertyURI('funding_round'), $o, $s_type, 'uri');
|
danielebarchiesi@4
|
197 $this->extractResourceRDF($sub_v, $o, $pos);
|
danielebarchiesi@4
|
198 }
|
danielebarchiesi@4
|
199 }
|
danielebarchiesi@4
|
200
|
danielebarchiesi@4
|
201 function extractInvestmentsRDF($s, $s_type, $v) {
|
danielebarchiesi@4
|
202 foreach ($v as $pos => $sub_v) {
|
danielebarchiesi@4
|
203 /* incoming */
|
danielebarchiesi@4
|
204 foreach (array('person' => 'person', 'company' => 'company', 'financial_org' => 'financial-organization') as $k => $type) {
|
danielebarchiesi@4
|
205 if (isset($sub_v[$k])) $this->addT($s, $this->getPropertyURI('investment'), $this->getResourceID($sub_v[$k], $type), $s_type, 'uri');
|
danielebarchiesi@4
|
206 }
|
danielebarchiesi@4
|
207 /* outgoing */
|
danielebarchiesi@4
|
208 if (isset($sub_v['funding_round'])) {
|
danielebarchiesi@4
|
209 $o = $this->createSubURI($s, 'investment', $pos);
|
danielebarchiesi@4
|
210 $this->addT($s, $this->getPropertyURI('investment'), $o, $s_type, 'uri');
|
danielebarchiesi@4
|
211 $this->extractResourceRDF($sub_v['funding_round'], $o, $pos);
|
danielebarchiesi@4
|
212 }
|
danielebarchiesi@4
|
213 }
|
danielebarchiesi@4
|
214 }
|
danielebarchiesi@4
|
215
|
danielebarchiesi@4
|
216 function extractExternalLinksRDF($s, $s_type, $v) {
|
danielebarchiesi@4
|
217 foreach ($v as $sub_v) {
|
danielebarchiesi@4
|
218 $href = $sub_v['external_url'];
|
danielebarchiesi@4
|
219 if (preg_match('/^https?\:\/\/[^\/]+$/', $href)) $href .= '/';
|
danielebarchiesi@4
|
220 $this->addT($s, $this->getPropertyURI('external_link'), $href, $s_type, 'uri');
|
danielebarchiesi@4
|
221 $this->addT($href, $this->getPropertyURI('title'), $sub_v['title'], $s_type, 'literal');
|
danielebarchiesi@4
|
222 }
|
danielebarchiesi@4
|
223 }
|
danielebarchiesi@4
|
224
|
danielebarchiesi@4
|
225 function extractWebPresencesRDF($s, $s_type, $v) {
|
danielebarchiesi@4
|
226 foreach ($v as $sub_v) {
|
danielebarchiesi@4
|
227 $href = $sub_v['external_url'];
|
danielebarchiesi@4
|
228 if (preg_match('/^https?\:\/\/[^\/]+$/', $href)) $href .= '/';
|
danielebarchiesi@4
|
229 $this->addT($s, $this->getPropertyURI('web_presence'), $href, $s_type, 'uri');
|
danielebarchiesi@4
|
230 $this->addT($href, $this->getPropertyURI('title'), $sub_v['title'], $s_type, 'literal');
|
danielebarchiesi@4
|
231 }
|
danielebarchiesi@4
|
232 }
|
danielebarchiesi@4
|
233
|
danielebarchiesi@4
|
234 function extractCreatedAtRDF($s, $s_type, $v) {
|
danielebarchiesi@4
|
235 $v = $this->getAPIDateXSD($v);
|
danielebarchiesi@4
|
236 $this->addT($s, $this->getPropertyURI('created_at'), $v, $s_type, 'literal');
|
danielebarchiesi@4
|
237 }
|
danielebarchiesi@4
|
238
|
danielebarchiesi@4
|
239 function extractUpdatedAtRDF($s, $s_type, $v) {
|
danielebarchiesi@4
|
240 $v = $this->getAPIDateXSD($v);
|
danielebarchiesi@4
|
241 $this->addT($s, $this->getPropertyURI('updated_at'), $v, $s_type, 'literal');
|
danielebarchiesi@4
|
242 }
|
danielebarchiesi@4
|
243
|
danielebarchiesi@4
|
244 function getAPIDateXSD($val) {
|
danielebarchiesi@4
|
245 //Fri Jan 16 21:11:48 UTC 2009
|
danielebarchiesi@4
|
246 if (preg_match('/^[a-z]+ ([a-z]+) ([0-9]+) ([0-9]{2}\:[0-9]{2}\:[0-9]{2}) UTC ([0-9]{4})/i', $val, $m)) {
|
danielebarchiesi@4
|
247 $months = array('Jan' => '01', 'Feb' => '02', 'Mar' =>'03', 'Apr' => '04', 'May' => '05', 'Jun' => '06', 'Jul' => '07', 'Aug' => '08', 'Sep' => '09', 'Oct' => '10', 'Nov' => '11', 'Dec' => '12');
|
danielebarchiesi@4
|
248 return $m[4] . '-' . $months[$m[1]] . '-' . $m[2] . 'T' . $m[3] . 'Z';
|
danielebarchiesi@4
|
249 }
|
danielebarchiesi@4
|
250 return '2000-01-01';
|
danielebarchiesi@4
|
251 }
|
danielebarchiesi@4
|
252
|
danielebarchiesi@4
|
253 /* */
|
danielebarchiesi@4
|
254
|
danielebarchiesi@4
|
255 function inferDate($prefix, $s, $struct) {
|
danielebarchiesi@4
|
256 $s_type = preg_match('/^\_\:/', $s) ? 'bnode' : 'uri';
|
danielebarchiesi@4
|
257 $r = '';
|
danielebarchiesi@4
|
258 foreach (array('year', 'month', 'day') as $suffix) {
|
danielebarchiesi@4
|
259 $val = $this->v1($prefix . '_' . $suffix, '00', $struct);
|
danielebarchiesi@4
|
260 $r .= ($r ? '-' : '') . str_pad($val, 2, '0', STR_PAD_LEFT);
|
danielebarchiesi@4
|
261 }
|
danielebarchiesi@4
|
262 if ($r != '00-00-00') {
|
danielebarchiesi@4
|
263 $this->addT($s, $this->getPropertyURI($prefix . '_date'), $r, $s_type, 'literal');
|
danielebarchiesi@4
|
264 }
|
danielebarchiesi@4
|
265 }
|
danielebarchiesi@4
|
266
|
danielebarchiesi@4
|
267 }
|