annotate sites/all/libraries/ARC2/arc/parsers/ARC2_CBJSONParser.php @ 11:b0ee71395280

deleted .DS_Store files
author danieleb <danielebarchiesi@me.com>
date Mon, 28 Oct 2013 16:12:13 +0000
parents ce11bbd8f642
children
rev   line source
danielebarchiesi@4 1 <?php
danielebarchiesi@4 2 /**
danielebarchiesi@4 3 * ARC2 CrunchBase API JSON Parser
danielebarchiesi@4 4 *
danielebarchiesi@4 5 * @author Benjamin Nowack <bnowack@semsol.com>
danielebarchiesi@4 6 * @license http://arc.semsol.org/license
danielebarchiesi@4 7 * @homepage <http://arc.semsol.org/>
danielebarchiesi@4 8 * @package ARC2
danielebarchiesi@4 9 * @version 2010-11-16
danielebarchiesi@4 10 */
danielebarchiesi@4 11
danielebarchiesi@4 12 ARC2::inc('JSONParser');
danielebarchiesi@4 13
danielebarchiesi@4 14 class ARC2_CBJSONParser extends ARC2_JSONParser {
danielebarchiesi@4 15
danielebarchiesi@4 16 function __construct($a, &$caller) {
danielebarchiesi@4 17 parent::__construct($a, $caller);
danielebarchiesi@4 18 }
danielebarchiesi@4 19
danielebarchiesi@4 20 function __init() {/* reader */
danielebarchiesi@4 21 parent::__init();
danielebarchiesi@4 22 $this->base = 'http://cb.semsol.org/';
danielebarchiesi@4 23 $this->rdf = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
danielebarchiesi@4 24 $this->default_ns = $this->base . 'ns#';
danielebarchiesi@4 25 $this->nsp = array($this->rdf => 'rdf');
danielebarchiesi@4 26 }
danielebarchiesi@4 27
danielebarchiesi@4 28 /* */
danielebarchiesi@4 29
danielebarchiesi@4 30 function done() {
danielebarchiesi@4 31 $this->extractRDF();
danielebarchiesi@4 32 }
danielebarchiesi@4 33
danielebarchiesi@4 34 function extractRDF() {
danielebarchiesi@4 35 $struct = $this->struct;
danielebarchiesi@4 36 if ($type = $this->getStructType($struct)) {
danielebarchiesi@4 37 $s = $this->getResourceID($struct, $type);
danielebarchiesi@4 38 /* rdf:type */
danielebarchiesi@4 39 $this->addT($s, $this->rdf . 'type', $this->default_ns . $this->camelCase($type), 'uri', 'uri');
danielebarchiesi@4 40 /* explicit triples */
danielebarchiesi@4 41 $this->extractResourceRDF($struct, $s);
danielebarchiesi@4 42 }
danielebarchiesi@4 43 }
danielebarchiesi@4 44
danielebarchiesi@4 45 function getStructType($struct, $rel = '') {
danielebarchiesi@4 46 /* url-based */
danielebarchiesi@4 47 if ($url = $this->v('crunchbase_url', '', $struct)) {
danielebarchiesi@4 48 return preg_replace('/^.*crunchbase\.com\/([^\/]+)\/.*$/', '\\1', $url);
danielebarchiesi@4 49 }
danielebarchiesi@4 50 /* rel-based */
danielebarchiesi@4 51 if ($rel == 'person') return 'person';
danielebarchiesi@4 52 if ($rel == 'company') return 'company';
danielebarchiesi@4 53 if ($rel == 'acquiring_company') return 'company';
danielebarchiesi@4 54 if ($rel == 'firm') return 'company';
danielebarchiesi@4 55 if ($rel == 'provider') return 'service-provider';
danielebarchiesi@4 56 /* struct-based */
danielebarchiesi@4 57 if (isset($struct['_type'])) return $struct['_type'];
danielebarchiesi@4 58 if (isset($struct['round_code'])) return 'funding_round';
danielebarchiesi@4 59 if (isset($struct['products'])) return 'company';
danielebarchiesi@4 60 if (isset($struct['first_name'])) return 'person';
danielebarchiesi@4 61 if (isset($struct['investments'])) return 'financial-organization';
danielebarchiesi@4 62 if (isset($struct['launched_year'])) return 'product';
danielebarchiesi@4 63 if (isset($struct['providerships']) && is_array($struct['providerships'])) return 'service-provider';
danielebarchiesi@4 64 return '';
danielebarchiesi@4 65 }
danielebarchiesi@4 66
danielebarchiesi@4 67 function getResourceID($struct, $type) {
danielebarchiesi@4 68 if ($type && isset($struct['permalink'])) {
danielebarchiesi@4 69 return $this->base . $type . '/' . $struct['permalink'] . '#self';
danielebarchiesi@4 70 }
danielebarchiesi@4 71 return $this->createBnodeID();
danielebarchiesi@4 72 }
danielebarchiesi@4 73
danielebarchiesi@4 74 function getPropertyURI($name, $ns = '') {
danielebarchiesi@4 75 if (!$ns) $ns = $this->default_ns;
danielebarchiesi@4 76 if (preg_match('/^(product|funding_round|investment|acquisition|.+ship|office|milestone|.+embed|.+link|degree|fund)s/', $name, $m)) $name = $m[1];
danielebarchiesi@4 77 if ($name == 'tag_list') $name = 'tag';
danielebarchiesi@4 78 if ($name == 'competitions') $name = 'competitor';
danielebarchiesi@4 79 return $ns . $name;
danielebarchiesi@4 80 }
danielebarchiesi@4 81
danielebarchiesi@4 82 function createSubURI($s, $k, $pos) {
danielebarchiesi@4 83 $s = str_replace('#self', '/', $s);
danielebarchiesi@4 84 if (preg_match('/(office|ship|investment|milestone|fund|embed|link)s$/', $k)) $k = substr($k, 0, -1);
danielebarchiesi@4 85 return $s . $k . '-' . ($pos + 1) . '#self';
danielebarchiesi@4 86 }
danielebarchiesi@4 87
danielebarchiesi@4 88 /* */
danielebarchiesi@4 89
danielebarchiesi@4 90 function extractResourceRDF($struct, $s, $pos = 0) {
danielebarchiesi@4 91 $s_type = preg_match('/^\_\:/', $s) ? 'bnode' : 'uri';
danielebarchiesi@4 92 $date_prefixes = array();
danielebarchiesi@4 93 foreach ($struct as $k => $v) {
danielebarchiesi@4 94 if ($k == 'acquisition') $k = 'exit';
danielebarchiesi@4 95 if (preg_match('/^(.*)\_(year|month|day)$/', $k, $m)) {
danielebarchiesi@4 96 if (!in_array($m[1], $date_prefixes)) $date_prefixes[] = $m[1];
danielebarchiesi@4 97 }
danielebarchiesi@4 98 $sub_m = 'extract' . $this->camelCase($k) . 'RDF';
danielebarchiesi@4 99 if (method_exists($this, $sub_m)) {
danielebarchiesi@4 100 $this->$sub_m($s, $s_type, $v);
danielebarchiesi@4 101 continue;
danielebarchiesi@4 102 }
danielebarchiesi@4 103 $p = $this->getPropertyURI($k);
danielebarchiesi@4 104 if (!$v) continue;
danielebarchiesi@4 105 /* simple, single v */
danielebarchiesi@4 106 if (!is_array($v)) {
danielebarchiesi@4 107 $o_type = preg_match('/^[a-z]+\:[^\s]+$/is', $v) ? 'uri' : 'literal';
danielebarchiesi@4 108 $v = trim($v);
danielebarchiesi@4 109 if (preg_match('/^https?\:\/\/[^\/]+$/', $v)) $v .= '/';
danielebarchiesi@4 110 $this->addT($s, $p, $v, $s_type, $o_type);
danielebarchiesi@4 111 /* rdfs:label */
danielebarchiesi@4 112 if ($k == 'name') $this->addT($s, 'http://www.w3.org/2000/01/rdf-schema#label', $v, $s_type, $o_type);
danielebarchiesi@4 113 /* dc:identifier */
danielebarchiesi@4 114 //if ($k == 'permalink') $this->addT($s, 'http://purl.org/dc/elements/1.1/identifier', $v, $s_type, $o_type);
danielebarchiesi@4 115 }
danielebarchiesi@4 116 /* structured, single v */
danielebarchiesi@4 117 elseif (!$this->isFlatArray($v)) {
danielebarchiesi@4 118 if ($o_type = $this->getStructType($v, $k)) {/* known type */
danielebarchiesi@4 119 $o = $this->getResourceID($v, $o_type);
danielebarchiesi@4 120 $this->addT($s, $p, $o, $s_type, 'uri');
danielebarchiesi@4 121 $this->addT($o, $this->rdf . 'type', $this->default_ns . $this->camelCase($o_type), 'uri', 'uri');
danielebarchiesi@4 122 }
danielebarchiesi@4 123 else {/* unknown type */
danielebarchiesi@4 124 $o = $this->createSubURI($s, $k, $pos);
danielebarchiesi@4 125 $this->addT($s, $p, $o, $s_type, 'uri');
danielebarchiesi@4 126 $this->extractResourceRDF($v, $o);
danielebarchiesi@4 127 }
danielebarchiesi@4 128 }
danielebarchiesi@4 129 /* value list */
danielebarchiesi@4 130 else {
danielebarchiesi@4 131 foreach ($v as $sub_pos => $sub_v) {
danielebarchiesi@4 132 $this->extractResourceRDF(array($k => $sub_v), $s, $sub_pos);
danielebarchiesi@4 133 }
danielebarchiesi@4 134 }
danielebarchiesi@4 135 }
danielebarchiesi@4 136 /* infer XSD triples */
danielebarchiesi@4 137 foreach ($date_prefixes as $prefix) {
danielebarchiesi@4 138 $this->inferDate($prefix, $s, $struct);
danielebarchiesi@4 139 }
danielebarchiesi@4 140 }
danielebarchiesi@4 141
danielebarchiesi@4 142 function isFlatArray($v) {
danielebarchiesi@4 143 foreach ($v as $k => $sub_v) {
danielebarchiesi@4 144 return is_numeric($k) ? 1 : 0;
danielebarchiesi@4 145 }
danielebarchiesi@4 146 }
danielebarchiesi@4 147
danielebarchiesi@4 148 /* */
danielebarchiesi@4 149
danielebarchiesi@4 150 function extractTagListRDF($s, $s_type, $v) {
danielebarchiesi@4 151 if (!$v) return 0;
danielebarchiesi@4 152 $tags = preg_split('/\, /', $v);
danielebarchiesi@4 153 foreach ($tags as $tag) {
danielebarchiesi@4 154 if (!trim($tag)) continue;
danielebarchiesi@4 155 $this->addT($s, $this->getPropertyURI('tag'), $tag, $s_type, 'literal');
danielebarchiesi@4 156 }
danielebarchiesi@4 157 }
danielebarchiesi@4 158
danielebarchiesi@4 159 function extractImageRDF($s, $s_type, $v, $rel = 'image') {
danielebarchiesi@4 160 if (!$v) return 1;
danielebarchiesi@4 161 $sizes = $v['available_sizes'];
danielebarchiesi@4 162 foreach ($sizes as $size) {
danielebarchiesi@4 163 $w = $size[0][0];
danielebarchiesi@4 164 $h = $size[0][1];
danielebarchiesi@4 165 $img = 'http://www.crunchbase.com/' . $size[1];
danielebarchiesi@4 166 $this->addT($s, $this->getPropertyURI($rel), $img, $s_type, 'uri');
danielebarchiesi@4 167 $this->addT($img, $this->getPropertyURI('width'), $w, 'uri', 'literal');
danielebarchiesi@4 168 $this->addT($img, $this->getPropertyURI('height'), $h, 'uri', 'literal');
danielebarchiesi@4 169 }
danielebarchiesi@4 170 }
danielebarchiesi@4 171
danielebarchiesi@4 172 function extractScreenshotsRDF($s, $s_type, $v) {
danielebarchiesi@4 173 if (!$v) return 1;
danielebarchiesi@4 174 foreach ($v as $sub_v) {
danielebarchiesi@4 175 $this->extractImageRDF($s, $s_type, $sub_v, 'screenshot');
danielebarchiesi@4 176 }
danielebarchiesi@4 177 }
danielebarchiesi@4 178
danielebarchiesi@4 179 function extractProductsRDF($s, $s_type, $v) {
danielebarchiesi@4 180 foreach ($v as $sub_v) {
danielebarchiesi@4 181 $o = $this->getResourceID($sub_v, 'product');
danielebarchiesi@4 182 $this->addT($s, $this->getPropertyURI('product'), $o, $s_type, 'uri');
danielebarchiesi@4 183 }
danielebarchiesi@4 184 }
danielebarchiesi@4 185
danielebarchiesi@4 186 function extractCompetitionsRDF($s, $s_type, $v) {
danielebarchiesi@4 187 foreach ($v as $sub_v) {
danielebarchiesi@4 188 $o = $this->getResourceID($sub_v['competitor'], 'company');
danielebarchiesi@4 189 $this->addT($s, $this->getPropertyURI('competitor'), $o, $s_type, 'uri');
danielebarchiesi@4 190 }
danielebarchiesi@4 191 }
danielebarchiesi@4 192
danielebarchiesi@4 193 function extractFundingRoundsRDF($s, $s_type, $v) {
danielebarchiesi@4 194 foreach ($v as $pos => $sub_v) {
danielebarchiesi@4 195 $o = $this->createSubURI($s, 'funding_round', $pos);
danielebarchiesi@4 196 $this->addT($s, $this->getPropertyURI('funding_round'), $o, $s_type, 'uri');
danielebarchiesi@4 197 $this->extractResourceRDF($sub_v, $o, $pos);
danielebarchiesi@4 198 }
danielebarchiesi@4 199 }
danielebarchiesi@4 200
danielebarchiesi@4 201 function extractInvestmentsRDF($s, $s_type, $v) {
danielebarchiesi@4 202 foreach ($v as $pos => $sub_v) {
danielebarchiesi@4 203 /* incoming */
danielebarchiesi@4 204 foreach (array('person' => 'person', 'company' => 'company', 'financial_org' => 'financial-organization') as $k => $type) {
danielebarchiesi@4 205 if (isset($sub_v[$k])) $this->addT($s, $this->getPropertyURI('investment'), $this->getResourceID($sub_v[$k], $type), $s_type, 'uri');
danielebarchiesi@4 206 }
danielebarchiesi@4 207 /* outgoing */
danielebarchiesi@4 208 if (isset($sub_v['funding_round'])) {
danielebarchiesi@4 209 $o = $this->createSubURI($s, 'investment', $pos);
danielebarchiesi@4 210 $this->addT($s, $this->getPropertyURI('investment'), $o, $s_type, 'uri');
danielebarchiesi@4 211 $this->extractResourceRDF($sub_v['funding_round'], $o, $pos);
danielebarchiesi@4 212 }
danielebarchiesi@4 213 }
danielebarchiesi@4 214 }
danielebarchiesi@4 215
danielebarchiesi@4 216 function extractExternalLinksRDF($s, $s_type, $v) {
danielebarchiesi@4 217 foreach ($v as $sub_v) {
danielebarchiesi@4 218 $href = $sub_v['external_url'];
danielebarchiesi@4 219 if (preg_match('/^https?\:\/\/[^\/]+$/', $href)) $href .= '/';
danielebarchiesi@4 220 $this->addT($s, $this->getPropertyURI('external_link'), $href, $s_type, 'uri');
danielebarchiesi@4 221 $this->addT($href, $this->getPropertyURI('title'), $sub_v['title'], $s_type, 'literal');
danielebarchiesi@4 222 }
danielebarchiesi@4 223 }
danielebarchiesi@4 224
danielebarchiesi@4 225 function extractWebPresencesRDF($s, $s_type, $v) {
danielebarchiesi@4 226 foreach ($v as $sub_v) {
danielebarchiesi@4 227 $href = $sub_v['external_url'];
danielebarchiesi@4 228 if (preg_match('/^https?\:\/\/[^\/]+$/', $href)) $href .= '/';
danielebarchiesi@4 229 $this->addT($s, $this->getPropertyURI('web_presence'), $href, $s_type, 'uri');
danielebarchiesi@4 230 $this->addT($href, $this->getPropertyURI('title'), $sub_v['title'], $s_type, 'literal');
danielebarchiesi@4 231 }
danielebarchiesi@4 232 }
danielebarchiesi@4 233
danielebarchiesi@4 234 function extractCreatedAtRDF($s, $s_type, $v) {
danielebarchiesi@4 235 $v = $this->getAPIDateXSD($v);
danielebarchiesi@4 236 $this->addT($s, $this->getPropertyURI('created_at'), $v, $s_type, 'literal');
danielebarchiesi@4 237 }
danielebarchiesi@4 238
danielebarchiesi@4 239 function extractUpdatedAtRDF($s, $s_type, $v) {
danielebarchiesi@4 240 $v = $this->getAPIDateXSD($v);
danielebarchiesi@4 241 $this->addT($s, $this->getPropertyURI('updated_at'), $v, $s_type, 'literal');
danielebarchiesi@4 242 }
danielebarchiesi@4 243
danielebarchiesi@4 244 function getAPIDateXSD($val) {
danielebarchiesi@4 245 //Fri Jan 16 21:11:48 UTC 2009
danielebarchiesi@4 246 if (preg_match('/^[a-z]+ ([a-z]+) ([0-9]+) ([0-9]{2}\:[0-9]{2}\:[0-9]{2}) UTC ([0-9]{4})/i', $val, $m)) {
danielebarchiesi@4 247 $months = array('Jan' => '01', 'Feb' => '02', 'Mar' =>'03', 'Apr' => '04', 'May' => '05', 'Jun' => '06', 'Jul' => '07', 'Aug' => '08', 'Sep' => '09', 'Oct' => '10', 'Nov' => '11', 'Dec' => '12');
danielebarchiesi@4 248 return $m[4] . '-' . $months[$m[1]] . '-' . $m[2] . 'T' . $m[3] . 'Z';
danielebarchiesi@4 249 }
danielebarchiesi@4 250 return '2000-01-01';
danielebarchiesi@4 251 }
danielebarchiesi@4 252
danielebarchiesi@4 253 /* */
danielebarchiesi@4 254
danielebarchiesi@4 255 function inferDate($prefix, $s, $struct) {
danielebarchiesi@4 256 $s_type = preg_match('/^\_\:/', $s) ? 'bnode' : 'uri';
danielebarchiesi@4 257 $r = '';
danielebarchiesi@4 258 foreach (array('year', 'month', 'day') as $suffix) {
danielebarchiesi@4 259 $val = $this->v1($prefix . '_' . $suffix, '00', $struct);
danielebarchiesi@4 260 $r .= ($r ? '-' : '') . str_pad($val, 2, '0', STR_PAD_LEFT);
danielebarchiesi@4 261 }
danielebarchiesi@4 262 if ($r != '00-00-00') {
danielebarchiesi@4 263 $this->addT($s, $this->getPropertyURI($prefix . '_date'), $r, $s_type, 'literal');
danielebarchiesi@4 264 }
danielebarchiesi@4 265 }
danielebarchiesi@4 266
danielebarchiesi@4 267 }