danielebarchiesi@4: danielebarchiesi@4: * @license http://arc.semsol.org/license danielebarchiesi@4: * @homepage danielebarchiesi@4: * @package ARC2 danielebarchiesi@4: * @version 2010-11-16 danielebarchiesi@4: */ danielebarchiesi@4: danielebarchiesi@4: ARC2::inc('JSONParser'); danielebarchiesi@4: danielebarchiesi@4: class ARC2_CBJSONParser extends ARC2_JSONParser { danielebarchiesi@4: danielebarchiesi@4: function __construct($a, &$caller) { danielebarchiesi@4: parent::__construct($a, $caller); danielebarchiesi@4: } danielebarchiesi@4: danielebarchiesi@4: function __init() {/* reader */ danielebarchiesi@4: parent::__init(); danielebarchiesi@4: $this->base = 'http://cb.semsol.org/'; danielebarchiesi@4: $this->rdf = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'; danielebarchiesi@4: $this->default_ns = $this->base . 'ns#'; danielebarchiesi@4: $this->nsp = array($this->rdf => 'rdf'); danielebarchiesi@4: } danielebarchiesi@4: danielebarchiesi@4: /* */ danielebarchiesi@4: danielebarchiesi@4: function done() { danielebarchiesi@4: $this->extractRDF(); danielebarchiesi@4: } danielebarchiesi@4: danielebarchiesi@4: function extractRDF() { danielebarchiesi@4: $struct = $this->struct; danielebarchiesi@4: if ($type = $this->getStructType($struct)) { danielebarchiesi@4: $s = $this->getResourceID($struct, $type); danielebarchiesi@4: /* rdf:type */ danielebarchiesi@4: $this->addT($s, $this->rdf . 'type', $this->default_ns . $this->camelCase($type), 'uri', 'uri'); danielebarchiesi@4: /* explicit triples */ danielebarchiesi@4: $this->extractResourceRDF($struct, $s); danielebarchiesi@4: } danielebarchiesi@4: } danielebarchiesi@4: danielebarchiesi@4: function getStructType($struct, $rel = '') { danielebarchiesi@4: /* url-based */ danielebarchiesi@4: if ($url = $this->v('crunchbase_url', '', $struct)) { danielebarchiesi@4: return preg_replace('/^.*crunchbase\.com\/([^\/]+)\/.*$/', '\\1', $url); danielebarchiesi@4: } danielebarchiesi@4: /* rel-based */ danielebarchiesi@4: if ($rel == 'person') return 'person'; danielebarchiesi@4: if ($rel == 'company') return 'company'; danielebarchiesi@4: if ($rel == 'acquiring_company') return 'company'; danielebarchiesi@4: if ($rel == 'firm') return 'company'; danielebarchiesi@4: if ($rel == 'provider') return 'service-provider'; danielebarchiesi@4: /* struct-based */ danielebarchiesi@4: if (isset($struct['_type'])) return $struct['_type']; danielebarchiesi@4: if (isset($struct['round_code'])) return 'funding_round'; danielebarchiesi@4: if (isset($struct['products'])) return 'company'; danielebarchiesi@4: if (isset($struct['first_name'])) return 'person'; danielebarchiesi@4: if (isset($struct['investments'])) return 'financial-organization'; danielebarchiesi@4: if (isset($struct['launched_year'])) return 'product'; danielebarchiesi@4: if (isset($struct['providerships']) && is_array($struct['providerships'])) return 'service-provider'; danielebarchiesi@4: return ''; danielebarchiesi@4: } danielebarchiesi@4: danielebarchiesi@4: function getResourceID($struct, $type) { danielebarchiesi@4: if ($type && isset($struct['permalink'])) { danielebarchiesi@4: return $this->base . $type . '/' . $struct['permalink'] . '#self'; danielebarchiesi@4: } danielebarchiesi@4: return $this->createBnodeID(); danielebarchiesi@4: } danielebarchiesi@4: danielebarchiesi@4: function getPropertyURI($name, $ns = '') { danielebarchiesi@4: if (!$ns) $ns = $this->default_ns; danielebarchiesi@4: if (preg_match('/^(product|funding_round|investment|acquisition|.+ship|office|milestone|.+embed|.+link|degree|fund)s/', $name, $m)) $name = $m[1]; danielebarchiesi@4: if ($name == 'tag_list') $name = 'tag'; danielebarchiesi@4: if ($name == 'competitions') $name = 'competitor'; danielebarchiesi@4: return $ns . $name; danielebarchiesi@4: } danielebarchiesi@4: danielebarchiesi@4: function createSubURI($s, $k, $pos) { danielebarchiesi@4: $s = str_replace('#self', '/', $s); danielebarchiesi@4: if (preg_match('/(office|ship|investment|milestone|fund|embed|link)s$/', $k)) $k = substr($k, 0, -1); danielebarchiesi@4: return $s . $k . '-' . ($pos + 1) . '#self'; danielebarchiesi@4: } danielebarchiesi@4: danielebarchiesi@4: /* */ danielebarchiesi@4: danielebarchiesi@4: function extractResourceRDF($struct, $s, $pos = 0) { danielebarchiesi@4: $s_type = preg_match('/^\_\:/', $s) ? 'bnode' : 'uri'; danielebarchiesi@4: $date_prefixes = array(); danielebarchiesi@4: foreach ($struct as $k => $v) { danielebarchiesi@4: if ($k == 'acquisition') $k = 'exit'; danielebarchiesi@4: if (preg_match('/^(.*)\_(year|month|day)$/', $k, $m)) { danielebarchiesi@4: if (!in_array($m[1], $date_prefixes)) $date_prefixes[] = $m[1]; danielebarchiesi@4: } danielebarchiesi@4: $sub_m = 'extract' . $this->camelCase($k) . 'RDF'; danielebarchiesi@4: if (method_exists($this, $sub_m)) { danielebarchiesi@4: $this->$sub_m($s, $s_type, $v); danielebarchiesi@4: continue; danielebarchiesi@4: } danielebarchiesi@4: $p = $this->getPropertyURI($k); danielebarchiesi@4: if (!$v) continue; danielebarchiesi@4: /* simple, single v */ danielebarchiesi@4: if (!is_array($v)) { danielebarchiesi@4: $o_type = preg_match('/^[a-z]+\:[^\s]+$/is', $v) ? 'uri' : 'literal'; danielebarchiesi@4: $v = trim($v); danielebarchiesi@4: if (preg_match('/^https?\:\/\/[^\/]+$/', $v)) $v .= '/'; danielebarchiesi@4: $this->addT($s, $p, $v, $s_type, $o_type); danielebarchiesi@4: /* rdfs:label */ danielebarchiesi@4: if ($k == 'name') $this->addT($s, 'http://www.w3.org/2000/01/rdf-schema#label', $v, $s_type, $o_type); danielebarchiesi@4: /* dc:identifier */ danielebarchiesi@4: //if ($k == 'permalink') $this->addT($s, 'http://purl.org/dc/elements/1.1/identifier', $v, $s_type, $o_type); danielebarchiesi@4: } danielebarchiesi@4: /* structured, single v */ danielebarchiesi@4: elseif (!$this->isFlatArray($v)) { danielebarchiesi@4: if ($o_type = $this->getStructType($v, $k)) {/* known type */ danielebarchiesi@4: $o = $this->getResourceID($v, $o_type); danielebarchiesi@4: $this->addT($s, $p, $o, $s_type, 'uri'); danielebarchiesi@4: $this->addT($o, $this->rdf . 'type', $this->default_ns . $this->camelCase($o_type), 'uri', 'uri'); danielebarchiesi@4: } danielebarchiesi@4: else {/* unknown type */ danielebarchiesi@4: $o = $this->createSubURI($s, $k, $pos); danielebarchiesi@4: $this->addT($s, $p, $o, $s_type, 'uri'); danielebarchiesi@4: $this->extractResourceRDF($v, $o); danielebarchiesi@4: } danielebarchiesi@4: } danielebarchiesi@4: /* value list */ danielebarchiesi@4: else { danielebarchiesi@4: foreach ($v as $sub_pos => $sub_v) { danielebarchiesi@4: $this->extractResourceRDF(array($k => $sub_v), $s, $sub_pos); danielebarchiesi@4: } danielebarchiesi@4: } danielebarchiesi@4: } danielebarchiesi@4: /* infer XSD triples */ danielebarchiesi@4: foreach ($date_prefixes as $prefix) { danielebarchiesi@4: $this->inferDate($prefix, $s, $struct); danielebarchiesi@4: } danielebarchiesi@4: } danielebarchiesi@4: danielebarchiesi@4: function isFlatArray($v) { danielebarchiesi@4: foreach ($v as $k => $sub_v) { danielebarchiesi@4: return is_numeric($k) ? 1 : 0; danielebarchiesi@4: } danielebarchiesi@4: } danielebarchiesi@4: danielebarchiesi@4: /* */ danielebarchiesi@4: danielebarchiesi@4: function extractTagListRDF($s, $s_type, $v) { danielebarchiesi@4: if (!$v) return 0; danielebarchiesi@4: $tags = preg_split('/\, /', $v); danielebarchiesi@4: foreach ($tags as $tag) { danielebarchiesi@4: if (!trim($tag)) continue; danielebarchiesi@4: $this->addT($s, $this->getPropertyURI('tag'), $tag, $s_type, 'literal'); danielebarchiesi@4: } danielebarchiesi@4: } danielebarchiesi@4: danielebarchiesi@4: function extractImageRDF($s, $s_type, $v, $rel = 'image') { danielebarchiesi@4: if (!$v) return 1; danielebarchiesi@4: $sizes = $v['available_sizes']; danielebarchiesi@4: foreach ($sizes as $size) { danielebarchiesi@4: $w = $size[0][0]; danielebarchiesi@4: $h = $size[0][1]; danielebarchiesi@4: $img = 'http://www.crunchbase.com/' . $size[1]; danielebarchiesi@4: $this->addT($s, $this->getPropertyURI($rel), $img, $s_type, 'uri'); danielebarchiesi@4: $this->addT($img, $this->getPropertyURI('width'), $w, 'uri', 'literal'); danielebarchiesi@4: $this->addT($img, $this->getPropertyURI('height'), $h, 'uri', 'literal'); danielebarchiesi@4: } danielebarchiesi@4: } danielebarchiesi@4: danielebarchiesi@4: function extractScreenshotsRDF($s, $s_type, $v) { danielebarchiesi@4: if (!$v) return 1; danielebarchiesi@4: foreach ($v as $sub_v) { danielebarchiesi@4: $this->extractImageRDF($s, $s_type, $sub_v, 'screenshot'); danielebarchiesi@4: } danielebarchiesi@4: } danielebarchiesi@4: danielebarchiesi@4: function extractProductsRDF($s, $s_type, $v) { danielebarchiesi@4: foreach ($v as $sub_v) { danielebarchiesi@4: $o = $this->getResourceID($sub_v, 'product'); danielebarchiesi@4: $this->addT($s, $this->getPropertyURI('product'), $o, $s_type, 'uri'); danielebarchiesi@4: } danielebarchiesi@4: } danielebarchiesi@4: danielebarchiesi@4: function extractCompetitionsRDF($s, $s_type, $v) { danielebarchiesi@4: foreach ($v as $sub_v) { danielebarchiesi@4: $o = $this->getResourceID($sub_v['competitor'], 'company'); danielebarchiesi@4: $this->addT($s, $this->getPropertyURI('competitor'), $o, $s_type, 'uri'); danielebarchiesi@4: } danielebarchiesi@4: } danielebarchiesi@4: danielebarchiesi@4: function extractFundingRoundsRDF($s, $s_type, $v) { danielebarchiesi@4: foreach ($v as $pos => $sub_v) { danielebarchiesi@4: $o = $this->createSubURI($s, 'funding_round', $pos); danielebarchiesi@4: $this->addT($s, $this->getPropertyURI('funding_round'), $o, $s_type, 'uri'); danielebarchiesi@4: $this->extractResourceRDF($sub_v, $o, $pos); danielebarchiesi@4: } danielebarchiesi@4: } danielebarchiesi@4: danielebarchiesi@4: function extractInvestmentsRDF($s, $s_type, $v) { danielebarchiesi@4: foreach ($v as $pos => $sub_v) { danielebarchiesi@4: /* incoming */ danielebarchiesi@4: foreach (array('person' => 'person', 'company' => 'company', 'financial_org' => 'financial-organization') as $k => $type) { danielebarchiesi@4: if (isset($sub_v[$k])) $this->addT($s, $this->getPropertyURI('investment'), $this->getResourceID($sub_v[$k], $type), $s_type, 'uri'); danielebarchiesi@4: } danielebarchiesi@4: /* outgoing */ danielebarchiesi@4: if (isset($sub_v['funding_round'])) { danielebarchiesi@4: $o = $this->createSubURI($s, 'investment', $pos); danielebarchiesi@4: $this->addT($s, $this->getPropertyURI('investment'), $o, $s_type, 'uri'); danielebarchiesi@4: $this->extractResourceRDF($sub_v['funding_round'], $o, $pos); danielebarchiesi@4: } danielebarchiesi@4: } danielebarchiesi@4: } danielebarchiesi@4: danielebarchiesi@4: function extractExternalLinksRDF($s, $s_type, $v) { danielebarchiesi@4: foreach ($v as $sub_v) { danielebarchiesi@4: $href = $sub_v['external_url']; danielebarchiesi@4: if (preg_match('/^https?\:\/\/[^\/]+$/', $href)) $href .= '/'; danielebarchiesi@4: $this->addT($s, $this->getPropertyURI('external_link'), $href, $s_type, 'uri'); danielebarchiesi@4: $this->addT($href, $this->getPropertyURI('title'), $sub_v['title'], $s_type, 'literal'); danielebarchiesi@4: } danielebarchiesi@4: } danielebarchiesi@4: danielebarchiesi@4: function extractWebPresencesRDF($s, $s_type, $v) { danielebarchiesi@4: foreach ($v as $sub_v) { danielebarchiesi@4: $href = $sub_v['external_url']; danielebarchiesi@4: if (preg_match('/^https?\:\/\/[^\/]+$/', $href)) $href .= '/'; danielebarchiesi@4: $this->addT($s, $this->getPropertyURI('web_presence'), $href, $s_type, 'uri'); danielebarchiesi@4: $this->addT($href, $this->getPropertyURI('title'), $sub_v['title'], $s_type, 'literal'); danielebarchiesi@4: } danielebarchiesi@4: } danielebarchiesi@4: danielebarchiesi@4: function extractCreatedAtRDF($s, $s_type, $v) { danielebarchiesi@4: $v = $this->getAPIDateXSD($v); danielebarchiesi@4: $this->addT($s, $this->getPropertyURI('created_at'), $v, $s_type, 'literal'); danielebarchiesi@4: } danielebarchiesi@4: danielebarchiesi@4: function extractUpdatedAtRDF($s, $s_type, $v) { danielebarchiesi@4: $v = $this->getAPIDateXSD($v); danielebarchiesi@4: $this->addT($s, $this->getPropertyURI('updated_at'), $v, $s_type, 'literal'); danielebarchiesi@4: } danielebarchiesi@4: danielebarchiesi@4: function getAPIDateXSD($val) { danielebarchiesi@4: //Fri Jan 16 21:11:48 UTC 2009 danielebarchiesi@4: if (preg_match('/^[a-z]+ ([a-z]+) ([0-9]+) ([0-9]{2}\:[0-9]{2}\:[0-9]{2}) UTC ([0-9]{4})/i', $val, $m)) { danielebarchiesi@4: $months = array('Jan' => '01', 'Feb' => '02', 'Mar' =>'03', 'Apr' => '04', 'May' => '05', 'Jun' => '06', 'Jul' => '07', 'Aug' => '08', 'Sep' => '09', 'Oct' => '10', 'Nov' => '11', 'Dec' => '12'); danielebarchiesi@4: return $m[4] . '-' . $months[$m[1]] . '-' . $m[2] . 'T' . $m[3] . 'Z'; danielebarchiesi@4: } danielebarchiesi@4: return '2000-01-01'; danielebarchiesi@4: } danielebarchiesi@4: danielebarchiesi@4: /* */ danielebarchiesi@4: danielebarchiesi@4: function inferDate($prefix, $s, $struct) { danielebarchiesi@4: $s_type = preg_match('/^\_\:/', $s) ? 'bnode' : 'uri'; danielebarchiesi@4: $r = ''; danielebarchiesi@4: foreach (array('year', 'month', 'day') as $suffix) { danielebarchiesi@4: $val = $this->v1($prefix . '_' . $suffix, '00', $struct); danielebarchiesi@4: $r .= ($r ? '-' : '') . str_pad($val, 2, '0', STR_PAD_LEFT); danielebarchiesi@4: } danielebarchiesi@4: if ($r != '00-00-00') { danielebarchiesi@4: $this->addT($s, $this->getPropertyURI($prefix . '_date'), $r, $s_type, 'literal'); danielebarchiesi@4: } danielebarchiesi@4: } danielebarchiesi@4: danielebarchiesi@4: }