danielebarchiesi@4
|
1 <?php
|
danielebarchiesi@4
|
2 /**
|
danielebarchiesi@4
|
3 * ARC2 JSON Parser
|
danielebarchiesi@4
|
4 * Does not extract triples, needs sub-class for RDF extraction
|
danielebarchiesi@4
|
5 *
|
danielebarchiesi@4
|
6 * @author Benjamin Nowack <bnowack@semsol.com>
|
danielebarchiesi@4
|
7 * @license http://arc.semsol.org/license
|
danielebarchiesi@4
|
8 * @homepage <http://arc.semsol.org/>
|
danielebarchiesi@4
|
9 * @package ARC2
|
danielebarchiesi@4
|
10 * @version 2010-11-16
|
danielebarchiesi@4
|
11 */
|
danielebarchiesi@4
|
12
|
danielebarchiesi@4
|
13 ARC2::inc('RDFParser');
|
danielebarchiesi@4
|
14
|
danielebarchiesi@4
|
15 class ARC2_JSONParser extends ARC2_RDFParser {
|
danielebarchiesi@4
|
16
|
danielebarchiesi@4
|
17 function __construct($a, &$caller) {
|
danielebarchiesi@4
|
18 parent::__construct($a, $caller);
|
danielebarchiesi@4
|
19 }
|
danielebarchiesi@4
|
20
|
danielebarchiesi@4
|
21 function __init() {
|
danielebarchiesi@4
|
22 parent::__init();
|
danielebarchiesi@4
|
23 }
|
danielebarchiesi@4
|
24
|
danielebarchiesi@4
|
25 /* */
|
danielebarchiesi@4
|
26
|
danielebarchiesi@4
|
27 function x($re, $v, $options = 'si') {
|
danielebarchiesi@4
|
28 while (preg_match('/^\s*(\/\*.*\*\/)(.*)$/Usi', $v, $m)) {/* comment removal */
|
danielebarchiesi@4
|
29 $v = $m[2];
|
danielebarchiesi@4
|
30 }
|
danielebarchiesi@4
|
31 $this->unparsed_code = (strlen($this->unparsed_code) > strlen($v)) ? $v : $this->unparsed_code;
|
danielebarchiesi@4
|
32 return ARC2::x($re, $v, $options);
|
danielebarchiesi@4
|
33 }
|
danielebarchiesi@4
|
34
|
danielebarchiesi@4
|
35 function parse($path, $data = '') {
|
danielebarchiesi@4
|
36 $this->state = 0;
|
danielebarchiesi@4
|
37 /* reader */
|
danielebarchiesi@4
|
38 if (!$this->v('reader')) {
|
danielebarchiesi@4
|
39 ARC2::inc('Reader');
|
danielebarchiesi@4
|
40 $this->reader = new ARC2_Reader($this->a, $this);
|
danielebarchiesi@4
|
41 }
|
danielebarchiesi@4
|
42 $this->reader->setAcceptHeader('Accept: application/json; q=0.9, */*; q=0.1');
|
danielebarchiesi@4
|
43 $this->reader->activate($path, $data);
|
danielebarchiesi@4
|
44 $this->x_base = isset($this->a['base']) && $this->a['base'] ? $this->a['base'] : $this->reader->base;
|
danielebarchiesi@4
|
45 /* parse */
|
danielebarchiesi@4
|
46 $doc = '';
|
danielebarchiesi@4
|
47 while ($d = $this->reader->readStream()) {
|
danielebarchiesi@4
|
48 $doc .= $d;
|
danielebarchiesi@4
|
49 }
|
danielebarchiesi@4
|
50 $this->reader->closeStream();
|
danielebarchiesi@4
|
51 unset($this->reader);
|
danielebarchiesi@4
|
52 $doc = preg_replace('/^[^\{]*(.*\})[^\}]*$/is', '\\1', $doc);
|
danielebarchiesi@4
|
53 $this->unparsed_code = $doc;
|
danielebarchiesi@4
|
54 list($this->struct, $rest) = $this->extractObject($doc);
|
danielebarchiesi@4
|
55 return $this->done();
|
danielebarchiesi@4
|
56 }
|
danielebarchiesi@4
|
57
|
danielebarchiesi@4
|
58 /* */
|
danielebarchiesi@4
|
59
|
danielebarchiesi@4
|
60 function extractObject($v) {
|
danielebarchiesi@4
|
61 if (function_exists('json_decode')) return array(json_decode($v, 1), '');
|
danielebarchiesi@4
|
62 $r = array();
|
danielebarchiesi@4
|
63 /* sub-object */
|
danielebarchiesi@4
|
64 if ($sub_r = $this->x('\{', $v)) {
|
danielebarchiesi@4
|
65 $v = $sub_r[1];
|
danielebarchiesi@4
|
66 while ((list($sub_r, $v) = $this->extractEntry($v)) && $sub_r) {
|
danielebarchiesi@4
|
67 $r[$sub_r['key']] = $sub_r['value'];
|
danielebarchiesi@4
|
68 }
|
danielebarchiesi@4
|
69 if ($sub_r = $this->x('\}', $v)) $v = $sub_r[1];
|
danielebarchiesi@4
|
70 }
|
danielebarchiesi@4
|
71 /* sub-list */
|
danielebarchiesi@4
|
72 elseif ($sub_r = $this->x('\[', $v)) {
|
danielebarchiesi@4
|
73 $v = $sub_r[1];
|
danielebarchiesi@4
|
74 while ((list($sub_r, $v) = $this->extractObject($v)) && $sub_r) {
|
danielebarchiesi@4
|
75 $r[] = $sub_r;
|
danielebarchiesi@4
|
76 $v = ltrim($v, ',');
|
danielebarchiesi@4
|
77 }
|
danielebarchiesi@4
|
78 if ($sub_r = $this->x('\]', $v)) $v = $sub_r[1];
|
danielebarchiesi@4
|
79 }
|
danielebarchiesi@4
|
80 /* sub-value */
|
danielebarchiesi@4
|
81 elseif ((list($sub_r, $v) = $this->extractValue($v)) && ($sub_r !== false)) {
|
danielebarchiesi@4
|
82 $r = $sub_r;
|
danielebarchiesi@4
|
83 }
|
danielebarchiesi@4
|
84 return array($r, $v);
|
danielebarchiesi@4
|
85 }
|
danielebarchiesi@4
|
86
|
danielebarchiesi@4
|
87 function extractEntry($v) {
|
danielebarchiesi@4
|
88 if ($r = $this->x('\,', $v)) $v = $r[1];
|
danielebarchiesi@4
|
89 /* k */
|
danielebarchiesi@4
|
90 if ($r = $this->x('\"([^\"]+)\"\s*\:', $v)) {
|
danielebarchiesi@4
|
91 $k = $r[1];
|
danielebarchiesi@4
|
92 $sub_v = $r[2];
|
danielebarchiesi@4
|
93 if (list($sub_r, $sub_v) = $this->extractObject($sub_v)) {
|
danielebarchiesi@4
|
94 return array(
|
danielebarchiesi@4
|
95 array('key' => $k, 'value' => $sub_r),
|
danielebarchiesi@4
|
96 $sub_v
|
danielebarchiesi@4
|
97 );
|
danielebarchiesi@4
|
98 }
|
danielebarchiesi@4
|
99 }
|
danielebarchiesi@4
|
100 return array(0, $v);
|
danielebarchiesi@4
|
101 }
|
danielebarchiesi@4
|
102
|
danielebarchiesi@4
|
103 function extractValue($v) {
|
danielebarchiesi@4
|
104 if ($r = $this->x('\,', $v)) $v = $r[1];
|
danielebarchiesi@4
|
105 if ($sub_r = $this->x('null', $v)) {
|
danielebarchiesi@4
|
106 return array(null, $sub_r[1]);
|
danielebarchiesi@4
|
107 }
|
danielebarchiesi@4
|
108 if ($sub_r = $this->x('(true|false)', $v)) {
|
danielebarchiesi@4
|
109 return array($sub_r[1], $sub_r[2]);
|
danielebarchiesi@4
|
110 }
|
danielebarchiesi@4
|
111 if ($sub_r = $this->x('([\-\+]?[0-9\.]+)', $v)) {
|
danielebarchiesi@4
|
112 return array($sub_r[1], $sub_r[2]);
|
danielebarchiesi@4
|
113 }
|
danielebarchiesi@4
|
114 if ($sub_r = $this->x('\"', $v)) {
|
danielebarchiesi@4
|
115 $rest = $sub_r[1];
|
danielebarchiesi@4
|
116 if (preg_match('/^([^\x5c]*|.*[^\x5c]|.*\x5c{2})\"(.*)$/sU', $rest, $m)) {
|
danielebarchiesi@4
|
117 $val = $m[1];
|
danielebarchiesi@4
|
118 /* unescape chars (single-byte) */
|
danielebarchiesi@4
|
119 $val = preg_replace('/\\\u(.{4})/e', 'chr(hexdec("\\1"))', $val);
|
danielebarchiesi@4
|
120 //$val = preg_replace('/\\\u00(.{2})/e', 'rawurldecode("%\\1")', $val);
|
danielebarchiesi@4
|
121 /* other escaped chars */
|
danielebarchiesi@4
|
122 $from = array('\\\\', '\r', '\t', '\n', '\"', '\b', '\f', '\/');
|
danielebarchiesi@4
|
123 $to = array("\\", "\r", "\t", "\n", '"', "\b", "\f", "/");
|
danielebarchiesi@4
|
124 $val = str_replace($from, $to, $val);
|
danielebarchiesi@4
|
125 return array($val, $m[2]);
|
danielebarchiesi@4
|
126 }
|
danielebarchiesi@4
|
127 }
|
danielebarchiesi@4
|
128 return array(false, $v);
|
danielebarchiesi@4
|
129 }
|
danielebarchiesi@4
|
130
|
danielebarchiesi@4
|
131 /* */
|
danielebarchiesi@4
|
132
|
danielebarchiesi@4
|
133 function getObject() {
|
danielebarchiesi@4
|
134 return $this->v('struct', array());
|
danielebarchiesi@4
|
135 }
|
danielebarchiesi@4
|
136
|
danielebarchiesi@4
|
137 function getTriples() {
|
danielebarchiesi@4
|
138 return $this->v('triples', array());
|
danielebarchiesi@4
|
139 }
|
danielebarchiesi@4
|
140
|
danielebarchiesi@4
|
141 function countTriples() {
|
danielebarchiesi@4
|
142 return $this->t_count;
|
danielebarchiesi@4
|
143 }
|
danielebarchiesi@4
|
144
|
danielebarchiesi@4
|
145 function addT($s = '', $p = '', $o = '', $s_type = '', $o_type = '', $o_dt = '', $o_lang = '') {
|
danielebarchiesi@4
|
146 $o = $this->toUTF8($o);
|
danielebarchiesi@4
|
147 //echo str_replace($this->base, '', "-----\n adding $s / $p / $o\n-----\n");
|
danielebarchiesi@4
|
148 $t = array('s' => $s, 'p' => $p, 'o' => $o, 's_type' => $s_type, 'o_type' => $o_type, 'o_datatype' => $o_dt, 'o_lang' => $o_lang);
|
danielebarchiesi@4
|
149 if ($this->skip_dupes) {
|
danielebarchiesi@4
|
150 $h = md5(serialize($t));
|
danielebarchiesi@4
|
151 if (!isset($this->added_triples[$h])) {
|
danielebarchiesi@4
|
152 $this->triples[$this->t_count] = $t;
|
danielebarchiesi@4
|
153 $this->t_count++;
|
danielebarchiesi@4
|
154 $this->added_triples[$h] = true;
|
danielebarchiesi@4
|
155 }
|
danielebarchiesi@4
|
156 }
|
danielebarchiesi@4
|
157 else {
|
danielebarchiesi@4
|
158 $this->triples[$this->t_count] = $t;
|
danielebarchiesi@4
|
159 $this->t_count++;
|
danielebarchiesi@4
|
160 }
|
danielebarchiesi@4
|
161 }
|
danielebarchiesi@4
|
162
|
danielebarchiesi@4
|
163 /* */
|
danielebarchiesi@4
|
164
|
danielebarchiesi@4
|
165 }
|