danielebarchiesi@4
|
1 <?php
|
danielebarchiesi@4
|
2 /*
|
danielebarchiesi@4
|
3 homepage: http://arc.semsol.org/
|
danielebarchiesi@4
|
4 license: http://arc.semsol.org/license
|
danielebarchiesi@4
|
5
|
danielebarchiesi@4
|
6 class: ARC2 RDF/XML Parser
|
danielebarchiesi@4
|
7 author: Benjamin Nowack
|
danielebarchiesi@4
|
8 version: 2010-11-16
|
danielebarchiesi@4
|
9 */
|
danielebarchiesi@4
|
10
|
danielebarchiesi@4
|
11 ARC2::inc('LegacyXMLParser');
|
danielebarchiesi@4
|
12
|
danielebarchiesi@4
|
13 class ARC2_SemHTMLParser extends ARC2_LegacyXMLParser {
|
danielebarchiesi@4
|
14
|
danielebarchiesi@4
|
15 function __construct($a, &$caller) {
|
danielebarchiesi@4
|
16 parent::__construct($a, $caller);
|
danielebarchiesi@4
|
17 }
|
danielebarchiesi@4
|
18
|
danielebarchiesi@4
|
19 function __init() {/* reader */
|
danielebarchiesi@4
|
20 parent::__init();
|
danielebarchiesi@4
|
21 $this->default_sem_html_formats = 'dc openid erdf rdfa posh-rdf microformats';
|
danielebarchiesi@4
|
22 $this->triples = array();
|
danielebarchiesi@4
|
23 $this->target_encoding = '';
|
danielebarchiesi@4
|
24 $this->t_count = 0;
|
danielebarchiesi@4
|
25 $this->added_triples = array();
|
danielebarchiesi@4
|
26 $this->skip_dupes = false;
|
danielebarchiesi@4
|
27 $this->bnode_prefix = $this->v('bnode_prefix', 'arc'.substr(md5(uniqid(rand())), 0, 4).'b', $this->a);
|
danielebarchiesi@4
|
28 $this->bnode_id = 0;
|
danielebarchiesi@4
|
29 $this->auto_extract = $this->v('auto_extract', 1, $this->a);
|
danielebarchiesi@4
|
30 $this->extracted_formats = array();
|
danielebarchiesi@4
|
31 $this->cache = array();
|
danielebarchiesi@4
|
32 $this->detected_formats = array();
|
danielebarchiesi@4
|
33 $this->keep_cdata_ws = $this->v('keep_cdata_whitespace', 0, $this->a);
|
danielebarchiesi@4
|
34 }
|
danielebarchiesi@4
|
35
|
danielebarchiesi@4
|
36 /* */
|
danielebarchiesi@4
|
37
|
danielebarchiesi@4
|
38 function x($re, $v, $options = 'si', $keep_ws = 0) {
|
danielebarchiesi@4
|
39 list($ws, $v) = preg_match('/^(\s*)(.*)$/s', $v, $m) ? array($m[1], $m[2]) : array('', $v);
|
danielebarchiesi@4
|
40 if (preg_match("/^" . $re . "(.*)$/" . $options, $v, $m)) {
|
danielebarchiesi@4
|
41 if ($keep_ws) $m[1] = $ws . $m[1];
|
danielebarchiesi@4
|
42 return $m;
|
danielebarchiesi@4
|
43 }
|
danielebarchiesi@4
|
44 return false;
|
danielebarchiesi@4
|
45 }
|
danielebarchiesi@4
|
46
|
danielebarchiesi@4
|
47 /* */
|
danielebarchiesi@4
|
48
|
danielebarchiesi@4
|
49 function setReader(&$reader) {
|
danielebarchiesi@4
|
50 $this->reader = $reader;
|
danielebarchiesi@4
|
51 }
|
danielebarchiesi@4
|
52
|
danielebarchiesi@4
|
53 function createBnodeID(){
|
danielebarchiesi@4
|
54 $this->bnode_id++;
|
danielebarchiesi@4
|
55 return '_:' . $this->bnode_prefix . $this->bnode_id;
|
danielebarchiesi@4
|
56 }
|
danielebarchiesi@4
|
57
|
danielebarchiesi@4
|
58 function addT($t) {
|
danielebarchiesi@4
|
59 if (function_exists('html_entity_decode')) {
|
danielebarchiesi@4
|
60 $t['o'] = html_entity_decode($t['o']);
|
danielebarchiesi@4
|
61 }
|
danielebarchiesi@4
|
62 if ($this->skip_dupes) {
|
danielebarchiesi@4
|
63 $h = md5(serialize($t));
|
danielebarchiesi@4
|
64 if (!isset($this->added_triples[$h])) {
|
danielebarchiesi@4
|
65 $this->triples[$this->t_count] = $t;
|
danielebarchiesi@4
|
66 $this->t_count++;
|
danielebarchiesi@4
|
67 $this->added_triples[$h] = true;
|
danielebarchiesi@4
|
68 }
|
danielebarchiesi@4
|
69 }
|
danielebarchiesi@4
|
70 else {
|
danielebarchiesi@4
|
71 $this->triples[$this->t_count] = $t;
|
danielebarchiesi@4
|
72 $this->t_count++;
|
danielebarchiesi@4
|
73 }
|
danielebarchiesi@4
|
74 }
|
danielebarchiesi@4
|
75
|
danielebarchiesi@4
|
76 function getTriples() {
|
danielebarchiesi@4
|
77 return $this->v('triples', array());
|
danielebarchiesi@4
|
78 }
|
danielebarchiesi@4
|
79
|
danielebarchiesi@4
|
80 function countTriples() {
|
danielebarchiesi@4
|
81 return $this->t_count;
|
danielebarchiesi@4
|
82 }
|
danielebarchiesi@4
|
83
|
danielebarchiesi@4
|
84 function getSimpleIndex($flatten_objects = 1, $vals = '') {
|
danielebarchiesi@4
|
85 return ARC2::getSimpleIndex($this->getTriples(), $flatten_objects, $vals);
|
danielebarchiesi@4
|
86 }
|
danielebarchiesi@4
|
87
|
danielebarchiesi@4
|
88 /* */
|
danielebarchiesi@4
|
89
|
danielebarchiesi@4
|
90 function parse($path, $data = '', $iso_fallback = 'ignore') {
|
danielebarchiesi@4
|
91 $this->nodes = array();
|
danielebarchiesi@4
|
92 $this->node_count = 0;
|
danielebarchiesi@4
|
93 $this->level = 0;
|
danielebarchiesi@4
|
94 /* reader */
|
danielebarchiesi@4
|
95 if (!$this->v('reader')) {
|
danielebarchiesi@4
|
96 ARC2::inc('Reader');
|
danielebarchiesi@4
|
97 $this->reader = new ARC2_Reader($this->a, $this);
|
danielebarchiesi@4
|
98 }
|
danielebarchiesi@4
|
99 $this->reader->setAcceptHeader('Accept: text/html, application/xhtml, */*; q=0.9');
|
danielebarchiesi@4
|
100 $this->reader->activate($path, $data);
|
danielebarchiesi@4
|
101 $this->target_encoding = $this->reader->getEncoding(false);
|
danielebarchiesi@4
|
102 $this->x_base = isset($this->a['base']) && $this->a['base'] ? $this->a['base'] : $this->reader->base;
|
danielebarchiesi@4
|
103 $this->base = $this->x_base;
|
danielebarchiesi@4
|
104 $this->doc_url = $this->reader->base;
|
danielebarchiesi@4
|
105 /* parse */
|
danielebarchiesi@4
|
106 $rest = '';
|
danielebarchiesi@4
|
107 $this->cur_tag = '';
|
danielebarchiesi@4
|
108 while ($d = $this->reader->readStream(1)) {
|
danielebarchiesi@4
|
109 $rest = $this->processData($rest . $d);
|
danielebarchiesi@4
|
110 }
|
danielebarchiesi@4
|
111 $this->reader->closeStream();
|
danielebarchiesi@4
|
112 unset($this->reader);
|
danielebarchiesi@4
|
113 return $this->done();
|
danielebarchiesi@4
|
114 }
|
danielebarchiesi@4
|
115
|
danielebarchiesi@4
|
116 /* */
|
danielebarchiesi@4
|
117
|
danielebarchiesi@4
|
118 function getEncoding($src = 'ignore') {
|
danielebarchiesi@4
|
119 return $this->target_encoding;
|
danielebarchiesi@4
|
120 }
|
danielebarchiesi@4
|
121
|
danielebarchiesi@4
|
122 /* */
|
danielebarchiesi@4
|
123
|
danielebarchiesi@4
|
124 function done() {
|
danielebarchiesi@4
|
125 if ($this->auto_extract) {
|
danielebarchiesi@4
|
126 $this->extractRDF();
|
danielebarchiesi@4
|
127 }
|
danielebarchiesi@4
|
128 }
|
danielebarchiesi@4
|
129
|
danielebarchiesi@4
|
130 /* */
|
danielebarchiesi@4
|
131
|
danielebarchiesi@4
|
132 function processData($v) {
|
danielebarchiesi@4
|
133 $sub_v = $v;
|
danielebarchiesi@4
|
134 do {
|
danielebarchiesi@4
|
135 $proceed = 1;
|
danielebarchiesi@4
|
136 if ((list($sub_r, $sub_v) = $this->xComment($sub_v)) && $sub_r) {
|
danielebarchiesi@4
|
137 $this->open(0, 'comment', array('value' => $sub_r));
|
danielebarchiesi@4
|
138 $this->close(0, 'comment');
|
danielebarchiesi@4
|
139 continue;
|
danielebarchiesi@4
|
140 }
|
danielebarchiesi@4
|
141 if ((list($sub_r, $sub_v) = $this->xDoctype($sub_v)) && $sub_r) {
|
danielebarchiesi@4
|
142 $this->open(0, 'doctype', array('value' => $sub_r));
|
danielebarchiesi@4
|
143 $this->close(0, 'doctype');
|
danielebarchiesi@4
|
144 /* RDFa detection */
|
danielebarchiesi@4
|
145 if (preg_match('/rdfa /i', $sub_r)) $this->detected_formats['rdfa'] = 1;
|
danielebarchiesi@4
|
146 continue;
|
danielebarchiesi@4
|
147 }
|
danielebarchiesi@4
|
148 if ($this->level && ((list($sub_r, $sub_v) = $this->xWS($sub_v)) && $sub_r)) {
|
danielebarchiesi@4
|
149 $this->cData(0, $sub_r);
|
danielebarchiesi@4
|
150 }
|
danielebarchiesi@4
|
151 elseif ((list($sub_r, $sub_v) = $this->xOpen($sub_v)) && $sub_r) {
|
danielebarchiesi@4
|
152 $this->open(0, $sub_r['tag'], $sub_r['a']);
|
danielebarchiesi@4
|
153 $this->cur_tag = $sub_r['tag'];
|
danielebarchiesi@4
|
154 if ($sub_r['empty']) {
|
danielebarchiesi@4
|
155 $this->close(0, $sub_r['tag'], 1);
|
danielebarchiesi@4
|
156 $this->cur_tag = '';
|
danielebarchiesi@4
|
157 }
|
danielebarchiesi@4
|
158 /* eRDF detection */
|
danielebarchiesi@4
|
159 if (!isset($this->detected_formats['erdf']) && isset($sub_r['a']['profile m']) && in_array('http://purl.org/NET/erdf/profile', $sub_r['a']['profile m'])) $this->detected_formats['erdf'] = 1;
|
danielebarchiesi@4
|
160 /* poshRDF detection */
|
danielebarchiesi@4
|
161 if (!isset($this->detected_formats['posh-rdf']) && isset($sub_r['a']['class m']) && in_array('rdf-p', $sub_r['a']['class m'])) $this->detected_formats['posh-rdf'] = 1;
|
danielebarchiesi@4
|
162 /* RDFa detection */
|
danielebarchiesi@4
|
163 if (!isset($this->detected_formats['rdfa']) && ($this->cur_tag == 'html') && isset($sub_r['a']['version m']) && in_array('XHTML+RDFa', $sub_r['a']['version m'])) $this->detected_formats['rdfa'] = 1;
|
danielebarchiesi@4
|
164 if (!isset($this->detected_formats['rdfa']) && isset($sub_r['a']['xmlns']) && $sub_r['a']['xmlns'] && $this->isRDFNSDecl($sub_r['a']['xmlns'])) $this->detected_formats['rdfa'] = 1;
|
danielebarchiesi@4
|
165 if (!isset($this->detected_formats['rdfa']) && array_intersect(array('about', 'typeof', 'property'), array_keys($sub_r['a']))) $this->detected_formats['rdfa'] = 1;
|
danielebarchiesi@4
|
166 }
|
danielebarchiesi@4
|
167 elseif ((list($sub_r, $sub_v) = $this->xClose($sub_v)) && $sub_r) {
|
danielebarchiesi@4
|
168 if (preg_match('/^(area|base|br|col|frame|hr|input|img|link|xmeta|param)$/', $sub_r['tag'])) {
|
danielebarchiesi@4
|
169 /* already implicitly closed */
|
danielebarchiesi@4
|
170 }
|
danielebarchiesi@4
|
171 else {
|
danielebarchiesi@4
|
172 $this->close(0, $sub_r['tag']);
|
danielebarchiesi@4
|
173 $this->cur_tag = '';
|
danielebarchiesi@4
|
174 }
|
danielebarchiesi@4
|
175 }
|
danielebarchiesi@4
|
176 elseif ((list($sub_r, $sub_v) = $this->xCData($sub_v)) && $sub_r) {
|
danielebarchiesi@4
|
177 $this->cData(0, $sub_r);
|
danielebarchiesi@4
|
178 }
|
danielebarchiesi@4
|
179 else {
|
danielebarchiesi@4
|
180 $proceed = 0;
|
danielebarchiesi@4
|
181 }
|
danielebarchiesi@4
|
182 } while ($proceed);
|
danielebarchiesi@4
|
183 return $sub_v;
|
danielebarchiesi@4
|
184 }
|
danielebarchiesi@4
|
185
|
danielebarchiesi@4
|
186 /* */
|
danielebarchiesi@4
|
187
|
danielebarchiesi@4
|
188 function isRDFNSDecl($ns) {
|
danielebarchiesi@4
|
189 foreach ($ns as $k => $v) {
|
danielebarchiesi@4
|
190 if ($k) return 1;
|
danielebarchiesi@4
|
191 }
|
danielebarchiesi@4
|
192 return 0;
|
danielebarchiesi@4
|
193 }
|
danielebarchiesi@4
|
194
|
danielebarchiesi@4
|
195 /* */
|
danielebarchiesi@4
|
196
|
danielebarchiesi@4
|
197 function xComment($v) {
|
danielebarchiesi@4
|
198 if ($r = $this->x('\<\!\-\-', $v)) {
|
danielebarchiesi@4
|
199 if ($sub_r = $this->x('(.*)\-\-\>', $r[1], 'Us')) {
|
danielebarchiesi@4
|
200 return array($sub_r[1], $sub_r[2]);
|
danielebarchiesi@4
|
201 }
|
danielebarchiesi@4
|
202 }
|
danielebarchiesi@4
|
203 return array(0, $v);
|
danielebarchiesi@4
|
204 }
|
danielebarchiesi@4
|
205
|
danielebarchiesi@4
|
206 function xDoctype($v) {
|
danielebarchiesi@4
|
207 if ($r = $this->x('\<\!DOCTYPE', $v)) {
|
danielebarchiesi@4
|
208 if ($sub_r = $this->x('([^\>]+)\>', $r[1])) {
|
danielebarchiesi@4
|
209 return array($sub_r[1], $sub_r[2]);
|
danielebarchiesi@4
|
210 }
|
danielebarchiesi@4
|
211 }
|
danielebarchiesi@4
|
212 return array(0, $v);
|
danielebarchiesi@4
|
213 }
|
danielebarchiesi@4
|
214
|
danielebarchiesi@4
|
215 function xWS($v) {
|
danielebarchiesi@4
|
216 if ($r = ARC2::x('(\s+)', $v)) {
|
danielebarchiesi@4
|
217 return array($r[1], $r[2]);
|
danielebarchiesi@4
|
218 }
|
danielebarchiesi@4
|
219 return array(0, $v);
|
danielebarchiesi@4
|
220 }
|
danielebarchiesi@4
|
221
|
danielebarchiesi@4
|
222 /* */
|
danielebarchiesi@4
|
223
|
danielebarchiesi@4
|
224 function xOpen($v) {
|
danielebarchiesi@4
|
225 if ($r = $this->x('\<([^\s\/\>]+)([^\>]*)\>', $v)) {
|
danielebarchiesi@4
|
226 list($sub_r, $sub_v) = $this->xAttributes($r[2]);
|
danielebarchiesi@4
|
227 return array(array('tag' => strtolower($r[1]), 'a' => $sub_r, 'empty' => $this->isEmpty($r[1], $r[2])), $r[3]);
|
danielebarchiesi@4
|
228 }
|
danielebarchiesi@4
|
229 return array(0, $v);
|
danielebarchiesi@4
|
230 }
|
danielebarchiesi@4
|
231
|
danielebarchiesi@4
|
232 /* */
|
danielebarchiesi@4
|
233
|
danielebarchiesi@4
|
234 function xAttributes($v) {
|
danielebarchiesi@4
|
235 $r = array();
|
danielebarchiesi@4
|
236 while ((list($sub_r, $v) = $this->xAttribute($v)) && $sub_r) {
|
danielebarchiesi@4
|
237 if ($sub_sub_r = $this->x('xmlns\:?(.*)', $sub_r['k'])) {
|
danielebarchiesi@4
|
238 $this->nsDecl(0, $sub_sub_r[1], $sub_r['value']);
|
danielebarchiesi@4
|
239 $r['xmlns'][$sub_sub_r[1]] = $sub_r['value'];
|
danielebarchiesi@4
|
240 }
|
danielebarchiesi@4
|
241 else {
|
danielebarchiesi@4
|
242 $r[$sub_r['k']] = $sub_r['value'];
|
danielebarchiesi@4
|
243 $r[$sub_r['k'] . ' m'] = $sub_r['values'];
|
danielebarchiesi@4
|
244 }
|
danielebarchiesi@4
|
245 }
|
danielebarchiesi@4
|
246 return array($r, $v);
|
danielebarchiesi@4
|
247 }
|
danielebarchiesi@4
|
248
|
danielebarchiesi@4
|
249 /* */
|
danielebarchiesi@4
|
250
|
danielebarchiesi@4
|
251 function xAttribute($v) {
|
danielebarchiesi@4
|
252 if ($r = $this->x('([^\s\=]+)\s*(\=)?\s*([\'\"]?)', $v)) {
|
danielebarchiesi@4
|
253 if (!$r[2]) {/* no '=' */
|
danielebarchiesi@4
|
254 if ($r[1] == '/') {
|
danielebarchiesi@4
|
255 return array(0, $r[4]);
|
danielebarchiesi@4
|
256 }
|
danielebarchiesi@4
|
257 return array(array('k' => $r[1], 'value' => 1, 'values' => array(1)), $r[4]);
|
danielebarchiesi@4
|
258 }
|
danielebarchiesi@4
|
259 if (!$r[3]) {/* no quots */
|
danielebarchiesi@4
|
260 if ($sub_r = $this->x('([^\s]+)', $r[4])) {
|
danielebarchiesi@4
|
261 return array(array('k' => $r[1], 'value' => $sub_r[1], 'values' => array($sub_r[1])), $sub_r[2]);
|
danielebarchiesi@4
|
262 }
|
danielebarchiesi@4
|
263 return array(array('k' => $r[1], 'value' => '', 'values' => array()), $r[4]);
|
danielebarchiesi@4
|
264 }
|
danielebarchiesi@4
|
265 $val = '';
|
danielebarchiesi@4
|
266 $multi = 0;
|
danielebarchiesi@4
|
267 $sub_v = $r[4];
|
danielebarchiesi@4
|
268 while ($sub_v && (!$sub_r = $this->x('(\x5c\\' .$r[3]. '|\\' .$r[3]. ')', $sub_v))) {
|
danielebarchiesi@4
|
269 $val .= substr($sub_v, 0, 1);
|
danielebarchiesi@4
|
270 $sub_v = substr($sub_v, 1);
|
danielebarchiesi@4
|
271 }
|
danielebarchiesi@4
|
272 $sub_v = $sub_v ? $sub_r[2] : $sub_v;
|
danielebarchiesi@4
|
273 $vals = preg_split('/ /', $val);
|
danielebarchiesi@4
|
274 return array(array('k' => $r[1], 'value' => $val, 'values' => $vals), $sub_v);
|
danielebarchiesi@4
|
275 }
|
danielebarchiesi@4
|
276 return array(0, $v);
|
danielebarchiesi@4
|
277 }
|
danielebarchiesi@4
|
278
|
danielebarchiesi@4
|
279 /* */
|
danielebarchiesi@4
|
280
|
danielebarchiesi@4
|
281 function isEmpty($t, $v) {
|
danielebarchiesi@4
|
282 if (preg_match('/^(area|base|br|col|frame|hr|input|img|link|xmeta|param)$/', $t)) {
|
danielebarchiesi@4
|
283 return 1;
|
danielebarchiesi@4
|
284 }
|
danielebarchiesi@4
|
285 if (preg_match('/\/$/', $v)) {
|
danielebarchiesi@4
|
286 return 1;
|
danielebarchiesi@4
|
287 }
|
danielebarchiesi@4
|
288 return 0;
|
danielebarchiesi@4
|
289 }
|
danielebarchiesi@4
|
290
|
danielebarchiesi@4
|
291 /* */
|
danielebarchiesi@4
|
292
|
danielebarchiesi@4
|
293 function xClose($v) {
|
danielebarchiesi@4
|
294 if ($r = $this->x('\<\/([^\s\>]+)\>', $v)) {
|
danielebarchiesi@4
|
295 return array(array('tag' => strtolower($r[1])), $r[2]);
|
danielebarchiesi@4
|
296 }
|
danielebarchiesi@4
|
297 return array(0, $v);
|
danielebarchiesi@4
|
298 }
|
danielebarchiesi@4
|
299
|
danielebarchiesi@4
|
300 /* */
|
danielebarchiesi@4
|
301
|
danielebarchiesi@4
|
302 function xCData($v) {
|
danielebarchiesi@4
|
303 if (preg_match('/(script|style)/i', $this->cur_tag)) {
|
danielebarchiesi@4
|
304 if ($r = $this->x('(.+)(\<\/' . $this->cur_tag . '\>)', $v, 'Uis')) {
|
danielebarchiesi@4
|
305 return array($r[1], $r[2] . $r[3]);
|
danielebarchiesi@4
|
306 }
|
danielebarchiesi@4
|
307 }
|
danielebarchiesi@4
|
308 elseif ($r = $this->x('([^\<]+)', $v, 'si', $this->keep_cdata_ws)) {
|
danielebarchiesi@4
|
309 return array($r[1], $r[2]);
|
danielebarchiesi@4
|
310 }
|
danielebarchiesi@4
|
311 return array(0, $v);
|
danielebarchiesi@4
|
312 }
|
danielebarchiesi@4
|
313
|
danielebarchiesi@4
|
314 /* */
|
danielebarchiesi@4
|
315
|
danielebarchiesi@4
|
316 function extractRDF($formats = '') {
|
danielebarchiesi@4
|
317 $this->node_index = $this->getNodeIndex();
|
danielebarchiesi@4
|
318 $formats = !$formats ? $this->v('sem_html_formats', $this->default_sem_html_formats, $this->a) : $formats;
|
danielebarchiesi@4
|
319 $formats = preg_split('/ /', $formats);
|
danielebarchiesi@4
|
320 foreach ($formats as $format) {
|
danielebarchiesi@4
|
321 if (!in_array($format, $this->extracted_formats)) {
|
danielebarchiesi@4
|
322 $comp = $this->camelCase($format) . 'Extractor';
|
danielebarchiesi@4
|
323 if (ARC2::inc($comp)) {
|
danielebarchiesi@4
|
324 $cls = 'ARC2_' . $comp;
|
danielebarchiesi@4
|
325 $e = new $cls($this->a, $this);
|
danielebarchiesi@4
|
326 $e->extractRDF();
|
danielebarchiesi@4
|
327 }
|
danielebarchiesi@4
|
328 $this->extracted_formats[] = $format;
|
danielebarchiesi@4
|
329 }
|
danielebarchiesi@4
|
330 }
|
danielebarchiesi@4
|
331 }
|
danielebarchiesi@4
|
332
|
danielebarchiesi@4
|
333 function getNode($id) {
|
danielebarchiesi@4
|
334 return isset($this->nodes[$id]) ? $this->nodes[$id] : 0;
|
danielebarchiesi@4
|
335 }
|
danielebarchiesi@4
|
336
|
danielebarchiesi@4
|
337 /* */
|
danielebarchiesi@4
|
338
|
danielebarchiesi@4
|
339 } |