Mercurial > hg > rr-repo
comparison sites/all/libraries/ARC2/arc/parsers/ARC2_SemHTMLParser.php @ 4:ce11bbd8f642
added modules
author | danieleb <danielebarchiesi@me.com> |
---|---|
date | Thu, 19 Sep 2013 10:38:44 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
3:b28be78d8160 | 4:ce11bbd8f642 |
---|---|
1 <?php | |
2 /* | |
3 homepage: http://arc.semsol.org/ | |
4 license: http://arc.semsol.org/license | |
5 | |
6 class: ARC2 RDF/XML Parser | |
7 author: Benjamin Nowack | |
8 version: 2010-11-16 | |
9 */ | |
10 | |
11 ARC2::inc('LegacyXMLParser'); | |
12 | |
13 class ARC2_SemHTMLParser extends ARC2_LegacyXMLParser { | |
14 | |
15 function __construct($a, &$caller) { | |
16 parent::__construct($a, $caller); | |
17 } | |
18 | |
19 function __init() {/* reader */ | |
20 parent::__init(); | |
21 $this->default_sem_html_formats = 'dc openid erdf rdfa posh-rdf microformats'; | |
22 $this->triples = array(); | |
23 $this->target_encoding = ''; | |
24 $this->t_count = 0; | |
25 $this->added_triples = array(); | |
26 $this->skip_dupes = false; | |
27 $this->bnode_prefix = $this->v('bnode_prefix', 'arc'.substr(md5(uniqid(rand())), 0, 4).'b', $this->a); | |
28 $this->bnode_id = 0; | |
29 $this->auto_extract = $this->v('auto_extract', 1, $this->a); | |
30 $this->extracted_formats = array(); | |
31 $this->cache = array(); | |
32 $this->detected_formats = array(); | |
33 $this->keep_cdata_ws = $this->v('keep_cdata_whitespace', 0, $this->a); | |
34 } | |
35 | |
36 /* */ | |
37 | |
38 function x($re, $v, $options = 'si', $keep_ws = 0) { | |
39 list($ws, $v) = preg_match('/^(\s*)(.*)$/s', $v, $m) ? array($m[1], $m[2]) : array('', $v); | |
40 if (preg_match("/^" . $re . "(.*)$/" . $options, $v, $m)) { | |
41 if ($keep_ws) $m[1] = $ws . $m[1]; | |
42 return $m; | |
43 } | |
44 return false; | |
45 } | |
46 | |
47 /* */ | |
48 | |
49 function setReader(&$reader) { | |
50 $this->reader = $reader; | |
51 } | |
52 | |
53 function createBnodeID(){ | |
54 $this->bnode_id++; | |
55 return '_:' . $this->bnode_prefix . $this->bnode_id; | |
56 } | |
57 | |
58 function addT($t) { | |
59 if (function_exists('html_entity_decode')) { | |
60 $t['o'] = html_entity_decode($t['o']); | |
61 } | |
62 if ($this->skip_dupes) { | |
63 $h = md5(serialize($t)); | |
64 if (!isset($this->added_triples[$h])) { | |
65 $this->triples[$this->t_count] = $t; | |
66 $this->t_count++; | |
67 $this->added_triples[$h] = true; | |
68 } | |
69 } | |
70 else { | |
71 $this->triples[$this->t_count] = $t; | |
72 $this->t_count++; | |
73 } | |
74 } | |
75 | |
76 function getTriples() { | |
77 return $this->v('triples', array()); | |
78 } | |
79 | |
80 function countTriples() { | |
81 return $this->t_count; | |
82 } | |
83 | |
84 function getSimpleIndex($flatten_objects = 1, $vals = '') { | |
85 return ARC2::getSimpleIndex($this->getTriples(), $flatten_objects, $vals); | |
86 } | |
87 | |
88 /* */ | |
89 | |
90 function parse($path, $data = '', $iso_fallback = 'ignore') { | |
91 $this->nodes = array(); | |
92 $this->node_count = 0; | |
93 $this->level = 0; | |
94 /* reader */ | |
95 if (!$this->v('reader')) { | |
96 ARC2::inc('Reader'); | |
97 $this->reader = new ARC2_Reader($this->a, $this); | |
98 } | |
99 $this->reader->setAcceptHeader('Accept: text/html, application/xhtml, */*; q=0.9'); | |
100 $this->reader->activate($path, $data); | |
101 $this->target_encoding = $this->reader->getEncoding(false); | |
102 $this->x_base = isset($this->a['base']) && $this->a['base'] ? $this->a['base'] : $this->reader->base; | |
103 $this->base = $this->x_base; | |
104 $this->doc_url = $this->reader->base; | |
105 /* parse */ | |
106 $rest = ''; | |
107 $this->cur_tag = ''; | |
108 while ($d = $this->reader->readStream(1)) { | |
109 $rest = $this->processData($rest . $d); | |
110 } | |
111 $this->reader->closeStream(); | |
112 unset($this->reader); | |
113 return $this->done(); | |
114 } | |
115 | |
116 /* */ | |
117 | |
118 function getEncoding($src = 'ignore') { | |
119 return $this->target_encoding; | |
120 } | |
121 | |
122 /* */ | |
123 | |
124 function done() { | |
125 if ($this->auto_extract) { | |
126 $this->extractRDF(); | |
127 } | |
128 } | |
129 | |
130 /* */ | |
131 | |
132 function processData($v) { | |
133 $sub_v = $v; | |
134 do { | |
135 $proceed = 1; | |
136 if ((list($sub_r, $sub_v) = $this->xComment($sub_v)) && $sub_r) { | |
137 $this->open(0, 'comment', array('value' => $sub_r)); | |
138 $this->close(0, 'comment'); | |
139 continue; | |
140 } | |
141 if ((list($sub_r, $sub_v) = $this->xDoctype($sub_v)) && $sub_r) { | |
142 $this->open(0, 'doctype', array('value' => $sub_r)); | |
143 $this->close(0, 'doctype'); | |
144 /* RDFa detection */ | |
145 if (preg_match('/rdfa /i', $sub_r)) $this->detected_formats['rdfa'] = 1; | |
146 continue; | |
147 } | |
148 if ($this->level && ((list($sub_r, $sub_v) = $this->xWS($sub_v)) && $sub_r)) { | |
149 $this->cData(0, $sub_r); | |
150 } | |
151 elseif ((list($sub_r, $sub_v) = $this->xOpen($sub_v)) && $sub_r) { | |
152 $this->open(0, $sub_r['tag'], $sub_r['a']); | |
153 $this->cur_tag = $sub_r['tag']; | |
154 if ($sub_r['empty']) { | |
155 $this->close(0, $sub_r['tag'], 1); | |
156 $this->cur_tag = ''; | |
157 } | |
158 /* eRDF detection */ | |
159 if (!isset($this->detected_formats['erdf']) && isset($sub_r['a']['profile m']) && in_array('http://purl.org/NET/erdf/profile', $sub_r['a']['profile m'])) $this->detected_formats['erdf'] = 1; | |
160 /* poshRDF detection */ | |
161 if (!isset($this->detected_formats['posh-rdf']) && isset($sub_r['a']['class m']) && in_array('rdf-p', $sub_r['a']['class m'])) $this->detected_formats['posh-rdf'] = 1; | |
162 /* RDFa detection */ | |
163 if (!isset($this->detected_formats['rdfa']) && ($this->cur_tag == 'html') && isset($sub_r['a']['version m']) && in_array('XHTML+RDFa', $sub_r['a']['version m'])) $this->detected_formats['rdfa'] = 1; | |
164 if (!isset($this->detected_formats['rdfa']) && isset($sub_r['a']['xmlns']) && $sub_r['a']['xmlns'] && $this->isRDFNSDecl($sub_r['a']['xmlns'])) $this->detected_formats['rdfa'] = 1; | |
165 if (!isset($this->detected_formats['rdfa']) && array_intersect(array('about', 'typeof', 'property'), array_keys($sub_r['a']))) $this->detected_formats['rdfa'] = 1; | |
166 } | |
167 elseif ((list($sub_r, $sub_v) = $this->xClose($sub_v)) && $sub_r) { | |
168 if (preg_match('/^(area|base|br|col|frame|hr|input|img|link|xmeta|param)$/', $sub_r['tag'])) { | |
169 /* already implicitly closed */ | |
170 } | |
171 else { | |
172 $this->close(0, $sub_r['tag']); | |
173 $this->cur_tag = ''; | |
174 } | |
175 } | |
176 elseif ((list($sub_r, $sub_v) = $this->xCData($sub_v)) && $sub_r) { | |
177 $this->cData(0, $sub_r); | |
178 } | |
179 else { | |
180 $proceed = 0; | |
181 } | |
182 } while ($proceed); | |
183 return $sub_v; | |
184 } | |
185 | |
186 /* */ | |
187 | |
188 function isRDFNSDecl($ns) { | |
189 foreach ($ns as $k => $v) { | |
190 if ($k) return 1; | |
191 } | |
192 return 0; | |
193 } | |
194 | |
195 /* */ | |
196 | |
197 function xComment($v) { | |
198 if ($r = $this->x('\<\!\-\-', $v)) { | |
199 if ($sub_r = $this->x('(.*)\-\-\>', $r[1], 'Us')) { | |
200 return array($sub_r[1], $sub_r[2]); | |
201 } | |
202 } | |
203 return array(0, $v); | |
204 } | |
205 | |
206 function xDoctype($v) { | |
207 if ($r = $this->x('\<\!DOCTYPE', $v)) { | |
208 if ($sub_r = $this->x('([^\>]+)\>', $r[1])) { | |
209 return array($sub_r[1], $sub_r[2]); | |
210 } | |
211 } | |
212 return array(0, $v); | |
213 } | |
214 | |
215 function xWS($v) { | |
216 if ($r = ARC2::x('(\s+)', $v)) { | |
217 return array($r[1], $r[2]); | |
218 } | |
219 return array(0, $v); | |
220 } | |
221 | |
222 /* */ | |
223 | |
224 function xOpen($v) { | |
225 if ($r = $this->x('\<([^\s\/\>]+)([^\>]*)\>', $v)) { | |
226 list($sub_r, $sub_v) = $this->xAttributes($r[2]); | |
227 return array(array('tag' => strtolower($r[1]), 'a' => $sub_r, 'empty' => $this->isEmpty($r[1], $r[2])), $r[3]); | |
228 } | |
229 return array(0, $v); | |
230 } | |
231 | |
232 /* */ | |
233 | |
234 function xAttributes($v) { | |
235 $r = array(); | |
236 while ((list($sub_r, $v) = $this->xAttribute($v)) && $sub_r) { | |
237 if ($sub_sub_r = $this->x('xmlns\:?(.*)', $sub_r['k'])) { | |
238 $this->nsDecl(0, $sub_sub_r[1], $sub_r['value']); | |
239 $r['xmlns'][$sub_sub_r[1]] = $sub_r['value']; | |
240 } | |
241 else { | |
242 $r[$sub_r['k']] = $sub_r['value']; | |
243 $r[$sub_r['k'] . ' m'] = $sub_r['values']; | |
244 } | |
245 } | |
246 return array($r, $v); | |
247 } | |
248 | |
249 /* */ | |
250 | |
251 function xAttribute($v) { | |
252 if ($r = $this->x('([^\s\=]+)\s*(\=)?\s*([\'\"]?)', $v)) { | |
253 if (!$r[2]) {/* no '=' */ | |
254 if ($r[1] == '/') { | |
255 return array(0, $r[4]); | |
256 } | |
257 return array(array('k' => $r[1], 'value' => 1, 'values' => array(1)), $r[4]); | |
258 } | |
259 if (!$r[3]) {/* no quots */ | |
260 if ($sub_r = $this->x('([^\s]+)', $r[4])) { | |
261 return array(array('k' => $r[1], 'value' => $sub_r[1], 'values' => array($sub_r[1])), $sub_r[2]); | |
262 } | |
263 return array(array('k' => $r[1], 'value' => '', 'values' => array()), $r[4]); | |
264 } | |
265 $val = ''; | |
266 $multi = 0; | |
267 $sub_v = $r[4]; | |
268 while ($sub_v && (!$sub_r = $this->x('(\x5c\\' .$r[3]. '|\\' .$r[3]. ')', $sub_v))) { | |
269 $val .= substr($sub_v, 0, 1); | |
270 $sub_v = substr($sub_v, 1); | |
271 } | |
272 $sub_v = $sub_v ? $sub_r[2] : $sub_v; | |
273 $vals = preg_split('/ /', $val); | |
274 return array(array('k' => $r[1], 'value' => $val, 'values' => $vals), $sub_v); | |
275 } | |
276 return array(0, $v); | |
277 } | |
278 | |
279 /* */ | |
280 | |
281 function isEmpty($t, $v) { | |
282 if (preg_match('/^(area|base|br|col|frame|hr|input|img|link|xmeta|param)$/', $t)) { | |
283 return 1; | |
284 } | |
285 if (preg_match('/\/$/', $v)) { | |
286 return 1; | |
287 } | |
288 return 0; | |
289 } | |
290 | |
291 /* */ | |
292 | |
293 function xClose($v) { | |
294 if ($r = $this->x('\<\/([^\s\>]+)\>', $v)) { | |
295 return array(array('tag' => strtolower($r[1])), $r[2]); | |
296 } | |
297 return array(0, $v); | |
298 } | |
299 | |
300 /* */ | |
301 | |
302 function xCData($v) { | |
303 if (preg_match('/(script|style)/i', $this->cur_tag)) { | |
304 if ($r = $this->x('(.+)(\<\/' . $this->cur_tag . '\>)', $v, 'Uis')) { | |
305 return array($r[1], $r[2] . $r[3]); | |
306 } | |
307 } | |
308 elseif ($r = $this->x('([^\<]+)', $v, 'si', $this->keep_cdata_ws)) { | |
309 return array($r[1], $r[2]); | |
310 } | |
311 return array(0, $v); | |
312 } | |
313 | |
314 /* */ | |
315 | |
316 function extractRDF($formats = '') { | |
317 $this->node_index = $this->getNodeIndex(); | |
318 $formats = !$formats ? $this->v('sem_html_formats', $this->default_sem_html_formats, $this->a) : $formats; | |
319 $formats = preg_split('/ /', $formats); | |
320 foreach ($formats as $format) { | |
321 if (!in_array($format, $this->extracted_formats)) { | |
322 $comp = $this->camelCase($format) . 'Extractor'; | |
323 if (ARC2::inc($comp)) { | |
324 $cls = 'ARC2_' . $comp; | |
325 $e = new $cls($this->a, $this); | |
326 $e->extractRDF(); | |
327 } | |
328 $this->extracted_formats[] = $format; | |
329 } | |
330 } | |
331 } | |
332 | |
333 function getNode($id) { | |
334 return isset($this->nodes[$id]) ? $this->nodes[$id] : 0; | |
335 } | |
336 | |
337 /* */ | |
338 | |
339 } |