danielebarchiesi@4
|
1 <?php
|
danielebarchiesi@4
|
2 /**
|
danielebarchiesi@4
|
3 * ARC2 streaming SPOG parser
|
danielebarchiesi@4
|
4 *
|
danielebarchiesi@4
|
5 * @author Benjamin Nowack
|
danielebarchiesi@4
|
6 * @license <http://arc.semsol.org/license>
|
danielebarchiesi@4
|
7 * @homepage <http://arc.semsol.org/>
|
danielebarchiesi@4
|
8 * @package ARC2
|
danielebarchiesi@4
|
9 * @version 2010-11-16
|
danielebarchiesi@4
|
10 */
|
danielebarchiesi@4
|
11
|
danielebarchiesi@4
|
12 ARC2::inc('RDFParser');
|
danielebarchiesi@4
|
13
|
danielebarchiesi@4
|
14 class ARC2_SPOGParser extends ARC2_RDFParser {
|
danielebarchiesi@4
|
15
|
danielebarchiesi@4
|
16 function __construct($a, &$caller) {
|
danielebarchiesi@4
|
17 parent::__construct($a, $caller);
|
danielebarchiesi@4
|
18 }
|
danielebarchiesi@4
|
19
|
danielebarchiesi@4
|
20 function __init() {/* reader */
|
danielebarchiesi@4
|
21 parent::__init();
|
danielebarchiesi@4
|
22 $this->encoding = $this->v('encoding', false, $this->a);
|
danielebarchiesi@4
|
23 $this->xml = 'http://www.w3.org/XML/1998/namespace';
|
danielebarchiesi@4
|
24 $this->rdf = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
|
danielebarchiesi@4
|
25 $this->nsp = array($this->xml => 'xml', $this->rdf => 'rdf');
|
danielebarchiesi@4
|
26 $this->target_encoding = '';
|
danielebarchiesi@4
|
27 }
|
danielebarchiesi@4
|
28
|
danielebarchiesi@4
|
29 /* */
|
danielebarchiesi@4
|
30
|
danielebarchiesi@4
|
31 function parse($path, $data = '', $iso_fallback = false) {
|
danielebarchiesi@4
|
32 $this->state = 0;
|
danielebarchiesi@4
|
33 /* reader */
|
danielebarchiesi@4
|
34 if (!$this->v('reader')) {
|
danielebarchiesi@4
|
35 ARC2::inc('Reader');
|
danielebarchiesi@4
|
36 $this->reader = new ARC2_Reader($this->a, $this);
|
danielebarchiesi@4
|
37 }
|
danielebarchiesi@4
|
38 $this->reader->setAcceptHeader('Accept: sparql-results+xml; q=0.9, */*; q=0.1');
|
danielebarchiesi@4
|
39 $this->reader->activate($path, $data);
|
danielebarchiesi@4
|
40 $this->x_base = isset($this->a['base']) && $this->a['base'] ? $this->a['base'] : $this->reader->base;
|
danielebarchiesi@4
|
41 /* xml parser */
|
danielebarchiesi@4
|
42 $this->initXMLParser();
|
danielebarchiesi@4
|
43 /* parse */
|
danielebarchiesi@4
|
44 $first = true;
|
danielebarchiesi@4
|
45 while ($d = $this->reader->readStream()) {
|
danielebarchiesi@4
|
46 if ($iso_fallback && $first) {
|
danielebarchiesi@4
|
47 $d = '<?xml version="1.0" encoding="ISO-8859-1"?>' . "\n" . preg_replace('/^\<\?xml [^\>]+\?\>\s*/s', '', $d);
|
danielebarchiesi@4
|
48 $first = false;
|
danielebarchiesi@4
|
49 }
|
danielebarchiesi@4
|
50 if (!xml_parse($this->xml_parser, $d, false)) {
|
danielebarchiesi@4
|
51 $error_str = xml_error_string(xml_get_error_code($this->xml_parser));
|
danielebarchiesi@4
|
52 $line = xml_get_current_line_number($this->xml_parser);
|
danielebarchiesi@4
|
53 $this->tmp_error = 'XML error: "' . $error_str . '" at line ' . $line . ' (parsing as ' . $this->getEncoding() . ')';
|
danielebarchiesi@4
|
54 $this->tmp_error .= $d . urlencode($d);
|
danielebarchiesi@4
|
55 if (0 && !$iso_fallback && preg_match("/Invalid character/i", $error_str)) {
|
danielebarchiesi@4
|
56 xml_parser_free($this->xml_parser);
|
danielebarchiesi@4
|
57 unset($this->xml_parser);
|
danielebarchiesi@4
|
58 $this->reader->closeStream();
|
danielebarchiesi@4
|
59 $this->__init();
|
danielebarchiesi@4
|
60 $this->encoding = 'ISO-8859-1';
|
danielebarchiesi@4
|
61 unset($this->xml_parser);
|
danielebarchiesi@4
|
62 unset($this->reader);
|
danielebarchiesi@4
|
63 return $this->parse($path, $data, true);
|
danielebarchiesi@4
|
64 }
|
danielebarchiesi@4
|
65 else {
|
danielebarchiesi@4
|
66 return $this->addError($this->tmp_error);
|
danielebarchiesi@4
|
67 }
|
danielebarchiesi@4
|
68 }
|
danielebarchiesi@4
|
69 }
|
danielebarchiesi@4
|
70 $this->target_encoding = xml_parser_get_option($this->xml_parser, XML_OPTION_TARGET_ENCODING);
|
danielebarchiesi@4
|
71 xml_parser_free($this->xml_parser);
|
danielebarchiesi@4
|
72 $this->reader->closeStream();
|
danielebarchiesi@4
|
73 unset($this->reader);
|
danielebarchiesi@4
|
74 return $this->done();
|
danielebarchiesi@4
|
75 }
|
danielebarchiesi@4
|
76
|
danielebarchiesi@4
|
77 /* */
|
danielebarchiesi@4
|
78
|
danielebarchiesi@4
|
79 function initXMLParser() {
|
danielebarchiesi@4
|
80 if (!isset($this->xml_parser)) {
|
danielebarchiesi@4
|
81 $enc = preg_match('/^(utf\-8|iso\-8859\-1|us\-ascii)$/i', $this->getEncoding(), $m) ? $m[1] : 'UTF-8';
|
danielebarchiesi@4
|
82 $parser = xml_parser_create($enc);
|
danielebarchiesi@4
|
83 xml_parser_set_option($parser, XML_OPTION_SKIP_WHITE, 0);
|
danielebarchiesi@4
|
84 xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, 0);
|
danielebarchiesi@4
|
85 xml_set_element_handler($parser, 'open', 'close');
|
danielebarchiesi@4
|
86 xml_set_character_data_handler($parser, 'cdata');
|
danielebarchiesi@4
|
87 xml_set_start_namespace_decl_handler($parser, 'nsDecl');
|
danielebarchiesi@4
|
88 xml_set_object($parser, $this);
|
danielebarchiesi@4
|
89 $this->xml_parser = $parser;
|
danielebarchiesi@4
|
90 }
|
danielebarchiesi@4
|
91 }
|
danielebarchiesi@4
|
92
|
danielebarchiesi@4
|
93 /* */
|
danielebarchiesi@4
|
94
|
danielebarchiesi@4
|
95 function getEncoding($src = 'config') {
|
danielebarchiesi@4
|
96 if ($src == 'parser') {
|
danielebarchiesi@4
|
97 return $this->target_encoding;
|
danielebarchiesi@4
|
98 }
|
danielebarchiesi@4
|
99 elseif (($src == 'config') && $this->encoding) {
|
danielebarchiesi@4
|
100 return $this->encoding;
|
danielebarchiesi@4
|
101 }
|
danielebarchiesi@4
|
102 return $this->reader->getEncoding();
|
danielebarchiesi@4
|
103 return 'UTF-8';
|
danielebarchiesi@4
|
104 }
|
danielebarchiesi@4
|
105
|
danielebarchiesi@4
|
106 /* */
|
danielebarchiesi@4
|
107
|
danielebarchiesi@4
|
108 function getTriples() {
|
danielebarchiesi@4
|
109 return $this->v('triples', array());
|
danielebarchiesi@4
|
110 }
|
danielebarchiesi@4
|
111
|
danielebarchiesi@4
|
112 function countTriples() {
|
danielebarchiesi@4
|
113 return $this->t_count;
|
danielebarchiesi@4
|
114 }
|
danielebarchiesi@4
|
115
|
danielebarchiesi@4
|
116 function addT($s = '', $p = '', $o = '', $s_type = '', $o_type = '', $o_dt = '', $o_lang = '', $g = '') {
|
danielebarchiesi@4
|
117 if (!($s && $p && $o)) return 0;
|
danielebarchiesi@4
|
118 //echo "-----\nadding $s / $p / $o\n-----\n";
|
danielebarchiesi@4
|
119 $t = array('s' => $s, 'p' => $p, 'o' => $o, 's_type' => $s_type, 'o_type' => $o_type, 'o_datatype' => $o_dt, 'o_lang' => $o_lang, 'g' => $g);
|
danielebarchiesi@4
|
120 if ($this->skip_dupes) {
|
danielebarchiesi@4
|
121 $h = md5(serialize($t));
|
danielebarchiesi@4
|
122 if (!isset($this->added_triples[$h])) {
|
danielebarchiesi@4
|
123 $this->triples[$this->t_count] = $t;
|
danielebarchiesi@4
|
124 $this->t_count++;
|
danielebarchiesi@4
|
125 $this->added_triples[$h] = true;
|
danielebarchiesi@4
|
126 }
|
danielebarchiesi@4
|
127 }
|
danielebarchiesi@4
|
128 else {
|
danielebarchiesi@4
|
129 $this->triples[$this->t_count] = $t;
|
danielebarchiesi@4
|
130 $this->t_count++;
|
danielebarchiesi@4
|
131 }
|
danielebarchiesi@4
|
132 }
|
danielebarchiesi@4
|
133
|
danielebarchiesi@4
|
134 /* */
|
danielebarchiesi@4
|
135
|
danielebarchiesi@4
|
136 function open($p, $t, $a) {
|
danielebarchiesi@4
|
137 $this->state = $t;
|
danielebarchiesi@4
|
138 if ($t == 'result') {
|
danielebarchiesi@4
|
139 $this->t = array();
|
danielebarchiesi@4
|
140 }
|
danielebarchiesi@4
|
141 elseif ($t == 'binding') {
|
danielebarchiesi@4
|
142 $this->binding = $a['name'];
|
danielebarchiesi@4
|
143 $this->t[$this->binding] = '';
|
danielebarchiesi@4
|
144 }
|
danielebarchiesi@4
|
145 elseif ($t == 'literal') {
|
danielebarchiesi@4
|
146 $this->t[$this->binding . '_dt'] = $this->v('datatype', '', $a);
|
danielebarchiesi@4
|
147 $this->t[$this->binding . '_lang'] = $this->v('xml:lang', '', $a);
|
danielebarchiesi@4
|
148 $this->t[$this->binding . '_type'] = 'literal';
|
danielebarchiesi@4
|
149 }
|
danielebarchiesi@4
|
150 elseif ($t == 'uri') {
|
danielebarchiesi@4
|
151 $this->t[$this->binding . '_type'] = 'uri';
|
danielebarchiesi@4
|
152 }
|
danielebarchiesi@4
|
153 elseif ($t == 'bnode') {
|
danielebarchiesi@4
|
154 $this->t[$this->binding . '_type'] = 'bnode';
|
danielebarchiesi@4
|
155 $this->t[$this->binding] = '_:';
|
danielebarchiesi@4
|
156 }
|
danielebarchiesi@4
|
157 }
|
danielebarchiesi@4
|
158
|
danielebarchiesi@4
|
159 function close($p, $t) {
|
danielebarchiesi@4
|
160 $this->prev_state = $this->state;
|
danielebarchiesi@4
|
161 $this->state = '';
|
danielebarchiesi@4
|
162 if ($t == 'result') {
|
danielebarchiesi@4
|
163 $this->addT(
|
danielebarchiesi@4
|
164 $this->v('s', '', $this->t),
|
danielebarchiesi@4
|
165 $this->v('p', '', $this->t),
|
danielebarchiesi@4
|
166 $this->v('o', '', $this->t),
|
danielebarchiesi@4
|
167 $this->v('s_type', '', $this->t),
|
danielebarchiesi@4
|
168 $this->v('o_type', '', $this->t),
|
danielebarchiesi@4
|
169 $this->v('o_dt', '', $this->t),
|
danielebarchiesi@4
|
170 $this->v('o_lang', '', $this->t),
|
danielebarchiesi@4
|
171 $this->v('g', '', $this->t)
|
danielebarchiesi@4
|
172 );
|
danielebarchiesi@4
|
173 }
|
danielebarchiesi@4
|
174 }
|
danielebarchiesi@4
|
175
|
danielebarchiesi@4
|
176 function cData($p, $d) {
|
danielebarchiesi@4
|
177 if (in_array($this->state, array('uri', 'bnode', 'literal'))) {
|
danielebarchiesi@4
|
178 $this->t[$this->binding] .= $d;
|
danielebarchiesi@4
|
179 }
|
danielebarchiesi@4
|
180 }
|
danielebarchiesi@4
|
181
|
danielebarchiesi@4
|
182 function nsDecl($p, $prf, $uri) {
|
danielebarchiesi@4
|
183 $this->nsp[$uri] = isset($this->nsp[$uri]) ? $this->nsp[$uri] : $prf;
|
danielebarchiesi@4
|
184 }
|
danielebarchiesi@4
|
185
|
danielebarchiesi@4
|
186 /* */
|
danielebarchiesi@4
|
187
|
danielebarchiesi@4
|
188 }
|