Chris@0
|
1 <?php
|
Chris@0
|
2 namespace Masterminds;
|
Chris@0
|
3
|
Chris@0
|
4 use Masterminds\HTML5\Parser\FileInputStream;
|
Chris@0
|
5 use Masterminds\HTML5\Parser\StringInputStream;
|
Chris@0
|
6 use Masterminds\HTML5\Parser\DOMTreeBuilder;
|
Chris@0
|
7 use Masterminds\HTML5\Parser\Scanner;
|
Chris@0
|
8 use Masterminds\HTML5\Parser\Tokenizer;
|
Chris@0
|
9 use Masterminds\HTML5\Serializer\OutputRules;
|
Chris@0
|
10 use Masterminds\HTML5\Serializer\Traverser;
|
Chris@0
|
11
|
Chris@0
|
12 /**
|
Chris@0
|
13 * This class offers convenience methods for parsing and serializing HTML5.
|
Chris@0
|
14 * It is roughly designed to mirror the \DOMDocument class that is
|
Chris@0
|
15 * provided with most versions of PHP.
|
Chris@0
|
16 *
|
Chris@0
|
17 * EXPERIMENTAL. This may change or be completely replaced.
|
Chris@0
|
18 */
|
Chris@0
|
19 class HTML5
|
Chris@0
|
20 {
|
Chris@0
|
21
|
Chris@0
|
22 /**
|
Chris@0
|
23 * Global options for the parser and serializer.
|
Chris@0
|
24 *
|
Chris@0
|
25 * @var array
|
Chris@0
|
26 */
|
Chris@0
|
27 protected $options = array(
|
Chris@0
|
28 // If the serializer should encode all entities.
|
Chris@0
|
29 'encode_entities' => false
|
Chris@0
|
30 );
|
Chris@0
|
31
|
Chris@0
|
32 protected $errors = array();
|
Chris@0
|
33
|
Chris@0
|
34 public function __construct(array $options = array())
|
Chris@0
|
35 {
|
Chris@0
|
36 $this->options = array_merge($this->options, $options);
|
Chris@0
|
37 }
|
Chris@0
|
38
|
Chris@0
|
39 /**
|
Chris@0
|
40 * Get the default options.
|
Chris@0
|
41 *
|
Chris@0
|
42 * @return array The default options.
|
Chris@0
|
43 */
|
Chris@0
|
44 public function getOptions()
|
Chris@0
|
45 {
|
Chris@0
|
46 return $this->options;
|
Chris@0
|
47 }
|
Chris@0
|
48
|
Chris@0
|
49 /**
|
Chris@0
|
50 * Load and parse an HTML file.
|
Chris@0
|
51 *
|
Chris@0
|
52 * This will apply the HTML5 parser, which is tolerant of many
|
Chris@0
|
53 * varieties of HTML, including XHTML 1, HTML 4, and well-formed HTML
|
Chris@0
|
54 * 3. Note that in these cases, not all of the old data will be
|
Chris@0
|
55 * preserved. For example, XHTML's XML declaration will be removed.
|
Chris@0
|
56 *
|
Chris@0
|
57 * The rules governing parsing are set out in the HTML 5 spec.
|
Chris@0
|
58 *
|
Chris@0
|
59 * @param string $file
|
Chris@0
|
60 * The path to the file to parse. If this is a resource, it is
|
Chris@0
|
61 * assumed to be an open stream whose pointer is set to the first
|
Chris@0
|
62 * byte of input.
|
Chris@0
|
63 * @param array $options
|
Chris@0
|
64 * Configuration options when parsing the HTML
|
Chris@0
|
65 * @return \DOMDocument A DOM document. These object type is defined by the libxml
|
Chris@0
|
66 * library, and should have been included with your version of PHP.
|
Chris@0
|
67 */
|
Chris@0
|
68 public function load($file, array $options = array())
|
Chris@0
|
69 {
|
Chris@0
|
70 // Handle the case where file is a resource.
|
Chris@0
|
71 if (is_resource($file)) {
|
Chris@0
|
72 // FIXME: We need a StreamInputStream class.
|
Chris@0
|
73 return $this->loadHTML(stream_get_contents($file), $options);
|
Chris@0
|
74 }
|
Chris@0
|
75
|
Chris@0
|
76 $input = new FileInputStream($file);
|
Chris@0
|
77
|
Chris@0
|
78 return $this->parse($input, $options);
|
Chris@0
|
79 }
|
Chris@0
|
80
|
Chris@0
|
81 /**
|
Chris@0
|
82 * Parse a HTML Document from a string.
|
Chris@0
|
83 *
|
Chris@0
|
84 * Take a string of HTML 5 (or earlier) and parse it into a
|
Chris@0
|
85 * DOMDocument.
|
Chris@0
|
86 *
|
Chris@0
|
87 * @param string $string
|
Chris@0
|
88 * A html5 document as a string.
|
Chris@0
|
89 * @param array $options
|
Chris@0
|
90 * Configuration options when parsing the HTML
|
Chris@0
|
91 * @return \DOMDocument A DOM document. DOM is part of libxml, which is included with
|
Chris@0
|
92 * almost all distribtions of PHP.
|
Chris@0
|
93 */
|
Chris@0
|
94 public function loadHTML($string, array $options = array())
|
Chris@0
|
95 {
|
Chris@0
|
96 $input = new StringInputStream($string);
|
Chris@0
|
97
|
Chris@0
|
98 return $this->parse($input, $options);
|
Chris@0
|
99 }
|
Chris@0
|
100
|
Chris@0
|
101 /**
|
Chris@0
|
102 * Convenience function to load an HTML file.
|
Chris@0
|
103 *
|
Chris@0
|
104 * This is here to provide backwards compatibility with the
|
Chris@0
|
105 * PHP DOM implementation. It simply calls load().
|
Chris@0
|
106 *
|
Chris@0
|
107 * @param string $file
|
Chris@0
|
108 * The path to the file to parse. If this is a resource, it is
|
Chris@0
|
109 * assumed to be an open stream whose pointer is set to the first
|
Chris@0
|
110 * byte of input.
|
Chris@0
|
111 * @param array $options
|
Chris@0
|
112 * Configuration options when parsing the HTML
|
Chris@0
|
113 *
|
Chris@0
|
114 * @return \DOMDocument A DOM document. These object type is defined by the libxml
|
Chris@0
|
115 * library, and should have been included with your version of PHP.
|
Chris@0
|
116 */
|
Chris@0
|
117 public function loadHTMLFile($file, array $options = array())
|
Chris@0
|
118 {
|
Chris@0
|
119 return $this->load($file, $options);
|
Chris@0
|
120 }
|
Chris@0
|
121
|
Chris@0
|
122 /**
|
Chris@0
|
123 * Parse a HTML fragment from a string.
|
Chris@0
|
124 *
|
Chris@0
|
125 * @param string $string
|
Chris@0
|
126 * The html5 fragment as a string.
|
Chris@0
|
127 * @param array $options
|
Chris@0
|
128 * Configuration options when parsing the HTML
|
Chris@0
|
129 *
|
Chris@0
|
130 * @return \DOMDocumentFragment A DOM fragment. The DOM is part of libxml, which is included with
|
Chris@0
|
131 * almost all distributions of PHP.
|
Chris@0
|
132 */
|
Chris@0
|
133 public function loadHTMLFragment($string, array $options = array())
|
Chris@0
|
134 {
|
Chris@0
|
135 $input = new StringInputStream($string);
|
Chris@0
|
136
|
Chris@0
|
137 return $this->parseFragment($input, $options);
|
Chris@0
|
138 }
|
Chris@0
|
139
|
Chris@0
|
140 /**
|
Chris@0
|
141 * Return all errors encountered into parsing phase
|
Chris@0
|
142 *
|
Chris@0
|
143 * @return array
|
Chris@0
|
144 */
|
Chris@0
|
145 public function getErrors()
|
Chris@0
|
146 {
|
Chris@0
|
147 return $this->errors;
|
Chris@0
|
148 }
|
Chris@0
|
149
|
Chris@0
|
150 /**
|
Chris@0
|
151 * Return true it some errors were encountered into parsing phase
|
Chris@0
|
152 *
|
Chris@0
|
153 * @return bool
|
Chris@0
|
154 */
|
Chris@0
|
155 public function hasErrors()
|
Chris@0
|
156 {
|
Chris@0
|
157 return count($this->errors) > 0;
|
Chris@0
|
158 }
|
Chris@0
|
159
|
Chris@0
|
160 /**
|
Chris@0
|
161 * Parse an input stream.
|
Chris@0
|
162 *
|
Chris@0
|
163 * Lower-level loading function. This requires an input stream instead
|
Chris@0
|
164 * of a string, file, or resource.
|
Chris@0
|
165 */
|
Chris@0
|
166 public function parse(\Masterminds\HTML5\Parser\InputStream $input, array $options = array())
|
Chris@0
|
167 {
|
Chris@0
|
168 $this->errors = array();
|
Chris@0
|
169 $options = array_merge($this->getOptions(), $options);
|
Chris@0
|
170 $events = new DOMTreeBuilder(false, $options);
|
Chris@0
|
171 $scanner = new Scanner($input);
|
Chris@0
|
172 $parser = new Tokenizer($scanner, $events, !empty($options['xmlNamespaces']) ? Tokenizer::CONFORMANT_XML: Tokenizer::CONFORMANT_HTML);
|
Chris@0
|
173
|
Chris@0
|
174 $parser->parse();
|
Chris@0
|
175 $this->errors = $events->getErrors();
|
Chris@0
|
176
|
Chris@0
|
177 return $events->document();
|
Chris@0
|
178 }
|
Chris@0
|
179
|
Chris@0
|
180 /**
|
Chris@0
|
181 * Parse an input stream where the stream is a fragment.
|
Chris@0
|
182 *
|
Chris@0
|
183 * Lower-level loading function. This requires an input stream instead
|
Chris@0
|
184 * of a string, file, or resource.
|
Chris@0
|
185 */
|
Chris@0
|
186 public function parseFragment(\Masterminds\HTML5\Parser\InputStream $input, array $options = array())
|
Chris@0
|
187 {
|
Chris@0
|
188 $options = array_merge($this->getOptions(), $options);
|
Chris@0
|
189 $events = new DOMTreeBuilder(true, $options);
|
Chris@0
|
190 $scanner = new Scanner($input);
|
Chris@0
|
191 $parser = new Tokenizer($scanner, $events, !empty($options['xmlNamespaces']) ? Tokenizer::CONFORMANT_XML: Tokenizer::CONFORMANT_HTML);
|
Chris@0
|
192
|
Chris@0
|
193 $parser->parse();
|
Chris@0
|
194 $this->errors = $events->getErrors();
|
Chris@0
|
195
|
Chris@0
|
196 return $events->fragment();
|
Chris@0
|
197 }
|
Chris@0
|
198
|
Chris@0
|
199 /**
|
Chris@0
|
200 * Save a DOM into a given file as HTML5.
|
Chris@0
|
201 *
|
Chris@0
|
202 * @param mixed $dom
|
Chris@0
|
203 * The DOM to be serialized.
|
Chris@0
|
204 * @param string $file
|
Chris@0
|
205 * The filename to be written.
|
Chris@0
|
206 * @param array $options
|
Chris@0
|
207 * Configuration options when serializing the DOM. These include:
|
Chris@0
|
208 * - encode_entities: Text written to the output is escaped by default and not all
|
Chris@0
|
209 * entities are encoded. If this is set to true all entities will be encoded.
|
Chris@0
|
210 * Defaults to false.
|
Chris@0
|
211 */
|
Chris@0
|
212 public function save($dom, $file, $options = array())
|
Chris@0
|
213 {
|
Chris@0
|
214 $close = true;
|
Chris@0
|
215 if (is_resource($file)) {
|
Chris@0
|
216 $stream = $file;
|
Chris@0
|
217 $close = false;
|
Chris@0
|
218 } else {
|
Chris@0
|
219 $stream = fopen($file, 'w');
|
Chris@0
|
220 }
|
Chris@0
|
221 $options = array_merge($this->getOptions(), $options);
|
Chris@0
|
222 $rules = new OutputRules($stream, $options);
|
Chris@0
|
223 $trav = new Traverser($dom, $stream, $rules, $options);
|
Chris@0
|
224
|
Chris@0
|
225 $trav->walk();
|
Chris@0
|
226
|
Chris@0
|
227 if ($close) {
|
Chris@0
|
228 fclose($stream);
|
Chris@0
|
229 }
|
Chris@0
|
230 }
|
Chris@0
|
231
|
Chris@0
|
232 /**
|
Chris@0
|
233 * Convert a DOM into an HTML5 string.
|
Chris@0
|
234 *
|
Chris@0
|
235 * @param mixed $dom
|
Chris@0
|
236 * The DOM to be serialized.
|
Chris@0
|
237 * @param array $options
|
Chris@0
|
238 * Configuration options when serializing the DOM. These include:
|
Chris@0
|
239 * - encode_entities: Text written to the output is escaped by default and not all
|
Chris@0
|
240 * entities are encoded. If this is set to true all entities will be encoded.
|
Chris@0
|
241 * Defaults to false.
|
Chris@0
|
242 *
|
Chris@0
|
243 * @return string A HTML5 documented generated from the DOM.
|
Chris@0
|
244 */
|
Chris@0
|
245 public function saveHTML($dom, $options = array())
|
Chris@0
|
246 {
|
Chris@0
|
247 $stream = fopen('php://temp', 'w');
|
Chris@0
|
248 $this->save($dom, $stream, array_merge($this->getOptions(), $options));
|
Chris@0
|
249
|
Chris@0
|
250 return stream_get_contents($stream, - 1, 0);
|
Chris@0
|
251 }
|
Chris@0
|
252 }
|