Chris@0
|
1 <?php
|
Chris@17
|
2
|
Chris@0
|
3 namespace Masterminds;
|
Chris@0
|
4
|
Chris@0
|
5 use Masterminds\HTML5\Parser\DOMTreeBuilder;
|
Chris@0
|
6 use Masterminds\HTML5\Parser\Scanner;
|
Chris@0
|
7 use Masterminds\HTML5\Parser\Tokenizer;
|
Chris@0
|
8 use Masterminds\HTML5\Serializer\OutputRules;
|
Chris@0
|
9 use Masterminds\HTML5\Serializer\Traverser;
|
Chris@0
|
10
|
Chris@0
|
11 /**
|
Chris@0
|
12 * This class offers convenience methods for parsing and serializing HTML5.
|
Chris@17
|
13 * It is roughly designed to mirror the \DOMDocument native class.
|
Chris@0
|
14 */
|
Chris@0
|
15 class HTML5
|
Chris@0
|
16 {
|
Chris@0
|
17 /**
|
Chris@0
|
18 * Global options for the parser and serializer.
|
Chris@0
|
19 *
|
Chris@0
|
20 * @var array
|
Chris@0
|
21 */
|
Chris@17
|
22 private $defaultOptions = array(
|
Chris@17
|
23 // Whether the serializer should aggressively encode all characters as entities.
|
Chris@17
|
24 'encode_entities' => false,
|
Chris@17
|
25
|
Chris@17
|
26 // Prevents the parser from automatically assigning the HTML5 namespace to the DOM document.
|
Chris@17
|
27 'disable_html_ns' => false,
|
Chris@0
|
28 );
|
Chris@0
|
29
|
Chris@0
|
30 protected $errors = array();
|
Chris@0
|
31
|
Chris@17
|
32 public function __construct(array $defaultOptions = array())
|
Chris@0
|
33 {
|
Chris@17
|
34 $this->defaultOptions = array_merge($this->defaultOptions, $defaultOptions);
|
Chris@0
|
35 }
|
Chris@0
|
36
|
Chris@0
|
37 /**
|
Chris@17
|
38 * Get the current default options.
|
Chris@0
|
39 *
|
Chris@17
|
40 * @return array
|
Chris@0
|
41 */
|
Chris@0
|
42 public function getOptions()
|
Chris@0
|
43 {
|
Chris@17
|
44 return $this->defaultOptions;
|
Chris@0
|
45 }
|
Chris@0
|
46
|
Chris@0
|
47 /**
|
Chris@0
|
48 * Load and parse an HTML file.
|
Chris@0
|
49 *
|
Chris@0
|
50 * This will apply the HTML5 parser, which is tolerant of many
|
Chris@0
|
51 * varieties of HTML, including XHTML 1, HTML 4, and well-formed HTML
|
Chris@0
|
52 * 3. Note that in these cases, not all of the old data will be
|
Chris@0
|
53 * preserved. For example, XHTML's XML declaration will be removed.
|
Chris@0
|
54 *
|
Chris@0
|
55 * The rules governing parsing are set out in the HTML 5 spec.
|
Chris@0
|
56 *
|
Chris@17
|
57 * @param string|resource $file The path to the file to parse. If this is a resource, it is
|
Chris@17
|
58 * assumed to be an open stream whose pointer is set to the first
|
Chris@17
|
59 * byte of input.
|
Chris@17
|
60 * @param array $options Configuration options when parsing the HTML.
|
Chris@17
|
61 *
|
Chris@0
|
62 * @return \DOMDocument A DOM document. These object type is defined by the libxml
|
Chris@17
|
63 * library, and should have been included with your version of PHP.
|
Chris@0
|
64 */
|
Chris@0
|
65 public function load($file, array $options = array())
|
Chris@0
|
66 {
|
Chris@0
|
67 // Handle the case where file is a resource.
|
Chris@0
|
68 if (is_resource($file)) {
|
Chris@17
|
69 return $this->parse(stream_get_contents($file), $options);
|
Chris@0
|
70 }
|
Chris@0
|
71
|
Chris@17
|
72 return $this->parse(file_get_contents($file), $options);
|
Chris@0
|
73 }
|
Chris@0
|
74
|
Chris@0
|
75 /**
|
Chris@0
|
76 * Parse a HTML Document from a string.
|
Chris@0
|
77 *
|
Chris@0
|
78 * Take a string of HTML 5 (or earlier) and parse it into a
|
Chris@0
|
79 * DOMDocument.
|
Chris@0
|
80 *
|
Chris@17
|
81 * @param string $string A html5 document as a string.
|
Chris@17
|
82 * @param array $options Configuration options when parsing the HTML.
|
Chris@17
|
83 *
|
Chris@0
|
84 * @return \DOMDocument A DOM document. DOM is part of libxml, which is included with
|
Chris@17
|
85 * almost all distribtions of PHP.
|
Chris@0
|
86 */
|
Chris@0
|
87 public function loadHTML($string, array $options = array())
|
Chris@0
|
88 {
|
Chris@17
|
89 return $this->parse($string, $options);
|
Chris@0
|
90 }
|
Chris@0
|
91
|
Chris@0
|
92 /**
|
Chris@0
|
93 * Convenience function to load an HTML file.
|
Chris@0
|
94 *
|
Chris@0
|
95 * This is here to provide backwards compatibility with the
|
Chris@0
|
96 * PHP DOM implementation. It simply calls load().
|
Chris@0
|
97 *
|
Chris@17
|
98 * @param string $file The path to the file to parse. If this is a resource, it is
|
Chris@17
|
99 * assumed to be an open stream whose pointer is set to the first
|
Chris@17
|
100 * byte of input.
|
Chris@17
|
101 * @param array $options Configuration options when parsing the HTML.
|
Chris@0
|
102 *
|
Chris@0
|
103 * @return \DOMDocument A DOM document. These object type is defined by the libxml
|
Chris@17
|
104 * library, and should have been included with your version of PHP.
|
Chris@0
|
105 */
|
Chris@0
|
106 public function loadHTMLFile($file, array $options = array())
|
Chris@0
|
107 {
|
Chris@0
|
108 return $this->load($file, $options);
|
Chris@0
|
109 }
|
Chris@0
|
110
|
Chris@0
|
111 /**
|
Chris@0
|
112 * Parse a HTML fragment from a string.
|
Chris@0
|
113 *
|
Chris@17
|
114 * @param string $string the HTML5 fragment as a string
|
Chris@17
|
115 * @param array $options Configuration options when parsing the HTML
|
Chris@0
|
116 *
|
Chris@0
|
117 * @return \DOMDocumentFragment A DOM fragment. The DOM is part of libxml, which is included with
|
Chris@17
|
118 * almost all distributions of PHP.
|
Chris@0
|
119 */
|
Chris@0
|
120 public function loadHTMLFragment($string, array $options = array())
|
Chris@0
|
121 {
|
Chris@17
|
122 return $this->parseFragment($string, $options);
|
Chris@0
|
123 }
|
Chris@0
|
124
|
Chris@0
|
125 /**
|
Chris@17
|
126 * Return all errors encountered into parsing phase.
|
Chris@0
|
127 *
|
Chris@0
|
128 * @return array
|
Chris@0
|
129 */
|
Chris@0
|
130 public function getErrors()
|
Chris@0
|
131 {
|
Chris@0
|
132 return $this->errors;
|
Chris@0
|
133 }
|
Chris@0
|
134
|
Chris@0
|
135 /**
|
Chris@17
|
136 * Return true it some errors were encountered into parsing phase.
|
Chris@0
|
137 *
|
Chris@0
|
138 * @return bool
|
Chris@0
|
139 */
|
Chris@0
|
140 public function hasErrors()
|
Chris@0
|
141 {
|
Chris@0
|
142 return count($this->errors) > 0;
|
Chris@0
|
143 }
|
Chris@0
|
144
|
Chris@0
|
145 /**
|
Chris@17
|
146 * Parse an input string.
|
Chris@0
|
147 *
|
Chris@17
|
148 * @param string $input
|
Chris@17
|
149 * @param array $options
|
Chris@17
|
150 *
|
Chris@17
|
151 * @return \DOMDocument
|
Chris@0
|
152 */
|
Chris@17
|
153 public function parse($input, array $options = array())
|
Chris@0
|
154 {
|
Chris@0
|
155 $this->errors = array();
|
Chris@17
|
156 $options = array_merge($this->defaultOptions, $options);
|
Chris@0
|
157 $events = new DOMTreeBuilder(false, $options);
|
Chris@18
|
158 $scanner = new Scanner($input, !empty($options['encoding']) ? $options['encoding'] : 'UTF-8');
|
Chris@17
|
159 $parser = new Tokenizer($scanner, $events, !empty($options['xmlNamespaces']) ? Tokenizer::CONFORMANT_XML : Tokenizer::CONFORMANT_HTML);
|
Chris@0
|
160
|
Chris@0
|
161 $parser->parse();
|
Chris@0
|
162 $this->errors = $events->getErrors();
|
Chris@0
|
163
|
Chris@0
|
164 return $events->document();
|
Chris@0
|
165 }
|
Chris@0
|
166
|
Chris@0
|
167 /**
|
Chris@0
|
168 * Parse an input stream where the stream is a fragment.
|
Chris@0
|
169 *
|
Chris@0
|
170 * Lower-level loading function. This requires an input stream instead
|
Chris@0
|
171 * of a string, file, or resource.
|
Chris@17
|
172 *
|
Chris@17
|
173 * @param string $input The input data to parse in the form of a string.
|
Chris@17
|
174 * @param array $options An array of options.
|
Chris@17
|
175 *
|
Chris@17
|
176 * @return \DOMDocumentFragment
|
Chris@0
|
177 */
|
Chris@17
|
178 public function parseFragment($input, array $options = array())
|
Chris@0
|
179 {
|
Chris@17
|
180 $options = array_merge($this->defaultOptions, $options);
|
Chris@0
|
181 $events = new DOMTreeBuilder(true, $options);
|
Chris@18
|
182 $scanner = new Scanner($input, !empty($options['encoding']) ? $options['encoding'] : 'UTF-8');
|
Chris@17
|
183 $parser = new Tokenizer($scanner, $events, !empty($options['xmlNamespaces']) ? Tokenizer::CONFORMANT_XML : Tokenizer::CONFORMANT_HTML);
|
Chris@0
|
184
|
Chris@0
|
185 $parser->parse();
|
Chris@0
|
186 $this->errors = $events->getErrors();
|
Chris@0
|
187
|
Chris@0
|
188 return $events->fragment();
|
Chris@0
|
189 }
|
Chris@0
|
190
|
Chris@0
|
191 /**
|
Chris@0
|
192 * Save a DOM into a given file as HTML5.
|
Chris@0
|
193 *
|
Chris@17
|
194 * @param mixed $dom The DOM to be serialized.
|
Chris@17
|
195 * @param string|resource $file The filename to be written or resource to write to.
|
Chris@17
|
196 * @param array $options Configuration options when serializing the DOM. These include:
|
Chris@17
|
197 * - encode_entities: Text written to the output is escaped by default and not all
|
Chris@17
|
198 * entities are encoded. If this is set to true all entities will be encoded.
|
Chris@17
|
199 * Defaults to false.
|
Chris@0
|
200 */
|
Chris@0
|
201 public function save($dom, $file, $options = array())
|
Chris@0
|
202 {
|
Chris@0
|
203 $close = true;
|
Chris@0
|
204 if (is_resource($file)) {
|
Chris@0
|
205 $stream = $file;
|
Chris@0
|
206 $close = false;
|
Chris@0
|
207 } else {
|
Chris@17
|
208 $stream = fopen($file, 'wb');
|
Chris@0
|
209 }
|
Chris@17
|
210 $options = array_merge($this->defaultOptions, $options);
|
Chris@0
|
211 $rules = new OutputRules($stream, $options);
|
Chris@0
|
212 $trav = new Traverser($dom, $stream, $rules, $options);
|
Chris@0
|
213
|
Chris@0
|
214 $trav->walk();
|
Chris@0
|
215
|
Chris@0
|
216 if ($close) {
|
Chris@0
|
217 fclose($stream);
|
Chris@0
|
218 }
|
Chris@0
|
219 }
|
Chris@0
|
220
|
Chris@0
|
221 /**
|
Chris@0
|
222 * Convert a DOM into an HTML5 string.
|
Chris@0
|
223 *
|
Chris@17
|
224 * @param mixed $dom The DOM to be serialized.
|
Chris@17
|
225 * @param array $options Configuration options when serializing the DOM. These include:
|
Chris@17
|
226 * - encode_entities: Text written to the output is escaped by default and not all
|
Chris@17
|
227 * entities are encoded. If this is set to true all entities will be encoded.
|
Chris@17
|
228 * Defaults to false.
|
Chris@0
|
229 *
|
Chris@0
|
230 * @return string A HTML5 documented generated from the DOM.
|
Chris@0
|
231 */
|
Chris@0
|
232 public function saveHTML($dom, $options = array())
|
Chris@0
|
233 {
|
Chris@17
|
234 $stream = fopen('php://temp', 'wb');
|
Chris@17
|
235 $this->save($dom, $stream, array_merge($this->defaultOptions, $options));
|
Chris@0
|
236
|
Chris@17
|
237 return stream_get_contents($stream, -1, 0);
|
Chris@0
|
238 }
|
Chris@0
|
239 }
|