annotate vendor/masterminds/html5/src/HTML5.php @ 19:fa3358dc1485 tip

Add ndrum files
author Chris Cannam
date Wed, 28 Aug 2019 13:14:47 +0100
parents af1871eacc83
children
rev   line source
Chris@0 1 <?php
Chris@17 2
Chris@0 3 namespace Masterminds;
Chris@0 4
Chris@0 5 use Masterminds\HTML5\Parser\DOMTreeBuilder;
Chris@0 6 use Masterminds\HTML5\Parser\Scanner;
Chris@0 7 use Masterminds\HTML5\Parser\Tokenizer;
Chris@0 8 use Masterminds\HTML5\Serializer\OutputRules;
Chris@0 9 use Masterminds\HTML5\Serializer\Traverser;
Chris@0 10
Chris@0 11 /**
Chris@0 12 * This class offers convenience methods for parsing and serializing HTML5.
Chris@17 13 * It is roughly designed to mirror the \DOMDocument native class.
Chris@0 14 */
Chris@0 15 class HTML5
Chris@0 16 {
Chris@0 17 /**
Chris@0 18 * Global options for the parser and serializer.
Chris@0 19 *
Chris@0 20 * @var array
Chris@0 21 */
Chris@17 22 private $defaultOptions = array(
Chris@17 23 // Whether the serializer should aggressively encode all characters as entities.
Chris@17 24 'encode_entities' => false,
Chris@17 25
Chris@17 26 // Prevents the parser from automatically assigning the HTML5 namespace to the DOM document.
Chris@17 27 'disable_html_ns' => false,
Chris@0 28 );
Chris@0 29
Chris@0 30 protected $errors = array();
Chris@0 31
Chris@17 32 public function __construct(array $defaultOptions = array())
Chris@0 33 {
Chris@17 34 $this->defaultOptions = array_merge($this->defaultOptions, $defaultOptions);
Chris@0 35 }
Chris@0 36
Chris@0 37 /**
Chris@17 38 * Get the current default options.
Chris@0 39 *
Chris@17 40 * @return array
Chris@0 41 */
Chris@0 42 public function getOptions()
Chris@0 43 {
Chris@17 44 return $this->defaultOptions;
Chris@0 45 }
Chris@0 46
Chris@0 47 /**
Chris@0 48 * Load and parse an HTML file.
Chris@0 49 *
Chris@0 50 * This will apply the HTML5 parser, which is tolerant of many
Chris@0 51 * varieties of HTML, including XHTML 1, HTML 4, and well-formed HTML
Chris@0 52 * 3. Note that in these cases, not all of the old data will be
Chris@0 53 * preserved. For example, XHTML's XML declaration will be removed.
Chris@0 54 *
Chris@0 55 * The rules governing parsing are set out in the HTML 5 spec.
Chris@0 56 *
Chris@17 57 * @param string|resource $file The path to the file to parse. If this is a resource, it is
Chris@17 58 * assumed to be an open stream whose pointer is set to the first
Chris@17 59 * byte of input.
Chris@17 60 * @param array $options Configuration options when parsing the HTML.
Chris@17 61 *
Chris@0 62 * @return \DOMDocument A DOM document. These object type is defined by the libxml
Chris@17 63 * library, and should have been included with your version of PHP.
Chris@0 64 */
Chris@0 65 public function load($file, array $options = array())
Chris@0 66 {
Chris@0 67 // Handle the case where file is a resource.
Chris@0 68 if (is_resource($file)) {
Chris@17 69 return $this->parse(stream_get_contents($file), $options);
Chris@0 70 }
Chris@0 71
Chris@17 72 return $this->parse(file_get_contents($file), $options);
Chris@0 73 }
Chris@0 74
Chris@0 75 /**
Chris@0 76 * Parse a HTML Document from a string.
Chris@0 77 *
Chris@0 78 * Take a string of HTML 5 (or earlier) and parse it into a
Chris@0 79 * DOMDocument.
Chris@0 80 *
Chris@17 81 * @param string $string A html5 document as a string.
Chris@17 82 * @param array $options Configuration options when parsing the HTML.
Chris@17 83 *
Chris@0 84 * @return \DOMDocument A DOM document. DOM is part of libxml, which is included with
Chris@17 85 * almost all distribtions of PHP.
Chris@0 86 */
Chris@0 87 public function loadHTML($string, array $options = array())
Chris@0 88 {
Chris@17 89 return $this->parse($string, $options);
Chris@0 90 }
Chris@0 91
Chris@0 92 /**
Chris@0 93 * Convenience function to load an HTML file.
Chris@0 94 *
Chris@0 95 * This is here to provide backwards compatibility with the
Chris@0 96 * PHP DOM implementation. It simply calls load().
Chris@0 97 *
Chris@17 98 * @param string $file The path to the file to parse. If this is a resource, it is
Chris@17 99 * assumed to be an open stream whose pointer is set to the first
Chris@17 100 * byte of input.
Chris@17 101 * @param array $options Configuration options when parsing the HTML.
Chris@0 102 *
Chris@0 103 * @return \DOMDocument A DOM document. These object type is defined by the libxml
Chris@17 104 * library, and should have been included with your version of PHP.
Chris@0 105 */
Chris@0 106 public function loadHTMLFile($file, array $options = array())
Chris@0 107 {
Chris@0 108 return $this->load($file, $options);
Chris@0 109 }
Chris@0 110
Chris@0 111 /**
Chris@0 112 * Parse a HTML fragment from a string.
Chris@0 113 *
Chris@17 114 * @param string $string the HTML5 fragment as a string
Chris@17 115 * @param array $options Configuration options when parsing the HTML
Chris@0 116 *
Chris@0 117 * @return \DOMDocumentFragment A DOM fragment. The DOM is part of libxml, which is included with
Chris@17 118 * almost all distributions of PHP.
Chris@0 119 */
Chris@0 120 public function loadHTMLFragment($string, array $options = array())
Chris@0 121 {
Chris@17 122 return $this->parseFragment($string, $options);
Chris@0 123 }
Chris@0 124
Chris@0 125 /**
Chris@17 126 * Return all errors encountered into parsing phase.
Chris@0 127 *
Chris@0 128 * @return array
Chris@0 129 */
Chris@0 130 public function getErrors()
Chris@0 131 {
Chris@0 132 return $this->errors;
Chris@0 133 }
Chris@0 134
Chris@0 135 /**
Chris@17 136 * Return true it some errors were encountered into parsing phase.
Chris@0 137 *
Chris@0 138 * @return bool
Chris@0 139 */
Chris@0 140 public function hasErrors()
Chris@0 141 {
Chris@0 142 return count($this->errors) > 0;
Chris@0 143 }
Chris@0 144
Chris@0 145 /**
Chris@17 146 * Parse an input string.
Chris@0 147 *
Chris@17 148 * @param string $input
Chris@17 149 * @param array $options
Chris@17 150 *
Chris@17 151 * @return \DOMDocument
Chris@0 152 */
Chris@17 153 public function parse($input, array $options = array())
Chris@0 154 {
Chris@0 155 $this->errors = array();
Chris@17 156 $options = array_merge($this->defaultOptions, $options);
Chris@0 157 $events = new DOMTreeBuilder(false, $options);
Chris@18 158 $scanner = new Scanner($input, !empty($options['encoding']) ? $options['encoding'] : 'UTF-8');
Chris@17 159 $parser = new Tokenizer($scanner, $events, !empty($options['xmlNamespaces']) ? Tokenizer::CONFORMANT_XML : Tokenizer::CONFORMANT_HTML);
Chris@0 160
Chris@0 161 $parser->parse();
Chris@0 162 $this->errors = $events->getErrors();
Chris@0 163
Chris@0 164 return $events->document();
Chris@0 165 }
Chris@0 166
Chris@0 167 /**
Chris@0 168 * Parse an input stream where the stream is a fragment.
Chris@0 169 *
Chris@0 170 * Lower-level loading function. This requires an input stream instead
Chris@0 171 * of a string, file, or resource.
Chris@17 172 *
Chris@17 173 * @param string $input The input data to parse in the form of a string.
Chris@17 174 * @param array $options An array of options.
Chris@17 175 *
Chris@17 176 * @return \DOMDocumentFragment
Chris@0 177 */
Chris@17 178 public function parseFragment($input, array $options = array())
Chris@0 179 {
Chris@17 180 $options = array_merge($this->defaultOptions, $options);
Chris@0 181 $events = new DOMTreeBuilder(true, $options);
Chris@18 182 $scanner = new Scanner($input, !empty($options['encoding']) ? $options['encoding'] : 'UTF-8');
Chris@17 183 $parser = new Tokenizer($scanner, $events, !empty($options['xmlNamespaces']) ? Tokenizer::CONFORMANT_XML : Tokenizer::CONFORMANT_HTML);
Chris@0 184
Chris@0 185 $parser->parse();
Chris@0 186 $this->errors = $events->getErrors();
Chris@0 187
Chris@0 188 return $events->fragment();
Chris@0 189 }
Chris@0 190
Chris@0 191 /**
Chris@0 192 * Save a DOM into a given file as HTML5.
Chris@0 193 *
Chris@17 194 * @param mixed $dom The DOM to be serialized.
Chris@17 195 * @param string|resource $file The filename to be written or resource to write to.
Chris@17 196 * @param array $options Configuration options when serializing the DOM. These include:
Chris@17 197 * - encode_entities: Text written to the output is escaped by default and not all
Chris@17 198 * entities are encoded. If this is set to true all entities will be encoded.
Chris@17 199 * Defaults to false.
Chris@0 200 */
Chris@0 201 public function save($dom, $file, $options = array())
Chris@0 202 {
Chris@0 203 $close = true;
Chris@0 204 if (is_resource($file)) {
Chris@0 205 $stream = $file;
Chris@0 206 $close = false;
Chris@0 207 } else {
Chris@17 208 $stream = fopen($file, 'wb');
Chris@0 209 }
Chris@17 210 $options = array_merge($this->defaultOptions, $options);
Chris@0 211 $rules = new OutputRules($stream, $options);
Chris@0 212 $trav = new Traverser($dom, $stream, $rules, $options);
Chris@0 213
Chris@0 214 $trav->walk();
Chris@0 215
Chris@0 216 if ($close) {
Chris@0 217 fclose($stream);
Chris@0 218 }
Chris@0 219 }
Chris@0 220
Chris@0 221 /**
Chris@0 222 * Convert a DOM into an HTML5 string.
Chris@0 223 *
Chris@17 224 * @param mixed $dom The DOM to be serialized.
Chris@17 225 * @param array $options Configuration options when serializing the DOM. These include:
Chris@17 226 * - encode_entities: Text written to the output is escaped by default and not all
Chris@17 227 * entities are encoded. If this is set to true all entities will be encoded.
Chris@17 228 * Defaults to false.
Chris@0 229 *
Chris@0 230 * @return string A HTML5 documented generated from the DOM.
Chris@0 231 */
Chris@0 232 public function saveHTML($dom, $options = array())
Chris@0 233 {
Chris@17 234 $stream = fopen('php://temp', 'wb');
Chris@17 235 $this->save($dom, $stream, array_merge($this->defaultOptions, $options));
Chris@0 236
Chris@17 237 return stream_get_contents($stream, -1, 0);
Chris@0 238 }
Chris@0 239 }