annotate vendor/masterminds/html5/src/HTML5.php @ 0:c75dbcec494b

Initial commit from drush-created site
author Chris Cannam
date Thu, 05 Jul 2018 14:24:15 +0000
parents
children a9cd425dd02b
rev   line source
Chris@0 1 <?php
Chris@0 2 namespace Masterminds;
Chris@0 3
Chris@0 4 use Masterminds\HTML5\Parser\FileInputStream;
Chris@0 5 use Masterminds\HTML5\Parser\StringInputStream;
Chris@0 6 use Masterminds\HTML5\Parser\DOMTreeBuilder;
Chris@0 7 use Masterminds\HTML5\Parser\Scanner;
Chris@0 8 use Masterminds\HTML5\Parser\Tokenizer;
Chris@0 9 use Masterminds\HTML5\Serializer\OutputRules;
Chris@0 10 use Masterminds\HTML5\Serializer\Traverser;
Chris@0 11
Chris@0 12 /**
Chris@0 13 * This class offers convenience methods for parsing and serializing HTML5.
Chris@0 14 * It is roughly designed to mirror the \DOMDocument class that is
Chris@0 15 * provided with most versions of PHP.
Chris@0 16 *
Chris@0 17 * EXPERIMENTAL. This may change or be completely replaced.
Chris@0 18 */
Chris@0 19 class HTML5
Chris@0 20 {
Chris@0 21
Chris@0 22 /**
Chris@0 23 * Global options for the parser and serializer.
Chris@0 24 *
Chris@0 25 * @var array
Chris@0 26 */
Chris@0 27 protected $options = array(
Chris@0 28 // If the serializer should encode all entities.
Chris@0 29 'encode_entities' => false
Chris@0 30 );
Chris@0 31
Chris@0 32 protected $errors = array();
Chris@0 33
Chris@0 34 public function __construct(array $options = array())
Chris@0 35 {
Chris@0 36 $this->options = array_merge($this->options, $options);
Chris@0 37 }
Chris@0 38
Chris@0 39 /**
Chris@0 40 * Get the default options.
Chris@0 41 *
Chris@0 42 * @return array The default options.
Chris@0 43 */
Chris@0 44 public function getOptions()
Chris@0 45 {
Chris@0 46 return $this->options;
Chris@0 47 }
Chris@0 48
Chris@0 49 /**
Chris@0 50 * Load and parse an HTML file.
Chris@0 51 *
Chris@0 52 * This will apply the HTML5 parser, which is tolerant of many
Chris@0 53 * varieties of HTML, including XHTML 1, HTML 4, and well-formed HTML
Chris@0 54 * 3. Note that in these cases, not all of the old data will be
Chris@0 55 * preserved. For example, XHTML's XML declaration will be removed.
Chris@0 56 *
Chris@0 57 * The rules governing parsing are set out in the HTML 5 spec.
Chris@0 58 *
Chris@0 59 * @param string $file
Chris@0 60 * The path to the file to parse. If this is a resource, it is
Chris@0 61 * assumed to be an open stream whose pointer is set to the first
Chris@0 62 * byte of input.
Chris@0 63 * @param array $options
Chris@0 64 * Configuration options when parsing the HTML
Chris@0 65 * @return \DOMDocument A DOM document. These object type is defined by the libxml
Chris@0 66 * library, and should have been included with your version of PHP.
Chris@0 67 */
Chris@0 68 public function load($file, array $options = array())
Chris@0 69 {
Chris@0 70 // Handle the case where file is a resource.
Chris@0 71 if (is_resource($file)) {
Chris@0 72 // FIXME: We need a StreamInputStream class.
Chris@0 73 return $this->loadHTML(stream_get_contents($file), $options);
Chris@0 74 }
Chris@0 75
Chris@0 76 $input = new FileInputStream($file);
Chris@0 77
Chris@0 78 return $this->parse($input, $options);
Chris@0 79 }
Chris@0 80
Chris@0 81 /**
Chris@0 82 * Parse a HTML Document from a string.
Chris@0 83 *
Chris@0 84 * Take a string of HTML 5 (or earlier) and parse it into a
Chris@0 85 * DOMDocument.
Chris@0 86 *
Chris@0 87 * @param string $string
Chris@0 88 * A html5 document as a string.
Chris@0 89 * @param array $options
Chris@0 90 * Configuration options when parsing the HTML
Chris@0 91 * @return \DOMDocument A DOM document. DOM is part of libxml, which is included with
Chris@0 92 * almost all distribtions of PHP.
Chris@0 93 */
Chris@0 94 public function loadHTML($string, array $options = array())
Chris@0 95 {
Chris@0 96 $input = new StringInputStream($string);
Chris@0 97
Chris@0 98 return $this->parse($input, $options);
Chris@0 99 }
Chris@0 100
Chris@0 101 /**
Chris@0 102 * Convenience function to load an HTML file.
Chris@0 103 *
Chris@0 104 * This is here to provide backwards compatibility with the
Chris@0 105 * PHP DOM implementation. It simply calls load().
Chris@0 106 *
Chris@0 107 * @param string $file
Chris@0 108 * The path to the file to parse. If this is a resource, it is
Chris@0 109 * assumed to be an open stream whose pointer is set to the first
Chris@0 110 * byte of input.
Chris@0 111 * @param array $options
Chris@0 112 * Configuration options when parsing the HTML
Chris@0 113 *
Chris@0 114 * @return \DOMDocument A DOM document. These object type is defined by the libxml
Chris@0 115 * library, and should have been included with your version of PHP.
Chris@0 116 */
Chris@0 117 public function loadHTMLFile($file, array $options = array())
Chris@0 118 {
Chris@0 119 return $this->load($file, $options);
Chris@0 120 }
Chris@0 121
Chris@0 122 /**
Chris@0 123 * Parse a HTML fragment from a string.
Chris@0 124 *
Chris@0 125 * @param string $string
Chris@0 126 * The html5 fragment as a string.
Chris@0 127 * @param array $options
Chris@0 128 * Configuration options when parsing the HTML
Chris@0 129 *
Chris@0 130 * @return \DOMDocumentFragment A DOM fragment. The DOM is part of libxml, which is included with
Chris@0 131 * almost all distributions of PHP.
Chris@0 132 */
Chris@0 133 public function loadHTMLFragment($string, array $options = array())
Chris@0 134 {
Chris@0 135 $input = new StringInputStream($string);
Chris@0 136
Chris@0 137 return $this->parseFragment($input, $options);
Chris@0 138 }
Chris@0 139
Chris@0 140 /**
Chris@0 141 * Return all errors encountered into parsing phase
Chris@0 142 *
Chris@0 143 * @return array
Chris@0 144 */
Chris@0 145 public function getErrors()
Chris@0 146 {
Chris@0 147 return $this->errors;
Chris@0 148 }
Chris@0 149
Chris@0 150 /**
Chris@0 151 * Return true it some errors were encountered into parsing phase
Chris@0 152 *
Chris@0 153 * @return bool
Chris@0 154 */
Chris@0 155 public function hasErrors()
Chris@0 156 {
Chris@0 157 return count($this->errors) > 0;
Chris@0 158 }
Chris@0 159
Chris@0 160 /**
Chris@0 161 * Parse an input stream.
Chris@0 162 *
Chris@0 163 * Lower-level loading function. This requires an input stream instead
Chris@0 164 * of a string, file, or resource.
Chris@0 165 */
Chris@0 166 public function parse(\Masterminds\HTML5\Parser\InputStream $input, array $options = array())
Chris@0 167 {
Chris@0 168 $this->errors = array();
Chris@0 169 $options = array_merge($this->getOptions(), $options);
Chris@0 170 $events = new DOMTreeBuilder(false, $options);
Chris@0 171 $scanner = new Scanner($input);
Chris@0 172 $parser = new Tokenizer($scanner, $events, !empty($options['xmlNamespaces']) ? Tokenizer::CONFORMANT_XML: Tokenizer::CONFORMANT_HTML);
Chris@0 173
Chris@0 174 $parser->parse();
Chris@0 175 $this->errors = $events->getErrors();
Chris@0 176
Chris@0 177 return $events->document();
Chris@0 178 }
Chris@0 179
Chris@0 180 /**
Chris@0 181 * Parse an input stream where the stream is a fragment.
Chris@0 182 *
Chris@0 183 * Lower-level loading function. This requires an input stream instead
Chris@0 184 * of a string, file, or resource.
Chris@0 185 */
Chris@0 186 public function parseFragment(\Masterminds\HTML5\Parser\InputStream $input, array $options = array())
Chris@0 187 {
Chris@0 188 $options = array_merge($this->getOptions(), $options);
Chris@0 189 $events = new DOMTreeBuilder(true, $options);
Chris@0 190 $scanner = new Scanner($input);
Chris@0 191 $parser = new Tokenizer($scanner, $events, !empty($options['xmlNamespaces']) ? Tokenizer::CONFORMANT_XML: Tokenizer::CONFORMANT_HTML);
Chris@0 192
Chris@0 193 $parser->parse();
Chris@0 194 $this->errors = $events->getErrors();
Chris@0 195
Chris@0 196 return $events->fragment();
Chris@0 197 }
Chris@0 198
Chris@0 199 /**
Chris@0 200 * Save a DOM into a given file as HTML5.
Chris@0 201 *
Chris@0 202 * @param mixed $dom
Chris@0 203 * The DOM to be serialized.
Chris@0 204 * @param string $file
Chris@0 205 * The filename to be written.
Chris@0 206 * @param array $options
Chris@0 207 * Configuration options when serializing the DOM. These include:
Chris@0 208 * - encode_entities: Text written to the output is escaped by default and not all
Chris@0 209 * entities are encoded. If this is set to true all entities will be encoded.
Chris@0 210 * Defaults to false.
Chris@0 211 */
Chris@0 212 public function save($dom, $file, $options = array())
Chris@0 213 {
Chris@0 214 $close = true;
Chris@0 215 if (is_resource($file)) {
Chris@0 216 $stream = $file;
Chris@0 217 $close = false;
Chris@0 218 } else {
Chris@0 219 $stream = fopen($file, 'w');
Chris@0 220 }
Chris@0 221 $options = array_merge($this->getOptions(), $options);
Chris@0 222 $rules = new OutputRules($stream, $options);
Chris@0 223 $trav = new Traverser($dom, $stream, $rules, $options);
Chris@0 224
Chris@0 225 $trav->walk();
Chris@0 226
Chris@0 227 if ($close) {
Chris@0 228 fclose($stream);
Chris@0 229 }
Chris@0 230 }
Chris@0 231
Chris@0 232 /**
Chris@0 233 * Convert a DOM into an HTML5 string.
Chris@0 234 *
Chris@0 235 * @param mixed $dom
Chris@0 236 * The DOM to be serialized.
Chris@0 237 * @param array $options
Chris@0 238 * Configuration options when serializing the DOM. These include:
Chris@0 239 * - encode_entities: Text written to the output is escaped by default and not all
Chris@0 240 * entities are encoded. If this is set to true all entities will be encoded.
Chris@0 241 * Defaults to false.
Chris@0 242 *
Chris@0 243 * @return string A HTML5 documented generated from the DOM.
Chris@0 244 */
Chris@0 245 public function saveHTML($dom, $options = array())
Chris@0 246 {
Chris@0 247 $stream = fopen('php://temp', 'w');
Chris@0 248 $this->save($dom, $stream, array_merge($this->getOptions(), $options));
Chris@0 249
Chris@0 250 return stream_get_contents($stream, - 1, 0);
Chris@0 251 }
Chris@0 252 }