Chris@0: false, Chris@17: Chris@17: // Prevents the parser from automatically assigning the HTML5 namespace to the DOM document. Chris@17: 'disable_html_ns' => false, Chris@0: ); Chris@0: Chris@0: protected $errors = array(); Chris@0: Chris@17: public function __construct(array $defaultOptions = array()) Chris@0: { Chris@17: $this->defaultOptions = array_merge($this->defaultOptions, $defaultOptions); Chris@0: } Chris@0: Chris@0: /** Chris@17: * Get the current default options. Chris@0: * Chris@17: * @return array Chris@0: */ Chris@0: public function getOptions() Chris@0: { Chris@17: return $this->defaultOptions; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Load and parse an HTML file. Chris@0: * Chris@0: * This will apply the HTML5 parser, which is tolerant of many Chris@0: * varieties of HTML, including XHTML 1, HTML 4, and well-formed HTML Chris@0: * 3. Note that in these cases, not all of the old data will be Chris@0: * preserved. For example, XHTML's XML declaration will be removed. Chris@0: * Chris@0: * The rules governing parsing are set out in the HTML 5 spec. Chris@0: * Chris@17: * @param string|resource $file The path to the file to parse. If this is a resource, it is Chris@17: * assumed to be an open stream whose pointer is set to the first Chris@17: * byte of input. Chris@17: * @param array $options Configuration options when parsing the HTML. Chris@17: * Chris@0: * @return \DOMDocument A DOM document. These object type is defined by the libxml Chris@17: * library, and should have been included with your version of PHP. Chris@0: */ Chris@0: public function load($file, array $options = array()) Chris@0: { Chris@0: // Handle the case where file is a resource. Chris@0: if (is_resource($file)) { Chris@17: return $this->parse(stream_get_contents($file), $options); Chris@0: } Chris@0: Chris@17: return $this->parse(file_get_contents($file), $options); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Parse a HTML Document from a string. Chris@0: * Chris@0: * Take a string of HTML 5 (or earlier) and parse it into a Chris@0: * DOMDocument. Chris@0: * Chris@17: * @param string $string A html5 document as a string. Chris@17: * @param array $options Configuration options when parsing the HTML. Chris@17: * Chris@0: * @return \DOMDocument A DOM document. DOM is part of libxml, which is included with Chris@17: * almost all distribtions of PHP. Chris@0: */ Chris@0: public function loadHTML($string, array $options = array()) Chris@0: { Chris@17: return $this->parse($string, $options); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Convenience function to load an HTML file. Chris@0: * Chris@0: * This is here to provide backwards compatibility with the Chris@0: * PHP DOM implementation. It simply calls load(). Chris@0: * Chris@17: * @param string $file The path to the file to parse. If this is a resource, it is Chris@17: * assumed to be an open stream whose pointer is set to the first Chris@17: * byte of input. Chris@17: * @param array $options Configuration options when parsing the HTML. Chris@0: * Chris@0: * @return \DOMDocument A DOM document. These object type is defined by the libxml Chris@17: * library, and should have been included with your version of PHP. Chris@0: */ Chris@0: public function loadHTMLFile($file, array $options = array()) Chris@0: { Chris@0: return $this->load($file, $options); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Parse a HTML fragment from a string. Chris@0: * Chris@17: * @param string $string the HTML5 fragment as a string Chris@17: * @param array $options Configuration options when parsing the HTML Chris@0: * Chris@0: * @return \DOMDocumentFragment A DOM fragment. The DOM is part of libxml, which is included with Chris@17: * almost all distributions of PHP. Chris@0: */ Chris@0: public function loadHTMLFragment($string, array $options = array()) Chris@0: { Chris@17: return $this->parseFragment($string, $options); Chris@0: } Chris@0: Chris@0: /** Chris@17: * Return all errors encountered into parsing phase. Chris@0: * Chris@0: * @return array Chris@0: */ Chris@0: public function getErrors() Chris@0: { Chris@0: return $this->errors; Chris@0: } Chris@0: Chris@0: /** Chris@17: * Return true it some errors were encountered into parsing phase. Chris@0: * Chris@0: * @return bool Chris@0: */ Chris@0: public function hasErrors() Chris@0: { Chris@0: return count($this->errors) > 0; Chris@0: } Chris@0: Chris@0: /** Chris@17: * Parse an input string. Chris@0: * Chris@17: * @param string $input Chris@17: * @param array $options Chris@17: * Chris@17: * @return \DOMDocument Chris@0: */ Chris@17: public function parse($input, array $options = array()) Chris@0: { Chris@0: $this->errors = array(); Chris@17: $options = array_merge($this->defaultOptions, $options); Chris@0: $events = new DOMTreeBuilder(false, $options); Chris@18: $scanner = new Scanner($input, !empty($options['encoding']) ? $options['encoding'] : 'UTF-8'); Chris@17: $parser = new Tokenizer($scanner, $events, !empty($options['xmlNamespaces']) ? Tokenizer::CONFORMANT_XML : Tokenizer::CONFORMANT_HTML); Chris@0: Chris@0: $parser->parse(); Chris@0: $this->errors = $events->getErrors(); Chris@0: Chris@0: return $events->document(); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Parse an input stream where the stream is a fragment. Chris@0: * Chris@0: * Lower-level loading function. This requires an input stream instead Chris@0: * of a string, file, or resource. Chris@17: * Chris@17: * @param string $input The input data to parse in the form of a string. Chris@17: * @param array $options An array of options. Chris@17: * Chris@17: * @return \DOMDocumentFragment Chris@0: */ Chris@17: public function parseFragment($input, array $options = array()) Chris@0: { Chris@17: $options = array_merge($this->defaultOptions, $options); Chris@0: $events = new DOMTreeBuilder(true, $options); Chris@18: $scanner = new Scanner($input, !empty($options['encoding']) ? $options['encoding'] : 'UTF-8'); Chris@17: $parser = new Tokenizer($scanner, $events, !empty($options['xmlNamespaces']) ? Tokenizer::CONFORMANT_XML : Tokenizer::CONFORMANT_HTML); Chris@0: Chris@0: $parser->parse(); Chris@0: $this->errors = $events->getErrors(); Chris@0: Chris@0: return $events->fragment(); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Save a DOM into a given file as HTML5. Chris@0: * Chris@17: * @param mixed $dom The DOM to be serialized. Chris@17: * @param string|resource $file The filename to be written or resource to write to. Chris@17: * @param array $options Configuration options when serializing the DOM. These include: Chris@17: * - encode_entities: Text written to the output is escaped by default and not all Chris@17: * entities are encoded. If this is set to true all entities will be encoded. Chris@17: * Defaults to false. Chris@0: */ Chris@0: public function save($dom, $file, $options = array()) Chris@0: { Chris@0: $close = true; Chris@0: if (is_resource($file)) { Chris@0: $stream = $file; Chris@0: $close = false; Chris@0: } else { Chris@17: $stream = fopen($file, 'wb'); Chris@0: } Chris@17: $options = array_merge($this->defaultOptions, $options); Chris@0: $rules = new OutputRules($stream, $options); Chris@0: $trav = new Traverser($dom, $stream, $rules, $options); Chris@0: Chris@0: $trav->walk(); Chris@0: Chris@0: if ($close) { Chris@0: fclose($stream); Chris@0: } Chris@0: } Chris@0: Chris@0: /** Chris@0: * Convert a DOM into an HTML5 string. Chris@0: * Chris@17: * @param mixed $dom The DOM to be serialized. Chris@17: * @param array $options Configuration options when serializing the DOM. These include: Chris@17: * - encode_entities: Text written to the output is escaped by default and not all Chris@17: * entities are encoded. If this is set to true all entities will be encoded. Chris@17: * Defaults to false. Chris@0: * Chris@0: * @return string A HTML5 documented generated from the DOM. Chris@0: */ Chris@0: public function saveHTML($dom, $options = array()) Chris@0: { Chris@17: $stream = fopen('php://temp', 'wb'); Chris@17: $this->save($dom, $stream, array_merge($this->defaultOptions, $options)); Chris@0: Chris@17: return stream_get_contents($stream, -1, 0); Chris@0: } Chris@0: }