Mercurial > hg > isophonics-drupal-site
diff core/lib/Drupal/Component/Gettext/PoStreamReader.php @ 0:4c8ae668cc8c
Initial import (non-working)
author | Chris Cannam |
---|---|
date | Wed, 29 Nov 2017 16:09:58 +0000 |
parents | |
children | 1fec387a4317 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/core/lib/Drupal/Component/Gettext/PoStreamReader.php Wed Nov 29 16:09:58 2017 +0000 @@ -0,0 +1,594 @@ +<?php + +namespace Drupal\Component\Gettext; + +use Drupal\Component\Utility\SafeMarkup; + +/** + * Implements Gettext PO stream reader. + * + * The PO file format parsing is implemented according to the documentation at + * http://www.gnu.org/software/gettext/manual/gettext.html#PO-Files + */ +class PoStreamReader implements PoStreamInterface, PoReaderInterface { + + /** + * Source line number of the stream being parsed. + * + * @var int + */ + private $_line_number = 0; + + /** + * Parser context for the stream reader state machine. + * + * Possible contexts are: + * - 'COMMENT' (#) + * - 'MSGID' (msgid) + * - 'MSGID_PLURAL' (msgid_plural) + * - 'MSGCTXT' (msgctxt) + * - 'MSGSTR' (msgstr or msgstr[]) + * - 'MSGSTR_ARR' (msgstr_arg) + * + * @var string + */ + private $_context = 'COMMENT'; + + /** + * Current entry being read. Incomplete. + * + * @var array + */ + private $_current_item = []; + + /** + * Current plural index for plural translations. + * + * @var int + */ + private $_current_plural_index = 0; + + /** + * URI of the PO stream that is being read. + * + * @var string + */ + private $_uri = ''; + + /** + * Language code for the PO stream being read. + * + * @var string + */ + private $_langcode = NULL; + + /** + * File handle of the current PO stream. + * + * @var resource + */ + private $_fd; + + /** + * The PO stream header. + * + * @var \Drupal\Component\Gettext\PoHeader + */ + private $_header; + + /** + * Object wrapper for the last read source/translation pair. + * + * @var \Drupal\Component\Gettext\PoItem + */ + private $_last_item; + + /** + * Indicator of whether the stream reading is finished. + * + * @var bool + */ + private $_finished; + + /** + * Array of translated error strings recorded on reading this stream so far. + * + * @var array + */ + private $_errors; + + /** + * {@inheritdoc} + */ + public function getLangcode() { + return $this->_langcode; + } + + /** + * {@inheritdoc} + */ + public function setLangcode($langcode) { + $this->_langcode = $langcode; + } + + /** + * {@inheritdoc} + */ + public function getHeader() { + return $this->_header; + } + + /** + * Implements Drupal\Component\Gettext\PoMetadataInterface::setHeader(). + * + * Not applicable to stream reading and therefore not implemented. + */ + public function setHeader(PoHeader $header) { + } + + /** + * {@inheritdoc} + */ + public function getURI() { + return $this->_uri; + } + + /** + * {@inheritdoc} + */ + public function setURI($uri) { + $this->_uri = $uri; + } + + /** + * Implements Drupal\Component\Gettext\PoStreamInterface::open(). + * + * Opens the stream and reads the header. The stream is ready for reading + * items after. + * + * @throws Exception + * If the URI is not yet set. + */ + public function open() { + if (!empty($this->_uri)) { + $this->_fd = fopen($this->_uri, 'rb'); + $this->readHeader(); + } + else { + throw new \Exception('Cannot open stream without URI set.'); + } + } + + /** + * Implements Drupal\Component\Gettext\PoStreamInterface::close(). + * + * @throws Exception + * If the stream is not open. + */ + public function close() { + if ($this->_fd) { + fclose($this->_fd); + } + else { + throw new \Exception('Cannot close stream that is not open.'); + } + } + + /** + * {@inheritdoc} + */ + public function readItem() { + // Clear out the last item. + $this->_last_item = NULL; + + // Read until finished with the stream or a complete item was identified. + while (!$this->_finished && is_null($this->_last_item)) { + $this->readLine(); + } + + return $this->_last_item; + } + + /** + * Sets the seek position for the current PO stream. + * + * @param int $seek + * The new seek position to set. + */ + public function setSeek($seek) { + fseek($this->_fd, $seek); + } + + /** + * Gets the pointer position of the current PO stream. + */ + public function getSeek() { + return ftell($this->_fd); + } + + /** + * Read the header from the PO stream. + * + * The header is a special case PoItem, using the empty string as source and + * key-value pairs as translation. We just reuse the item reader logic to + * read the header. + */ + private function readHeader() { + $item = $this->readItem(); + // Handle the case properly when the .po file is empty (0 bytes). + if (!$item) { + return; + } + $header = new PoHeader(); + $header->setFromString(trim($item->getTranslation())); + $this->_header = $header; + } + + /** + * Reads a line from the PO stream and stores data internally. + * + * Expands $this->_current_item based on new data for the current item. If + * this line ends the current item, it is saved with setItemFromArray() with + * data from $this->_current_item. + * + * An internal state machine is maintained in this reader using + * $this->_context as the reading state. PO items are in between COMMENT + * states (when items have at least one line or comment in between them) or + * indicated by MSGSTR or MSGSTR_ARR followed immediately by an MSGID or + * MSGCTXT (when items closely follow each other). + * + * @return + * FALSE if an error was logged, NULL otherwise. The errors are considered + * non-blocking, so reading can continue, while the errors are collected + * for later presentation. + */ + private function readLine() { + // Read a line and set the stream finished indicator if it was not + // possible anymore. + $line = fgets($this->_fd); + $this->_finished = ($line === FALSE); + + if (!$this->_finished) { + + if ($this->_line_number == 0) { + // The first line might come with a UTF-8 BOM, which should be removed. + $line = str_replace("\xEF\xBB\xBF", '', $line); + // Current plurality for 'msgstr[]'. + $this->_current_plural_index = 0; + } + + // Track the line number for error reporting. + $this->_line_number++; + + // Initialize common values for error logging. + $log_vars = [ + '%uri' => $this->getURI(), + '%line' => $this->_line_number, + ]; + + // Trim away the linefeed. \\n might appear at the end of the string if + // another line continuing the same string follows. We can remove that. + $line = trim(strtr($line, ["\\\n" => ""])); + + if (!strncmp('#', $line, 1)) { + // Lines starting with '#' are comments. + + if ($this->_context == 'COMMENT') { + // Already in comment context, add to current comment. + $this->_current_item['#'][] = substr($line, 1); + } + elseif (($this->_context == 'MSGSTR') || ($this->_context == 'MSGSTR_ARR')) { + // We are currently in string context, save current item. + $this->setItemFromArray($this->_current_item); + + // Start a new entry for the comment. + $this->_current_item = []; + $this->_current_item['#'][] = substr($line, 1); + + $this->_context = 'COMMENT'; + return; + } + else { + // A comment following any other context is a syntax error. + $this->_errors[] = SafeMarkup::format('The translation stream %uri contains an error: "msgstr" was expected but not found on line %line.', $log_vars); + return FALSE; + } + return; + } + elseif (!strncmp('msgid_plural', $line, 12)) { + // A plural form for the current source string. + + if ($this->_context != 'MSGID') { + // A plural form can only be added to an msgid directly. + $this->_errors[] = SafeMarkup::format('The translation stream %uri contains an error: "msgid_plural" was expected but not found on line %line.', $log_vars); + return FALSE; + } + + // Remove 'msgid_plural' and trim away whitespace. + $line = trim(substr($line, 12)); + + // Only the plural source string is left, parse it. + $quoted = $this->parseQuoted($line); + if ($quoted === FALSE) { + // The plural form must be wrapped in quotes. + $this->_errors[] = SafeMarkup::format('The translation stream %uri contains a syntax error on line %line.', $log_vars); + return FALSE; + } + + // Append the plural source to the current entry. + if (is_string($this->_current_item['msgid'])) { + // The first value was stored as string. Now we know the context is + // plural, it is converted to array. + $this->_current_item['msgid'] = [$this->_current_item['msgid']]; + } + $this->_current_item['msgid'][] = $quoted; + + $this->_context = 'MSGID_PLURAL'; + return; + } + elseif (!strncmp('msgid', $line, 5)) { + // Starting a new message. + + if (($this->_context == 'MSGSTR') || ($this->_context == 'MSGSTR_ARR')) { + // We are currently in string context, save current item. + $this->setItemFromArray($this->_current_item); + + // Start a new context for the msgid. + $this->_current_item = []; + } + elseif ($this->_context == 'MSGID') { + // We are currently already in the context, meaning we passed an id with no data. + $this->_errors[] = SafeMarkup::format('The translation stream %uri contains an error: "msgid" is unexpected on line %line.', $log_vars); + return FALSE; + } + + // Remove 'msgid' and trim away whitespace. + $line = trim(substr($line, 5)); + + // Only the message id string is left, parse it. + $quoted = $this->parseQuoted($line); + if ($quoted === FALSE) { + // The message id must be wrapped in quotes. + $this->_errors[] = SafeMarkup::format('The translation stream %uri contains an error: invalid format for "msgid" on line %line.', $log_vars, $log_vars); + return FALSE; + } + + $this->_current_item['msgid'] = $quoted; + $this->_context = 'MSGID'; + return; + } + elseif (!strncmp('msgctxt', $line, 7)) { + // Starting a new context. + + if (($this->_context == 'MSGSTR') || ($this->_context == 'MSGSTR_ARR')) { + // We are currently in string context, save current item. + $this->setItemFromArray($this->_current_item); + $this->_current_item = []; + } + elseif (!empty($this->_current_item['msgctxt'])) { + // A context cannot apply to another context. + $this->_errors[] = SafeMarkup::format('The translation stream %uri contains an error: "msgctxt" is unexpected on line %line.', $log_vars); + return FALSE; + } + + // Remove 'msgctxt' and trim away whitespaces. + $line = trim(substr($line, 7)); + + // Only the msgctxt string is left, parse it. + $quoted = $this->parseQuoted($line); + if ($quoted === FALSE) { + // The context string must be quoted. + $this->_errors[] = SafeMarkup::format('The translation stream %uri contains an error: invalid format for "msgctxt" on line %line.', $log_vars); + return FALSE; + } + + $this->_current_item['msgctxt'] = $quoted; + + $this->_context = 'MSGCTXT'; + return; + } + elseif (!strncmp('msgstr[', $line, 7)) { + // A message string for a specific plurality. + + if (($this->_context != 'MSGID') && + ($this->_context != 'MSGCTXT') && + ($this->_context != 'MSGID_PLURAL') && + ($this->_context != 'MSGSTR_ARR')) { + // Plural message strings must come after msgid, msgxtxt, + // msgid_plural, or other msgstr[] entries. + $this->_errors[] = SafeMarkup::format('The translation stream %uri contains an error: "msgstr[]" is unexpected on line %line.', $log_vars); + return FALSE; + } + + // Ensure the plurality is terminated. + if (strpos($line, ']') === FALSE) { + $this->_errors[] = SafeMarkup::format('The translation stream %uri contains an error: invalid format for "msgstr[]" on line %line.', $log_vars); + return FALSE; + } + + // Extract the plurality. + $frombracket = strstr($line, '['); + $this->_current_plural_index = substr($frombracket, 1, strpos($frombracket, ']') - 1); + + // Skip to the next whitespace and trim away any further whitespace, + // bringing $line to the message text only. + $line = trim(strstr($line, " ")); + + $quoted = $this->parseQuoted($line); + if ($quoted === FALSE) { + // The string must be quoted. + $this->_errors[] = SafeMarkup::format('The translation stream %uri contains an error: invalid format for "msgstr[]" on line %line.', $log_vars); + return FALSE; + } + if (!isset($this->_current_item['msgstr']) || !is_array($this->_current_item['msgstr'])) { + $this->_current_item['msgstr'] = []; + } + + $this->_current_item['msgstr'][$this->_current_plural_index] = $quoted; + + $this->_context = 'MSGSTR_ARR'; + return; + } + elseif (!strncmp("msgstr", $line, 6)) { + // A string pair for an msgid (with optional context). + + if (($this->_context != 'MSGID') && ($this->_context != 'MSGCTXT')) { + // Strings are only valid within an id or context scope. + $this->_errors[] = SafeMarkup::format('The translation stream %uri contains an error: "msgstr" is unexpected on line %line.', $log_vars); + return FALSE; + } + + // Remove 'msgstr' and trim away away whitespaces. + $line = trim(substr($line, 6)); + + // Only the msgstr string is left, parse it. + $quoted = $this->parseQuoted($line); + if ($quoted === FALSE) { + // The string must be quoted. + $this->_errors[] = SafeMarkup::format('The translation stream %uri contains an error: invalid format for "msgstr" on line %line.', $log_vars); + return FALSE; + } + + $this->_current_item['msgstr'] = $quoted; + + $this->_context = 'MSGSTR'; + return; + } + elseif ($line != '') { + // Anything that is not a token may be a continuation of a previous token. + + $quoted = $this->parseQuoted($line); + if ($quoted === FALSE) { + // This string must be quoted. + $this->_errors[] = SafeMarkup::format('The translation stream %uri contains an error: string continuation expected on line %line.', $log_vars); + return FALSE; + } + + // Append the string to the current item. + if (($this->_context == 'MSGID') || ($this->_context == 'MSGID_PLURAL')) { + if (is_array($this->_current_item['msgid'])) { + // Add string to last array element for plural sources. + $last_index = count($this->_current_item['msgid']) - 1; + $this->_current_item['msgid'][$last_index] .= $quoted; + } + else { + // Singular source, just append the string. + $this->_current_item['msgid'] .= $quoted; + } + } + elseif ($this->_context == 'MSGCTXT') { + // Multiline context name. + $this->_current_item['msgctxt'] .= $quoted; + } + elseif ($this->_context == 'MSGSTR') { + // Multiline translation string. + $this->_current_item['msgstr'] .= $quoted; + } + elseif ($this->_context == 'MSGSTR_ARR') { + // Multiline plural translation string. + $this->_current_item['msgstr'][$this->_current_plural_index] .= $quoted; + } + else { + // No valid context to append to. + $this->_errors[] = SafeMarkup::format('The translation stream %uri contains an error: unexpected string on line %line.', $log_vars); + return FALSE; + } + return; + } + } + + // Empty line read or EOF of PO stream, close out the last entry. + if (($this->_context == 'MSGSTR') || ($this->_context == 'MSGSTR_ARR')) { + $this->setItemFromArray($this->_current_item); + $this->_current_item = []; + } + elseif ($this->_context != 'COMMENT') { + $this->_errors[] = SafeMarkup::format('The translation stream %uri ended unexpectedly at line %line.', $log_vars); + return FALSE; + } + } + + /** + * Store the parsed values as a PoItem object. + */ + public function setItemFromArray($value) { + $plural = FALSE; + + $comments = ''; + if (isset($value['#'])) { + $comments = $this->shortenComments($value['#']); + } + + if (is_array($value['msgstr'])) { + // Sort plural variants by their form index. + ksort($value['msgstr']); + $plural = TRUE; + } + + $item = new PoItem(); + $item->setContext(isset($value['msgctxt']) ? $value['msgctxt'] : ''); + $item->setSource($value['msgid']); + $item->setTranslation($value['msgstr']); + $item->setPlural($plural); + $item->setComment($comments); + $item->setLangcode($this->_langcode); + + $this->_last_item = $item; + + $this->_context = 'COMMENT'; + } + + /** + * Parses a string in quotes. + * + * @param $string + * A string specified with enclosing quotes. + * + * @return + * The string parsed from inside the quotes. + */ + public function parseQuoted($string) { + if (substr($string, 0, 1) != substr($string, -1, 1)) { + // Start and end quotes must be the same. + return FALSE; + } + $quote = substr($string, 0, 1); + $string = substr($string, 1, -1); + if ($quote == '"') { + // Double quotes: strip slashes. + return stripcslashes($string); + } + elseif ($quote == "'") { + // Simple quote: return as-is. + return $string; + } + else { + // Unrecognized quote. + return FALSE; + } + } + + /** + * Generates a short, one-string version of the passed comment array. + * + * @param $comment + * An array of strings containing a comment. + * + * @return + * Short one-string version of the comment. + */ + private function shortenComments($comment) { + $comm = ''; + while (count($comment)) { + $test = $comm . substr(array_shift($comment), 1) . ', '; + if (strlen($comm) < 130) { + $comm = $test; + } + else { + break; + } + } + return trim(substr($comm, 0, -2)); + } + +}