PhpTransliteration.php

Same filename in this branch
  1. 11.x core/lib/Drupal/Core/Transliteration/PhpTransliteration.php
Same filename and directory in other branches
  1. 9 core/lib/Drupal/Core/Transliteration/PhpTransliteration.php
  2. 9 core/lib/Drupal/Component/Transliteration/PhpTransliteration.php
  3. 8.9.x core/lib/Drupal/Core/Transliteration/PhpTransliteration.php
  4. 8.9.x core/lib/Drupal/Component/Transliteration/PhpTransliteration.php
  5. 10 core/lib/Drupal/Core/Transliteration/PhpTransliteration.php
  6. 10 core/lib/Drupal/Component/Transliteration/PhpTransliteration.php

Namespace

Drupal\Component\Transliteration

File

core/lib/Drupal/Component/Transliteration/PhpTransliteration.php

View source
<?php

namespace Drupal\Component\Transliteration;

// cspell:ignore Brion Vibber

/**
 * Implements transliteration without using the PECL extensions.
 *
 * Transliterations are done character-by-character, by looking up non-US-ASCII
 * characters in a transliteration database.
 *
 * The database comes from two types of files, both of which are searched for in
 * the PhpTransliteration::$dataDirectory directory. First, language-specific
 * overrides are searched (see PhpTransliteration::readLanguageOverrides()). If
 * there is no language-specific override for a character, the generic
 * transliteration character tables are searched (see
 * PhpTransliteration::readGenericData()). If looking up the character in the
 * generic table results in a NULL value, or an illegal character is
 * encountered, then a substitute character is returned.
 *
 * Some parts of this code were derived from the MediaWiki project's UtfNormal
 * class, Copyright © 2004 Brion Vibber <brion@pobox.com>,
 * http://www.mediawiki.org/
 */
class PhpTransliteration implements TransliterationInterface {
    
    /**
     * Directory where data for transliteration resides.
     *
     * The constructor sets this (by default) to subdirectory 'data' underneath
     * the directory where the class's PHP file resides.
     *
     * @var string
     */
    protected $dataDirectory;
    
    /**
     * Associative array of language-specific character transliteration tables.
     *
     * The outermost array keys are language codes. For each language code key,
     * the value is an array whose keys are Unicode character codes, and whose
     * values are the transliterations of those characters to US-ASCII. This is
     * set up as needed in PhpTransliteration::replace() by calling
     * PhpTransliteration::readLanguageOverrides().
     *
     * @var array
     */
    protected $languageOverrides = [];
    
    /**
     * Non-language-specific transliteration tables.
     *
     * Array whose keys are the upper two bytes of the Unicode character, and
     * whose values are an array of transliterations for each lower-two bytes
     * character code. This is set up as needed in PhpTransliteration::replace()
     * by calling PhpTransliteration::readGenericData().
     *
     * @var array
     */
    protected $genericMap = [];
    
    /**
     * Special characters for ::removeDiacritics().
     *
     * Characters which have accented variants but their base character
     * transliterates to more than one ASCII character require special
     * treatment: we want to remove their accent and use the un-
     * transliterated base character.
     */
    protected $fixTransliterateForRemoveDiacritics = [
        'AE' => 'Æ',
        'ae' => 'æ',
        'ZH' => 'Ʒ',
        'zh' => 'ʒ',
    ];
    
    /**
     * Constructs a transliteration object.
     *
     * @param string $data_directory
     *   (optional) The directory where data files reside. If omitted, defaults
     *   to subdirectory 'data' underneath the directory where the class's PHP
     *   file resides.
     */
    public function __construct($data_directory = NULL) {
        $this->dataDirectory = isset($data_directory) ? $data_directory : __DIR__ . '/data';
    }
    
    /**
     * {@inheritdoc}
     */
    public function removeDiacritics($string) {
        $result = '';
        foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
            $code = self::ordUTF8($character);
            // These two Unicode ranges include the accented US-ASCII letters, with a
            // few characters that aren't accented letters mixed in. So define the
            // ranges and the excluded characters.
            $range1 = $code > 0xbf && $code < 0x17f;
            $exclusions_range1 = [
                0xd0,
                0xd7,
                0xf0,
                0xf7,
                0x138,
                0x14a,
                0x14b,
            ];
            $range2 = $code > 0x1cc && $code < 0x250;
            $exclusions_range2 = [
                0x1dd,
                0x1f7,
                0x21c,
                0x21d,
                0x220,
                0x221,
                0x241,
                0x242,
                0x245,
            ];
            $replacement = $character;
            if ($range1 && !in_array($code, $exclusions_range1) || $range2 && !in_array($code, $exclusions_range2)) {
                $to_add = $this->lookupReplacement($code, 'xyz');
                if (strlen($to_add) === 1) {
                    $replacement = $to_add;
                }
                elseif (isset($this->fixTransliterateForRemoveDiacritics[$to_add])) {
                    $replacement = $this->fixTransliterateForRemoveDiacritics[$to_add];
                }
            }
            $result .= $replacement;
        }
        return $result;
    }
    
    /**
     * {@inheritdoc}
     */
    public function transliterate($string, $langcode = 'en', $unknown_character = '?', $max_length = NULL) {
        $result = '';
        $length = 0;
        $hash = FALSE;
        // Replace question marks with a unique hash if necessary. This because
        // mb_convert_encoding() replaces all invalid characters with a question
        // mark.
        if ($unknown_character != '?' && str_contains($string, '?')) {
            $hash = hash('sha256', $string);
            $string = str_replace('?', $hash, $string);
        }
        // Ensure the string is valid UTF8 for preg_split(). Unknown characters will
        // be replaced by a question mark.
        $string = mb_convert_encoding($string, 'UTF-8', 'UTF-8');
        // Use the provided unknown character instead of a question mark.
        if ($unknown_character != '?') {
            $string = str_replace('?', $unknown_character, $string);
            // Restore original question marks if necessary.
            if ($hash !== FALSE) {
                $string = str_replace($hash, '?', $string);
            }
        }
        // Split into Unicode characters and transliterate each one.
        foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
            $code = self::ordUTF8($character);
            if ($code == -1) {
                $to_add = $unknown_character;
            }
            else {
                $to_add = $this->replace($code, $langcode, $unknown_character);
            }
            // Check if this exceeds the maximum allowed length.
            if (isset($max_length)) {
                $length += strlen($to_add);
                if ($length > $max_length) {
                    // There is no more space.
                    return $result;
                }
            }
            $result .= $to_add;
        }
        return $result;
    }
    
    /**
     * Finds the character code for a UTF-8 character: like ord() but for UTF-8.
     *
     * @param string $character
     *   A single UTF-8 character.
     *
     * @return int
     *   The character code, or -1 if an illegal character is found.
     */
    protected static function ordUTF8($character) {
        $first_byte = ord($character[0]);
        if (($first_byte & 0x80) == 0) {
            // Single-byte form: 0xxxxxxxx.
            return $first_byte;
        }
        if (($first_byte & 0xe0) == 0xc0) {
            // Two-byte form: 110xxxxx 10xxxxxx.
            return (($first_byte & 0x1f) << 6) + (ord($character[1]) & 0x3f);
        }
        if (($first_byte & 0xf0) == 0xe0) {
            // Three-byte form: 1110xxxx 10xxxxxx 10xxxxxx.
            return (($first_byte & 0xf) << 12) + ((ord($character[1]) & 0x3f) << 6) + (ord($character[2]) & 0x3f);
        }
        if (($first_byte & 0xf8) == 0xf0) {
            // Four-byte form: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
            return (($first_byte & 0x7) << 18) + ((ord($character[1]) & 0x3f) << 12) + ((ord($character[2]) & 0x3f) << 6) + (ord($character[3]) & 0x3f);
        }
        // Other forms are not legal.
        return -1;
    }
    
    /**
     * Replaces a single Unicode character using the transliteration database.
     *
     * @param int $code
     *   The character code of a Unicode character.
     * @param string $langcode
     *   The language code of the language the character is in.
     * @param string $unknown_character
     *   The character to substitute for characters without transliterated
     *   equivalents.
     *
     * @return string
     *   US-ASCII replacement character. If it has a mapping, it is returned;
     *   otherwise, $unknown_character is returned. The replacement can contain
     *   multiple characters.
     */
    protected function replace($code, $langcode, $unknown_character) {
        if ($code < 0x80) {
            // Already lower ASCII.
            return chr($code);
        }
        // See if there is a language-specific override for this character.
        if (!isset($this->languageOverrides[$langcode])) {
            $this->readLanguageOverrides($langcode);
        }
        if (isset($this->languageOverrides[$langcode][$code])) {
            return $this->languageOverrides[$langcode][$code];
        }
        return $this->lookupReplacement($code, $unknown_character);
    }
    
    /**
     * Look up the generic replacement for a UTF-8 character code.
     *
     * @param $code
     *   The UTF-8 character code.
     * @param string $unknown_character
     *   (optional) The character to substitute for characters without entries in
     *   the replacement tables.
     *
     * @return string
     *   US-ASCII replacement characters. If it has a mapping, it is returned;
     *   otherwise, $unknown_character is returned. The replacement can contain
     *   multiple characters.
     */
    protected function lookupReplacement($code, $unknown_character = '?') {
        // See if there is a generic mapping for this character.
        $bank = $code >> 8;
        if (!isset($this->genericMap[$bank])) {
            $this->readGenericData($bank);
        }
        $code = $code & 0xff;
        return $this->genericMap[$bank][$code] ?? $unknown_character;
    }
    
    /**
     * Reads in language overrides for a language code.
     *
     * The data is read from files named "$langcode.php" in
     * PhpTransliteration::$dataDirectory. These files should set up an array
     * variable $overrides with an element whose key is $langcode and whose value
     * is an array whose keys are character codes, and whose values are their
     * transliterations in this language. The character codes can be for any valid
     * Unicode character, independent of the number of bytes.
     *
     * @param $langcode
     *   Code for the language to read.
     */
    protected function readLanguageOverrides($langcode) {
        // Figure out the file name to use by sanitizing the language code,
        // just in case.
        $file = $this->dataDirectory . '/' . preg_replace('/[^a-zA-Z\\-]/', '', $langcode) . '.php';
        // Read in this file, which should set up a variable called $overrides,
        // which will be local to this function.
        $overrides[$langcode] = [];
        if (is_file($file)) {
            include $file;
        }
        $this->languageOverrides[$langcode] = $overrides[$langcode];
    }
    
    /**
     * Reads in generic transliteration data for a bank of characters.
     *
     * The data is read in from a file named "x$bank.php" (with $bank in
     * hexadecimal notation) in PhpTransliteration::$dataDirectory. These files
     * should set up a variable $bank containing an array whose numerical indices
     * are the remaining two bytes of the character code, and whose values are the
     * transliterations of these characters into US-ASCII. Note that the maximum
     * Unicode character that can be encoded in this way is 4 bytes.
     *
     * @param $bank
     *   First two bytes of the Unicode character, or 0 for the ASCII range.
     */
    protected function readGenericData($bank) {
        // Figure out the file name.
        $file = $this->dataDirectory . '/x' . sprintf('%02x', $bank) . '.php';
        // Read in this file, which should set up a variable called $base, which
        // will be local to this function.
        $base = [];
        if (is_file($file)) {
            include $file;
        }
        $this->genericMap[$bank] = $base;
    }

}

Classes

Title Deprecated Summary
PhpTransliteration Implements transliteration without using the PECL extensions.

Buggy or inaccurate documentation? Please file an issue. Need support? Need help programming? Connect with the Drupal community.