PhpTransliteration.php
Same filename in this branch
Same filename in other branches
- 9 core/lib/Drupal/Core/Transliteration/PhpTransliteration.php
- 9 core/lib/Drupal/Component/Transliteration/PhpTransliteration.php
- 8.9.x core/lib/Drupal/Core/Transliteration/PhpTransliteration.php
- 8.9.x core/lib/Drupal/Component/Transliteration/PhpTransliteration.php
- 11.x core/lib/Drupal/Core/Transliteration/PhpTransliteration.php
- 11.x core/lib/Drupal/Component/Transliteration/PhpTransliteration.php
Namespace
Drupal\Component\TransliterationFile
-
core/
lib/ Drupal/ Component/ Transliteration/ PhpTransliteration.php
View source
<?php
namespace Drupal\Component\Transliteration;
// cspell:ignore Brion Vibber
/**
* Implements transliteration without using the PECL extensions.
*
* Transliterations are done character-by-character, by looking up non-US-ASCII
* characters in a transliteration database.
*
* The database comes from two types of files, both of which are searched for in
* the PhpTransliteration::$dataDirectory directory. First, language-specific
* overrides are searched (see PhpTransliteration::readLanguageOverrides()). If
* there is no language-specific override for a character, the generic
* transliteration character tables are searched (see
* PhpTransliteration::readGenericData()). If looking up the character in the
* generic table results in a NULL value, or an illegal character is
* encountered, then a substitute character is returned.
*
* Some parts of this code were derived from the MediaWiki project's UtfNormal
* class, Copyright © 2004 Brion Vibber <brion@pobox.com>,
* http://www.mediawiki.org/
*/
class PhpTransliteration implements TransliterationInterface {
/**
* Directory where data for transliteration resides.
*
* The constructor sets this (by default) to subdirectory 'data' underneath
* the directory where the class's PHP file resides.
*
* @var string
*/
protected $dataDirectory;
/**
* Associative array of language-specific character transliteration tables.
*
* The outermost array keys are language codes. For each language code key,
* the value is an array whose keys are Unicode character codes, and whose
* values are the transliterations of those characters to US-ASCII. This is
* set up as needed in PhpTransliteration::replace() by calling
* PhpTransliteration::readLanguageOverrides().
*
* @var array
*/
protected $languageOverrides = [];
/**
* Non-language-specific transliteration tables.
*
* Array whose keys are the upper two bytes of the Unicode character, and
* whose values are an array of transliterations for each lower-two bytes
* character code. This is set up as needed in PhpTransliteration::replace()
* by calling PhpTransliteration::readGenericData().
*
* @var array
*/
protected $genericMap = [];
/**
* Special characters for ::removeDiacritics().
*
* Characters which have accented variants but their base character
* transliterates to more than one ASCII character require special
* treatment: we want to remove their accent and use the un-
* transliterated base character.
*/
protected $fixTransliterateForRemoveDiacritics = [
'AE' => 'Æ',
'ae' => 'æ',
'ZH' => 'Ʒ',
'zh' => 'ʒ',
];
/**
* Constructs a transliteration object.
*
* @param string $data_directory
* (optional) The directory where data files reside. If omitted, defaults
* to subdirectory 'data' underneath the directory where the class's PHP
* file resides.
*/
public function __construct($data_directory = NULL) {
$this->dataDirectory = isset($data_directory) ? $data_directory : __DIR__ . '/data';
}
/**
* {@inheritdoc}
*/
public function removeDiacritics($string) {
$result = '';
foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
$code = self::ordUTF8($character);
// These two Unicode ranges include the accented US-ASCII letters, with a
// few characters that aren't accented letters mixed in. So define the
// ranges and the excluded characters.
$range1 = $code > 0xbf && $code < 0x17f;
$exclusions_range1 = [
0xd0,
0xd7,
0xf0,
0xf7,
0x138,
0x14a,
0x14b,
];
$range2 = $code > 0x1cc && $code < 0x250;
$exclusions_range2 = [
0x1dd,
0x1f7,
0x21c,
0x21d,
0x220,
0x221,
0x241,
0x242,
0x245,
];
$replacement = $character;
if ($range1 && !in_array($code, $exclusions_range1) || $range2 && !in_array($code, $exclusions_range2)) {
$to_add = $this->lookupReplacement($code, 'xyz');
if (strlen($to_add) === 1) {
$replacement = $to_add;
}
elseif (isset($this->fixTransliterateForRemoveDiacritics[$to_add])) {
$replacement = $this->fixTransliterateForRemoveDiacritics[$to_add];
}
}
$result .= $replacement;
}
return $result;
}
/**
* {@inheritdoc}
*/
public function transliterate($string, $langcode = 'en', $unknown_character = '?', $max_length = NULL) {
$result = '';
$length = 0;
$hash = FALSE;
// Replace question marks with a unique hash if necessary. This because
// mb_convert_encoding() replaces all invalid characters with a question
// mark.
if ($unknown_character != '?' && str_contains($string, '?')) {
$hash = hash('sha256', $string);
$string = str_replace('?', $hash, $string);
}
// Ensure the string is valid UTF8 for preg_split(). Unknown characters will
// be replaced by a question mark.
$string = mb_convert_encoding($string, 'UTF-8', 'UTF-8');
// Use the provided unknown character instead of a question mark.
if ($unknown_character != '?') {
$string = str_replace('?', $unknown_character, $string);
// Restore original question marks if necessary.
if ($hash !== FALSE) {
$string = str_replace($hash, '?', $string);
}
}
// Split into Unicode characters and transliterate each one.
foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
$code = self::ordUTF8($character);
if ($code == -1) {
$to_add = $unknown_character;
}
else {
$to_add = $this->replace($code, $langcode, $unknown_character);
}
// Check if this exceeds the maximum allowed length.
if (isset($max_length)) {
$length += strlen($to_add);
if ($length > $max_length) {
// There is no more space.
return $result;
}
}
$result .= $to_add;
}
return $result;
}
/**
* Finds the character code for a UTF-8 character: like ord() but for UTF-8.
*
* @param string $character
* A single UTF-8 character.
*
* @return int
* The character code, or -1 if an illegal character is found.
*/
protected static function ordUTF8($character) {
$first_byte = ord($character[0]);
if (($first_byte & 0x80) == 0) {
// Single-byte form: 0xxxxxxxx.
return $first_byte;
}
if (($first_byte & 0xe0) == 0xc0) {
// Two-byte form: 110xxxxx 10xxxxxx.
return (($first_byte & 0x1f) << 6) + (ord($character[1]) & 0x3f);
}
if (($first_byte & 0xf0) == 0xe0) {
// Three-byte form: 1110xxxx 10xxxxxx 10xxxxxx.
return (($first_byte & 0xf) << 12) + ((ord($character[1]) & 0x3f) << 6) + (ord($character[2]) & 0x3f);
}
if (($first_byte & 0xf8) == 0xf0) {
// Four-byte form: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
return (($first_byte & 0x7) << 18) + ((ord($character[1]) & 0x3f) << 12) + ((ord($character[2]) & 0x3f) << 6) + (ord($character[3]) & 0x3f);
}
// Other forms are not legal.
return -1;
}
/**
* Replaces a single Unicode character using the transliteration database.
*
* @param int $code
* The character code of a Unicode character.
* @param string $langcode
* The language code of the language the character is in.
* @param string $unknown_character
* The character to substitute for characters without transliterated
* equivalents.
*
* @return string
* US-ASCII replacement character. If it has a mapping, it is returned;
* otherwise, $unknown_character is returned. The replacement can contain
* multiple characters.
*/
protected function replace($code, $langcode, $unknown_character) {
if ($code < 0x80) {
// Already lower ASCII.
return chr($code);
}
// See if there is a language-specific override for this character.
if (!isset($this->languageOverrides[$langcode])) {
$this->readLanguageOverrides($langcode);
}
if (isset($this->languageOverrides[$langcode][$code])) {
return $this->languageOverrides[$langcode][$code];
}
return $this->lookupReplacement($code, $unknown_character);
}
/**
* Look up the generic replacement for a UTF-8 character code.
*
* @param $code
* The UTF-8 character code.
* @param string $unknown_character
* (optional) The character to substitute for characters without entries in
* the replacement tables.
*
* @return string
* US-ASCII replacement characters. If it has a mapping, it is returned;
* otherwise, $unknown_character is returned. The replacement can contain
* multiple characters.
*/
protected function lookupReplacement($code, $unknown_character = '?') {
// See if there is a generic mapping for this character.
$bank = $code >> 8;
if (!isset($this->genericMap[$bank])) {
$this->readGenericData($bank);
}
$code = $code & 0xff;
return $this->genericMap[$bank][$code] ?? $unknown_character;
}
/**
* Reads in language overrides for a language code.
*
* The data is read from files named "$langcode.php" in
* PhpTransliteration::$dataDirectory. These files should set up an array
* variable $overrides with an element whose key is $langcode and whose value
* is an array whose keys are character codes, and whose values are their
* transliterations in this language. The character codes can be for any valid
* Unicode character, independent of the number of bytes.
*
* @param $langcode
* Code for the language to read.
*/
protected function readLanguageOverrides($langcode) {
// Figure out the file name to use by sanitizing the language code,
// just in case.
$file = $this->dataDirectory . '/' . preg_replace('/[^a-zA-Z\\-]/', '', $langcode) . '.php';
// Read in this file, which should set up a variable called $overrides,
// which will be local to this function.
$overrides[$langcode] = [];
if (is_file($file)) {
include $file;
}
$this->languageOverrides[$langcode] = $overrides[$langcode];
}
/**
* Reads in generic transliteration data for a bank of characters.
*
* The data is read in from a file named "x$bank.php" (with $bank in
* hexadecimal notation) in PhpTransliteration::$dataDirectory. These files
* should set up a variable $bank containing an array whose numerical indices
* are the remaining two bytes of the character code, and whose values are the
* transliterations of these characters into US-ASCII. Note that the maximum
* Unicode character that can be encoded in this way is 4 bytes.
*
* @param $bank
* First two bytes of the Unicode character, or 0 for the ASCII range.
*/
protected function readGenericData($bank) {
// Figure out the file name.
$file = $this->dataDirectory . '/x' . sprintf('%02x', $bank) . '.php';
// Read in this file, which should set up a variable called $base, which
// will be local to this function.
$base = [];
if (is_file($file)) {
include $file;
}
$this->genericMap[$bank] = $base;
}
}
Classes
Title | Deprecated | Summary |
---|---|---|
PhpTransliteration | Implements transliteration without using the PECL extensions. |
Buggy or inaccurate documentation? Please file an issue. Need support? Need help programming? Connect with the Drupal community.