| [ Index ] |
PHP Cross Reference of Drupal 6 (gatewave) |
[Summary view] [Print] [Text view]
1 <?php 2 // $Id: transliteration.inc,v 1.6.2.6 2009/11/29 15:59:06 smk Exp $ 3 4 /** 5 * @file 6 * Transliteration processing functions. 7 */ 8 9 /** 10 * Transliterate UTF-8 encoded text to US-ASCII. 11 * 12 * Based on Mediawiki's UtfNormal::quickIsNFCVerify(). 13 * 14 * @param $string 15 * UTF-8 encoded text input. 16 * @param $unknown 17 * Replacement string for characters that do not have a suitable ASCII 18 * equivalent. 19 * @param $source_langcode 20 * Optional ISO 639 language code that denotes the language of the input and 21 * is used to apply language-specific variations. If the source language is 22 * not known at the time of transliteration, it is recommended to set this 23 * argument to the site default language to produce consistent results. 24 * Otherwise the current display language will be used. 25 * @return 26 * Transliterated text. 27 */ 28 function transliteration_process($string, $unknown = '?', $source_langcode = NULL) { 29 // ASCII is always valid NFC! If we're only ever given plain ASCII, we can 30 // avoid the overhead of initializing the decomposition tables by skipping 31 // out early. 32 if (!preg_match('/[\x80-\xff]/', $string)) { 33 return $string; 34 } 35 36 static $tailBytes; 37 38 if (!isset($tailBytes)) { 39 // Each UTF-8 head byte is followed by a certain number of tail bytes. 40 $tailBytes = array(); 41 for ($n = 0; $n < 256; $n++) { 42 if ($n < 0xc0) { 43 $remaining = 0; 44 } 45 elseif ($n < 0xe0) { 46 $remaining = 1; 47 } 48 elseif ($n < 0xf0) { 49 $remaining = 2; 50 } 51 elseif ($n < 0xf8) { 52 $remaining = 3; 53 } 54 elseif ($n < 0xfc) { 55 $remaining = 4; 56 } 57 elseif ($n < 0xfe) { 58 $remaining = 5; 59 } 60 else { 61 $remaining = 0; 62 } 63 $tailBytes[chr($n)] = $remaining; 64 } 65 } 66 67 // Chop the text into pure-ASCII and non-ASCII areas; large ASCII parts can 68 // be handled much more quickly. Don't chop up Unicode areas for punctuation, 69 // though, that wastes energy. 70 preg_match_all('/[\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*/', $string, $matches); 71 72 $result = ''; 73 foreach ($matches[0] as $str) { 74 if ($str[0] < "\x80") { 75 // ASCII chunk: guaranteed to be valid UTF-8 and in normal form C, so 76 // skip over it. 77 $result .= $str; 78 continue; 79 } 80 81 // We'll have to examine the chunk byte by byte to ensure that it consists 82 // of valid UTF-8 sequences, and to see if any of them might not be 83 // normalized. 84 // 85 // Since PHP is not the fastest language on earth, some of this code is a 86 // little ugly with inner loop optimizations. 87 88 $head = ''; 89 $chunk = strlen($str); 90 // Counting down is faster. I'm *so* sorry. 91 $len = $chunk + 1; 92 93 for ($i = -1; --$len; ) { 94 $c = $str[++$i]; 95 if ($remaining = $tailBytes[$c]) { 96 // UTF-8 head byte! 97 $sequence = $head = $c; 98 do { 99 // Look for the defined number of tail bytes... 100 if (--$len && ($c = $str[++$i]) >= "\x80" && $c < "\xc0") { 101 // Legal tail bytes are nice. 102 $sequence .= $c; 103 } 104 else { 105 if ($len == 0) { 106 // Premature end of string! Drop a replacement character into 107 // output to represent the invalid UTF-8 sequence. 108 $result .= $unknown; 109 break 2; 110 } 111 else { 112 // Illegal tail byte; abandon the sequence. 113 $result .= $unknown; 114 // Back up and reprocess this byte; it may itself be a legal 115 // ASCII or UTF-8 sequence head. 116 --$i; 117 ++$len; 118 continue 2; 119 } 120 } 121 } while (--$remaining); 122 123 $n = ord($head); 124 if ($n <= 0xdf) { 125 $ord = ($n - 192) * 64 + (ord($sequence[1]) - 128); 126 } 127 elseif ($n <= 0xef) { 128 $ord = ($n - 224) * 4096 + (ord($sequence[1]) - 128) * 64 + (ord($sequence[2]) - 128); 129 } 130 elseif ($n <= 0xf7) { 131 $ord = ($n - 240) * 262144 + (ord($sequence[1]) - 128) * 4096 + (ord($sequence[2]) - 128) * 64 + (ord($sequence[3]) - 128); 132 } 133 elseif ($n <= 0xfb) { 134 $ord = ($n - 248) * 16777216 + (ord($sequence[1]) - 128) * 262144 + (ord($sequence[2]) - 128) * 4096 + (ord($sequence[3]) - 128) * 64 + (ord($sequence[4]) - 128); 135 } 136 elseif ($n <= 0xfd) { 137 $ord = ($n - 252) * 1073741824 + (ord($sequence[1]) - 128) * 16777216 + (ord($sequence[2]) - 128) * 262144 + (ord($sequence[3]) - 128) * 4096 + (ord($sequence[4]) - 128) * 64 + (ord($sequence[5]) - 128); 138 } 139 $result .= _transliteration_replace($ord, $unknown, $source_langcode); 140 $head = ''; 141 } 142 elseif ($c < "\x80") { 143 // ASCII byte. 144 $result .= $c; 145 $head = ''; 146 } 147 elseif ($c < "\xc0") { 148 // Illegal tail bytes. 149 if ($head == '') { 150 $result .= $unknown; 151 } 152 } 153 else { 154 // Miscellaneous freaks. 155 $result .= $unknown; 156 $head = ''; 157 } 158 } 159 } 160 return $result; 161 } 162 163 /** 164 * Replace a Unicode character using the transliteration database. 165 * 166 * @param $ord 167 * An ordinal Unicode character code. 168 * @param $unknown 169 * Replacement string for characters that do not have a suitable ASCII 170 * equivalent. 171 * @param $langcode 172 * Optional ISO 639 language code that denotes the language of the input and 173 * is used to apply language-specific variations. Defaults to the current 174 * display language. 175 * @return 176 * ASCII replacement character. 177 */ 178 function _transliteration_replace($ord, $unknown = '?', $langcode = NULL) { 179 static $map = array(); 180 181 if (!isset($langcode)) { 182 global $language; 183 $langcode = $language->language; 184 } 185 186 $bank = $ord >> 8; 187 188 if (!isset($map[$bank][$langcode])) { 189 $file = drupal_get_path('module', 'transliteration') . '/data/' . sprintf('x%02x', $bank) . '.php'; 190 if (file_exists($file)) { 191 include $file; 192 if ($langcode != 'en' && isset($variant[$langcode])) { 193 // Merge in language specific mappings. 194 $map[$bank][$langcode] = $variant[$langcode] + $base; 195 } 196 else { 197 $map[$bank][$langcode] = $base; 198 } 199 } 200 else { 201 $map[$bank][$langcode] = array(); 202 } 203 } 204 205 $ord = $ord & 255; 206 207 return isset($map[$bank][$langcode][$ord]) ? $map[$bank][$langcode][$ord] : $unknown; 208 } 209
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
| Generated: Thu Mar 24 11:18:33 2011 | Cross-referenced by PHPXref 0.7 |