[ Index ]

PHP Cross Reference of Drupal 6 (gatewave)

title

Body

[close]

/sites/all/modules/transliteration/ -> transliteration.inc (source)

   1  <?php
   2  // $Id: transliteration.inc,v 1.6.2.6 2009/11/29 15:59:06 smk Exp $
   3  
   4  /**
   5   * @file
   6   * Transliteration processing functions.
   7   */
   8  
   9  /**
  10   * Transliterate UTF-8 encoded text to US-ASCII.
  11   *
  12   * Based on Mediawiki's UtfNormal::quickIsNFCVerify().
  13   *
  14   * @param $string
  15   *   UTF-8 encoded text input.
  16   * @param $unknown
  17   *   Replacement string for characters that do not have a suitable ASCII
  18   *   equivalent.
  19   * @param $source_langcode
  20   *   Optional ISO 639 language code that denotes the language of the input and
  21   *   is used to apply language-specific variations. If the source language is
  22   *   not known at the time of transliteration, it is recommended to set this
  23   *   argument to the site default language to produce consistent results.
  24   *   Otherwise the current display language will be used.
  25   * @return
  26   *   Transliterated text.
  27   */
  28  function transliteration_process($string, $unknown = '?', $source_langcode = NULL) {
  29    // ASCII is always valid NFC! If we're only ever given plain ASCII, we can
  30    // avoid the overhead of initializing the decomposition tables by skipping
  31    // out early.
  32    if (!preg_match('/[\x80-\xff]/', $string)) {
  33      return $string;
  34    }
  35  
  36    static $tailBytes;
  37  
  38    if (!isset($tailBytes)) {
  39      // Each UTF-8 head byte is followed by a certain number of tail bytes.
  40      $tailBytes = array();
  41      for ($n = 0; $n < 256; $n++) {
  42        if ($n < 0xc0) {
  43          $remaining = 0;
  44        }
  45        elseif ($n < 0xe0) {
  46          $remaining = 1;
  47        }
  48        elseif ($n < 0xf0) {
  49          $remaining = 2;
  50        }
  51        elseif ($n < 0xf8) {
  52          $remaining = 3;
  53        }
  54        elseif ($n < 0xfc) {
  55          $remaining = 4;
  56        }
  57        elseif ($n < 0xfe) {
  58          $remaining = 5;
  59        }
  60        else {
  61          $remaining = 0;
  62        }
  63        $tailBytes[chr($n)] = $remaining;
  64      }
  65    }
  66  
  67    // Chop the text into pure-ASCII and non-ASCII areas; large ASCII parts can
  68    // be handled much more quickly. Don't chop up Unicode areas for punctuation,
  69    // though, that wastes energy.
  70    preg_match_all('/[\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*/', $string, $matches);
  71  
  72    $result = '';
  73    foreach ($matches[0] as $str) {
  74      if ($str[0] < "\x80") {
  75        // ASCII chunk: guaranteed to be valid UTF-8 and in normal form C, so
  76        // skip over it.
  77        $result .= $str;
  78        continue;
  79      }
  80  
  81      // We'll have to examine the chunk byte by byte to ensure that it consists
  82      // of valid UTF-8 sequences, and to see if any of them might not be
  83      // normalized.
  84      //
  85      // Since PHP is not the fastest language on earth, some of this code is a
  86      // little ugly with inner loop optimizations.
  87  
  88      $head = '';
  89      $chunk = strlen($str);
  90      // Counting down is faster. I'm *so* sorry.
  91      $len = $chunk + 1;
  92  
  93      for ($i = -1; --$len; ) {
  94        $c = $str[++$i];
  95        if ($remaining = $tailBytes[$c]) {
  96          // UTF-8 head byte!
  97          $sequence = $head = $c;
  98          do {
  99            // Look for the defined number of tail bytes...
 100            if (--$len && ($c = $str[++$i]) >= "\x80" && $c < "\xc0") {
 101              // Legal tail bytes are nice.
 102              $sequence .= $c;
 103            }
 104            else {
 105              if ($len == 0) {
 106                // Premature end of string! Drop a replacement character into
 107                // output to represent the invalid UTF-8 sequence.
 108                $result .= $unknown;
 109                break 2;
 110              }
 111              else {
 112                // Illegal tail byte; abandon the sequence.
 113                $result .= $unknown;
 114                // Back up and reprocess this byte; it may itself be a legal
 115                // ASCII or UTF-8 sequence head.
 116                --$i;
 117                ++$len;
 118                continue 2;
 119              }
 120            }
 121          } while (--$remaining);
 122  
 123          $n = ord($head);
 124          if ($n <= 0xdf) {
 125            $ord = ($n - 192) * 64 + (ord($sequence[1]) - 128);
 126          }
 127          elseif ($n <= 0xef) {
 128            $ord = ($n - 224) * 4096 + (ord($sequence[1]) - 128) * 64 + (ord($sequence[2]) - 128);
 129          }
 130          elseif ($n <= 0xf7) {
 131            $ord = ($n - 240) * 262144 + (ord($sequence[1]) - 128) * 4096 + (ord($sequence[2]) - 128) * 64 + (ord($sequence[3]) - 128);
 132          }
 133          elseif ($n <= 0xfb) {
 134            $ord = ($n - 248) * 16777216 + (ord($sequence[1]) - 128) * 262144 + (ord($sequence[2]) - 128) * 4096 + (ord($sequence[3]) - 128) * 64 + (ord($sequence[4]) - 128);
 135          }
 136          elseif ($n <= 0xfd) {
 137            $ord = ($n - 252) * 1073741824 + (ord($sequence[1]) - 128) * 16777216 + (ord($sequence[2]) - 128) * 262144 + (ord($sequence[3]) - 128) * 4096 + (ord($sequence[4]) - 128) * 64 + (ord($sequence[5]) - 128);
 138          }
 139          $result .= _transliteration_replace($ord, $unknown, $source_langcode);
 140          $head = '';
 141        }
 142        elseif ($c < "\x80") {
 143          // ASCII byte.
 144          $result .= $c;
 145          $head = '';
 146        }
 147        elseif ($c < "\xc0") {
 148          // Illegal tail bytes.
 149          if ($head == '') {
 150            $result .= $unknown;
 151          }
 152        }
 153        else {
 154          // Miscellaneous freaks.
 155          $result .= $unknown;
 156          $head = '';
 157        }
 158      }
 159    }
 160    return $result;
 161  }
 162  
 163  /**
 164   * Replace a Unicode character using the transliteration database.
 165   *
 166   * @param $ord
 167   *   An ordinal Unicode character code.
 168   * @param $unknown
 169   *   Replacement string for characters that do not have a suitable ASCII
 170   *   equivalent.
 171   * @param $langcode
 172   *   Optional ISO 639 language code that denotes the language of the input and
 173   *   is used to apply language-specific variations.  Defaults to the current
 174   *   display language.
 175   * @return
 176   *   ASCII replacement character.
 177   */
 178  function _transliteration_replace($ord, $unknown = '?', $langcode = NULL) {
 179    static $map = array();
 180  
 181    if (!isset($langcode)) {
 182      global $language;
 183      $langcode = $language->language;
 184    }
 185  
 186    $bank = $ord >> 8;
 187  
 188    if (!isset($map[$bank][$langcode])) {
 189      $file = drupal_get_path('module', 'transliteration') . '/data/' . sprintf('x%02x', $bank) . '.php';
 190      if (file_exists($file)) {
 191        include $file;
 192        if ($langcode != 'en' && isset($variant[$langcode])) {
 193          // Merge in language specific mappings.
 194          $map[$bank][$langcode] = $variant[$langcode] + $base;
 195        }
 196        else {
 197          $map[$bank][$langcode] = $base;
 198        }
 199      }
 200      else {
 201        $map[$bank][$langcode] = array();
 202      }
 203    }
 204  
 205    $ord = $ord & 255;
 206  
 207    return isset($map[$bank][$langcode][$ord]) ? $map[$bank][$langcode][$ord] : $unknown;
 208  }
 209  


Generated: Thu Mar 24 11:18:33 2011 Cross-referenced by PHPXref 0.7