PHPXRef 0.7 : Drupal 6 (gatewave) : /includes/unicode.inc source

[Summary view] [Print] [Text view]
   1  <?php
   2  // $Id: unicode.inc,v 1.29.2.2 2010/12/15 13:50:13 goba Exp $
   3  
   4  /**
   5   * Indicates an error during check for PHP unicode support.
   6   */
   7  define('UNICODE_ERROR', -1);
   8  
   9  /**
  10   * Indicates that standard PHP (emulated) unicode support is being used.
  11   */
  12  define('UNICODE_SINGLEBYTE', 0);
  13  
  14  /**
  15   * Indicates that full unicode support with the PHP mbstring extension is being
  16   * used.
  17   */
  18  define('UNICODE_MULTIBYTE', 1);
  19  
  20  /**
  21   * Wrapper around _unicode_check().
  22   */
  23  function unicode_check() {
  24    list($GLOBALS['multibyte']) = _unicode_check();
  25  }
  26  
  27  /**
  28   * Perform checks about Unicode support in PHP, and set the right settings if
  29   * needed.
  30   *
  31   * Because Drupal needs to be able to handle text in various encodings, we do
  32   * not support mbstring function overloading. HTTP input/output conversion must
  33   * be disabled for similar reasons.
  34   *
  35   * @param $errors
  36   *   Whether to report any fatal errors with form_set_error().
  37   */
  38  function _unicode_check() {
  39    // Ensure translations don't break at install time
  40    $t = get_t();
  41  
  42    // Set the standard C locale to ensure consistent, ASCII-only string handling.
  43    setlocale(LC_CTYPE, 'C');
  44  
  45    // Check for outdated PCRE library
  46    // Note: we check if U+E2 is in the range U+E0 - U+E1. This test returns TRUE on old PCRE versions.
  47    if (preg_match('/[à-á]/u', 'â')) {
  48      return array(UNICODE_ERROR, $t('The PCRE library in your PHP installation is outdated. This will cause problems when handling Unicode text. If you are running PHP 4.3.3 or higher, make sure you are using the PCRE library supplied by PHP. Please refer to the <a href="@url">PHP PCRE documentation</a> for more information.', array('@url' => 'http://www.php.net/pcre')));
  49    }
  50  
  51    // Check for mbstring extension
  52    if (!function_exists('mb_strlen')) {
  53      return array(UNICODE_SINGLEBYTE, $t('Operations on Unicode strings are emulated on a best-effort basis. Install the <a href="@url">PHP mbstring extension</a> for improved Unicode support.', array('@url' => 'http://www.php.net/mbstring')));
  54    }
  55  
  56    // Check mbstring configuration
  57    if (ini_get('mbstring.func_overload') != 0) {
  58      return array(UNICODE_ERROR, $t('Multibyte string function overloading in PHP is active and must be disabled. Check the php.ini <em>mbstring.func_overload</em> setting. Please refer to the <a href="@url">PHP mbstring documentation</a> for more information.', array('@url' => 'http://www.php.net/mbstring')));
  59    }
  60    if (ini_get('mbstring.encoding_translation') != 0) {
  61      return array(UNICODE_ERROR, $t('Multibyte string input conversion in PHP is active and must be disabled. Check the php.ini <em>mbstring.encoding_translation</em> setting. Please refer to the <a href="@url">PHP mbstring documentation</a> for more information.', array('@url' => 'http://www.php.net/mbstring')));
  62    }
  63    if (ini_get('mbstring.http_input') != 'pass') {
  64      return array(UNICODE_ERROR, $t('Multibyte string input conversion in PHP is active and must be disabled. Check the php.ini <em>mbstring.http_input</em> setting. Please refer to the <a href="@url">PHP mbstring documentation</a> for more information.', array('@url' => 'http://www.php.net/mbstring')));
  65    }
  66    if (ini_get('mbstring.http_output') != 'pass') {
  67      return array(UNICODE_ERROR, $t('Multibyte string output conversion in PHP is active and must be disabled. Check the php.ini <em>mbstring.http_output</em> setting. Please refer to the <a href="@url">PHP mbstring documentation</a> for more information.', array('@url' => 'http://www.php.net/mbstring')));
  68    }
  69  
  70    // Set appropriate configuration
  71    mb_internal_encoding('utf-8');
  72    mb_language('uni');
  73    return array(UNICODE_MULTIBYTE, '');
  74  }
  75  
  76  /**
  77   * Return Unicode library status and errors.
  78   */
  79  function unicode_requirements() {
  80    // Ensure translations don't break at install time
  81    $t = get_t();
  82  
  83    $libraries = array(
  84      UNICODE_SINGLEBYTE => $t('Standard PHP'),
  85      UNICODE_MULTIBYTE => $t('PHP Mbstring Extension'),
  86      UNICODE_ERROR => $t('Error'),
  87    );
  88    $severities = array(
  89      UNICODE_SINGLEBYTE => REQUIREMENT_WARNING,
  90      UNICODE_MULTIBYTE => REQUIREMENT_OK,
  91      UNICODE_ERROR => REQUIREMENT_ERROR,
  92    );
  93    list($library, $description) = _unicode_check();
  94  
  95    $requirements['unicode'] = array(
  96      'title' => $t('Unicode library'),
  97      'value' => $libraries[$library],
  98    );
  99    if ($description) {
 100      $requirements['unicode']['description'] = $description;
 101    }
 102  
 103    $requirements['unicode']['severity'] = $severities[$library];
 104  
 105    return $requirements;
 106  }
 107  
 108  /**
 109   * Prepare a new XML parser.
 110   *
 111   * This is a wrapper around xml_parser_create() which extracts the encoding from
 112   * the XML data first and sets the output encoding to UTF-8. This function should
 113   * be used instead of xml_parser_create(), because PHP 4's XML parser doesn't
 114   * check the input encoding itself. "Starting from PHP 5, the input encoding is
 115   * automatically detected, so that the encoding parameter specifies only the
 116   * output encoding."
 117   *
 118   * This is also where unsupported encodings will be converted. Callers should
 119   * take this into account: $data might have been changed after the call.
 120   *
 121   * @param &$data
 122   *   The XML data which will be parsed later.
 123   * @return
 124   *   An XML parser object.
 125   */
 126  function drupal_xml_parser_create(&$data) {
 127    // Default XML encoding is UTF-8
 128    $encoding = 'utf-8';
 129    $bom = FALSE;
 130  
 131    // Check for UTF-8 byte order mark (PHP5's XML parser doesn't handle it).
 132    if (!strncmp($data, "\xEF\xBB\xBF", 3)) {
 133      $bom = TRUE;
 134      $data = substr($data, 3);
 135    }
 136  
 137    // Check for an encoding declaration in the XML prolog if no BOM was found.
 138    if (!$bom && ereg('^<\?xml[^>]+encoding="([^"]+)"', $data, $match)) {
 139      $encoding = $match[1];
 140    }
 141  
 142    // Unsupported encodings are converted here into UTF-8.
 143    $php_supported = array('utf-8', 'iso-8859-1', 'us-ascii');
 144    if (!in_array(strtolower($encoding), $php_supported)) {
 145      $out = drupal_convert_to_utf8($data, $encoding);
 146      if ($out !== FALSE) {
 147        $encoding = 'utf-8';
 148        $data = ereg_replace('^(<\?xml[^>]+encoding)="([^"]+)"', '\\1="utf-8"', $out);
 149      }
 150      else {
 151        watchdog('php', 'Could not convert XML encoding %s to UTF-8.', array('%s' => $encoding), WATCHDOG_WARNING);
 152        return 0;
 153      }
 154    }
 155  
 156    $xml_parser = xml_parser_create($encoding);
 157    xml_parser_set_option($xml_parser, XML_OPTION_TARGET_ENCODING, 'utf-8');
 158    return $xml_parser;
 159  }
 160  
 161  /**
 162   * Convert data to UTF-8
 163   *
 164   * Requires the iconv, GNU recode or mbstring PHP extension.
 165   *
 166   * @param $data
 167   *   The data to be converted.
 168   * @param $encoding
 169   *   The encoding that the data is in
 170   * @return
 171   *   Converted data or FALSE.
 172   */
 173  function drupal_convert_to_utf8($data, $encoding) {
 174    if (function_exists('iconv')) {
 175      $out = @iconv($encoding, 'utf-8', $data);
 176    }
 177    else if (function_exists('mb_convert_encoding')) {
 178      $out = @mb_convert_encoding($data, 'utf-8', $encoding);
 179    }
 180    else if (function_exists('recode_string')) {
 181      $out = @recode_string($encoding .'..utf-8', $data);
 182    }
 183    else {
 184      watchdog('php', 'Unsupported encoding %s. Please install iconv, GNU recode or mbstring for PHP.', array('%s' => $encoding), WATCHDOG_ERROR);
 185      return FALSE;
 186    }
 187  
 188    return $out;
 189  }
 190  
 191  /**
 192   * Truncate a UTF-8-encoded string safely to a number of bytes.
 193   *
 194   * If the end position is in the middle of a UTF-8 sequence, it scans backwards
 195   * until the beginning of the byte sequence.
 196   *
 197   * Use this function whenever you want to chop off a string at an unsure
 198   * location. On the other hand, if you're sure that you're splitting on a
 199   * character boundary (e.g. after using strpos() or similar), you can safely use
 200   * substr() instead.
 201   *
 202   * @param $string
 203   *   The string to truncate.
 204   * @param $len
 205   *   An upper limit on the returned string length.
 206   * @return
 207   *   The truncated string.
 208   */
 209  function drupal_truncate_bytes($string, $len) {
 210    if (strlen($string) <= $len) {
 211      return $string;
 212    }
 213    if ((ord($string[$len]) < 0x80) || (ord($string[$len]) >= 0xC0)) {
 214      return substr($string, 0, $len);
 215    }
 216    while (--$len >= 0 && ord($string[$len]) >= 0x80 && ord($string[$len]) < 0xC0) {};
 217    return substr($string, 0, $len);
 218  }
 219  
 220  /**
 221   * Truncate a UTF-8-encoded string safely to a number of characters.
 222   *
 223   * @param $string
 224   *   The string to truncate.
 225   * @param $len
 226   *   An upper limit on the returned string length.
 227   * @param $wordsafe
 228   *   Flag to truncate at last space within the upper limit. Defaults to FALSE.
 229   * @param $dots
 230   *   Flag to add trailing dots. Defaults to FALSE.
 231   * @return
 232   *   The truncated string.
 233   */
 234  function truncate_utf8($string, $len, $wordsafe = FALSE, $dots = FALSE) {
 235  
 236    if (drupal_strlen($string) <= $len) {
 237      return $string;
 238    }
 239  
 240    if ($dots) {
 241      $len -= 4;
 242    }
 243  
 244    if ($wordsafe) {
 245      $string = drupal_substr($string, 0, $len + 1); // leave one more character
 246      if ($last_space = strrpos($string, ' ')) { // space exists AND is not on position 0
 247        $string = substr($string, 0, $last_space);
 248      }
 249      else {
 250        $string = drupal_substr($string, 0, $len);
 251      }
 252    }
 253    else {
 254      $string = drupal_substr($string, 0, $len);
 255    }
 256  
 257    if ($dots) {
 258      $string .= ' ...';
 259    }
 260  
 261    return $string;
 262  }
 263  
 264  /**
 265   * Encodes MIME/HTTP header values that contain non-ASCII, UTF-8 encoded
 266   * characters.
 267   *
 268   * For example, mime_header_encode('tést.txt') returns "=?UTF-8?B?dMOpc3QudHh0?=".
 269   *
 270   * See http://www.rfc-editor.org/rfc/rfc2047.txt for more information.
 271   *
 272   * Notes:
 273   * - Only encode strings that contain non-ASCII characters.
 274   * - We progressively cut-off a chunk with truncate_utf8(). This is to ensure
 275   *   each chunk starts and ends on a character boundary.
 276   * - Using \n as the chunk separator may cause problems on some systems and may
 277   *   have to be changed to \r\n or \r.
 278   */
 279  function mime_header_encode($string) {
 280    if (preg_match('/[^\x20-\x7E]/', $string)) {
 281      $chunk_size = 47; // floor((75 - strlen("=?UTF-8?B??=")) * 0.75);
 282      $len = strlen($string);
 283      $output = '';
 284      while ($len > 0) {
 285        $chunk = drupal_truncate_bytes($string, $chunk_size);
 286        $output .= ' =?UTF-8?B?'. base64_encode($chunk) ."?=\n";
 287        $c = strlen($chunk);
 288        $string = substr($string, $c);
 289        $len -= $c;
 290      }
 291      return trim($output);
 292    }
 293    return $string;
 294  }
 295  
 296  /**
 297   * Complement to mime_header_encode
 298   */
 299  function mime_header_decode($header) {
 300    // First step: encoded chunks followed by other encoded chunks (need to collapse whitespace)
 301    $header = preg_replace_callback('/=\?([^?]+)\?(Q|B)\?([^?]+|\?(?!=))\?=\s+(?==\?)/', '_mime_header_decode', $header);
 302    // Second step: remaining chunks (do not collapse whitespace)
 303    return preg_replace_callback('/=\?([^?]+)\?(Q|B)\?([^?]+|\?(?!=))\?=/', '_mime_header_decode', $header);
 304  }
 305  
 306  /**
 307   * Helper function to mime_header_decode
 308   */
 309  function _mime_header_decode($matches) {
 310    // Regexp groups:
 311    // 1: Character set name
 312    // 2: Escaping method (Q or B)
 313    // 3: Encoded data
 314    $data = ($matches[2] == 'B') ? base64_decode($matches[3]) : str_replace('_', ' ', quoted_printable_decode($matches[3]));
 315    if (strtolower($matches[1]) != 'utf-8') {
 316      $data = drupal_convert_to_utf8($data, $matches[1]);
 317    }
 318    return $data;
 319  }
 320  
 321  /**
 322   * Decodes all HTML entities (including numerical ones) to regular UTF-8 bytes.
 323   *
 324   * Double-escaped entities will only be decoded once ("&amp;lt;" becomes "&lt;",
 325   * not "<"). Be careful when using this function, as decode_entities can revert
 326   * previous sanitization efforts (&lt;script&gt; will become <script>).
 327   *
 328   * @param $text
 329   *   The text to decode entities in.
 330   * @param $exclude
 331   *   An array of characters which should not be decoded. For example,
 332   *   array('<', '&', '"'). This affects both named and numerical entities.
 333   *
 334   * @return
 335   *   The input $text, with all HTML entities decoded once.
 336   */
 337  function decode_entities($text, $exclude = array()) {
 338    static $html_entities;
 339    if (!isset($html_entities)) {
 340      include_once  './includes/unicode.entities.inc';
 341    }
 342  
 343    // Flip the exclude list so that we can do quick lookups later.
 344    $exclude = array_flip($exclude);
 345  
 346    // Use a regexp to select all entities in one pass, to avoid decoding 
 347    // double-escaped entities twice. The PREG_REPLACE_EVAL modifier 'e' is
 348    // being used to allow for a callback (see 
 349    // http://php.net/manual/en/reference.pcre.pattern.modifiers).
 350    return preg_replace('/&(#x?)?([A-Za-z0-9]+);/e', '_decode_entities("$1", "$2", "$0", $html_entities, $exclude)', $text);
 351  }
 352  
 353  /**
 354   * Helper function for decode_entities
 355   */
 356  function _decode_entities($prefix, $codepoint, $original, &$html_entities, &$exclude) {
 357    // Named entity
 358    if (!$prefix) {
 359      // A named entity not in the exclude list.
 360      if (isset($html_entities[$original]) && !isset($exclude[$html_entities[$original]])) {
 361        return $html_entities[$original];
 362      }
 363      else {
 364        return $original;
 365      }
 366    }
 367    // Hexadecimal numerical entity
 368    if ($prefix == '#x') {
 369      $codepoint = base_convert($codepoint, 16, 10);
 370    }
 371    // Decimal numerical entity (strip leading zeros to avoid PHP octal notation)
 372    else {
 373      $codepoint = preg_replace('/^0+/', '', $codepoint);
 374    }
 375    // Encode codepoint as UTF-8 bytes
 376    if ($codepoint < 0x80) {
 377      $str = chr($codepoint);
 378    }
 379    else if ($codepoint < 0x800) {
 380      $str = chr(0xC0 | ($codepoint >> 6))
 381           . chr(0x80 | ($codepoint & 0x3F));
 382    }
 383    else if ($codepoint < 0x10000) {
 384      $str = chr(0xE0 | ( $codepoint >> 12))
 385           . chr(0x80 | (($codepoint >> 6) & 0x3F))
 386           . chr(0x80 | ( $codepoint       & 0x3F));
 387    }
 388    else if ($codepoint < 0x200000) {
 389      $str = chr(0xF0 | ( $codepoint >> 18))
 390           . chr(0x80 | (($codepoint >> 12) & 0x3F))
 391           . chr(0x80 | (($codepoint >> 6)  & 0x3F))
 392           . chr(0x80 | ( $codepoint        & 0x3F));
 393    }
 394    // Check for excluded characters
 395    if (isset($exclude[$str])) {
 396      return $original;
 397    }
 398    else {
 399      return $str;
 400    }
 401  }
 402  
 403  /**
 404   * Count the amount of characters in a UTF-8 string. This is less than or
 405   * equal to the byte count.
 406   */
 407  function drupal_strlen($text) {
 408    global $multibyte;
 409    if ($multibyte == UNICODE_MULTIBYTE) {
 410      return mb_strlen($text);
 411    }
 412    else {
 413      // Do not count UTF-8 continuation bytes.
 414      return strlen(preg_replace("/[\x80-\xBF]/", '', $text));
 415    }
 416  }
 417  
 418  /**
 419   * Uppercase a UTF-8 string.
 420   */
 421  function drupal_strtoupper($text) {
 422    global $multibyte;
 423    if ($multibyte == UNICODE_MULTIBYTE) {
 424      return mb_strtoupper($text);
 425    }
 426    else {
 427      // Use C-locale for ASCII-only uppercase
 428      $text = strtoupper($text);
 429      // Case flip Latin-1 accented letters
 430      $text = preg_replace_callback('/\xC3[\xA0-\xB6\xB8-\xBE]/', '_unicode_caseflip', $text);
 431      return $text;
 432    }
 433  }
 434  
 435  /**
 436   * Lowercase a UTF-8 string.
 437   */
 438  function drupal_strtolower($text) {
 439    global $multibyte;
 440    if ($multibyte == UNICODE_MULTIBYTE) {
 441      return mb_strtolower($text);
 442    }
 443    else {
 444      // Use C-locale for ASCII-only lowercase
 445      $text = strtolower($text);
 446      // Case flip Latin-1 accented letters
 447      $text = preg_replace_callback('/\xC3[\x80-\x96\x98-\x9E]/', '_unicode_caseflip', $text);
 448      return $text;
 449    }
 450  }
 451  
 452  /**
 453   * Helper function for case conversion of Latin-1.
 454   * Used for flipping U+C0-U+DE to U+E0-U+FD and back.
 455   */
 456  function _unicode_caseflip($matches) {
 457    return $matches[0][0] . chr(ord($matches[0][1]) ^ 32);
 458  }
 459  
 460  /**
 461   * Capitalize the first letter of a UTF-8 string.
 462   */
 463  function drupal_ucfirst($text) {
 464    // Note: no mbstring equivalent!
 465    return drupal_strtoupper(drupal_substr($text, 0, 1)) . drupal_substr($text, 1);
 466  }
 467  
 468  /**
 469   * Cut off a piece of a string based on character indices and counts. Follows
 470   * the same behavior as PHP's own substr() function.
 471   *
 472   * Note that for cutting off a string at a known character/substring
 473   * location, the usage of PHP's normal strpos/substr is safe and
 474   * much faster.
 475   */
 476  function drupal_substr($text, $start, $length = NULL) {
 477    global $multibyte;
 478    if ($multibyte == UNICODE_MULTIBYTE) {
 479      return $length === NULL ? mb_substr($text, $start) : mb_substr($text, $start, $length);
 480    }
 481    else {
 482      $strlen = strlen($text);
 483      // Find the starting byte offset
 484      $bytes = 0;
 485      if ($start > 0) {
 486        // Count all the continuation bytes from the start until we have found
 487        // $start characters
 488        $bytes = -1; $chars = -1;
 489        while ($bytes < $strlen && $chars < $start) {
 490          $bytes++;
 491          $c = ord($text[$bytes]);
 492          if ($c < 0x80 || $c >= 0xC0) {
 493            $chars++;
 494          }
 495        }
 496      }
 497      else if ($start < 0) {
 498        // Count all the continuation bytes from the end until we have found
 499        // abs($start) characters
 500        $start = abs($start);
 501        $bytes = $strlen; $chars = 0;
 502        while ($bytes > 0 && $chars < $start) {
 503          $bytes--;
 504          $c = ord($text[$bytes]);
 505          if ($c < 0x80 || $c >= 0xC0) {
 506            $chars++;
 507          }
 508        }
 509      }
 510      $istart = $bytes;
 511  
 512      // Find the ending byte offset
 513      if ($length === NULL) {
 514        $bytes = $strlen - 1;
 515      }
 516      else if ($length > 0) {
 517        // Count all the continuation bytes from the starting index until we have
 518        // found $length + 1 characters. Then backtrack one byte.
 519        $bytes = $istart; $chars = 0;
 520        while ($bytes < $strlen && $chars < $length) {
 521          $bytes++;
 522          $c = ord($text[$bytes]);
 523          if ($c < 0x80 || $c >= 0xC0) {
 524            $chars++;
 525          }
 526        }
 527        $bytes--;
 528      }
 529      else if ($length < 0) {
 530        // Count all the continuation bytes from the end until we have found
 531        // abs($length) characters
 532        $length = abs($length);
 533        $bytes = $strlen - 1; $chars = 0;
 534        while ($bytes >= 0 && $chars < $length) {
 535          $c = ord($text[$bytes]);
 536          if ($c < 0x80 || $c >= 0xC0) {
 537            $chars++;
 538          }
 539          $bytes--;
 540        }
 541      }
 542      $iend = $bytes;
 543  
 544      return substr($text, $istart, max(0, $iend - $istart + 1));
 545    }
 546  }
 547  
 548
PHP Cross Reference of Drupal 6 (gatewave)

/includes/ -> unicode.inc (source)