PHPXRef 0.7.1 : Unnamed Project : /includes/utf/utf

[Summary view] [Print] [Text view]
   1  <?php
   2  /**
   3  *
   4  * @package utf
   5  * @version $Id$
   6  * @copyright (c) 2005 phpBB Group
   7  * @license http://opensource.org/licenses/gpl-license.php GNU Public License
   8  *
   9  */
  10  
  11  /**
  12  */
  13  if (!defined('IN_PHPBB'))
  14  {
  15      exit;
  16  }
  17  
  18  /**
  19  * Some Unicode characters encoded in UTF-8
  20  *
  21  * Preserved for compatibility
  22  */
  23  define('UTF8_REPLACEMENT', "\xEF\xBF\xBD");
  24  define('UTF8_MAX', "\xF4\x8F\xBF\xBF");
  25  define('UTF8_FFFE', "\xEF\xBF\xBE");
  26  define('UTF8_FFFF', "\xEF\xBF\xBF");
  27  define('UTF8_SURROGATE_FIRST', "\xED\xA0\x80");
  28  define('UTF8_SURROGATE_LAST', "\xED\xBF\xBF");
  29  define('UTF8_HANGUL_FIRST', "\xEA\xB0\x80");
  30  define('UTF8_HANGUL_LAST', "\xED\x9E\xA3");
  31  
  32  define('UTF8_CJK_FIRST', "\xE4\xB8\x80");
  33  define('UTF8_CJK_LAST', "\xE9\xBE\xBB");
  34  define('UTF8_CJK_B_FIRST', "\xF0\xA0\x80\x80");
  35  define('UTF8_CJK_B_LAST', "\xF0\xAA\x9B\x96");
  36  
  37  // Unset global variables
  38  unset($GLOBALS['utf_jamo_index'], $GLOBALS['utf_jamo_type'], $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_combining_class'], $GLOBALS['utf_canonical_comp'], $GLOBALS['utf_canonical_decomp'], $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
  39  
  40  // NFC_QC and NFKC_QC values
  41  define('UNICODE_QC_MAYBE', 0);
  42  define('UNICODE_QC_NO', 1);
  43  
  44  // Contains all the ASCII characters appearing in UTF-8, sorted by frequency
  45  define('UTF8_ASCII_RANGE', "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F");
  46  
  47  // Contains all the tail bytes that can appear in the composition of a UTF-8 char
  48  define('UTF8_TRAILING_BYTES', "\xA9\xA0\xA8\x80\xAA\x99\xA7\xBB\xAB\x89\x94\x82\xB4\xA2\xAE\x83\xB0\xB9\xB8\x93\xAF\xBC\xB3\x81\xA4\xB2\x9C\xA1\xB5\xBE\xBD\xBA\x98\xAD\xB1\x84\x95\xA6\xB6\x88\x8D\x90\xB7\xBF\x92\x85\xA5\x97\x8C\x86\xA3\x8E\x9F\x8F\x87\x91\x9D\xAC\x9E\x8B\x96\x9B\x8A\x9A");
  49  
  50  // Constants used by the Hangul [de]composition algorithms
  51  define('UNICODE_HANGUL_SBASE', 0xAC00);
  52  define('UNICODE_HANGUL_LBASE', 0x1100);
  53  define('UNICODE_HANGUL_VBASE', 0x1161);
  54  define('UNICODE_HANGUL_TBASE', 0x11A7);
  55  define('UNICODE_HANGUL_SCOUNT', 11172);
  56  define('UNICODE_HANGUL_LCOUNT', 19);
  57  define('UNICODE_HANGUL_VCOUNT', 21);
  58  define('UNICODE_HANGUL_TCOUNT', 28);
  59  define('UNICODE_HANGUL_NCOUNT', 588);
  60  define('UNICODE_JAMO_L', 0);
  61  define('UNICODE_JAMO_V', 1);
  62  define('UNICODE_JAMO_T', 2);
  63  
  64  /**
  65  * Unicode normalization routines
  66  *
  67  * @package utf
  68  */
  69  class utf_normalizer
  70  {
  71      /**
  72      * Validate, cleanup and normalize a string
  73      *
  74      * The ultimate convenience function! Clean up invalid UTF-8 sequences,
  75      * and convert to Normal Form C, canonical composition.
  76      *
  77      * @param    string    &$str    The dirty string
  78      * @return    string            The same string, all shiny and cleaned-up
  79      */
  80  	function cleanup(&$str)
  81      {
  82          // The string below is the list of all autorized characters, sorted by frequency in latin text
  83          $pos = strspn($str, "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x0D");
  84          $len = strlen($str);
  85  
  86          if ($pos == $len)
  87          {
  88              // ASCII strings with no special chars return immediately
  89              return;
  90          }
  91  
  92          // Note: we do not check for $GLOBALS['utf_canonical_decomp']. It is assumed they are always loaded together
  93          if (!isset($GLOBALS['utf_nfc_qc']))
  94          {
  95              global $phpbb_root_path, $phpEx;
  96              include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
  97          }
  98  
  99          if (!isset($GLOBALS['utf_canonical_decomp']))
 100          {
 101              global $phpbb_root_path, $phpEx;
 102              include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
 103          }
 104  
 105          // Replace any byte in the range 0x00..0x1F, except for \r, \n and \t
 106          // We replace those characters with a 0xFF byte, which is illegal in UTF-8 and will in turn be replaced with a UTF replacement char
 107          $str = strtr(
 108              $str,
 109              "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
 110              "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"
 111          );
 112  
 113          $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
 114      }
 115  
 116      /**
 117      * Validate and normalize a UTF string to NFC
 118      *
 119      * @param    string    &$str    Unchecked UTF string
 120      * @return    string            The string, validated and in normal form
 121      */
 122  	function nfc(&$str)
 123      {
 124          $pos = strspn($str, UTF8_ASCII_RANGE);
 125          $len = strlen($str);
 126  
 127          if ($pos == $len)
 128          {
 129              // ASCII strings return immediately
 130              return;
 131          }
 132  
 133          if (!isset($GLOBALS['utf_nfc_qc']))
 134          {
 135              global $phpbb_root_path, $phpEx;
 136              include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
 137          }
 138  
 139          if (!isset($GLOBALS['utf_canonical_decomp']))
 140          {
 141              global $phpbb_root_path, $phpEx;
 142              include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
 143          }
 144  
 145          $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
 146      }
 147  
 148      /**
 149      * Validate and normalize a UTF string to NFKC
 150      *
 151      * @param    string    &$str    Unchecked UTF string
 152      * @return    string            The string, validated and in normal form
 153      */
 154  	function nfkc(&$str)
 155      {
 156          $pos = strspn($str, UTF8_ASCII_RANGE);
 157          $len = strlen($str);
 158  
 159          if ($pos == $len)
 160          {
 161              // ASCII strings return immediately
 162              return;
 163          }
 164  
 165          if (!isset($GLOBALS['utf_nfkc_qc']))
 166          {
 167              global $phpbb_root_path, $phpEx;
 168              include($phpbb_root_path . 'includes/utf/data/utf_nfkc_qc.' . $phpEx);
 169          }
 170  
 171          if (!isset($GLOBALS['utf_compatibility_decomp']))
 172          {
 173              global $phpbb_root_path, $phpEx;
 174              include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);
 175          }
 176  
 177          $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
 178      }
 179  
 180      /**
 181      * Validate and normalize a UTF string to NFD
 182      *
 183      * @param    string    &$str    Unchecked UTF string
 184      * @return    string            The string, validated and in normal form
 185      */
 186  	function nfd(&$str)
 187      {
 188          $pos = strspn($str, UTF8_ASCII_RANGE);
 189          $len = strlen($str);
 190  
 191          if ($pos == $len)
 192          {
 193              // ASCII strings return immediately
 194              return;
 195          }
 196  
 197          if (!isset($GLOBALS['utf_canonical_decomp']))
 198          {
 199              global $phpbb_root_path, $phpEx;
 200              include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
 201          }
 202  
 203          $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_canonical_decomp']);
 204      }
 205  
 206      /**
 207      * Validate and normalize a UTF string to NFKD
 208      *
 209      * @param    string    &$str    Unchecked UTF string
 210      * @return    string            The string, validated and in normal form
 211      */
 212  	function nfkd(&$str)
 213      {
 214          $pos = strspn($str, UTF8_ASCII_RANGE);
 215          $len = strlen($str);
 216  
 217          if ($pos == $len)
 218          {
 219              // ASCII strings return immediately
 220              return;
 221          }
 222  
 223          if (!isset($GLOBALS['utf_compatibility_decomp']))
 224          {
 225              global $phpbb_root_path, $phpEx;
 226              include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);
 227          }
 228  
 229          $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_compatibility_decomp']);
 230      }
 231  
 232  
 233      /**
 234      * Recompose a UTF string
 235      *
 236      * @param    string    $str            Unchecked UTF string
 237      * @param    integer    $pos            Position of the first UTF char (in bytes)
 238      * @param    integer    $len            Length of the string (in bytes)
 239      * @param    array    &$qc            Quick-check array, passed by reference but never modified
 240      * @param    array    &$decomp_map    Decomposition mapping, passed by reference but never modified
 241      * @return    string                    The string, validated and recomposed
 242      *
 243      * @access    private
 244      */
 245  	function recompose($str, $pos, $len, &$qc, &$decomp_map)
 246      {
 247          global $utf_combining_class, $utf_canonical_comp, $utf_jamo_type, $utf_jamo_index;
 248  
 249          // Load some commonly-used tables
 250          if (!isset($utf_jamo_index, $utf_jamo_type, $utf_combining_class))
 251          {
 252              global $phpbb_root_path, $phpEx;
 253              include($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx);
 254          }
 255  
 256          // Load the canonical composition table
 257          if (!isset($utf_canonical_comp))
 258          {
 259              global $phpbb_root_path, $phpEx;
 260              include($phpbb_root_path . 'includes/utf/data/utf_canonical_comp.' . $phpEx);
 261          }
 262  
 263          // Buffer the last ASCII char before the UTF-8 stuff if applicable
 264          $tmp = '';
 265          $i = $tmp_pos = $last_cc = 0;
 266  
 267          $buffer = ($pos) ? array(++$i => $str[$pos - 1]) : array();
 268  
 269          // UTF char length array
 270          // This array is used to determine the length of a UTF character.
 271          // Be $c the result of ($str[$pos] & "\xF0") --where $str is the string we're operating on and $pos
 272          // the position of the cursor--, if $utf_len_mask[$c] does not exist, the byte is an ASCII char.
 273          // Otherwise, if $utf_len_mask[$c] is greater than 0, we have a the leading byte of a multibyte character
 274          // whose length is $utf_len_mask[$c] and if it is equal to 0, the byte is a trailing byte.
 275          $utf_len_mask = array(
 276              // Leading bytes masks
 277              "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,
 278              // Trailing bytes masks
 279              "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0
 280          );
 281  
 282          $extra_check = array(
 283              "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1,
 284              "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1,
 285              "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1
 286          );
 287  
 288          $utf_validation_mask = array(
 289              2    => "\xE0\xC0",
 290              3    => "\xF0\xC0\xC0",
 291              4    => "\xF8\xC0\xC0\xC0"
 292          );
 293  
 294          $utf_validation_check = array(
 295              2    => "\xC0\x80",
 296              3    => "\xE0\x80\x80",
 297              4    => "\xF0\x80\x80\x80"
 298          );
 299  
 300          // Main loop
 301          do
 302          {
 303              // STEP 0: Capture the current char and buffer it
 304              $c = $str[$pos];
 305              $c_mask = $c & "\xF0";
 306  
 307              if (isset($utf_len_mask[$c_mask]))
 308              {
 309                  // Byte at $pos is either a leading byte or a missplaced trailing byte
 310                  if ($utf_len = $utf_len_mask[$c_mask])
 311                  {
 312                      // Capture the char
 313                      $buffer[++$i & 7] = $utf_char = substr($str, $pos, $utf_len);
 314  
 315                      // Let's find out if a thorough check is needed
 316                      if (isset($qc[$utf_char]))
 317                      {
 318                          // If the UTF char is in the qc array then it may not be in normal form. We do nothing here, the actual processing is below this "if" block
 319                      }
 320                      else if (isset($utf_combining_class[$utf_char]))
 321                      {
 322                          if ($utf_combining_class[$utf_char] < $last_cc)
 323                          {
 324                              // A combining character that is NOT canonically ordered
 325                          }
 326                          else
 327                          {
 328                              // A combining character that IS canonically ordered, skip to the next char
 329                              $last_cc = $utf_combining_class[$utf_char];
 330  
 331                              $pos += $utf_len;
 332                              continue;
 333                          }
 334                      }
 335                      else
 336                      {
 337                          // At this point, $utf_char holds a UTF char that we know is not a NF[K]C_QC and is not a combining character.
 338                          // It can be a singleton, a canonical composite, a replacement char or an even an ill-formed bunch of bytes. Let's find out
 339                          $last_cc = 0;
 340  
 341                          // Check that we have the correct number of trailing bytes
 342                          if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])
 343                          {
 344                              // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char
 345                              // has been encoded in a five- or six- byte sequence
 346                              if ($utf_char[0] >= "\xF8")
 347                              {
 348                                  if ($utf_char[0] < "\xFC")
 349                                  {
 350                                      $trailing_bytes = 4;
 351                                  }
 352                                  else if ($utf_char[0] > "\xFD")
 353                                  {
 354                                      $trailing_bytes = 0;
 355                                  }
 356                                  else
 357                                  {
 358                                      $trailing_bytes = 5;
 359                                  }
 360                              }
 361                              else
 362                              {
 363                                  $trailing_bytes = $utf_len - 1;
 364                              }
 365  
 366                              $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
 367                              $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes);
 368                              $tmp_pos = $pos;
 369  
 370                              continue;
 371                          }
 372  
 373                          if (isset($extra_check[$c]))
 374                          {
 375                              switch ($c)
 376                              {
 377                                  // Note: 0xED is quite common in Korean
 378                                  case "\xED":
 379                                      if ($utf_char >= "\xED\xA0\x80")
 380                                      {
 381                                          // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)
 382                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
 383                                          $pos += $utf_len;
 384                                          $tmp_pos = $pos;
 385                                          continue 2;
 386                                      }
 387                                  break;
 388  
 389                                  // Note: 0xEF is quite common in Japanese
 390                                  case "\xEF":
 391                                      if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF")
 392                                      {
 393                                          // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)
 394                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
 395                                          $pos += $utf_len;
 396                                          $tmp_pos = $pos;
 397                                          continue 2;
 398                                      }
 399                                  break;
 400  
 401                                  case "\xC0":
 402                                  case "\xC1":
 403                                      if ($utf_char <= "\xC1\xBF")
 404                                      {
 405                                          // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char
 406                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
 407                                          $pos += $utf_len;
 408                                          $tmp_pos = $pos;
 409                                          continue 2;
 410                                      }
 411                                  break;
 412  
 413                                  case "\xE0":
 414                                      if ($utf_char <= "\xE0\x9F\xBF")
 415                                      {
 416                                          // Unicode char U+0000..U+07FF encoded in 3 bytes
 417                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
 418                                          $pos += $utf_len;
 419                                          $tmp_pos = $pos;
 420                                          continue 2;
 421                                      }
 422                                  break;
 423  
 424                                  case "\xF0":
 425                                      if ($utf_char <= "\xF0\x8F\xBF\xBF")
 426                                      {
 427                                          // Unicode char U+0000..U+FFFF encoded in 4 bytes
 428                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
 429                                          $pos += $utf_len;
 430                                          $tmp_pos = $pos;
 431                                          continue 2;
 432                                      }
 433                                  break;
 434  
 435                                  default:
 436                                      // Five- and six- byte sequences do not need being checked for here anymore
 437                                      if ($utf_char > UTF8_MAX)
 438                                      {
 439                                          // Out of the Unicode range
 440                                          if ($utf_char[0] < "\xF8")
 441                                          {
 442                                              $trailing_bytes = 3;
 443                                          }
 444                                          else if ($utf_char[0] < "\xFC")
 445                                          {
 446                                              $trailing_bytes = 4;
 447                                          }
 448                                          else if ($utf_char[0] > "\xFD")
 449                                          {
 450                                              $trailing_bytes = 0;
 451                                          }
 452                                          else
 453                                          {
 454                                              $trailing_bytes = 5;
 455                                          }
 456  
 457                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
 458                                          $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes);
 459                                          $tmp_pos = $pos;
 460                                          continue 2;
 461                                      }
 462                                  break;
 463                              }
 464                          }
 465  
 466                          // The char is a valid starter, move the cursor and go on
 467                          $pos += $utf_len;
 468                          continue;
 469                      }
 470                  }
 471                  else
 472                  {
 473                      // A trailing byte came out of nowhere, we will advance the cursor and treat the this byte and all following trailing bytes as if
 474                      // each of them was a Unicode replacement char
 475                      $spn = strspn($str, UTF8_TRAILING_BYTES, $pos);
 476                      $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn);
 477  
 478                      $pos += $spn;
 479                      $tmp_pos = $pos;
 480                      continue;
 481                  }
 482  
 483  
 484                  // STEP 1: Decompose current char
 485  
 486                  // We have found a character that is either:
 487                  //  - in the NFC_QC/NFKC_QC list
 488                  //  - a non-starter char that is not canonically ordered
 489                  //
 490                  // We are going to capture the shortest UTF sequence that satisfies these two conditions:
 491                  //
 492                  //  1 - If the sequence does not start at the begginning of the string, it must begin with a starter,
 493                  // and that starter must not have the NF[K]C_QC property equal to "MAYBE"
 494                  //
 495                  //  2 - If the sequence does not end at the end of the string, it must end with a non-starter and be
 496                  // immediately followed by a starter that is not on the QC list
 497                  //
 498                  $utf_seq = array();
 499                  $last_cc = 0;
 500                  $lpos = $pos;
 501                  $pos += $utf_len;
 502  
 503                  if (isset($decomp_map[$utf_char]))
 504                  {
 505                      $_pos = 0;
 506                      $_len = strlen($decomp_map[$utf_char]);
 507  
 508                      do
 509                      {
 510                          $_utf_len =& $utf_len_mask[$decomp_map[$utf_char][$_pos] & "\xF0"];
 511  
 512                          if (isset($_utf_len))
 513                          {
 514                              $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
 515                              $_pos += $_utf_len;
 516                          }
 517                          else
 518                          {
 519                              $utf_seq[] = $decomp_map[$utf_char][$_pos];
 520                              ++$_pos;
 521                          }
 522                      }
 523                      while ($_pos < $_len);
 524                  }
 525                  else
 526                  {
 527                      // The char is not decomposable
 528                      $utf_seq = array($utf_char);
 529                  }
 530  
 531  
 532                  // STEP 2: Capture the starter
 533  
 534                  // Check out the combining class of the first character of the UTF sequence
 535                  $k = 0;
 536                  if (isset($utf_combining_class[$utf_seq[0]]) || $qc[$utf_char] == UNICODE_QC_MAYBE)
 537                  {
 538                      // Not a starter, inspect previous characters
 539                      // The last 8 characters are kept in a buffer so that we don't have to capture them everytime.
 540                      // This is enough for all real-life strings but even if it wasn't, we can capture characters in backward mode,
 541                      // although it is slower than this method.
 542                      //
 543                      // In the following loop, $j starts at the previous buffered character ($i - 1, because current character is
 544                      // at offset $i) and process them in backward mode until we find a starter.
 545                      //
 546                      // $k is the index on each UTF character inside of our UTF sequence. At this time, $utf_seq contains one or more
 547                      // characters numbered 0 to n. $k starts at 0 and for each char we prepend we pre-decrement it and for numbering
 548                      $starter_found = 0;
 549                      $j_min = max(1, $i - 7);
 550  
 551                      for ($j = $i - 1; $j >= $j_min && $lpos > $tmp_pos; --$j)
 552                      {
 553                          $utf_char = $buffer[$j & 7];
 554                          $lpos -= strlen($utf_char);
 555  
 556                          if (isset($decomp_map[$utf_char]))
 557                          {
 558                              // The char is a composite, decompose for storage
 559                              $decomp_seq = array();
 560                              $_pos = 0;
 561                              $_len = strlen($decomp_map[$utf_char]);
 562  
 563                              do
 564                              {
 565                                  $c = $decomp_map[$utf_char][$_pos];
 566                                  $_utf_len =& $utf_len_mask[$c & "\xF0"];
 567  
 568                                  if (isset($_utf_len))
 569                                  {
 570                                      $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
 571                                      $_pos += $_utf_len;
 572                                  }
 573                                  else
 574                                  {
 575                                      $decomp_seq[] = $c;
 576                                      ++$_pos;
 577                                  }
 578                              }
 579                              while ($_pos < $_len);
 580  
 581                              // Prepend the UTF sequence with our decomposed sequence
 582                              if (isset($decomp_seq[1]))
 583                              {
 584                                  // The char expanded into several chars
 585                                  $decomp_cnt = sizeof($decomp_seq);
 586  
 587                                  foreach ($decomp_seq as $decomp_i => $decomp_char)
 588                                  {
 589                                      $utf_seq[$k + $decomp_i - $decomp_cnt] = $decomp_char;
 590                                  }
 591                                  $k -= $decomp_cnt;
 592                              }
 593                              else
 594                              {
 595                                  // Decomposed to a single char, easier to prepend
 596                                  $utf_seq[--$k] = $decomp_seq[0];
 597                              }
 598                          }
 599                          else
 600                          {
 601                              $utf_seq[--$k] = $utf_char;
 602                          }
 603  
 604                          if (!isset($utf_combining_class[$utf_seq[$k]]))
 605                          {
 606                              // We have found our starter
 607                              $starter_found = 1;
 608                              break;
 609                          }
 610                      }
 611  
 612                      if (!$starter_found && $lpos > $tmp_pos)
 613                      {
 614                          // The starter was not found in the buffer, let's rewind some more
 615                          do
 616                          {
 617                              // $utf_len_mask contains the masks of both leading bytes and trailing bytes. If $utf_en > 0 then it's a leading byte, otherwise it's a trailing byte.
 618                              $c = $str[--$lpos];
 619                              $c_mask = $c & "\xF0";
 620  
 621                              if (isset($utf_len_mask[$c_mask]))
 622                              {
 623                                  // UTF byte
 624                                  if ($utf_len = $utf_len_mask[$c_mask])
 625                                  {
 626                                      // UTF *leading* byte
 627                                      $utf_char = substr($str, $lpos, $utf_len);
 628  
 629                                      if (isset($decomp_map[$utf_char]))
 630                                      {
 631                                          // Decompose the character
 632                                          $decomp_seq = array();
 633                                          $_pos = 0;
 634                                          $_len = strlen($decomp_map[$utf_char]);
 635  
 636                                          do
 637                                          {
 638                                              $c = $decomp_map[$utf_char][$_pos];
 639                                              $_utf_len =& $utf_len_mask[$c & "\xF0"];
 640  
 641                                              if (isset($_utf_len))
 642                                              {
 643                                                  $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
 644                                                  $_pos += $_utf_len;
 645                                              }
 646                                              else
 647                                              {
 648                                                  $decomp_seq[] = $c;
 649                                                  ++$_pos;
 650                                              }
 651                                          }
 652                                          while ($_pos < $_len);
 653  
 654                                          // Prepend the UTF sequence with our decomposed sequence
 655                                          if (isset($decomp_seq[1]))
 656                                          {
 657                                              // The char expanded into several chars
 658                                              $decomp_cnt = sizeof($decomp_seq);
 659                                              foreach ($decomp_seq as $decomp_i => $utf_char)
 660                                              {
 661                                                  $utf_seq[$k + $decomp_i - $decomp_cnt] = $utf_char;
 662                                              }
 663                                              $k -= $decomp_cnt;
 664                                          }
 665                                          else
 666                                          {
 667                                              // Decomposed to a single char, easier to prepend
 668                                              $utf_seq[--$k] = $decomp_seq[0];
 669                                          }
 670                                      }
 671                                      else
 672                                      {
 673                                          $utf_seq[--$k] = $utf_char;
 674                                      }
 675                                  }
 676                              }
 677                              else
 678                              {
 679                                  // ASCII char
 680                                  $utf_seq[--$k] = $c;
 681                              }
 682                          }
 683                          while ($lpos > $tmp_pos);
 684                      }
 685                  }
 686  
 687  
 688                  // STEP 3: Capture following combining modifiers
 689  
 690                  while ($pos < $len)
 691                  {
 692                      $c_mask = $str[$pos] & "\xF0";
 693  
 694                      if (isset($utf_len_mask[$c_mask]))
 695                      {
 696                          if ($utf_len = $utf_len_mask[$c_mask])
 697                          {
 698                              $utf_char = substr($str, $pos, $utf_len);
 699                          }
 700                          else
 701                          {
 702                              // A trailing byte came out of nowhere
 703                              // Trailing bytes are replaced with Unicode replacement chars, we will just ignore it for now, break out of the loop
 704                              // as if it was a starter (replacement chars ARE starters) and let the next loop replace it
 705                              break;
 706                          }
 707  
 708                          if (isset($utf_combining_class[$utf_char]) || isset($qc[$utf_char]))
 709                          {
 710                              // Combining character, add it to the sequence and move the cursor
 711                              if (isset($decomp_map[$utf_char]))
 712                              {
 713                                  // Decompose the character
 714                                  $_pos = 0;
 715                                  $_len = strlen($decomp_map[$utf_char]);
 716  
 717                                  do
 718                                  {
 719                                      $c = $decomp_map[$utf_char][$_pos];
 720                                      $_utf_len =& $utf_len_mask[$c & "\xF0"];
 721  
 722                                      if (isset($_utf_len))
 723                                      {
 724                                          $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
 725                                          $_pos += $_utf_len;
 726                                      }
 727                                      else
 728                                      {
 729                                          $utf_seq[] = $c;
 730                                          ++$_pos;
 731                                      }
 732                                  }
 733                                  while ($_pos < $_len);
 734                              }
 735                              else
 736                              {
 737                                  $utf_seq[] = $utf_char;
 738                              }
 739  
 740                              $pos += $utf_len;
 741                          }
 742                          else
 743                          {
 744                              // Combining class 0 and no QC, break out of the loop
 745                              // Note: we do not know if that character is valid. If it's not, the next iteration will replace it
 746                              break;
 747                          }
 748                      }
 749                      else
 750                      {
 751                          // ASCII chars are starters
 752                          break;
 753                      }
 754                  }
 755  
 756  
 757                  // STEP 4: Sort and combine
 758  
 759                  // Here we sort...
 760                  $k_max = $k + sizeof($utf_seq);
 761  
 762                  if (!$k && $k_max == 1)
 763                  {
 764                      // There is only one char in the UTF sequence, add it then jump to the next iteration of main loop
 765                          // Note: the two commented lines below can be enabled under PHP5 for a very small performance gain in most cases
 766  //                        if (substr_compare($str, $utf_seq[0], $lpos, $pos - $lpos))
 767  //                        {
 768                          $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $utf_seq[0];
 769                          $tmp_pos = $pos;
 770  //                        }
 771  
 772                      continue;
 773                  }
 774  
 775                  // ...there we combine
 776                  if (isset($utf_combining_class[$utf_seq[$k]]))
 777                  {
 778                      $starter = $nf_seq = '';
 779                  }
 780                  else
 781                  {
 782                      $starter = $utf_seq[$k++];
 783                      $nf_seq = '';
 784                  }
 785                  $utf_sort = array();
 786  
 787                  // We add an empty char at the end of the UTF char sequence. It will act as a starter and trigger the sort/combine routine
 788                  // at the end of the string without altering it
 789                  $utf_seq[] = '';
 790  
 791                  do
 792                  {
 793                      $utf_char = $utf_seq[$k++];
 794  
 795                      if (isset($utf_combining_class[$utf_char]))
 796                      {
 797                          $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;
 798                      }
 799                      else
 800                      {
 801                          if (empty($utf_sort))
 802                          {
 803                              // No combining characters... check for a composite of the two starters
 804                              if (isset($utf_canonical_comp[$starter . $utf_char]))
 805                              {
 806                                  // Good ol' composite character
 807                                  $starter = $utf_canonical_comp[$starter . $utf_char];
 808                              }
 809                              else if (isset($utf_jamo_type[$utf_char]))
 810                              {
 811                                  // Current char is a composable jamo
 812                                  if (isset($utf_jamo_type[$starter]) && $utf_jamo_type[$starter] == UNICODE_JAMO_L && $utf_jamo_type[$utf_char] == UNICODE_JAMO_V)
 813                                  {
 814                                      // We have a L jamo followed by a V jamo, we are going to prefetch the next char to see if it's a T jamo
 815                                      if (isset($utf_jamo_type[$utf_seq[$k]]) && $utf_jamo_type[$utf_seq[$k]] == UNICODE_JAMO_T)
 816                                      {
 817                                          // L+V+T jamos, combine to a LVT Hangul syllable ($k is incremented)
 818                                          $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char] + $utf_jamo_index[$utf_seq[$k]];
 819                                          ++$k;
 820                                      }
 821                                      else
 822                                      {
 823                                          // L+V jamos, combine to a LV Hangul syllable
 824                                          $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char];
 825                                      }
 826  
 827                                      $starter = chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
 828                                  }
 829                                  else
 830                                  {
 831                                      // Non-composable jamo, just add it to the sequence
 832                                      $nf_seq .= $starter;
 833                                      $starter = $utf_char;
 834                                  }
 835                              }
 836                              else
 837                              {
 838                                  // No composite, just add the first starter to the sequence then continue with the other one
 839                                  $nf_seq .= $starter;
 840                                  $starter = $utf_char;
 841                              }
 842                          }
 843                          else
 844                          {
 845                              ksort($utf_sort);
 846  
 847                              // For each class of combining characters
 848                              foreach ($utf_sort as $cc => $utf_chars)
 849                              {
 850                                  $j = 0;
 851  
 852                                  do
 853                                  {
 854                                      // Look for a composite
 855                                      if (isset($utf_canonical_comp[$starter . $utf_chars[$j]]))
 856                                      {
 857                                          // Found a composite, replace the starter
 858                                          $starter = $utf_canonical_comp[$starter . $utf_chars[$j]];
 859                                          unset($utf_sort[$cc][$j]);
 860                                      }
 861                                      else
 862                                      {
 863                                          // No composite, all following characters in that class are blocked
 864                                          break;
 865                                      }
 866                                  }
 867                                  while (isset($utf_sort[$cc][++$j]));
 868                              }
 869  
 870                              // Add the starter to the normalized sequence, followed by non-starters in canonical order
 871                              $nf_seq .= $starter;
 872  
 873                              foreach ($utf_sort as $utf_chars)
 874                              {
 875                                  if (!empty($utf_chars))
 876                                  {
 877                                      $nf_seq .= implode('', $utf_chars);
 878                                  }
 879                              }
 880  
 881                              // Reset the array and go on
 882                              $utf_sort = array();
 883                              $starter = $utf_char;
 884                          }
 885                      }
 886                  }
 887                  while ($k <= $k_max);
 888  
 889                  $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $nf_seq;
 890                  $tmp_pos = $pos;
 891              }
 892              else
 893              {
 894                  // Only a ASCII char can make the program get here
 895                  //
 896                  // First we skip the current byte with ++$pos, then we quickly skip following ASCII chars with strspn().
 897                  //
 898                  // The first two "if"'s here can be removed, with the consequences of being faster on latin text (lots of ASCII) and slower on
 899                  // multi-byte text (where the only ASCII chars are spaces and punctuation)
 900                  if (++$pos != $len)
 901                  {
 902                      if ($str[$pos] < "\x80")
 903                      {
 904                          $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);
 905                          $buffer[++$i & 7] = $str[$pos - 1];
 906                      }
 907                      else
 908                      {
 909                          $buffer[++$i & 7] = $c;
 910                      }
 911                  }
 912              }
 913          }
 914          while ($pos < $len);
 915  
 916          // Now is time to return the string
 917          if ($tmp_pos)
 918          {
 919              // If the $tmp_pos cursor is not at the beggining of the string then at least one character was not in normal form. Replace $str with the fixed version
 920              if ($tmp_pos == $len)
 921              {
 922                  // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str
 923                  return $tmp;
 924              }
 925              else
 926              {
 927                  // The rightmost chunk of $str has not been appended to $tmp yet
 928                  return $tmp . substr($str, $tmp_pos);
 929              }
 930          }
 931  
 932          // The string was already in normal form
 933          return $str;
 934      }
 935  
 936      /**
 937      * Decompose a UTF string
 938      *
 939      * @param    string    $str            UTF string
 940      * @param    integer    $pos            Position of the first UTF char (in bytes)
 941      * @param    integer    $len            Length of the string (in bytes)
 942      * @param    array    &$decomp_map    Decomposition mapping, passed by reference but never modified
 943      * @return    string                    The string, decomposed and sorted canonically
 944      *
 945      * @access    private
 946      */
 947  	function decompose($str, $pos, $len, &$decomp_map)
 948      {
 949          global $utf_combining_class;
 950  
 951          // Load some commonly-used tables
 952          if (!isset($utf_combining_class))
 953          {
 954              global $phpbb_root_path, $phpEx;
 955              include($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx);
 956          }
 957  
 958          // UTF char length array
 959          $utf_len_mask = array(
 960              // Leading bytes masks
 961              "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,
 962              // Trailing bytes masks
 963              "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0
 964          );
 965  
 966          // Some extra checks are triggered on the first byte of a UTF sequence
 967          $extra_check = array(
 968              "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1,
 969              "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1,
 970              "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1
 971          );
 972  
 973          // These masks are used to check if a UTF sequence is well formed. Here are the only 3 lengths we acknowledge:
 974          //   - 2-byte: 110? ???? 10?? ????
 975          //   - 3-byte: 1110 ???? 10?? ???? 10?? ????
 976          //   - 4-byte: 1111 0??? 10?? ???? 10?? ???? 10?? ????
 977          // Note that 5- and 6- byte sequences are automatically discarded
 978          $utf_validation_mask = array(
 979              2    => "\xE0\xC0",
 980              3    => "\xF0\xC0\xC0",
 981              4    => "\xF8\xC0\xC0\xC0"
 982          );
 983  
 984          $utf_validation_check = array(
 985              2    => "\xC0\x80",
 986              3    => "\xE0\x80\x80",
 987              4    => "\xF0\x80\x80\x80"
 988          );
 989  
 990          $tmp = '';
 991          $starter_pos = $pos;
 992          $tmp_pos = $last_cc = $sort = $dump = 0;
 993          $utf_sort = array();
 994  
 995  
 996          // Main loop
 997          do
 998          {
 999              // STEP 0: Capture the current char
1000  
1001              $cur_mask = $str[$pos] & "\xF0";
1002              if (isset($utf_len_mask[$cur_mask]))
1003              {
1004                  if ($utf_len = $utf_len_mask[$cur_mask])
1005                  {
1006                      // Multibyte char
1007                      $utf_char = substr($str, $pos, $utf_len);
1008                      $pos += $utf_len;
1009                  }
1010                  else
1011                  {
1012                      // A trailing byte came out of nowhere, we will treat it and all following trailing bytes as if each of them was a Unicode
1013                      // replacement char and we will advance the cursor
1014                      $spn = strspn($str, UTF8_TRAILING_BYTES, $pos);
1015  
1016                      if ($dump)
1017                      {
1018                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1019  
1020                          // Dump combiners
1021                          if (!empty($utf_sort))
1022                          {
1023                              if ($sort)
1024                              {
1025                                  ksort($utf_sort);
1026                              }
1027  
1028                              foreach ($utf_sort as $utf_chars)
1029                              {
1030                                  $tmp .= implode('', $utf_chars);
1031                              }
1032                          }
1033  
1034                          $tmp .= str_repeat(UTF8_REPLACEMENT, $spn);
1035                          $dump = $sort = 0;
1036                      }
1037                      else
1038                      {
1039                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn);
1040                      }
1041  
1042                      $pos += $spn;
1043                      $tmp_pos = $starter_pos = $pos;
1044  
1045                      $utf_sort = array();
1046                      $last_cc = 0;
1047  
1048                      continue;
1049                  }
1050  
1051  
1052                  // STEP 1: Decide what to do with current char
1053  
1054                  // Now, in that order:
1055                  //  - check if that character is decomposable
1056                  //  - check if that character is a non-starter
1057                  //  - check if that character requires extra checks to be performed
1058                  if (isset($decomp_map[$utf_char]))
1059                  {
1060                      // Decompose the char
1061                      $_pos = 0;
1062                      $_len = strlen($decomp_map[$utf_char]);
1063  
1064                      do
1065                      {
1066                          $c = $decomp_map[$utf_char][$_pos];
1067                          $_utf_len =& $utf_len_mask[$c & "\xF0"];
1068  
1069                          if (isset($_utf_len))
1070                          {
1071                              $_utf_char = substr($decomp_map[$utf_char], $_pos, $_utf_len);
1072                              $_pos += $_utf_len;
1073  
1074                              if (isset($utf_combining_class[$_utf_char]))
1075                              {
1076                                  // The character decomposed to a non-starter, buffer it for sorting
1077                                  $utf_sort[$utf_combining_class[$_utf_char]][] = $_utf_char;
1078  
1079                                  if ($utf_combining_class[$_utf_char] < $last_cc)
1080                                  {
1081                                      // Not canonically ordered, will require sorting
1082                                      $sort = $dump = 1;
1083                                  }
1084                                  else
1085                                  {
1086                                      $dump = 1;
1087                                      $last_cc = $utf_combining_class[$_utf_char];
1088                                  }
1089                              }
1090                              else
1091                              {
1092                                  // This character decomposition contains a starter, dump the buffer and continue
1093                                  if ($dump)
1094                                  {
1095                                      $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1096  
1097                                      // Dump combiners
1098                                      if (!empty($utf_sort))
1099                                      {
1100                                          if ($sort)
1101                                          {
1102                                              ksort($utf_sort);
1103                                          }
1104  
1105                                          foreach ($utf_sort as $utf_chars)
1106                                          {
1107                                              $tmp .= implode('', $utf_chars);
1108                                          }
1109                                      }
1110  
1111                                      $tmp .= $_utf_char;
1112                                      $dump = $sort = 0;
1113                                  }
1114                                  else
1115                                  {
1116                                      $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos) . $_utf_char;
1117                                  }
1118  
1119                                  $tmp_pos = $starter_pos = $pos;
1120                                  $utf_sort = array();
1121                                  $last_cc = 0;
1122                              }
1123                          }
1124                          else
1125                          {
1126                              // This character decomposition contains an ASCII char, which is a starter. Dump the buffer and continue
1127                              ++$_pos;
1128  
1129                              if ($dump)
1130                              {
1131                                  $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1132  
1133                                  // Dump combiners
1134                                  if (!empty($utf_sort))
1135                                  {
1136                                      if ($sort)
1137                                      {
1138                                          ksort($utf_sort);
1139                                      }
1140  
1141                                      foreach ($utf_sort as $utf_chars)
1142                                      {
1143                                          $tmp .= implode('', $utf_chars);
1144                                      }
1145                                  }
1146  
1147                                  $tmp .= $c;
1148                                  $dump = $sort = 0;
1149                              }
1150                              else
1151                              {
1152                                  $tmp .= substr($str, $tmp_pos, $pos - $utf_len - $tmp_pos) . $c;
1153                              }
1154  
1155                              $tmp_pos = $starter_pos = $pos;
1156                              $utf_sort = array();
1157                              $last_cc = 0;
1158                          }
1159                      }
1160                      while ($_pos < $_len);
1161                  }
1162                  else if (isset($utf_combining_class[$utf_char]))
1163                  {
1164                      // Combining character
1165                      if ($utf_combining_class[$utf_char] < $last_cc)
1166                      {
1167                          // Not in canonical order
1168                          $sort = $dump = 1;
1169                      }
1170                      else
1171                      {
1172                          $last_cc = $utf_combining_class[$utf_char];
1173                      }
1174  
1175                      $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;
1176                  }
1177                  else
1178                  {
1179                      // Non-decomposable starter, check out if it's a Hangul syllable
1180                      if ($utf_char < UTF8_HANGUL_FIRST || $utf_char > UTF8_HANGUL_LAST)
1181                      {
1182                          // Nope, regular UTF char, check that we have the correct number of trailing bytes
1183                          if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])
1184                          {
1185                              // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char
1186                              // has been encoded in a five- or six- byte sequence.
1187                              // Move the cursor back to its original position then advance it to the position it should really be at
1188                              $pos -= $utf_len;
1189                              $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1190  
1191                              if (!empty($utf_sort))
1192                              {
1193                                  ksort($utf_sort);
1194  
1195                                  foreach ($utf_sort as $utf_chars)
1196                                  {
1197                                      $tmp .= implode('', $utf_chars);
1198                                  }
1199                                  $utf_sort = array();
1200                              }
1201  
1202                              // Add a replacement char then another replacement char for every trailing byte.
1203                              //
1204                              // @todo I'm not entirely sure that's how we're supposed to mark invalidated byte sequences, check this
1205                              $spn = strspn($str, UTF8_TRAILING_BYTES, ++$pos);
1206                              $tmp .= str_repeat(UTF8_REPLACEMENT, $spn + 1);
1207  
1208                              $dump = $sort = 0;
1209  
1210                              $pos += $spn;
1211                              $tmp_pos = $pos;
1212                              continue;
1213                          }
1214  
1215                          if (isset($extra_check[$utf_char[0]]))
1216                          {
1217                              switch ($utf_char[0])
1218                              {
1219                                  // Note: 0xED is quite common in Korean
1220                                  case "\xED":
1221                                      if ($utf_char >= "\xED\xA0\x80")
1222                                      {
1223                                          // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)
1224                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1225  
1226                                          if (!empty($utf_sort))
1227                                          {
1228                                              ksort($utf_sort);
1229  
1230                                              foreach ($utf_sort as $utf_chars)
1231                                              {
1232                                                  $tmp .= implode('', $utf_chars);
1233                                              }
1234                                              $utf_sort = array();
1235                                          }
1236  
1237                                          $tmp .= UTF8_REPLACEMENT;
1238                                          $dump = $sort = 0;
1239  
1240                                          $tmp_pos = $starter_pos = $pos;
1241                                          continue 2;
1242                                      }
1243                                  break;
1244  
1245                                  // Note: 0xEF is quite common in Japanese
1246                                  case "\xEF":
1247                                      if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF")
1248                                      {
1249                                          // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)
1250                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1251  
1252                                          if (!empty($utf_sort))
1253                                          {
1254                                              ksort($utf_sort);
1255  
1256                                              foreach ($utf_sort as $utf_chars)
1257                                              {
1258                                                  $tmp .= implode('', $utf_chars);
1259                                              }
1260                                              $utf_sort = array();
1261                                          }
1262  
1263                                          $tmp .= UTF8_REPLACEMENT;
1264                                          $dump = $sort = 0;
1265  
1266                                          $tmp_pos = $starter_pos = $pos;
1267                                          continue 2;
1268                                      }
1269                                  break;
1270  
1271                                  case "\xC0":
1272                                  case "\xC1":
1273                                      if ($utf_char <= "\xC1\xBF")
1274                                      {
1275                                          // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char
1276                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1277  
1278                                          if (!empty($utf_sort))
1279                                          {
1280                                              ksort($utf_sort);
1281  
1282                                              foreach ($utf_sort as $utf_chars)
1283                                              {
1284                                                  $tmp .= implode('', $utf_chars);
1285                                              }
1286                                              $utf_sort = array();
1287                                          }
1288  
1289                                          $tmp .= UTF8_REPLACEMENT;
1290                                          $dump = $sort = 0;
1291  
1292                                          $tmp_pos = $starter_pos = $pos;
1293                                          continue 2;
1294                                      }
1295                                  break;
1296  
1297                                  case "\xE0":
1298                                      if ($utf_char <= "\xE0\x9F\xBF")
1299                                      {
1300                                          // Unicode char U+0000..U+07FF encoded in 3 bytes
1301                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1302  
1303                                          if (!empty($utf_sort))
1304                                          {
1305                                              ksort($utf_sort);
1306  
1307                                              foreach ($utf_sort as $utf_chars)
1308                                              {
1309                                                  $tmp .= implode('', $utf_chars);
1310                                              }
1311                                              $utf_sort = array();
1312                                          }
1313  
1314                                          $tmp .= UTF8_REPLACEMENT;
1315                                          $dump = $sort = 0;
1316  
1317                                          $tmp_pos = $starter_pos = $pos;
1318                                          continue 2;
1319                                      }
1320                                  break;
1321  
1322                                  case "\xF0":
1323                                      if ($utf_char <= "\xF0\x8F\xBF\xBF")
1324                                      {
1325                                          // Unicode char U+0000..U+FFFF encoded in 4 bytes
1326                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1327  
1328                                          if (!empty($utf_sort))
1329                                          {
1330                                              ksort($utf_sort);
1331  
1332                                              foreach ($utf_sort as $utf_chars)
1333                                              {
1334                                                  $tmp .= implode('', $utf_chars);
1335                                              }
1336                                              $utf_sort = array();
1337                                          }
1338  
1339                                          $tmp .= UTF8_REPLACEMENT;
1340                                          $dump = $sort = 0;
1341  
1342                                          $tmp_pos = $starter_pos = $pos;
1343                                          continue 2;
1344                                      }
1345                                  break;
1346  
1347                                  default:
1348                                      if ($utf_char > UTF8_MAX)
1349                                      {
1350                                          // Out of the Unicode range
1351                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1352  
1353                                          if (!empty($utf_sort))
1354                                          {
1355                                              ksort($utf_sort);
1356  
1357                                              foreach ($utf_sort as $utf_chars)
1358                                              {
1359                                                  $tmp .= implode('', $utf_chars);
1360                                              }
1361                                              $utf_sort = array();
1362                                          }
1363  
1364                                          $tmp .= UTF8_REPLACEMENT;
1365                                          $dump = $sort = 0;
1366  
1367                                          $tmp_pos = $starter_pos = $pos;
1368                                          continue 2;
1369                                      }
1370                                  break;
1371                              }
1372                          }
1373                      }
1374                      else
1375                      {
1376                          // Hangul syllable
1377                          $idx = (((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F)) - UNICODE_HANGUL_SBASE;
1378  
1379                          // LIndex can only range from 0 to 18, therefore it cannot influence the first two bytes of the L Jamo, which allows us to hardcode them (based on LBase).
1380                          //
1381                          // The same goes for VIndex, but for TIndex there's a catch: the value of the third byte could exceed 0xBF and we would have to increment the second byte
1382                          if ($t_index = $idx % UNICODE_HANGUL_TCOUNT)
1383                          {
1384                              if ($t_index < 25)
1385                              {
1386                                  $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x86\x00";
1387                                  $utf_char[8] = chr(0xA7 + $t_index);
1388                              }
1389                              else
1390                              {
1391                                  $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x87\x00";
1392                                  $utf_char[8] = chr(0x67 + $t_index);
1393                              }
1394                          }
1395                          else
1396                          {
1397                              $utf_char = "\xE1\x84\x00\xE1\x85\x00";
1398                          }
1399  
1400                          $utf_char[2] = chr(0x80 + (int) ($idx / UNICODE_HANGUL_NCOUNT));
1401                          $utf_char[5] = chr(0xA1 + (int) (($idx % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT));
1402  
1403                          // Just like other decompositions, the resulting Jamos must be dumped to the tmp string
1404                          $dump = 1;
1405                      }
1406  
1407                      // Do we need to dump stuff to the tmp string?
1408                      if ($dump)
1409                      {
1410                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1411  
1412                          // Dump combiners
1413                          if (!empty($utf_sort))
1414                          {
1415                              if ($sort)
1416                              {
1417                                  ksort($utf_sort);
1418                              }
1419  
1420                              foreach ($utf_sort as $utf_chars)
1421                              {
1422                                  $tmp .= implode('', $utf_chars);
1423                              }
1424                          }
1425  
1426                          $tmp .= $utf_char;
1427                          $dump = $sort = 0;
1428                          $tmp_pos = $pos;
1429                      }
1430  
1431                      $last_cc = 0;
1432                      $utf_sort = array();
1433                      $starter_pos = $pos;
1434                  }
1435              }
1436              else
1437              {
1438                  // ASCII char, which happens to be a starter (as any other ASCII char)
1439                  if ($dump)
1440                  {
1441                      $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1442  
1443                      // Dump combiners
1444                      if (!empty($utf_sort))
1445                      {
1446                          if ($sort)
1447                          {
1448                              ksort($utf_sort);
1449                          }
1450  
1451                          foreach ($utf_sort as $utf_chars)
1452                          {
1453                              $tmp .= implode('', $utf_chars);
1454                          }
1455                      }
1456  
1457                      $tmp .= $str[$pos];
1458                      $dump = $sort = 0;
1459                      $tmp_pos = ++$pos;
1460  
1461                      $pos += strspn($str, UTF8_ASCII_RANGE, $pos);
1462                  }
1463                  else
1464                  {
1465                      $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);
1466                  }
1467  
1468                  $last_cc = 0;
1469                  $utf_sort = array();
1470                  $starter_pos = $pos;
1471              }
1472          }
1473          while ($pos < $len);
1474  
1475          // Now is time to return the string
1476          if ($dump)
1477          {
1478              $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1479  
1480              // Dump combiners
1481              if (!empty($utf_sort))
1482              {
1483                  if ($sort)
1484                  {
1485                      ksort($utf_sort);
1486                  }
1487  
1488                  foreach ($utf_sort as $utf_chars)
1489                  {
1490                      $tmp .= implode('', $utf_chars);
1491                  }
1492              }
1493  
1494              return $tmp;
1495          }
1496          else if ($tmp_pos)
1497          {
1498              // If the $tmp_pos cursor was moved then at least one character was not in normal form. Replace $str with the fixed version
1499              if ($tmp_pos == $len)
1500              {
1501                  // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str
1502                  return $tmp;
1503              }
1504              else
1505              {
1506                  // The rightmost chunk of $str has not been appended to $tmp yet
1507                  return $tmp . substr($str, $tmp_pos);
1508              }
1509          }
1510  
1511          // The string was already in normal form
1512          return $str;
1513      }
1514  }
1515  
1516  ?>
PHP Cross Reference of Unnamed Project

/includes/utf/ -> utf_normalizer.php (source)