[ Index ] |
PHP Cross Reference of Unnamed Project |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * 4 * @package utf 5 * @version $Id$ 6 * @copyright (c) 2005 phpBB Group 7 * @license http://opensource.org/licenses/gpl-license.php GNU Public License 8 * 9 */ 10 11 /** 12 */ 13 if (!defined('IN_PHPBB')) 14 { 15 exit; 16 } 17 18 /** 19 * Some Unicode characters encoded in UTF-8 20 * 21 * Preserved for compatibility 22 */ 23 define('UTF8_REPLACEMENT', "\xEF\xBF\xBD"); 24 define('UTF8_MAX', "\xF4\x8F\xBF\xBF"); 25 define('UTF8_FFFE', "\xEF\xBF\xBE"); 26 define('UTF8_FFFF', "\xEF\xBF\xBF"); 27 define('UTF8_SURROGATE_FIRST', "\xED\xA0\x80"); 28 define('UTF8_SURROGATE_LAST', "\xED\xBF\xBF"); 29 define('UTF8_HANGUL_FIRST', "\xEA\xB0\x80"); 30 define('UTF8_HANGUL_LAST', "\xED\x9E\xA3"); 31 32 define('UTF8_CJK_FIRST', "\xE4\xB8\x80"); 33 define('UTF8_CJK_LAST', "\xE9\xBE\xBB"); 34 define('UTF8_CJK_B_FIRST', "\xF0\xA0\x80\x80"); 35 define('UTF8_CJK_B_LAST', "\xF0\xAA\x9B\x96"); 36 37 // Unset global variables 38 unset($GLOBALS['utf_jamo_index'], $GLOBALS['utf_jamo_type'], $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_combining_class'], $GLOBALS['utf_canonical_comp'], $GLOBALS['utf_canonical_decomp'], $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']); 39 40 // NFC_QC and NFKC_QC values 41 define('UNICODE_QC_MAYBE', 0); 42 define('UNICODE_QC_NO', 1); 43 44 // Contains all the ASCII characters appearing in UTF-8, sorted by frequency 45 define('UTF8_ASCII_RANGE', "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F"); 46 47 // Contains all the tail bytes that can appear in the composition of a UTF-8 char 48 define('UTF8_TRAILING_BYTES', "\xA9\xA0\xA8\x80\xAA\x99\xA7\xBB\xAB\x89\x94\x82\xB4\xA2\xAE\x83\xB0\xB9\xB8\x93\xAF\xBC\xB3\x81\xA4\xB2\x9C\xA1\xB5\xBE\xBD\xBA\x98\xAD\xB1\x84\x95\xA6\xB6\x88\x8D\x90\xB7\xBF\x92\x85\xA5\x97\x8C\x86\xA3\x8E\x9F\x8F\x87\x91\x9D\xAC\x9E\x8B\x96\x9B\x8A\x9A"); 49 50 // Constants used by the Hangul [de]composition algorithms 51 define('UNICODE_HANGUL_SBASE', 0xAC00); 52 define('UNICODE_HANGUL_LBASE', 0x1100); 53 define('UNICODE_HANGUL_VBASE', 0x1161); 54 define('UNICODE_HANGUL_TBASE', 0x11A7); 55 define('UNICODE_HANGUL_SCOUNT', 11172); 56 define('UNICODE_HANGUL_LCOUNT', 19); 57 define('UNICODE_HANGUL_VCOUNT', 21); 58 define('UNICODE_HANGUL_TCOUNT', 28); 59 define('UNICODE_HANGUL_NCOUNT', 588); 60 define('UNICODE_JAMO_L', 0); 61 define('UNICODE_JAMO_V', 1); 62 define('UNICODE_JAMO_T', 2); 63 64 /** 65 * Unicode normalization routines 66 * 67 * @package utf 68 */ 69 class utf_normalizer 70 { 71 /** 72 * Validate, cleanup and normalize a string 73 * 74 * The ultimate convenience function! Clean up invalid UTF-8 sequences, 75 * and convert to Normal Form C, canonical composition. 76 * 77 * @param string &$str The dirty string 78 * @return string The same string, all shiny and cleaned-up 79 */ 80 function cleanup(&$str) 81 { 82 // The string below is the list of all autorized characters, sorted by frequency in latin text 83 $pos = strspn($str, "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x0D"); 84 $len = strlen($str); 85 86 if ($pos == $len) 87 { 88 // ASCII strings with no special chars return immediately 89 return; 90 } 91 92 // Note: we do not check for $GLOBALS['utf_canonical_decomp']. It is assumed they are always loaded together 93 if (!isset($GLOBALS['utf_nfc_qc'])) 94 { 95 global $phpbb_root_path, $phpEx; 96 include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx); 97 } 98 99 if (!isset($GLOBALS['utf_canonical_decomp'])) 100 { 101 global $phpbb_root_path, $phpEx; 102 include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx); 103 } 104 105 // Replace any byte in the range 0x00..0x1F, except for \r, \n and \t 106 // We replace those characters with a 0xFF byte, which is illegal in UTF-8 and will in turn be replaced with a UTF replacement char 107 $str = strtr( 108 $str, 109 "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F", 110 "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF" 111 ); 112 113 $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']); 114 } 115 116 /** 117 * Validate and normalize a UTF string to NFC 118 * 119 * @param string &$str Unchecked UTF string 120 * @return string The string, validated and in normal form 121 */ 122 function nfc(&$str) 123 { 124 $pos = strspn($str, UTF8_ASCII_RANGE); 125 $len = strlen($str); 126 127 if ($pos == $len) 128 { 129 // ASCII strings return immediately 130 return; 131 } 132 133 if (!isset($GLOBALS['utf_nfc_qc'])) 134 { 135 global $phpbb_root_path, $phpEx; 136 include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx); 137 } 138 139 if (!isset($GLOBALS['utf_canonical_decomp'])) 140 { 141 global $phpbb_root_path, $phpEx; 142 include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx); 143 } 144 145 $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']); 146 } 147 148 /** 149 * Validate and normalize a UTF string to NFKC 150 * 151 * @param string &$str Unchecked UTF string 152 * @return string The string, validated and in normal form 153 */ 154 function nfkc(&$str) 155 { 156 $pos = strspn($str, UTF8_ASCII_RANGE); 157 $len = strlen($str); 158 159 if ($pos == $len) 160 { 161 // ASCII strings return immediately 162 return; 163 } 164 165 if (!isset($GLOBALS['utf_nfkc_qc'])) 166 { 167 global $phpbb_root_path, $phpEx; 168 include($phpbb_root_path . 'includes/utf/data/utf_nfkc_qc.' . $phpEx); 169 } 170 171 if (!isset($GLOBALS['utf_compatibility_decomp'])) 172 { 173 global $phpbb_root_path, $phpEx; 174 include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx); 175 } 176 177 $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']); 178 } 179 180 /** 181 * Validate and normalize a UTF string to NFD 182 * 183 * @param string &$str Unchecked UTF string 184 * @return string The string, validated and in normal form 185 */ 186 function nfd(&$str) 187 { 188 $pos = strspn($str, UTF8_ASCII_RANGE); 189 $len = strlen($str); 190 191 if ($pos == $len) 192 { 193 // ASCII strings return immediately 194 return; 195 } 196 197 if (!isset($GLOBALS['utf_canonical_decomp'])) 198 { 199 global $phpbb_root_path, $phpEx; 200 include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx); 201 } 202 203 $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_canonical_decomp']); 204 } 205 206 /** 207 * Validate and normalize a UTF string to NFKD 208 * 209 * @param string &$str Unchecked UTF string 210 * @return string The string, validated and in normal form 211 */ 212 function nfkd(&$str) 213 { 214 $pos = strspn($str, UTF8_ASCII_RANGE); 215 $len = strlen($str); 216 217 if ($pos == $len) 218 { 219 // ASCII strings return immediately 220 return; 221 } 222 223 if (!isset($GLOBALS['utf_compatibility_decomp'])) 224 { 225 global $phpbb_root_path, $phpEx; 226 include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx); 227 } 228 229 $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_compatibility_decomp']); 230 } 231 232 233 /** 234 * Recompose a UTF string 235 * 236 * @param string $str Unchecked UTF string 237 * @param integer $pos Position of the first UTF char (in bytes) 238 * @param integer $len Length of the string (in bytes) 239 * @param array &$qc Quick-check array, passed by reference but never modified 240 * @param array &$decomp_map Decomposition mapping, passed by reference but never modified 241 * @return string The string, validated and recomposed 242 * 243 * @access private 244 */ 245 function recompose($str, $pos, $len, &$qc, &$decomp_map) 246 { 247 global $utf_combining_class, $utf_canonical_comp, $utf_jamo_type, $utf_jamo_index; 248 249 // Load some commonly-used tables 250 if (!isset($utf_jamo_index, $utf_jamo_type, $utf_combining_class)) 251 { 252 global $phpbb_root_path, $phpEx; 253 include($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx); 254 } 255 256 // Load the canonical composition table 257 if (!isset($utf_canonical_comp)) 258 { 259 global $phpbb_root_path, $phpEx; 260 include($phpbb_root_path . 'includes/utf/data/utf_canonical_comp.' . $phpEx); 261 } 262 263 // Buffer the last ASCII char before the UTF-8 stuff if applicable 264 $tmp = ''; 265 $i = $tmp_pos = $last_cc = 0; 266 267 $buffer = ($pos) ? array(++$i => $str[$pos - 1]) : array(); 268 269 // UTF char length array 270 // This array is used to determine the length of a UTF character. 271 // Be $c the result of ($str[$pos] & "\xF0") --where $str is the string we're operating on and $pos 272 // the position of the cursor--, if $utf_len_mask[$c] does not exist, the byte is an ASCII char. 273 // Otherwise, if $utf_len_mask[$c] is greater than 0, we have a the leading byte of a multibyte character 274 // whose length is $utf_len_mask[$c] and if it is equal to 0, the byte is a trailing byte. 275 $utf_len_mask = array( 276 // Leading bytes masks 277 "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4, 278 // Trailing bytes masks 279 "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0 280 ); 281 282 $extra_check = array( 283 "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1, 284 "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1, 285 "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1 286 ); 287 288 $utf_validation_mask = array( 289 2 => "\xE0\xC0", 290 3 => "\xF0\xC0\xC0", 291 4 => "\xF8\xC0\xC0\xC0" 292 ); 293 294 $utf_validation_check = array( 295 2 => "\xC0\x80", 296 3 => "\xE0\x80\x80", 297 4 => "\xF0\x80\x80\x80" 298 ); 299 300 // Main loop 301 do 302 { 303 // STEP 0: Capture the current char and buffer it 304 $c = $str[$pos]; 305 $c_mask = $c & "\xF0"; 306 307 if (isset($utf_len_mask[$c_mask])) 308 { 309 // Byte at $pos is either a leading byte or a missplaced trailing byte 310 if ($utf_len = $utf_len_mask[$c_mask]) 311 { 312 // Capture the char 313 $buffer[++$i & 7] = $utf_char = substr($str, $pos, $utf_len); 314 315 // Let's find out if a thorough check is needed 316 if (isset($qc[$utf_char])) 317 { 318 // If the UTF char is in the qc array then it may not be in normal form. We do nothing here, the actual processing is below this "if" block 319 } 320 else if (isset($utf_combining_class[$utf_char])) 321 { 322 if ($utf_combining_class[$utf_char] < $last_cc) 323 { 324 // A combining character that is NOT canonically ordered 325 } 326 else 327 { 328 // A combining character that IS canonically ordered, skip to the next char 329 $last_cc = $utf_combining_class[$utf_char]; 330 331 $pos += $utf_len; 332 continue; 333 } 334 } 335 else 336 { 337 // At this point, $utf_char holds a UTF char that we know is not a NF[K]C_QC and is not a combining character. 338 // It can be a singleton, a canonical composite, a replacement char or an even an ill-formed bunch of bytes. Let's find out 339 $last_cc = 0; 340 341 // Check that we have the correct number of trailing bytes 342 if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len]) 343 { 344 // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char 345 // has been encoded in a five- or six- byte sequence 346 if ($utf_char[0] >= "\xF8") 347 { 348 if ($utf_char[0] < "\xFC") 349 { 350 $trailing_bytes = 4; 351 } 352 else if ($utf_char[0] > "\xFD") 353 { 354 $trailing_bytes = 0; 355 } 356 else 357 { 358 $trailing_bytes = 5; 359 } 360 } 361 else 362 { 363 $trailing_bytes = $utf_len - 1; 364 } 365 366 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT; 367 $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes); 368 $tmp_pos = $pos; 369 370 continue; 371 } 372 373 if (isset($extra_check[$c])) 374 { 375 switch ($c) 376 { 377 // Note: 0xED is quite common in Korean 378 case "\xED": 379 if ($utf_char >= "\xED\xA0\x80") 380 { 381 // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF) 382 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT; 383 $pos += $utf_len; 384 $tmp_pos = $pos; 385 continue 2; 386 } 387 break; 388 389 // Note: 0xEF is quite common in Japanese 390 case "\xEF": 391 if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF") 392 { 393 // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF) 394 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT; 395 $pos += $utf_len; 396 $tmp_pos = $pos; 397 continue 2; 398 } 399 break; 400 401 case "\xC0": 402 case "\xC1": 403 if ($utf_char <= "\xC1\xBF") 404 { 405 // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char 406 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT; 407 $pos += $utf_len; 408 $tmp_pos = $pos; 409 continue 2; 410 } 411 break; 412 413 case "\xE0": 414 if ($utf_char <= "\xE0\x9F\xBF") 415 { 416 // Unicode char U+0000..U+07FF encoded in 3 bytes 417 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT; 418 $pos += $utf_len; 419 $tmp_pos = $pos; 420 continue 2; 421 } 422 break; 423 424 case "\xF0": 425 if ($utf_char <= "\xF0\x8F\xBF\xBF") 426 { 427 // Unicode char U+0000..U+FFFF encoded in 4 bytes 428 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT; 429 $pos += $utf_len; 430 $tmp_pos = $pos; 431 continue 2; 432 } 433 break; 434 435 default: 436 // Five- and six- byte sequences do not need being checked for here anymore 437 if ($utf_char > UTF8_MAX) 438 { 439 // Out of the Unicode range 440 if ($utf_char[0] < "\xF8") 441 { 442 $trailing_bytes = 3; 443 } 444 else if ($utf_char[0] < "\xFC") 445 { 446 $trailing_bytes = 4; 447 } 448 else if ($utf_char[0] > "\xFD") 449 { 450 $trailing_bytes = 0; 451 } 452 else 453 { 454 $trailing_bytes = 5; 455 } 456 457 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT; 458 $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes); 459 $tmp_pos = $pos; 460 continue 2; 461 } 462 break; 463 } 464 } 465 466 // The char is a valid starter, move the cursor and go on 467 $pos += $utf_len; 468 continue; 469 } 470 } 471 else 472 { 473 // A trailing byte came out of nowhere, we will advance the cursor and treat the this byte and all following trailing bytes as if 474 // each of them was a Unicode replacement char 475 $spn = strspn($str, UTF8_TRAILING_BYTES, $pos); 476 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn); 477 478 $pos += $spn; 479 $tmp_pos = $pos; 480 continue; 481 } 482 483 484 // STEP 1: Decompose current char 485 486 // We have found a character that is either: 487 // - in the NFC_QC/NFKC_QC list 488 // - a non-starter char that is not canonically ordered 489 // 490 // We are going to capture the shortest UTF sequence that satisfies these two conditions: 491 // 492 // 1 - If the sequence does not start at the begginning of the string, it must begin with a starter, 493 // and that starter must not have the NF[K]C_QC property equal to "MAYBE" 494 // 495 // 2 - If the sequence does not end at the end of the string, it must end with a non-starter and be 496 // immediately followed by a starter that is not on the QC list 497 // 498 $utf_seq = array(); 499 $last_cc = 0; 500 $lpos = $pos; 501 $pos += $utf_len; 502 503 if (isset($decomp_map[$utf_char])) 504 { 505 $_pos = 0; 506 $_len = strlen($decomp_map[$utf_char]); 507 508 do 509 { 510 $_utf_len =& $utf_len_mask[$decomp_map[$utf_char][$_pos] & "\xF0"]; 511 512 if (isset($_utf_len)) 513 { 514 $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len); 515 $_pos += $_utf_len; 516 } 517 else 518 { 519 $utf_seq[] = $decomp_map[$utf_char][$_pos]; 520 ++$_pos; 521 } 522 } 523 while ($_pos < $_len); 524 } 525 else 526 { 527 // The char is not decomposable 528 $utf_seq = array($utf_char); 529 } 530 531 532 // STEP 2: Capture the starter 533 534 // Check out the combining class of the first character of the UTF sequence 535 $k = 0; 536 if (isset($utf_combining_class[$utf_seq[0]]) || $qc[$utf_char] == UNICODE_QC_MAYBE) 537 { 538 // Not a starter, inspect previous characters 539 // The last 8 characters are kept in a buffer so that we don't have to capture them everytime. 540 // This is enough for all real-life strings but even if it wasn't, we can capture characters in backward mode, 541 // although it is slower than this method. 542 // 543 // In the following loop, $j starts at the previous buffered character ($i - 1, because current character is 544 // at offset $i) and process them in backward mode until we find a starter. 545 // 546 // $k is the index on each UTF character inside of our UTF sequence. At this time, $utf_seq contains one or more 547 // characters numbered 0 to n. $k starts at 0 and for each char we prepend we pre-decrement it and for numbering 548 $starter_found = 0; 549 $j_min = max(1, $i - 7); 550 551 for ($j = $i - 1; $j >= $j_min && $lpos > $tmp_pos; --$j) 552 { 553 $utf_char = $buffer[$j & 7]; 554 $lpos -= strlen($utf_char); 555 556 if (isset($decomp_map[$utf_char])) 557 { 558 // The char is a composite, decompose for storage 559 $decomp_seq = array(); 560 $_pos = 0; 561 $_len = strlen($decomp_map[$utf_char]); 562 563 do 564 { 565 $c = $decomp_map[$utf_char][$_pos]; 566 $_utf_len =& $utf_len_mask[$c & "\xF0"]; 567 568 if (isset($_utf_len)) 569 { 570 $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len); 571 $_pos += $_utf_len; 572 } 573 else 574 { 575 $decomp_seq[] = $c; 576 ++$_pos; 577 } 578 } 579 while ($_pos < $_len); 580 581 // Prepend the UTF sequence with our decomposed sequence 582 if (isset($decomp_seq[1])) 583 { 584 // The char expanded into several chars 585 $decomp_cnt = sizeof($decomp_seq); 586 587 foreach ($decomp_seq as $decomp_i => $decomp_char) 588 { 589 $utf_seq[$k + $decomp_i - $decomp_cnt] = $decomp_char; 590 } 591 $k -= $decomp_cnt; 592 } 593 else 594 { 595 // Decomposed to a single char, easier to prepend 596 $utf_seq[--$k] = $decomp_seq[0]; 597 } 598 } 599 else 600 { 601 $utf_seq[--$k] = $utf_char; 602 } 603 604 if (!isset($utf_combining_class[$utf_seq[$k]])) 605 { 606 // We have found our starter 607 $starter_found = 1; 608 break; 609 } 610 } 611 612 if (!$starter_found && $lpos > $tmp_pos) 613 { 614 // The starter was not found in the buffer, let's rewind some more 615 do 616 { 617 // $utf_len_mask contains the masks of both leading bytes and trailing bytes. If $utf_en > 0 then it's a leading byte, otherwise it's a trailing byte. 618 $c = $str[--$lpos]; 619 $c_mask = $c & "\xF0"; 620 621 if (isset($utf_len_mask[$c_mask])) 622 { 623 // UTF byte 624 if ($utf_len = $utf_len_mask[$c_mask]) 625 { 626 // UTF *leading* byte 627 $utf_char = substr($str, $lpos, $utf_len); 628 629 if (isset($decomp_map[$utf_char])) 630 { 631 // Decompose the character 632 $decomp_seq = array(); 633 $_pos = 0; 634 $_len = strlen($decomp_map[$utf_char]); 635 636 do 637 { 638 $c = $decomp_map[$utf_char][$_pos]; 639 $_utf_len =& $utf_len_mask[$c & "\xF0"]; 640 641 if (isset($_utf_len)) 642 { 643 $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len); 644 $_pos += $_utf_len; 645 } 646 else 647 { 648 $decomp_seq[] = $c; 649 ++$_pos; 650 } 651 } 652 while ($_pos < $_len); 653 654 // Prepend the UTF sequence with our decomposed sequence 655 if (isset($decomp_seq[1])) 656 { 657 // The char expanded into several chars 658 $decomp_cnt = sizeof($decomp_seq); 659 foreach ($decomp_seq as $decomp_i => $utf_char) 660 { 661 $utf_seq[$k + $decomp_i - $decomp_cnt] = $utf_char; 662 } 663 $k -= $decomp_cnt; 664 } 665 else 666 { 667 // Decomposed to a single char, easier to prepend 668 $utf_seq[--$k] = $decomp_seq[0]; 669 } 670 } 671 else 672 { 673 $utf_seq[--$k] = $utf_char; 674 } 675 } 676 } 677 else 678 { 679 // ASCII char 680 $utf_seq[--$k] = $c; 681 } 682 } 683 while ($lpos > $tmp_pos); 684 } 685 } 686 687 688 // STEP 3: Capture following combining modifiers 689 690 while ($pos < $len) 691 { 692 $c_mask = $str[$pos] & "\xF0"; 693 694 if (isset($utf_len_mask[$c_mask])) 695 { 696 if ($utf_len = $utf_len_mask[$c_mask]) 697 { 698 $utf_char = substr($str, $pos, $utf_len); 699 } 700 else 701 { 702 // A trailing byte came out of nowhere 703 // Trailing bytes are replaced with Unicode replacement chars, we will just ignore it for now, break out of the loop 704 // as if it was a starter (replacement chars ARE starters) and let the next loop replace it 705 break; 706 } 707 708 if (isset($utf_combining_class[$utf_char]) || isset($qc[$utf_char])) 709 { 710 // Combining character, add it to the sequence and move the cursor 711 if (isset($decomp_map[$utf_char])) 712 { 713 // Decompose the character 714 $_pos = 0; 715 $_len = strlen($decomp_map[$utf_char]); 716 717 do 718 { 719 $c = $decomp_map[$utf_char][$_pos]; 720 $_utf_len =& $utf_len_mask[$c & "\xF0"]; 721 722 if (isset($_utf_len)) 723 { 724 $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len); 725 $_pos += $_utf_len; 726 } 727 else 728 { 729 $utf_seq[] = $c; 730 ++$_pos; 731 } 732 } 733 while ($_pos < $_len); 734 } 735 else 736 { 737 $utf_seq[] = $utf_char; 738 } 739 740 $pos += $utf_len; 741 } 742 else 743 { 744 // Combining class 0 and no QC, break out of the loop 745 // Note: we do not know if that character is valid. If it's not, the next iteration will replace it 746 break; 747 } 748 } 749 else 750 { 751 // ASCII chars are starters 752 break; 753 } 754 } 755 756 757 // STEP 4: Sort and combine 758 759 // Here we sort... 760 $k_max = $k + sizeof($utf_seq); 761 762 if (!$k && $k_max == 1) 763 { 764 // There is only one char in the UTF sequence, add it then jump to the next iteration of main loop 765 // Note: the two commented lines below can be enabled under PHP5 for a very small performance gain in most cases 766 // if (substr_compare($str, $utf_seq[0], $lpos, $pos - $lpos)) 767 // { 768 $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $utf_seq[0]; 769 $tmp_pos = $pos; 770 // } 771 772 continue; 773 } 774 775 // ...there we combine 776 if (isset($utf_combining_class[$utf_seq[$k]])) 777 { 778 $starter = $nf_seq = ''; 779 } 780 else 781 { 782 $starter = $utf_seq[$k++]; 783 $nf_seq = ''; 784 } 785 $utf_sort = array(); 786 787 // We add an empty char at the end of the UTF char sequence. It will act as a starter and trigger the sort/combine routine 788 // at the end of the string without altering it 789 $utf_seq[] = ''; 790 791 do 792 { 793 $utf_char = $utf_seq[$k++]; 794 795 if (isset($utf_combining_class[$utf_char])) 796 { 797 $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char; 798 } 799 else 800 { 801 if (empty($utf_sort)) 802 { 803 // No combining characters... check for a composite of the two starters 804 if (isset($utf_canonical_comp[$starter . $utf_char])) 805 { 806 // Good ol' composite character 807 $starter = $utf_canonical_comp[$starter . $utf_char]; 808 } 809 else if (isset($utf_jamo_type[$utf_char])) 810 { 811 // Current char is a composable jamo 812 if (isset($utf_jamo_type[$starter]) && $utf_jamo_type[$starter] == UNICODE_JAMO_L && $utf_jamo_type[$utf_char] == UNICODE_JAMO_V) 813 { 814 // We have a L jamo followed by a V jamo, we are going to prefetch the next char to see if it's a T jamo 815 if (isset($utf_jamo_type[$utf_seq[$k]]) && $utf_jamo_type[$utf_seq[$k]] == UNICODE_JAMO_T) 816 { 817 // L+V+T jamos, combine to a LVT Hangul syllable ($k is incremented) 818 $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char] + $utf_jamo_index[$utf_seq[$k]]; 819 ++$k; 820 } 821 else 822 { 823 // L+V jamos, combine to a LV Hangul syllable 824 $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char]; 825 } 826 827 $starter = chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F)); 828 } 829 else 830 { 831 // Non-composable jamo, just add it to the sequence 832 $nf_seq .= $starter; 833 $starter = $utf_char; 834 } 835 } 836 else 837 { 838 // No composite, just add the first starter to the sequence then continue with the other one 839 $nf_seq .= $starter; 840 $starter = $utf_char; 841 } 842 } 843 else 844 { 845 ksort($utf_sort); 846 847 // For each class of combining characters 848 foreach ($utf_sort as $cc => $utf_chars) 849 { 850 $j = 0; 851 852 do 853 { 854 // Look for a composite 855 if (isset($utf_canonical_comp[$starter . $utf_chars[$j]])) 856 { 857 // Found a composite, replace the starter 858 $starter = $utf_canonical_comp[$starter . $utf_chars[$j]]; 859 unset($utf_sort[$cc][$j]); 860 } 861 else 862 { 863 // No composite, all following characters in that class are blocked 864 break; 865 } 866 } 867 while (isset($utf_sort[$cc][++$j])); 868 } 869 870 // Add the starter to the normalized sequence, followed by non-starters in canonical order 871 $nf_seq .= $starter; 872 873 foreach ($utf_sort as $utf_chars) 874 { 875 if (!empty($utf_chars)) 876 { 877 $nf_seq .= implode('', $utf_chars); 878 } 879 } 880 881 // Reset the array and go on 882 $utf_sort = array(); 883 $starter = $utf_char; 884 } 885 } 886 } 887 while ($k <= $k_max); 888 889 $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $nf_seq; 890 $tmp_pos = $pos; 891 } 892 else 893 { 894 // Only a ASCII char can make the program get here 895 // 896 // First we skip the current byte with ++$pos, then we quickly skip following ASCII chars with strspn(). 897 // 898 // The first two "if"'s here can be removed, with the consequences of being faster on latin text (lots of ASCII) and slower on 899 // multi-byte text (where the only ASCII chars are spaces and punctuation) 900 if (++$pos != $len) 901 { 902 if ($str[$pos] < "\x80") 903 { 904 $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos); 905 $buffer[++$i & 7] = $str[$pos - 1]; 906 } 907 else 908 { 909 $buffer[++$i & 7] = $c; 910 } 911 } 912 } 913 } 914 while ($pos < $len); 915 916 // Now is time to return the string 917 if ($tmp_pos) 918 { 919 // If the $tmp_pos cursor is not at the beggining of the string then at least one character was not in normal form. Replace $str with the fixed version 920 if ($tmp_pos == $len) 921 { 922 // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str 923 return $tmp; 924 } 925 else 926 { 927 // The rightmost chunk of $str has not been appended to $tmp yet 928 return $tmp . substr($str, $tmp_pos); 929 } 930 } 931 932 // The string was already in normal form 933 return $str; 934 } 935 936 /** 937 * Decompose a UTF string 938 * 939 * @param string $str UTF string 940 * @param integer $pos Position of the first UTF char (in bytes) 941 * @param integer $len Length of the string (in bytes) 942 * @param array &$decomp_map Decomposition mapping, passed by reference but never modified 943 * @return string The string, decomposed and sorted canonically 944 * 945 * @access private 946 */ 947 function decompose($str, $pos, $len, &$decomp_map) 948 { 949 global $utf_combining_class; 950 951 // Load some commonly-used tables 952 if (!isset($utf_combining_class)) 953 { 954 global $phpbb_root_path, $phpEx; 955 include($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx); 956 } 957 958 // UTF char length array 959 $utf_len_mask = array( 960 // Leading bytes masks 961 "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4, 962 // Trailing bytes masks 963 "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0 964 ); 965 966 // Some extra checks are triggered on the first byte of a UTF sequence 967 $extra_check = array( 968 "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1, 969 "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1, 970 "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1 971 ); 972 973 // These masks are used to check if a UTF sequence is well formed. Here are the only 3 lengths we acknowledge: 974 // - 2-byte: 110? ???? 10?? ???? 975 // - 3-byte: 1110 ???? 10?? ???? 10?? ???? 976 // - 4-byte: 1111 0??? 10?? ???? 10?? ???? 10?? ???? 977 // Note that 5- and 6- byte sequences are automatically discarded 978 $utf_validation_mask = array( 979 2 => "\xE0\xC0", 980 3 => "\xF0\xC0\xC0", 981 4 => "\xF8\xC0\xC0\xC0" 982 ); 983 984 $utf_validation_check = array( 985 2 => "\xC0\x80", 986 3 => "\xE0\x80\x80", 987 4 => "\xF0\x80\x80\x80" 988 ); 989 990 $tmp = ''; 991 $starter_pos = $pos; 992 $tmp_pos = $last_cc = $sort = $dump = 0; 993 $utf_sort = array(); 994 995 996 // Main loop 997 do 998 { 999 // STEP 0: Capture the current char 1000 1001 $cur_mask = $str[$pos] & "\xF0"; 1002 if (isset($utf_len_mask[$cur_mask])) 1003 { 1004 if ($utf_len = $utf_len_mask[$cur_mask]) 1005 { 1006 // Multibyte char 1007 $utf_char = substr($str, $pos, $utf_len); 1008 $pos += $utf_len; 1009 } 1010 else 1011 { 1012 // A trailing byte came out of nowhere, we will treat it and all following trailing bytes as if each of them was a Unicode 1013 // replacement char and we will advance the cursor 1014 $spn = strspn($str, UTF8_TRAILING_BYTES, $pos); 1015 1016 if ($dump) 1017 { 1018 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); 1019 1020 // Dump combiners 1021 if (!empty($utf_sort)) 1022 { 1023 if ($sort) 1024 { 1025 ksort($utf_sort); 1026 } 1027 1028 foreach ($utf_sort as $utf_chars) 1029 { 1030 $tmp .= implode('', $utf_chars); 1031 } 1032 } 1033 1034 $tmp .= str_repeat(UTF8_REPLACEMENT, $spn); 1035 $dump = $sort = 0; 1036 } 1037 else 1038 { 1039 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn); 1040 } 1041 1042 $pos += $spn; 1043 $tmp_pos = $starter_pos = $pos; 1044 1045 $utf_sort = array(); 1046 $last_cc = 0; 1047 1048 continue; 1049 } 1050 1051 1052 // STEP 1: Decide what to do with current char 1053 1054 // Now, in that order: 1055 // - check if that character is decomposable 1056 // - check if that character is a non-starter 1057 // - check if that character requires extra checks to be performed 1058 if (isset($decomp_map[$utf_char])) 1059 { 1060 // Decompose the char 1061 $_pos = 0; 1062 $_len = strlen($decomp_map[$utf_char]); 1063 1064 do 1065 { 1066 $c = $decomp_map[$utf_char][$_pos]; 1067 $_utf_len =& $utf_len_mask[$c & "\xF0"]; 1068 1069 if (isset($_utf_len)) 1070 { 1071 $_utf_char = substr($decomp_map[$utf_char], $_pos, $_utf_len); 1072 $_pos += $_utf_len; 1073 1074 if (isset($utf_combining_class[$_utf_char])) 1075 { 1076 // The character decomposed to a non-starter, buffer it for sorting 1077 $utf_sort[$utf_combining_class[$_utf_char]][] = $_utf_char; 1078 1079 if ($utf_combining_class[$_utf_char] < $last_cc) 1080 { 1081 // Not canonically ordered, will require sorting 1082 $sort = $dump = 1; 1083 } 1084 else 1085 { 1086 $dump = 1; 1087 $last_cc = $utf_combining_class[$_utf_char]; 1088 } 1089 } 1090 else 1091 { 1092 // This character decomposition contains a starter, dump the buffer and continue 1093 if ($dump) 1094 { 1095 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); 1096 1097 // Dump combiners 1098 if (!empty($utf_sort)) 1099 { 1100 if ($sort) 1101 { 1102 ksort($utf_sort); 1103 } 1104 1105 foreach ($utf_sort as $utf_chars) 1106 { 1107 $tmp .= implode('', $utf_chars); 1108 } 1109 } 1110 1111 $tmp .= $_utf_char; 1112 $dump = $sort = 0; 1113 } 1114 else 1115 { 1116 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos) . $_utf_char; 1117 } 1118 1119 $tmp_pos = $starter_pos = $pos; 1120 $utf_sort = array(); 1121 $last_cc = 0; 1122 } 1123 } 1124 else 1125 { 1126 // This character decomposition contains an ASCII char, which is a starter. Dump the buffer and continue 1127 ++$_pos; 1128 1129 if ($dump) 1130 { 1131 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); 1132 1133 // Dump combiners 1134 if (!empty($utf_sort)) 1135 { 1136 if ($sort) 1137 { 1138 ksort($utf_sort); 1139 } 1140 1141 foreach ($utf_sort as $utf_chars) 1142 { 1143 $tmp .= implode('', $utf_chars); 1144 } 1145 } 1146 1147 $tmp .= $c; 1148 $dump = $sort = 0; 1149 } 1150 else 1151 { 1152 $tmp .= substr($str, $tmp_pos, $pos - $utf_len - $tmp_pos) . $c; 1153 } 1154 1155 $tmp_pos = $starter_pos = $pos; 1156 $utf_sort = array(); 1157 $last_cc = 0; 1158 } 1159 } 1160 while ($_pos < $_len); 1161 } 1162 else if (isset($utf_combining_class[$utf_char])) 1163 { 1164 // Combining character 1165 if ($utf_combining_class[$utf_char] < $last_cc) 1166 { 1167 // Not in canonical order 1168 $sort = $dump = 1; 1169 } 1170 else 1171 { 1172 $last_cc = $utf_combining_class[$utf_char]; 1173 } 1174 1175 $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char; 1176 } 1177 else 1178 { 1179 // Non-decomposable starter, check out if it's a Hangul syllable 1180 if ($utf_char < UTF8_HANGUL_FIRST || $utf_char > UTF8_HANGUL_LAST) 1181 { 1182 // Nope, regular UTF char, check that we have the correct number of trailing bytes 1183 if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len]) 1184 { 1185 // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char 1186 // has been encoded in a five- or six- byte sequence. 1187 // Move the cursor back to its original position then advance it to the position it should really be at 1188 $pos -= $utf_len; 1189 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); 1190 1191 if (!empty($utf_sort)) 1192 { 1193 ksort($utf_sort); 1194 1195 foreach ($utf_sort as $utf_chars) 1196 { 1197 $tmp .= implode('', $utf_chars); 1198 } 1199 $utf_sort = array(); 1200 } 1201 1202 // Add a replacement char then another replacement char for every trailing byte. 1203 // 1204 // @todo I'm not entirely sure that's how we're supposed to mark invalidated byte sequences, check this 1205 $spn = strspn($str, UTF8_TRAILING_BYTES, ++$pos); 1206 $tmp .= str_repeat(UTF8_REPLACEMENT, $spn + 1); 1207 1208 $dump = $sort = 0; 1209 1210 $pos += $spn; 1211 $tmp_pos = $pos; 1212 continue; 1213 } 1214 1215 if (isset($extra_check[$utf_char[0]])) 1216 { 1217 switch ($utf_char[0]) 1218 { 1219 // Note: 0xED is quite common in Korean 1220 case "\xED": 1221 if ($utf_char >= "\xED\xA0\x80") 1222 { 1223 // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF) 1224 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); 1225 1226 if (!empty($utf_sort)) 1227 { 1228 ksort($utf_sort); 1229 1230 foreach ($utf_sort as $utf_chars) 1231 { 1232 $tmp .= implode('', $utf_chars); 1233 } 1234 $utf_sort = array(); 1235 } 1236 1237 $tmp .= UTF8_REPLACEMENT; 1238 $dump = $sort = 0; 1239 1240 $tmp_pos = $starter_pos = $pos; 1241 continue 2; 1242 } 1243 break; 1244 1245 // Note: 0xEF is quite common in Japanese 1246 case "\xEF": 1247 if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF") 1248 { 1249 // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF) 1250 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); 1251 1252 if (!empty($utf_sort)) 1253 { 1254 ksort($utf_sort); 1255 1256 foreach ($utf_sort as $utf_chars) 1257 { 1258 $tmp .= implode('', $utf_chars); 1259 } 1260 $utf_sort = array(); 1261 } 1262 1263 $tmp .= UTF8_REPLACEMENT; 1264 $dump = $sort = 0; 1265 1266 $tmp_pos = $starter_pos = $pos; 1267 continue 2; 1268 } 1269 break; 1270 1271 case "\xC0": 1272 case "\xC1": 1273 if ($utf_char <= "\xC1\xBF") 1274 { 1275 // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char 1276 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); 1277 1278 if (!empty($utf_sort)) 1279 { 1280 ksort($utf_sort); 1281 1282 foreach ($utf_sort as $utf_chars) 1283 { 1284 $tmp .= implode('', $utf_chars); 1285 } 1286 $utf_sort = array(); 1287 } 1288 1289 $tmp .= UTF8_REPLACEMENT; 1290 $dump = $sort = 0; 1291 1292 $tmp_pos = $starter_pos = $pos; 1293 continue 2; 1294 } 1295 break; 1296 1297 case "\xE0": 1298 if ($utf_char <= "\xE0\x9F\xBF") 1299 { 1300 // Unicode char U+0000..U+07FF encoded in 3 bytes 1301 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); 1302 1303 if (!empty($utf_sort)) 1304 { 1305 ksort($utf_sort); 1306 1307 foreach ($utf_sort as $utf_chars) 1308 { 1309 $tmp .= implode('', $utf_chars); 1310 } 1311 $utf_sort = array(); 1312 } 1313 1314 $tmp .= UTF8_REPLACEMENT; 1315 $dump = $sort = 0; 1316 1317 $tmp_pos = $starter_pos = $pos; 1318 continue 2; 1319 } 1320 break; 1321 1322 case "\xF0": 1323 if ($utf_char <= "\xF0\x8F\xBF\xBF") 1324 { 1325 // Unicode char U+0000..U+FFFF encoded in 4 bytes 1326 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); 1327 1328 if (!empty($utf_sort)) 1329 { 1330 ksort($utf_sort); 1331 1332 foreach ($utf_sort as $utf_chars) 1333 { 1334 $tmp .= implode('', $utf_chars); 1335 } 1336 $utf_sort = array(); 1337 } 1338 1339 $tmp .= UTF8_REPLACEMENT; 1340 $dump = $sort = 0; 1341 1342 $tmp_pos = $starter_pos = $pos; 1343 continue 2; 1344 } 1345 break; 1346 1347 default: 1348 if ($utf_char > UTF8_MAX) 1349 { 1350 // Out of the Unicode range 1351 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); 1352 1353 if (!empty($utf_sort)) 1354 { 1355 ksort($utf_sort); 1356 1357 foreach ($utf_sort as $utf_chars) 1358 { 1359 $tmp .= implode('', $utf_chars); 1360 } 1361 $utf_sort = array(); 1362 } 1363 1364 $tmp .= UTF8_REPLACEMENT; 1365 $dump = $sort = 0; 1366 1367 $tmp_pos = $starter_pos = $pos; 1368 continue 2; 1369 } 1370 break; 1371 } 1372 } 1373 } 1374 else 1375 { 1376 // Hangul syllable 1377 $idx = (((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F)) - UNICODE_HANGUL_SBASE; 1378 1379 // LIndex can only range from 0 to 18, therefore it cannot influence the first two bytes of the L Jamo, which allows us to hardcode them (based on LBase). 1380 // 1381 // The same goes for VIndex, but for TIndex there's a catch: the value of the third byte could exceed 0xBF and we would have to increment the second byte 1382 if ($t_index = $idx % UNICODE_HANGUL_TCOUNT) 1383 { 1384 if ($t_index < 25) 1385 { 1386 $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x86\x00"; 1387 $utf_char[8] = chr(0xA7 + $t_index); 1388 } 1389 else 1390 { 1391 $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x87\x00"; 1392 $utf_char[8] = chr(0x67 + $t_index); 1393 } 1394 } 1395 else 1396 { 1397 $utf_char = "\xE1\x84\x00\xE1\x85\x00"; 1398 } 1399 1400 $utf_char[2] = chr(0x80 + (int) ($idx / UNICODE_HANGUL_NCOUNT)); 1401 $utf_char[5] = chr(0xA1 + (int) (($idx % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT)); 1402 1403 // Just like other decompositions, the resulting Jamos must be dumped to the tmp string 1404 $dump = 1; 1405 } 1406 1407 // Do we need to dump stuff to the tmp string? 1408 if ($dump) 1409 { 1410 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); 1411 1412 // Dump combiners 1413 if (!empty($utf_sort)) 1414 { 1415 if ($sort) 1416 { 1417 ksort($utf_sort); 1418 } 1419 1420 foreach ($utf_sort as $utf_chars) 1421 { 1422 $tmp .= implode('', $utf_chars); 1423 } 1424 } 1425 1426 $tmp .= $utf_char; 1427 $dump = $sort = 0; 1428 $tmp_pos = $pos; 1429 } 1430 1431 $last_cc = 0; 1432 $utf_sort = array(); 1433 $starter_pos = $pos; 1434 } 1435 } 1436 else 1437 { 1438 // ASCII char, which happens to be a starter (as any other ASCII char) 1439 if ($dump) 1440 { 1441 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); 1442 1443 // Dump combiners 1444 if (!empty($utf_sort)) 1445 { 1446 if ($sort) 1447 { 1448 ksort($utf_sort); 1449 } 1450 1451 foreach ($utf_sort as $utf_chars) 1452 { 1453 $tmp .= implode('', $utf_chars); 1454 } 1455 } 1456 1457 $tmp .= $str[$pos]; 1458 $dump = $sort = 0; 1459 $tmp_pos = ++$pos; 1460 1461 $pos += strspn($str, UTF8_ASCII_RANGE, $pos); 1462 } 1463 else 1464 { 1465 $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos); 1466 } 1467 1468 $last_cc = 0; 1469 $utf_sort = array(); 1470 $starter_pos = $pos; 1471 } 1472 } 1473 while ($pos < $len); 1474 1475 // Now is time to return the string 1476 if ($dump) 1477 { 1478 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); 1479 1480 // Dump combiners 1481 if (!empty($utf_sort)) 1482 { 1483 if ($sort) 1484 { 1485 ksort($utf_sort); 1486 } 1487 1488 foreach ($utf_sort as $utf_chars) 1489 { 1490 $tmp .= implode('', $utf_chars); 1491 } 1492 } 1493 1494 return $tmp; 1495 } 1496 else if ($tmp_pos) 1497 { 1498 // If the $tmp_pos cursor was moved then at least one character was not in normal form. Replace $str with the fixed version 1499 if ($tmp_pos == $len) 1500 { 1501 // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str 1502 return $tmp; 1503 } 1504 else 1505 { 1506 // The rightmost chunk of $str has not been appended to $tmp yet 1507 return $tmp . substr($str, $tmp_pos); 1508 } 1509 } 1510 1511 // The string was already in normal form 1512 return $str; 1513 } 1514 } 1515 1516 ?>
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Wed Oct 2 15:03:47 2013 | Cross-referenced by PHPXref 0.7.1 |