Revision 464
Added by Matthias over 17 years ago
functions.php | ||
---|---|---|
367 | 367 |
// converts $charset_in to $charset_out or |
368 | 368 |
// UTF-8 to HTML-ENTITIES or HTML-ENTITIES to UTF-8 |
369 | 369 |
function mb_convert_encoding_wrapper($string, $charset_out, $charset_in) { |
370 |
if ($charset_out == $charset_in) { |
|
371 |
return $string; |
|
372 |
} |
|
370 | 373 |
// try mb_convert_encoding(). This can handle to or from HTML-ENTITIES, too |
371 | 374 |
if (function_exists('mb_convert_encoding')) { |
372 | 375 |
// there's no GB2312 or ISO-8859-11 encoding in php's mb_* functions |
... | ... | |
561 | 564 |
|
562 | 565 |
if (!is_UTF8($string)) { |
563 | 566 |
$string=mb_convert_encoding_wrapper($string, 'UTF-8', $charset); |
564 |
} else { |
|
565 | 567 |
} |
566 |
|
|
567 |
// check if we really get UTF-8. We don't get UTF-8 if charset is ISO-8859-11 or GB2312 and mb_string AND iconv aren't available.
|
|
568 |
// check if we really get UTF-8. We don't get UTF-8 if charset is ISO-8859-6 or ISO-2022-JP/KR |
|
569 |
// and mb_string AND iconv aren't available. |
|
568 | 570 |
if (is_UTF8($string)) { |
569 | 571 |
$string=mb_convert_encoding_wrapper($string, 'HTML-ENTITIES', 'UTF-8'); |
570 | 572 |
$string=mb_convert_encoding_wrapper($string, 'UTF-8', 'HTML-ENTITIES'); |
571 | 573 |
} else { |
574 |
// nothing we can do here :-( |
|
572 | 575 |
} |
573 | 576 |
return($string); |
574 | 577 |
} |
575 | 578 |
|
576 | 579 |
// function to check if a string is UTF-8 |
577 |
function is_UTF8 ($string) { |
|
578 |
return preg_match('%^(?:[\x09\x0A\x0D\x20-\x7E]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}|\xED[\x80-\x9F][\x80-\xBF]|\xF0[\x90-\xBF][\x80-\xBF]{2}|[\xF1-\xF3][\x80-\xBF]{3}|\xF4[\x80-\x8F][\x80-\xBF]{2})*$%xs', $string); |
|
580 |
function is_UTF8 ($str) { |
|
581 |
if (strlen($str) < 4000) { |
|
582 |
// see http://bugs.php.net/bug.php?id=24460 and http://bugs.php.net/bug.php?id=27070 and http://ilia.ws/archives/5-Top-10-ways-to-crash-PHP.html for this. |
|
583 |
// 4000 works for me ... |
|
584 |
return preg_match('/^(?:[\x09\x0A\x0D\x20-\x7E]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}|\xED[\x80-\x9F][\x80-\xBF]|\xF0[\x90-\xBF][\x80-\xBF]{2}|[\xF1-\xF3][\x80-\xBF]{3}|\xF4[\x80-\x8F][\x80-\xBF]{2})*$/s', $str); |
|
585 |
} else { |
|
586 |
$isUTF8 = true; |
|
587 |
while($str{0}) { |
|
588 |
if (preg_match("/^[\x09\x0A\x0D\x20-\x7E]/", $str)) { $str = substr($str, 1); continue; } |
|
589 |
if (preg_match("/^[\xC2-\xDF][\x80-\xBF]/", $str)) { $str = substr($str, 2); continue; } |
|
590 |
if (preg_match("/^\xE0[\xA0-\xBF][\x80-\xBF]/", $str)) { $str = substr($str, 3); continue; } |
|
591 |
if (preg_match("/^[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}/", $str)) { $str = substr($str, 3); continue; } |
|
592 |
if (preg_match("/^\xED[\x80-\x9F][\x80-\xBF]/", $str)) { $str = substr($str, 3); continue; } |
|
593 |
if (preg_match("/^\xF0[\x90-\xBF][\x80-\xBF]{2}/", $str)) { $str = substr($str, 4); continue; } |
|
594 |
if (preg_match("/^[\xF1-\xF3][\x80-\xBF]{3}/", $str)) { $str = substr($str, 4); continue; } |
|
595 |
if (preg_match("/^\xF4[\x80-\x8F][\x80-\xBF]{2}/", $str)) { $str = substr($str, 4); continue; } |
|
596 |
if (preg_match("/^$/", $str)) { break; } |
|
597 |
$isUTF8 = false; |
|
598 |
break; |
|
599 |
} |
|
600 |
return ($isUTF8); |
|
601 |
} |
|
579 | 602 |
} |
580 | 603 |
|
581 | 604 |
// Function to convert a string from mixed html-entities/umlauts to pure $charset_out-umlauts |
Also available in: Unified diff
Fixed some possible page crashes wich are caused from php if strlen is used (see http://bugs.php.net/bug.php?id=24460 and http://bugs.php.net/bug.php?id=27070 and http://ilia.ws/archives/5-Top-10-ways-to-crash-PHP.html for this) Thanks to Thorn