Project

General

Profile

« Previous | Next » 

Revision 465

Added by Matthias over 17 years ago

Added changeset [464] also to the branches

View differences:

functions.php
367 367
// converts $charset_in to $charset_out or 
368 368
// UTF-8 to HTML-ENTITIES or HTML-ENTITIES to UTF-8
369 369
function mb_convert_encoding_wrapper($string, $charset_out, $charset_in) {
370
	if ($charset_out == $charset_in) {
371
		return $string;
372
	}
370 373
	// try mb_convert_encoding(). This can handle to or from HTML-ENTITIES, too
371 374
	if (function_exists('mb_convert_encoding')) {
372 375
		// there's no GB2312 or ISO-8859-11 encoding in php's mb_* functions
......
561 564

  
562 565
	if (!is_UTF8($string)) {
563 566
		$string=mb_convert_encoding_wrapper($string, 'UTF-8', $charset);
564
	} else {
565 567
	}
566

  
567
	// check if we really get UTF-8. We don't get UTF-8 if charset is ISO-8859-11 or GB2312 and mb_string AND iconv aren't available.
568
	// check if we really get UTF-8. We don't get UTF-8 if charset is ISO-8859-6 or ISO-2022-JP/KR
569
	// and mb_string AND iconv aren't available.
568 570
	if (is_UTF8($string)) {
569 571
		$string=mb_convert_encoding_wrapper($string, 'HTML-ENTITIES', 'UTF-8');
570 572
		$string=mb_convert_encoding_wrapper($string, 'UTF-8', 'HTML-ENTITIES');
571 573
	} else {
574
		// nothing we can do here :-(
572 575
	}
573 576
	return($string);
574 577
}
575 578

  
576 579
// function to check if a string is UTF-8
577
function is_UTF8 ($string) {
578
	return preg_match('%^(?:[\x09\x0A\x0D\x20-\x7E]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}|\xED[\x80-\x9F][\x80-\xBF]|\xF0[\x90-\xBF][\x80-\xBF]{2}|[\xF1-\xF3][\x80-\xBF]{3}|\xF4[\x80-\x8F][\x80-\xBF]{2})*$%xs', $string);
580
function is_UTF8 ($str) {
581
	if (strlen($str) < 4000) {
582
		// see http://bugs.php.net/bug.php?id=24460 and http://bugs.php.net/bug.php?id=27070 and http://ilia.ws/archives/5-Top-10-ways-to-crash-PHP.html for this.
583
		// 4000 works for me ...
584
		return preg_match('/^(?:[\x09\x0A\x0D\x20-\x7E]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}|\xED[\x80-\x9F][\x80-\xBF]|\xF0[\x90-\xBF][\x80-\xBF]{2}|[\xF1-\xF3][\x80-\xBF]{3}|\xF4[\x80-\x8F][\x80-\xBF]{2})*$/s', $str);
585
	}	else {
586
		$isUTF8 = true;
587
		while($str{0}) {
588
			if (preg_match("/^[\x09\x0A\x0D\x20-\x7E]/", $str)) { $str = substr($str, 1); continue; }
589
			if (preg_match("/^[\xC2-\xDF][\x80-\xBF]/", $str)) { $str = substr($str, 2); continue; }
590
			if (preg_match("/^\xE0[\xA0-\xBF][\x80-\xBF]/", $str)) { $str = substr($str, 3); continue; }
591
			if (preg_match("/^[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}/", $str)) { $str = substr($str, 3); continue; }
592
			if (preg_match("/^\xED[\x80-\x9F][\x80-\xBF]/", $str)) { $str = substr($str, 3); continue; }
593
			if (preg_match("/^\xF0[\x90-\xBF][\x80-\xBF]{2}/", $str)) { $str = substr($str, 4); continue; }
594
			if (preg_match("/^[\xF1-\xF3][\x80-\xBF]{3}/", $str)) { $str = substr($str, 4); continue; }
595
			if (preg_match("/^\xF4[\x80-\x8F][\x80-\xBF]{2}/", $str)) { $str = substr($str, 4); continue; }
596
			if (preg_match("/^$/", $str)) { break; }
597
			$isUTF8 = false;
598
			break;
599
		}
600
		return ($isUTF8);
601
	}
579 602
}
580 603

  
581 604
// Function to convert a string from mixed html-entities/umlauts to pure $charset_out-umlauts

Also available in: Unified diff