Project

General

Profile

« Previous | Next » 

Revision 504

Added by Matthias over 17 years ago

Fixed converting issue with some ISO-charsets and speeded up converting on large pages (thanks to Thorn)

View differences:

trunk/wb/framework/frontend.functions.php
84 84
		$foo = preg_replace('/('.$string.')/i', '<span class="highlight">$1</span>',$foo);
85 85
	}
86 86
	$foo = strtr($foo, array("!,,!"=>"&lt;", "!,,,!"=>"&gt;", "!,,,,!"=>"&amp;", "!,,,,,!"=>"&quot;", "!,,,,,,!"=>"&#39;"));
87
	//$foo = umlauts_to_defcharset($foo, 'UTF-8');
87 88
	if(DEFAULT_CHARSET != 'utf-8') {
88
		$foo = umlauts_to_defcharset($foo, 'UTF-8');
89
		$foo = umlauts_to_entities($foo, 'UTF-8');
89 90
	}
90 91
	return $foo;
91 92
}
trunk/wb/framework/functions.php
338 338
	return $subs;
339 339
}
340 340

  
341
// Function as replecement for php's htmlspecialchars()
341
// Function as replacement for php's htmlspecialchars()
342 342
function my_htmlspecialchars($string) {
343 343
	$string = preg_replace("/&(?=[#a-z0-9]+;)/i", "_x_", $string);
344 344
	$string = strtr($string, array("<"=>"&lt;", ">"=>"&gt;", "&"=>"&amp;", "\""=>"&quot;", "\'"=>"&#39;"));
......
613 613
	);
614 614
		
615 615
	if ($in == 'HTML-ENTITIES') {
616
		$string = strtr($string, array('&#039;'=>'&#39;')); // fix a broken entity
617 616
		$string = strtr($string, $named_to_numbered_entities);
618 617
		$string = preg_replace("/&#([0-9]+);/e", "code_to_utf8($1)", $string);
619 618
	}
620 619
	elseif ($out == 'HTML-ENTITIES') {
621
		//$string = preg_replace("/&#([0-9]+);/e", "code_to_utf8($1)", $string);
622 620
		$char = "";
623
		while (strlen($string) > 0) {
624
			preg_match("/^(.)(.*)$/su", $string, $match);
625
			if (strlen($match[1]) > 1) {
626
				$char .= "&#".uniord($match[1]).";";
627
			} else $char .= $match[1];
628
			$string = $match[2];
629
		}
621
		$i=0;
622
		$len=strlen($string);
623
		if($len==0) return $string;
624
		do {
625
			if(ord($string{$i}) <= 127) $ud = $string{$i++};
626
			elseif(ord($string{$i}) <= 223) $ud = (ord($string{$i++})-192)*64 + (ord($string{$i++})-128);
627
			elseif(ord($string{$i}) <= 239) $ud = (ord($string{$i++})-224)*4096 + (ord($string{$i++})-128)*64 + (ord($string{$i++})-128);
628
			elseif(ord($string{$i}) <= 247) $ud = (ord($string{$i++})-240)*262144 + (ord($string{$i++})-128)*4096 + (ord($string{$i++})-128)*64 + (ord($string{$i++})-128);
629
			elseif(ord($string{$i}) <= 251) $ud = ord($string{$i++}); // error!
630
			if($ud > 127) {
631
				$char .= "&#$ud;";
632
			} else {
633
				$char .= $ud;
634
			}
635
		} while($i < $len);
630 636
		$string = $char;
631 637
		$string = strtr($string, $numbered_to_named_entities);
632 638
		// do ' and "
......
636 642
}
637 643

  
638 644
// support-function for string_decode_encode_entities()
639
function uniord($c) {
640
        $ud = 0;
641
        if (ord($c{0}) >= 0 && ord($c{0}) <= 127) $ud = ord($c{0});
642
        if (ord($c{0}) >= 192 && ord($c{0}) <= 223) $ud = (ord($c{0})-192)*64 + (ord($c{1})-128);
643
        if (ord($c{0}) >= 224 && ord($c{0}) <= 239) $ud = (ord($c{0})-224)*4096 + (ord($c{1})-128)*64 + (ord($c{2})-128);
644
        if (ord($c{0}) >= 240 && ord($c{0}) <= 247) $ud = (ord($c{0})-240)*262144 + (ord($c{1})-128)*4096 + (ord($c{2})-128)*64 + (ord($c{3})-128);
645
        if (ord($c{0}) >= 248 && ord($c{0}) <= 251) $ud = (ord($c{0})-248)*16777216 + (ord($c{1})-128)*262144 + (ord($c{2})-128)*4096 + (ord($c{3})-128)*64 + (ord($c{4})-128);
646
        if (ord($c{0}) >= 252 && ord($c{0}) <= 253) $ud = (ord($c{0})-252)*1073741824 + (ord($c{1})-128)*16777216 + (ord($c{2})-128)*262144 + (ord($c{3})-128)*4096 + (ord($c{4})-128)*64 + (ord($c{5})-128);
647
        if (ord($c{0}) >= 254 && ord($c{0}) <= 255) $ud = false; // error
648
        return $ud;
649
}
650
// support-function for mb_convert_encoding_wrapper()
651 645
function code_to_utf8($num) {
652 646
	if ($num <= 0x7F) {
653 647
		return chr($num);
......
706 700
}
707 701

  
708 702
// Function to convert a string from mixed html-entities/umlauts to pure $charset_out-umlauts
709
function entities_to_umlauts($string, $charset_out=DEFAULT_CHARSET, $convert_htmlspecialchars=0) {
703
function entities_to_umlauts($string, $charset_out=DEFAULT_CHARSET) {
710 704
	$charset_out = strtoupper($charset_out);
711 705
	if ($charset_out == '') { $charset_out = 'ISO-8859-1'; }
712 706
	$charset_in = strtoupper(DEFAULT_CHARSET);
713
	
714
	// string to utf-8
715
	if ($charset_in == 'ISO-8859-1' || $charset_in == 'UTF-8') {
707
	require_once(WB_PATH.'/framework/charsets_table.php');
708
	global $iso_8859_2_to_utf8, $iso_8859_3_to_utf8, $iso_8859_4_to_utf8, $iso_8859_5_to_utf8, $iso_8859_6_to_utf8, $iso_8859_7_to_utf8, $iso_8859_8_to_utf8, $iso_8859_9_to_utf8, $iso_8859_10_to_utf8, $iso_8859_11_to_utf8;
709
	global $utf8_to_iso_8859_2, $utf8_to_iso_8859_3, $utf8_to_iso_8859_4, $utf8_to_iso_8859_5, $utf8_to_iso_8859_6, $utf8_to_iso_8859_7, $utf8_to_iso_8859_8, $utf8_to_iso_8859_9, $utf8_to_iso_8859_10, $utf8_to_iso_8859_11;
710

  
711
	// string to utf-8, entities_to_utf8
712
	if (substr($charset_in,0,8) == 'ISO-8859' || $charset_in == 'UTF-8') {
716 713
		if ($charset_in == 'ISO-8859-1') {
717 714
			$string=utf8_encode($string);
715
		} elseif ($charset_in == 'ISO-8859-2') {
716
			$string = strtr($string, $iso_8859_2_to_utf8);
717
		} elseif ($charset_in == 'ISO-8859-3') {
718
			$string = strtr($string, $iso_8859_3_to_utf8);
719
		} elseif ($charset_in == 'ISO-8859-4') {
720
			$string = strtr($string, $iso_8859_4_to_utf8);
721
		} elseif ($charset_in == 'ISO-8859-5') {
722
			$string = strtr($string, $iso_8859_5_to_utf8);
723
		} elseif ($charset_in == 'ISO-8859-6') {
724
			$string = strtr($string, $iso_8859_6_to_utf8);
725
		} elseif ($charset_in == 'ISO-8859-7') {
726
			$string = strtr($string, $iso_8859_7_to_utf8);
727
		} elseif ($charset_in == 'ISO-8859-8') {
728
			$string = strtr($string, $iso_8859_8_to_utf8);
729
		} elseif ($charset_in == 'ISO-8859-9') {
730
			$string = strtr($string, $iso_8859_9_to_utf8);
731
		} elseif ($charset_in == 'ISO-8859-10') {
732
			$string = strtr($string, $iso_8859_10_to_utf8);
733
		} elseif ($charset_in == 'ISO-8859-11') {
734
			$string = strtr($string, $iso_8859_11_to_utf8);
718 735
		}
719 736
		// decode html-entities
720 737
		if(preg_match("/&[#a-zA-Z0-9]+;/", $string)) {
721 738
			$string=string_decode_encode_entities($string, 'UTF-8', 'HTML-ENTITIES');
722
			//$string=mb_convert_encoding_wrapper($string, 'HTML-ENTITIES', 'UTF-8');
739
			//$string=mb_convert_encoding_wrapper($string, 'HTML-ENTITIES', 'UTF-8'); // alternative to string_decode_encode_entities()
723 740
			//$string=mb_convert_encoding_wrapper($string, 'UTF-8', 'HTML-ENTITIES');
724 741
		}
725 742
	}
......
729 746
	// string to $charset_out
730 747
	if($charset_out == 'ISO-8859-1') {
731 748
			$string=utf8_decode($string);
749
	} elseif($charset_out == 'ISO-8859-2') {
750
		$string = strtr($string, $utf8_to_iso_8859_2);
751
	} elseif($charset_out == 'ISO-8859-3') {
752
		$string = strtr($string, $utf8_to_iso_8859_3);
753
	} elseif($charset_out == 'ISO-8859-4') {
754
		$string = strtr($string, $utf8_to_iso_8859_4);
755
	} elseif($charset_out == 'ISO-8859-5') {
756
		$string = strtr($string, $utf8_to_iso_8859_5);
757
	} elseif($charset_out == 'ISO-8859-6') {
758
		$string = strtr($string, $utf8_to_iso_8859_6);
759
	} elseif($charset_out == 'ISO-8859-7') {
760
		$string = strtr($string, $utf8_to_iso_8859_7);
761
	} elseif($charset_out == 'ISO-8859-8') {
762
		$string = strtr($string, $utf8_to_iso_8859_8);
763
	} elseif($charset_out == 'ISO-8859-9') {
764
		$string = strtr($string, $utf8_to_iso_8859_9);
765
	} elseif($charset_out == 'ISO-8859-10') {
766
		$string = strtr($string, $utf8_to_iso_8859_10);
767
	} elseif($charset_out == 'ISO-8859-11') {
768
		$string = strtr($string, $utf8_to_iso_8859_11);
769
	} elseif($charset_out != 'UTF-8') {
770
		if(is_UTF8($string)) {
771
			$string=mb_convert_encoding_wrapper($string, $charset_out, 'UTF-8');
772
		}
732 773
	}
733
	elseif($charset_out != 'UTF-8' && is_UTF8($string)) {
734
		$string=mb_convert_encoding_wrapper($string, $charset_out, 'UTF-8');
735
	}
736 774
	return $string;
737 775
}	
738 776

  
739 777
// Function to convert a string from mixed html-entitites/$charset_in-umlauts to pure html-entities
740
function umlauts_to_entities($string, $charset_in=DEFAULT_CHARSET, $convert_htmlspecialchars=0) {
778
function umlauts_to_entities($string, $charset_in=DEFAULT_CHARSET) {
741 779
	$charset_in = strtoupper($charset_in);
742 780
	if ($charset_in == "") { $charset_in = 'ISO-8859-1'; }
781
	require_once(WB_PATH.'/framework/charsets_table.php');
782
	global $iso_8859_2_to_utf8, $iso_8859_3_to_utf8, $iso_8859_4_to_utf8, $iso_8859_5_to_utf8, $iso_8859_6_to_utf8, $iso_8859_7_to_utf8, $iso_8859_8_to_utf8, $iso_8859_9_to_utf8, $iso_8859_10_to_utf8, $iso_8859_11_to_utf8;
743 783

  
744
	// string to utf-8
745
	if ($charset_in == 'ISO-8859-1' || $charset_in == 'UTF-8') {
784
	// string to utf-8, umlauts_to_entities
785
	if ($charset_in == 'UTF-8' || substr($charset_in,0,8) == 'ISO-8859') {
746 786
		if ($charset_in == 'ISO-8859-1') {
747 787
			$string=utf8_encode($string);
788
		} elseif ($charset_in == 'ISO-8859-2') {
789
			$string = strtr($string, $iso_8859_2_to_utf8);
790
		} elseif ($charset_in == 'ISO-8859-3') {
791
			$string = strtr($string, $iso_8859_3_to_utf8);
792
		} elseif ($charset_in == 'ISO-8859-4') {
793
			$string = strtr($string, $iso_8859_4_to_utf8);
794
		} elseif ($charset_in == 'ISO-8859-5') {
795
			$string = strtr($string, $iso_8859_5_to_utf8);
796
		} elseif ($charset_in == 'ISO-8859-6') {
797
			$string = strtr($string, $iso_8859_6_to_utf8);
798
		} elseif ($charset_in == 'ISO-8859-7') {
799
			$string = strtr($string, $iso_8859_7_to_utf8);
800
		} elseif ($charset_in == 'ISO-8859-8') {
801
			$string = strtr($string, $iso_8859_8_to_utf8);
802
		} elseif ($charset_in == 'ISO-8859-9') {
803
			$string = strtr($string, $iso_8859_9_to_utf8);
804
		} elseif ($charset_in == 'ISO-8859-10') {
805
			$string = strtr($string, $iso_8859_10_to_utf8);
806
		} elseif ($charset_in == 'ISO-8859-11') {
807
			$string = strtr($string, $iso_8859_11_to_utf8);
748 808
		}
749 809
		// encode html-entities
750
		$string=string_decode_encode_entities($string, 'HTML-ENTITIES', 'UTF-8'); // this is very slow!
810
		$string=string_decode_encode_entities($string, 'HTML-ENTITIES', 'UTF-8');
751 811
		//$string=mb_convert_encoding_wrapper($string, 'HTML-ENTITIES', 'UTF-8');
752 812
	}
753 813
	else {
......
764 824
function umlauts_to_defcharset($string, $charset) {
765 825
		$charset_out = strtoupper(DEFAULT_CHARSET);
766 826
		if ($charset_out == "") { $charset_out = 'ISO-8859-1'; }
827
		require_once(WB_PATH.'/framework/charsets_table.php');
828
		global $utf8_to_iso_8859_2, $utf8_to_iso_8859_3, $utf8_to_iso_8859_4, $utf8_to_iso_8859_5, $utf8_to_iso_8859_6, $utf8_to_iso_8859_7, $utf8_to_iso_8859_8, $utf8_to_iso_8859_9, $utf8_to_iso_8859_10, $utf8_to_iso_8859_11;
767 829
		
768 830
		if($charset_out == $charset) {
769 831
			return $string;
770 832
		}
771
		if($charset_out == 'ISO-8859-1' && $charset == 'UTF-8') {
772
			$string = utf8_decode($string);
833

  
834
		if($charset == 'UTF-8') {
835
			if($charset_out == 'ISO-8859-1') {
836
				$string = utf8_decode($string);
837
			} elseif ($charset_out == 'ISO-8859-2') {
838
				$string = strtr($string, $utf8_to_iso_8859_2);
839
			} elseif ($charset_out == 'ISO-8859-3') {
840
				$string = strtr($string, $utf8_to_iso_8859_3);
841
			} elseif ($charset_out == 'ISO-8859-4') {
842
				$string = strtr($string, $utf8_to_iso_8859_4);
843
			} elseif ($charset_out == 'ISO-8859-5') {
844
				$string = strtr($string, $utf8_to_iso_8859_5);
845
			} elseif ($charset_out == 'ISO-8859-6') {
846
				$string = strtr($string, $utf8_to_iso_8859_6);
847
			} elseif ($charset_out == 'ISO-8859-7') {
848
				$string = strtr($string, $utf8_to_iso_8859_7);
849
			} elseif ($charset_out == 'ISO-8859-8') {
850
				$string = strtr($string, $utf8_to_iso_8859_8);
851
			} elseif ($charset_out == 'ISO-8859-9') {
852
				$string = strtr($string, $utf8_to_iso_8859_9);
853
			} elseif ($charset_out == 'ISO-8859-10') {
854
				$string = strtr($string, $utf8_to_iso_8859_10);
855
			} elseif ($charset_out == 'ISO-8859-11') {
856
				$string = strtr($string, $utf8_to_iso_8859_11);
857
			}
858
			else {
859
				$string=mb_convert_encoding_wrapper($string, $charset_out, $charset);
860
			}
773 861
		}
774 862
		else {
775
			$string=mb_convert_encoding_wrapper($string, $charset_out, 'UTF-8');
863
			$string=mb_convert_encoding_wrapper($string, $charset_out, $charset);
776 864
		}
777 865
		
778 866
	return $string;

Also available in: Unified diff