| 
      1
     | 
    
      <?php
 
     | 
  
  
    | 
      2
     | 
    
      
 
     | 
  
  
    | 
      3
     | 
    
      // $Id: functions-utf8.php 2009 2013-11-27 15:03:43Z darkviper $
 
     | 
  
  
    | 
      4
     | 
    
      
 
     | 
  
  
    | 
      5
     | 
    
      /*
 
     | 
  
  
    | 
      6
     | 
    
      
 
     | 
  
  
    | 
      7
     | 
    
       Website Baker Project <http://www.websitebaker.org/>
 
     | 
  
  
    | 
      8
     | 
    
       Copyright (C) 2004-2009, Ryan Djurovich
 
     | 
  
  
    | 
      9
     | 
    
      
 
     | 
  
  
    | 
      10
     | 
    
       Website Baker is free software; you can redistribute it and/or modify
 
     | 
  
  
    | 
      11
     | 
    
       it under the terms of the GNU General Public License as published by
 
     | 
  
  
    | 
      12
     | 
    
       the Free Software Foundation; either version 2 of the License, or
 
     | 
  
  
    | 
      13
     | 
    
       (at your option) any later version.
 
     | 
  
  
    | 
      14
     | 
    
      
 
     | 
  
  
    | 
      15
     | 
    
       Website Baker is distributed in the hope that it will be useful,
 
     | 
  
  
    | 
      16
     | 
    
       but WITHOUT ANY WARRANTY; without even the implied warranty of
 
     | 
  
  
    | 
      17
     | 
    
       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
     | 
  
  
    | 
      18
     | 
    
       GNU General Public License for more details.
 
     | 
  
  
    | 
      19
     | 
    
      
 
     | 
  
  
    | 
      20
     | 
    
       You should have received a copy of the GNU General Public License
 
     | 
  
  
    | 
      21
     | 
    
       along with Website Baker; if not, write to the Free Software
 
     | 
  
  
    | 
      22
     | 
    
       Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 
     | 
  
  
    | 
      23
     | 
    
      
 
     | 
  
  
    | 
      24
     | 
    
      */
 
     | 
  
  
    | 
      25
     | 
    
      
 
     | 
  
  
    | 
      26
     | 
    
      /*
 
     | 
  
  
    | 
      27
     | 
    
       * A part of this file is based on 'utf8.php' from the DokuWiki-project.
 
     | 
  
  
    | 
      28
     | 
    
       * (http://www.splitbrain.org/projects/dokuwiki):
 
     | 
  
  
    | 
      29
     | 
    
       **
 
     | 
  
  
    | 
      30
     | 
    
       * UTF8 helper functions
 
     | 
  
  
    | 
      31
     | 
    
       * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)
 
     | 
  
  
    | 
      32
     | 
    
       * @author     Andreas Gohr <andi@splitbrain.org>
 
     | 
  
  
    | 
      33
     | 
    
       **
 
     | 
  
  
    | 
      34
     | 
    
       * modified for use with Website Baker
 
     | 
  
  
    | 
      35
     | 
    
       * from thorn, Jan. 2008
 
     | 
  
  
    | 
      36
     | 
    
       *
 
     | 
  
  
    | 
      37
     | 
    
       * most of the original functions appeared to be to slow with large strings, so i replaced them with my own ones
 
     | 
  
  
    | 
      38
     | 
    
       * thorn, Mar. 2008
 
     | 
  
  
    | 
      39
     | 
    
       */
 
     | 
  
  
    | 
      40
     | 
    
      
 
     | 
  
  
    | 
      41
     | 
    
      // Functions we use in Website Baker:
 
     | 
  
  
    | 
      42
     | 
    
      //   entities_to_7bit()
 
     | 
  
  
    | 
      43
     | 
    
      //   entities_to_umlauts2()
 
     | 
  
  
    | 
      44
     | 
    
      //   umlauts_to_entities2()
 
     | 
  
  
    | 
      45
     | 
    
      /* -------------------------------------------------------- */
 
     | 
  
  
    | 
      46
     | 
    
      // Must include code to stop this file being accessed directly
 
     | 
  
  
    | 
      47
     | 
    
      if(!defined('WB_PATH')) {
     | 
  
  
    | 
      48
     | 
    
      	require_once(dirname(__FILE__).'/globalExceptionHandler.php');
 
     | 
  
  
    | 
      49
     | 
    
      	throw new IllegalFileException();
 
     | 
  
  
    | 
      50
     | 
    
      }
 
     | 
  
  
    | 
      51
     | 
    
      /* -------------------------------------------------------- */
 
     | 
  
  
    | 
      52
     | 
    
      /*
 
     | 
  
  
    | 
      53
     | 
    
       * check for mb_string support
 
     | 
  
  
    | 
      54
     | 
    
       */
 
     | 
  
  
    | 
      55
     | 
    
      //define('UTF8_NOMBSTRING',1); // uncomment this to forbid use of mb_string-functions
     | 
  
  
    | 
      56
     | 
    
      if(!defined('UTF8_MBSTRING')){
     | 
  
  
    | 
      57
     | 
    
        if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
     | 
  
  
    | 
      58
     | 
    
          define('UTF8_MBSTRING',1);
     | 
  
  
    | 
      59
     | 
    
        }else{
     | 
  
  
    | 
      60
     | 
    
          define('UTF8_MBSTRING',0);
     | 
  
  
    | 
      61
     | 
    
        }
 
     | 
  
  
    | 
      62
     | 
    
      }
 
     | 
  
  
    | 
      63
     | 
    
      
 
     | 
  
  
    | 
      64
     | 
    
      if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
     | 
  
  
    | 
      65
     | 
    
      
 
     | 
  
  
    | 
      66
     | 
    
      require_once(WB_PATH.'/framework/charsets_table.php');
 
     | 
  
  
    | 
      67
     | 
    
      
 
     | 
  
  
    | 
      68
     | 
    
      /*
 
     | 
  
  
    | 
      69
     | 
    
       * Checks if a string contains 7bit ASCII only
 
     | 
  
  
    | 
      70
     | 
    
       *
 
     | 
  
  
    | 
      71
     | 
    
       * @author thorn
 
     | 
  
  
    | 
      72
     | 
    
       */
 
     | 
  
  
    | 
      73
     | 
    
      function utf8_isASCII($str){
     | 
  
  
    | 
      74
     | 
    
      	if(preg_match('/[\x80-\xFF]/', $str))
     | 
  
  
    | 
      75
     | 
    
      		return false;
 
     | 
  
  
    | 
      76
     | 
    
      	else
 
     | 
  
  
    | 
      77
     | 
    
      		return true;
 
     | 
  
  
    | 
      78
     | 
    
      }
 
     | 
  
  
    | 
      79
     | 
    
      
 
     | 
  
  
    | 
      80
     | 
    
      /*
 
     | 
  
  
    | 
      81
     | 
    
       * Tries to detect if a string is in Unicode encoding
 
     | 
  
  
    | 
      82
     | 
    
       *
 
     | 
  
  
    | 
      83
     | 
    
       * @author <bmorel@ssi.fr>
 
     | 
  
  
    | 
      84
     | 
    
       * @link   http://www.php.net/manual/en/function.utf8-encode.php
 
     | 
  
  
    | 
      85
     | 
    
       */
 
     | 
  
  
    | 
      86
     | 
    
      function utf8_check($Str) {
     | 
  
  
    | 
      87
     | 
    
       for ($i=0; $i<strlen($Str); $i++) {
     | 
  
  
    | 
      88
     | 
    
        $b = ord($Str[$i]);
 
     | 
  
  
    | 
      89
     | 
    
        if ($b < 0x80) continue; # 0bbbbbbb
 
     | 
  
  
    | 
      90
     | 
    
        elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
 
     | 
  
  
    | 
      91
     | 
    
        elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
 
     | 
  
  
    | 
      92
     | 
    
        elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
 
     | 
  
  
    | 
      93
     | 
    
        elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
 
     | 
  
  
    | 
      94
     | 
    
        elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
 
     | 
  
  
    | 
      95
     | 
    
        else return false; # Does not match any model
 
     | 
  
  
    | 
      96
     | 
    
        for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
     | 
  
  
    | 
      97
     | 
    
         if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
 
     | 
  
  
    | 
      98
     | 
    
         return false;
 
     | 
  
  
    | 
      99
     | 
    
        }
 
     | 
  
  
    | 
      100
     | 
    
       }
 
     | 
  
  
    | 
      101
     | 
    
       return true;
 
     | 
  
  
    | 
      102
     | 
    
      }
 
     | 
  
  
    | 
      103
     | 
    
      
 
     | 
  
  
    | 
      104
     | 
    
      /*
 
     | 
  
  
    | 
      105
     | 
    
       * Romanize a non-latin string
 
     | 
  
  
    | 
      106
     | 
    
       *
 
     | 
  
  
    | 
      107
     | 
    
       * @author Andreas Gohr <andi@splitbrain.org>
 
     | 
  
  
    | 
      108
     | 
    
       */
 
     | 
  
  
    | 
      109
     | 
    
      function utf8_romanize($string){
     | 
  
  
    | 
      110
     | 
    
        if(utf8_isASCII($string)) return $string; //nothing to do
 
     | 
  
  
    | 
      111
     | 
    
      
 
     | 
  
  
    | 
      112
     | 
    
        global $UTF8_ROMANIZATION;
 
     | 
  
  
    | 
      113
     | 
    
        return strtr($string,$UTF8_ROMANIZATION);
 
     | 
  
  
    | 
      114
     | 
    
      }
 
     | 
  
  
    | 
      115
     | 
    
      
 
     | 
  
  
    | 
      116
     | 
    
      /*
 
     | 
  
  
    | 
      117
     | 
    
       * Removes special characters (nonalphanumeric) from a UTF-8 string
 
     | 
  
  
    | 
      118
     | 
    
       *
 
     | 
  
  
    | 
      119
     | 
    
       * This function adds the controlchars 0x00 to 0x19 to the array of
 
     | 
  
  
    | 
      120
     | 
    
       * stripped chars (they are not included in $UTF8_SPECIAL_CHARS2)
 
     | 
  
  
    | 
      121
     | 
    
       *
 
     | 
  
  
    | 
      122
     | 
    
       * @author Andreas Gohr <andi@splitbrain.org>
 
     | 
  
  
    | 
      123
     | 
    
       * @param  string $string     The UTF8 string to strip of special chars
 
     | 
  
  
    | 
      124
     | 
    
       * @param  string $repl       Replace special with this string
 
     | 
  
  
    | 
      125
     | 
    
       * @param  string $additional Additional chars to strip (used in regexp char class)
 
     | 
  
  
    | 
      126
     | 
    
       */
 
     | 
  
  
    | 
      127
     | 
    
      function utf8_stripspecials($string,$repl='',$additional=''){
     | 
  
  
    | 
      128
     | 
    
        global $UTF8_SPECIAL_CHARS2;
 
     | 
  
  
    | 
      129
     | 
    
      
 
     | 
  
  
    | 
      130
     | 
    
        static $specials = null;
 
     | 
  
  
    | 
      131
     | 
    
        if(is_null($specials)){
     | 
  
  
    | 
      132
     | 
    
          $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
 
     | 
  
  
    | 
      133
     | 
    
        }
 
     | 
  
  
    | 
      134
     | 
    
      
 
     | 
  
  
    | 
      135
     | 
    
        return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
     | 
  
  
    | 
      136
     | 
    
      }
 
     | 
  
  
    | 
      137
     | 
    
      
 
     | 
  
  
    | 
      138
     | 
    
      /*
 
     | 
  
  
    | 
      139
     | 
    
       * added functions - thorn
 
     | 
  
  
    | 
      140
     | 
    
       */
 
     | 
  
  
    | 
      141
     | 
    
      
 
     | 
  
  
    | 
      142
     | 
    
      /*
 
     | 
  
  
    | 
      143
     | 
    
       * faster replacement for utf8_entities_to_umlauts()
 
     | 
  
  
    | 
      144
     | 
    
       * not all features of utf8_entities_to_umlauts() --> utf8_unhtml() are supported!
 
     | 
  
  
    | 
      145
     | 
    
       * @author thorn
 
     | 
  
  
    | 
      146
     | 
    
       */
 
     | 
  
  
    | 
      147
     | 
    
      function utf8_fast_entities_to_umlauts($str) {
     | 
  
  
    | 
      148
     | 
    
      	if(UTF8_MBSTRING) {
     | 
  
  
    | 
      149
     | 
    
      		// we need this for use with mb_convert_encoding
 
     | 
  
  
    | 
      150
     | 
    
      		$str = str_replace(array('&','>','<','"',''',' '), array('&amp;','&gt;','&lt;','&quot;','&#39;','&nbsp;'), $str);
     | 
  
  
    | 
      151
     | 
    
      		// we need two mb_convert_encoding()-calls - is this a bug?
 
     | 
  
  
    | 
      152
     | 
    
      		// mb_convert_encoding("öö", 'UTF-8', 'HTML-ENTITIES'); // with string in utf-8-encoding doesn't work. Result: "öö"
     | 
  
  
    | 
      153
     | 
    
      		// Work-around: convert all umlauts to entities first ("öö"->"öö"), then all entities to umlauts ("öö"->"öö")
     | 
  
  
    | 
      154
     | 
    
      		return(mb_convert_encoding(mb_convert_encoding($str, 'HTML-ENTITIES', 'UTF-8'),'UTF-8', 'HTML-ENTITIES'));
 
     | 
  
  
    | 
      155
     | 
    
      	} else {
     | 
  
  
    | 
      156
     | 
    
      		global $named_entities;global $numbered_entities;
 
     | 
  
  
    | 
      157
     | 
    
      		$str = str_replace($named_entities, $numbered_entities, $str);
 
     | 
  
  
    | 
      158
     | 
    
      		$str = preg_replace("/&#([0-9]+);/e", "code_to_utf8($1)", $str);
     | 
  
  
    | 
      159
     | 
    
      	}
 
     | 
  
  
    | 
      160
     | 
    
      	return($str);
 
     | 
  
  
    | 
      161
     | 
    
      }
 
     | 
  
  
    | 
      162
     | 
    
      // support-function for utf8_fast_entities_to_umlauts()
 
     | 
  
  
    | 
      163
     | 
    
      function code_to_utf8($num) {
     | 
  
  
    | 
      164
     | 
    
      	if ($num <= 0x7F) {
     | 
  
  
    | 
      165
     | 
    
      		return chr($num);
 
     | 
  
  
    | 
      166
     | 
    
      	} elseif ($num <= 0x7FF) {
     | 
  
  
    | 
      167
     | 
    
      		return chr(($num >> 6) + 192) . chr(($num & 63) + 128);
 
     | 
  
  
    | 
      168
     | 
    
      	} elseif ($num <= 0xFFFF) {
     | 
  
  
    | 
      169
     | 
    
      		 return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
 
     | 
  
  
    | 
      170
     | 
    
      	} elseif ($num <= 0x1FFFFF) {
     | 
  
  
    | 
      171
     | 
    
      		return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
 
     | 
  
  
    | 
      172
     | 
    
      	}
 
     | 
  
  
    | 
      173
     | 
    
      	return "?";
 
     | 
  
  
    | 
      174
     | 
    
      }
 
     | 
  
  
    | 
      175
     | 
    
      
 
     | 
  
  
    | 
      176
     | 
    
      /*
 
     | 
  
  
    | 
      177
     | 
    
       * faster replacement for utf8_umlauts_to_entities()
 
     | 
  
  
    | 
      178
     | 
    
       * not all features of utf8_umlauts_to_entities() --> utf8_tohtml() are supported!
 
     | 
  
  
    | 
      179
     | 
    
       * @author thorn
 
     | 
  
  
    | 
      180
     | 
    
       */
 
     | 
  
  
    | 
      181
     | 
    
      function utf8_fast_umlauts_to_entities($string, $named_entities=true) {
     | 
  
  
    | 
      182
     | 
    
      	if(UTF8_MBSTRING)
 
     | 
  
  
    | 
      183
     | 
    
      		return(mb_convert_encoding($string, 'HTML-ENTITIES', 'UTF-8'));
 
     | 
  
  
    | 
      184
     | 
    
      	else {
     | 
  
  
    | 
      185
     | 
    
      		global $named_entities;global $numbered_entities;
 
     | 
  
  
    | 
      186
     | 
    
      		$new = "";
 
     | 
  
  
    | 
      187
     | 
    
      		$i=0;
 
     | 
  
  
    | 
      188
     | 
    
      		$len=strlen($string);
 
     | 
  
  
    | 
      189
     | 
    
      		if($len==0) return $string;
 
     | 
  
  
    | 
      190
     | 
    
      		do {
     | 
  
  
    | 
      191
     | 
    
      			if(ord($string{$i}) <= 127) $ud = $string{$i++};
     | 
  
  
    | 
      192
     | 
    
      			elseif(ord($string{$i}) <= 223) $ud = (ord($string{$i++})-192)*64 + (ord($string{$i++})-128);
     | 
  
  
    | 
      193
     | 
    
      			elseif(ord($string{$i}) <= 239) $ud = (ord($string{$i++})-224)*4096 + (ord($string{$i++})-128)*64 + (ord($string{$i++})-128);
     | 
  
  
    | 
      194
     | 
    
      			elseif(ord($string{$i}) <= 247) $ud = (ord($string{$i++})-240)*262144 + (ord($string{$i++})-128)*4096 + (ord($string{$i++})-128)*64 + (ord($string{$i++})-128);
     | 
  
  
    | 
      195
     | 
    
      			else $ud = ord($string{$i++}); // error!
     | 
  
  
    | 
      196
     | 
    
      			if($ud > 127) {
     | 
  
  
    | 
      197
     | 
    
      				$new .= "&#$ud;";
 
     | 
  
  
    | 
      198
     | 
    
      			} else {
     | 
  
  
    | 
      199
     | 
    
      				$new .= $ud;
 
     | 
  
  
    | 
      200
     | 
    
      			}
 
     | 
  
  
    | 
      201
     | 
    
      		} while($i < $len);
 
     | 
  
  
    | 
      202
     | 
    
      		$string = $new;
 
     | 
  
  
    | 
      203
     | 
    
      		if($named_entities)
 
     | 
  
  
    | 
      204
     | 
    
      			$string = str_replace($numbered_entities, $named_entities, $string);
 
     | 
  
  
    | 
      205
     | 
    
      	}
 
     | 
  
  
    | 
      206
     | 
    
      	return($string);
 
     | 
  
  
    | 
      207
     | 
    
      }
 
     | 
  
  
    | 
      208
     | 
    
      
 
     | 
  
  
    | 
      209
     | 
    
      /*
 
     | 
  
  
    | 
      210
     | 
    
       * Converts from various charsets to UTF-8
 
     | 
  
  
    | 
      211
     | 
    
       *
 
     | 
  
  
    | 
      212
     | 
    
       * Will convert a string from various charsets to UTF-8.
 
     | 
  
  
    | 
      213
     | 
    
       * HTML-entities may be converted, too.
 
     | 
  
  
    | 
      214
     | 
    
       * In case of error the returned string is unchanged, and a message is emitted.
 
     | 
  
  
    | 
      215
     | 
    
       * Supported charsets are:
 
     | 
  
  
    | 
      216
     | 
    
       * direct: iso_8859_1 iso_8859_2 iso_8859_3 iso_8859_4 iso_8859_5
 
     | 
  
  
    | 
      217
     | 
    
       *         iso_8859_6 iso_8859_7 iso_8859_8 iso_8859_9 iso_8859_10 iso_8859_11
 
     | 
  
  
    | 
      218
     | 
    
       * mb_convert_encoding: all wb charsets (except those from 'direct'); but not GB2312
 
     | 
  
  
    | 
      219
     | 
    
       * iconv:  all wb charsets (except those from 'direct')
 
     | 
  
  
    | 
      220
     | 
    
       *
 
     | 
  
  
    | 
      221
     | 
    
       * @param  string  A string in supported encoding
 
     | 
  
  
    | 
      222
     | 
    
       * @param  string  The charset to convert from, defaults to DEFAULT_CHARSET
 
     | 
  
  
    | 
      223
     | 
    
       * @return string  A string in UTF-8-encoding, with all entities decoded, too.
 
     | 
  
  
    | 
      224
     | 
    
       *                 String is unchanged in case of error.
 
     | 
  
  
    | 
      225
     | 
    
       * @author thorn
 
     | 
  
  
    | 
      226
     | 
    
       */
 
     | 
  
  
    | 
      227
     | 
    
      function charset_to_utf8($str, $charset_in=DEFAULT_CHARSET, $decode_entities=true) {
     | 
  
  
    | 
      228
     | 
    
      	global $iso_8859_2_to_utf8, $iso_8859_3_to_utf8, $iso_8859_4_to_utf8, $iso_8859_5_to_utf8, $iso_8859_6_to_utf8, $iso_8859_7_to_utf8, $iso_8859_8_to_utf8, $iso_8859_9_to_utf8, $iso_8859_10_to_utf8, $iso_8859_11_to_utf8;
 
     | 
  
  
    | 
      229
     | 
    
      	$charset_in = strtoupper($charset_in);
 
     | 
  
  
    | 
      230
     | 
    
      	if ($charset_in == "") { $charset_in = 'UTF-8'; }
     | 
  
  
    | 
      231
     | 
    
      	$wrong_ISO8859 = false;
 
     | 
  
  
    | 
      232
     | 
    
      	$converted = false;
 
     | 
  
  
    | 
      233
     | 
    
      
 
     | 
  
  
    | 
      234
     | 
    
      	if((!function_exists('iconv') && !UTF8_MBSTRING && ($charset_in=='BIG5' || $charset_in=='ISO-2022-JP' || $charset_in=='ISO-2022-KR')) || (!function_exists('iconv') && $charset_in=='GB2312')) {
     | 
  
  
    | 
      235
     | 
    
      		// Nothing we can do here :-(
 
     | 
  
  
    | 
      236
     | 
    
      		// Charset is one of those obscure ISO-2022... or BIG5, GB2312 or something
 
     | 
  
  
    | 
      237
     | 
    
      		// and we can't use mb_convert_encoding() or iconv();
 
     | 
  
  
    | 
      238
     | 
    
      		// Emit an error-message.
 
     | 
  
  
    | 
      239
     | 
    
      		trigger_error("Can't convert from $charset_in without mb_convert_encoding() or iconv(). Use UTF-8 instead.", E_USER_WARNING);
     | 
  
  
    | 
      240
     | 
    
      		return($str);
 
     | 
  
  
    | 
      241
     | 
    
      	}
 
     | 
  
  
    | 
      242
     | 
    
      
 
     | 
  
  
    | 
      243
     | 
    
      	// check if we have UTF-8 or a plain ASCII string
 
     | 
  
  
    | 
      244
     | 
    
      	if($charset_in == 'UTF-8' || utf8_isASCII($str)) {
     | 
  
  
    | 
      245
     | 
    
      		// we have utf-8. Just replace HTML-entities and return
 
     | 
  
  
    | 
      246
     | 
    
      		if($decode_entities && preg_match('/&[#0-9a-zA-Z]+;/',$str))
     | 
  
  
    | 
      247
     | 
    
      			return(utf8_fast_entities_to_umlauts($str));
 
     | 
  
  
    | 
      248
     | 
    
      		else // nothing to do
 
     | 
  
  
    | 
      249
     | 
    
      			return($str);
 
     | 
  
  
    | 
      250
     | 
    
      	}
 
     | 
  
  
    | 
      251
     | 
    
      	
 
     | 
  
  
    | 
      252
     | 
    
      	// Convert $str to utf8
 
     | 
  
  
    | 
      253
     | 
    
      	if(substr($charset_in,0,8) == 'ISO-8859') {
     | 
  
  
    | 
      254
     | 
    
      		switch($charset_in) {
     | 
  
  
    | 
      255
     | 
    
      			case 'ISO-8859-1': $str=utf8_encode($str); break;
 
     | 
  
  
    | 
      256
     | 
    
      			case 'ISO-8859-2': $str=strtr($str, $iso_8859_2_to_utf8); break;
 
     | 
  
  
    | 
      257
     | 
    
      			case 'ISO-8859-3': $str=strtr($str, $iso_8859_3_to_utf8); break;
 
     | 
  
  
    | 
      258
     | 
    
      			case 'ISO-8859-4': $str=strtr($str, $iso_8859_4_to_utf8); break;
 
     | 
  
  
    | 
      259
     | 
    
      			case 'ISO-8859-5': $str=strtr($str, $iso_8859_5_to_utf8); break;
 
     | 
  
  
    | 
      260
     | 
    
      			case 'ISO-8859-6': $str=strtr($str, $iso_8859_6_to_utf8); break;
 
     | 
  
  
    | 
      261
     | 
    
      			case 'ISO-8859-7': $str=strtr($str, $iso_8859_7_to_utf8); break;
 
     | 
  
  
    | 
      262
     | 
    
      			case 'ISO-8859-8': $str=strtr($str, $iso_8859_8_to_utf8); break;
 
     | 
  
  
    | 
      263
     | 
    
      			case 'ISO-8859-9': $str=strtr($str, $iso_8859_9_to_utf8); break;
 
     | 
  
  
    | 
      264
     | 
    
      			case 'ISO-8859-10': $str=strtr($str, $iso_8859_10_to_utf8); break;
 
     | 
  
  
    | 
      265
     | 
    
      			case 'ISO-8859-11': $str=strtr($str, $iso_8859_11_to_utf8); break;
 
     | 
  
  
    | 
      266
     | 
    
      			default: $wrong_ISO8859 = true;
 
     | 
  
  
    | 
      267
     | 
    
      		}
 
     | 
  
  
    | 
      268
     | 
    
      		if(!$wrong_ISO8859)
 
     | 
  
  
    | 
      269
     | 
    
      			$converted = true;
 
     | 
  
  
    | 
      270
     | 
    
      	}
 
     | 
  
  
    | 
      271
     | 
    
      	if(!$converted && UTF8_MBSTRING && $charset_in != 'GB2312') {
     | 
  
  
    | 
      272
     | 
    
      		// $charset is neither UTF-8 nor a known ISO-8859...
 
     | 
  
  
    | 
      273
     | 
    
      		// Try mb_convert_encoding() - but there's no GB2312 encoding in php's mb_* functions
 
     | 
  
  
    | 
      274
     | 
    
      		$str = mb_convert_encoding($str, 'UTF-8', $charset_in);
 
     | 
  
  
    | 
      275
     | 
    
      		$converted = true;
 
     | 
  
  
    | 
      276
     | 
    
      	} elseif(!$converted) { // Try iconv
     | 
  
  
    | 
      277
     | 
    
      		if(function_exists('iconv')) {
     | 
  
  
    | 
      278
     | 
    
      			$str = iconv($charset_in, 'UTF-8', $str);
 
     | 
  
  
    | 
      279
     | 
    
      			$converted = true;
 
     | 
  
  
    | 
      280
     | 
    
      		}
 
     | 
  
  
    | 
      281
     | 
    
      	}
 
     | 
  
  
    | 
      282
     | 
    
      	if($converted) {
     | 
  
  
    | 
      283
     | 
    
      		// we have utf-8, now replace HTML-entities and return
 
     | 
  
  
    | 
      284
     | 
    
      		if($decode_entities && preg_match('/&[#0-9a-zA-Z]+;/',$str))
     | 
  
  
    | 
      285
     | 
    
      			$str = utf8_fast_entities_to_umlauts($str);
 
     | 
  
  
    | 
      286
     | 
    
      		return($str);
 
     | 
  
  
    | 
      287
     | 
    
      	}
 
     | 
  
  
    | 
      288
     | 
    
      	
 
     | 
  
  
    | 
      289
     | 
    
      	// Nothing we can do here :-(
 
     | 
  
  
    | 
      290
     | 
    
      	// Charset is one of those obscure ISO-2022... or BIG5, GB2312 or something
 
     | 
  
  
    | 
      291
     | 
    
      	// and we can't use mb_convert_encoding() or iconv();
 
     | 
  
  
    | 
      292
     | 
    
      	// Emit an error-message.
 
     | 
  
  
    | 
      293
     | 
    
      	trigger_error("Can't convert from $charset_in without mb_convert_encoding() or iconv(). Use UTF-8 instead.", E_USER_WARNING);
     | 
  
  
    | 
      294
     | 
    
      	
 
     | 
  
  
    | 
      295
     | 
    
      	return $str;
 
     | 
  
  
    | 
      296
     | 
    
      }
 
     | 
  
  
    | 
      297
     | 
    
      
 
     | 
  
  
    | 
      298
     | 
    
      /*
 
     | 
  
  
    | 
      299
     | 
    
       * Converts from UTF-8 to various charsets
 
     | 
  
  
    | 
      300
     | 
    
       *
 
     | 
  
  
    | 
      301
     | 
    
       * Will convert a string from UTF-8 to various charsets.
 
     | 
  
  
    | 
      302
     | 
    
       * HTML-entities will not! be converted.
 
     | 
  
  
    | 
      303
     | 
    
       * In case of error the returned string is unchanged, and a message is emitted.
 
     | 
  
  
    | 
      304
     | 
    
       * Supported charsets are:
 
     | 
  
  
    | 
      305
     | 
    
       * direct: iso_8859_1 iso_8859_2 iso_8859_3 iso_8859_4 iso_8859_5
 
     | 
  
  
    | 
      306
     | 
    
       *         iso_8859_6 iso_8859_7 iso_8859_8 iso_8859_9 iso_8859_10 iso_8859_11
 
     | 
  
  
    | 
      307
     | 
    
       * mb_convert_encoding: all wb charsets (except those from 'direct'); but not GB2312
 
     | 
  
  
    | 
      308
     | 
    
       * iconv:  all wb charsets (except those from 'direct')
 
     | 
  
  
    | 
      309
     | 
    
       *
 
     | 
  
  
    | 
      310
     | 
    
       * @param  string  An UTF-8 encoded string
 
     | 
  
  
    | 
      311
     | 
    
       * @param  string  The charset to convert to, defaults to DEFAULT_CHARSET
 
     | 
  
  
    | 
      312
     | 
    
       * @return string  A string in a supported encoding, with all entities decoded, too.
 
     | 
  
  
    | 
      313
     | 
    
       *                 String is unchanged in case of error.
 
     | 
  
  
    | 
      314
     | 
    
       * @author thorn
 
     | 
  
  
    | 
      315
     | 
    
       */
 
     | 
  
  
    | 
      316
     | 
    
      function utf8_to_charset($str, $charset_out=DEFAULT_CHARSET) {
     | 
  
  
    | 
      317
     | 
    
      	global $utf8_to_iso_8859_2, $utf8_to_iso_8859_3, $utf8_to_iso_8859_4, $utf8_to_iso_8859_5, $utf8_to_iso_8859_6, $utf8_to_iso_8859_7, $utf8_to_iso_8859_8, $utf8_to_iso_8859_9, $utf8_to_iso_8859_10, $utf8_to_iso_8859_11;
 
     | 
  
  
    | 
      318
     | 
    
      	$charset_out = strtoupper($charset_out);
 
     | 
  
  
    | 
      319
     | 
    
      	$wrong_ISO8859 = false;
 
     | 
  
  
    | 
      320
     | 
    
      	$converted = false;
 
     | 
  
  
    | 
      321
     | 
    
      
 
     | 
  
  
    | 
      322
     | 
    
      	if((!function_exists('iconv') && !UTF8_MBSTRING && ($charset_out=='BIG5' || $charset_out=='ISO-2022-JP' || $charset_out=='ISO-2022-KR')) || (!function_exists('iconv') && $charset_out=='GB2312')) {
     | 
  
  
    | 
      323
     | 
    
      		// Nothing we can do here :-(
 
     | 
  
  
    | 
      324
     | 
    
      		// Charset is one of those obscure ISO-2022... or BIG5, GB2312 or something
 
     | 
  
  
    | 
      325
     | 
    
      		// and we can't use mb_convert_encoding() or iconv();
 
     | 
  
  
    | 
      326
     | 
    
      		// Emit an error-message.
 
     | 
  
  
    | 
      327
     | 
    
      		trigger_error("Can't convert into $charset_out without mb_convert_encoding() or iconv(). Use UTF-8 instead.", E_USER_WARNING);
     | 
  
  
    | 
      328
     | 
    
      		return($str);
 
     | 
  
  
    | 
      329
     | 
    
      	}
 
     | 
  
  
    | 
      330
     | 
    
      	
 
     | 
  
  
    | 
      331
     | 
    
      	// the string comes from charset_to_utf8(), so we can skip this
 
     | 
  
  
    | 
      332
     | 
    
      	// replace HTML-entities first
 
     | 
  
  
    | 
      333
     | 
    
      	//if(preg_match('/&[#0-9a-zA-Z]+;/',$str))
     | 
  
  
    | 
      334
     | 
    
      	//	$str = utf8_entities_to_umlauts($str);
 
     | 
  
  
    | 
      335
     | 
    
      	
 
     | 
  
  
    | 
      336
     | 
    
      	// check if we need to convert
 
     | 
  
  
    | 
      337
     | 
    
      	if($charset_out == 'UTF-8' || utf8_isASCII($str)) {
     | 
  
  
    | 
      338
     | 
    
      		// Nothing to do. Just return
 
     | 
  
  
    | 
      339
     | 
    
      			return($str);
 
     | 
  
  
    | 
      340
     | 
    
      	}
 
     | 
  
  
    | 
      341
     | 
    
      	
 
     | 
  
  
    | 
      342
     | 
    
      	// Convert $str to $charset_out
 
     | 
  
  
    | 
      343
     | 
    
      	if(substr($charset_out,0,8) == 'ISO-8859') {
     | 
  
  
    | 
      344
     | 
    
      		switch($charset_out) {
     | 
  
  
    | 
      345
     | 
    
      			case 'ISO-8859-1': $str=utf8_decode($str); break;
 
     | 
  
  
    | 
      346
     | 
    
      			case 'ISO-8859-2': $str=strtr($str, $utf8_to_iso_8859_2); break;
 
     | 
  
  
    | 
      347
     | 
    
      			case 'ISO-8859-3': $str=strtr($str, $utf8_to_iso_8859_3); break;
 
     | 
  
  
    | 
      348
     | 
    
      			case 'ISO-8859-4': $str=strtr($str, $utf8_to_iso_8859_4); break;
 
     | 
  
  
    | 
      349
     | 
    
      			case 'ISO-8859-5': $str=strtr($str, $utf8_to_iso_8859_5); break;
 
     | 
  
  
    | 
      350
     | 
    
      			case 'ISO-8859-6': $str=strtr($str, $utf8_to_iso_8859_6); break;
 
     | 
  
  
    | 
      351
     | 
    
      			case 'ISO-8859-7': $str=strtr($str, $utf8_to_iso_8859_7); break;
 
     | 
  
  
    | 
      352
     | 
    
      			case 'ISO-8859-8': $str=strtr($str, $utf8_to_iso_8859_8); break;
 
     | 
  
  
    | 
      353
     | 
    
      			case 'ISO-8859-9': $str=strtr($str, $utf8_to_iso_8859_9); break;
 
     | 
  
  
    | 
      354
     | 
    
      			case 'ISO-8859-10': $str=strtr($str, $utf8_to_iso_8859_10); break;
 
     | 
  
  
    | 
      355
     | 
    
      			case 'ISO-8859-11': $str=strtr($str, $utf8_to_iso_8859_11); break;
 
     | 
  
  
    | 
      356
     | 
    
      			default: $wrong_ISO8859 = true;
 
     | 
  
  
    | 
      357
     | 
    
      		}
 
     | 
  
  
    | 
      358
     | 
    
      		if(!$wrong_ISO8859)
 
     | 
  
  
    | 
      359
     | 
    
      			$converted = true;
 
     | 
  
  
    | 
      360
     | 
    
      	}
 
     | 
  
  
    | 
      361
     | 
    
      	if(!$converted && UTF8_MBSTRING && $charset_out != 'GB2312') {
     | 
  
  
    | 
      362
     | 
    
      		// $charset is neither UTF-8 nor a known ISO-8859...
 
     | 
  
  
    | 
      363
     | 
    
      		// Try mb_convert_encoding() - but there's no GB2312 encoding in php's mb_* functions
 
     | 
  
  
    | 
      364
     | 
    
      		$str = mb_convert_encoding($str, $charset_out, 'UTF-8');
 
     | 
  
  
    | 
      365
     | 
    
      		$converted = true;
 
     | 
  
  
    | 
      366
     | 
    
      	} elseif(!$converted) { // Try iconv
     | 
  
  
    | 
      367
     | 
    
      		if(function_exists('iconv')) {
     | 
  
  
    | 
      368
     | 
    
      			$str = iconv('UTF-8', $charset_out, $str);
     | 
  
  
    | 
      369
     | 
    
      			$converted = true;
 
     | 
  
  
    | 
      370
     | 
    
      		}
 
     | 
  
  
    | 
      371
     | 
    
      	}
 
     | 
  
  
    | 
      372
     | 
    
      	if($converted) {
     | 
  
  
    | 
      373
     | 
    
      		return($str);
 
     | 
  
  
    | 
      374
     | 
    
      	}
 
     | 
  
  
    | 
      375
     | 
    
      	
 
     | 
  
  
    | 
      376
     | 
    
      	// Nothing we can do here :-(
 
     | 
  
  
    | 
      377
     | 
    
      	// Charset is one of those obscure ISO-2022... or BIG5, GB2312 or something
 
     | 
  
  
    | 
      378
     | 
    
      	// and we can't use mb_convert_encoding() or iconv();
 
     | 
  
  
    | 
      379
     | 
    
      	// Emit an error-message.
 
     | 
  
  
    | 
      380
     | 
    
      	trigger_error("Can't convert into $charset_out without mb_convert_encoding() or iconv(). Use UTF-8 instead.", E_USER_WARNING);
     | 
  
  
    | 
      381
     | 
    
      	
 
     | 
  
  
    | 
      382
     | 
    
      	return $str;
 
     | 
  
  
    | 
      383
     | 
    
      }
 
     | 
  
  
    | 
      384
     | 
    
      
 
     | 
  
  
    | 
      385
     | 
    
      /*
 
     | 
  
  
    | 
      386
     | 
    
       * convert Filenames to ASCII
 
     | 
  
  
    | 
      387
     | 
    
       *
 
     | 
  
  
    | 
      388
     | 
    
       * Convert all non-ASCII characters and all HTML-entities to their plain 7bit equivalents
 
     | 
  
  
    | 
      389
     | 
    
       * Characters without an equivalent will be converted to hex-values.
 
     | 
  
  
    | 
      390
     | 
    
       * The name entities_to_7bit() is somewhat misleading, but kept for compatibility-reasons.
 
     | 
  
  
    | 
      391
     | 
    
       *
 
     | 
  
  
    | 
      392
     | 
    
       * @param  string  Filename to convert (all encodings from charset_to_utf8() are allowed)
 
     | 
  
  
    | 
      393
     | 
    
       * @return string  ASCII encoded string, to use as filename in wb's page_filename() and media_filename
 
     | 
  
  
    | 
      394
     | 
    
       * @author thorn
 
     | 
  
  
    | 
      395
     | 
    
       */
 
     | 
  
  
    | 
      396
     | 
    
      function entities_to_7bit($str) {
     | 
  
  
    | 
      397
     | 
    
      	// convert to UTF-8
 
     | 
  
  
    | 
      398
     | 
    
      	$str = charset_to_utf8($str);
 
     | 
  
  
    | 
      399
     | 
    
      	if(!utf8_check($str))
 
     | 
  
  
    | 
      400
     | 
    
      		return($str);
 
     | 
  
  
    | 
      401
     | 
    
      	// replace some specials
 
     | 
  
  
    | 
      402
     | 
    
      	$str = utf8_stripspecials($str, '_');
 
     | 
  
  
    | 
      403
     | 
    
      	// translate non-ASCII characters to ASCII
 
     | 
  
  
    | 
      404
     | 
    
      	$str = utf8_romanize($str);
 
     | 
  
  
    | 
      405
     | 
    
      	// missed some? - Many UTF-8-chars can't be romanized
 
     | 
  
  
    | 
      406
     | 
    
      	// convert to HTML-entities, and replace entites by hex-numbers
 
     | 
  
  
    | 
      407
     | 
    
      	$str = utf8_fast_umlauts_to_entities($str, false);
 
     | 
  
  
    | 
      408
     | 
    
      	$str = str_replace(''', ''', $str);
     | 
  
  
    | 
      409
     | 
    
      	if (version_compare(PHP_VERSION, '5.3', '<')) {
     | 
  
  
    | 
      410
     | 
    
      		$str = preg_replace('/&#([0-9]+);/e', "dechex('$1')",  $str);
     | 
  
  
    | 
      411
     | 
    
      	} else {
     | 
  
  
    | 
      412
     | 
    
      		$str = preg_replace_callback('/&#([0-9]+);/', create_function('$aMatches', 'return dechex($aMatches[1]);'),  $str);
     | 
  
  
    | 
      413
     | 
    
      	}
 
     | 
  
  
    | 
      414
     | 
    
      	// maybe there are some > < ' " &   left, replace them too
 
     | 
  
  
    | 
      415
     | 
    
      	$str = str_replace(array('>', '<', ''', '\'', '"', '&'), '', $str);
     | 
  
  
    | 
      416
     | 
    
      	$str = str_replace('&', '', $str);
     | 
  
  
    | 
      417
     | 
    
      	
 
     | 
  
  
    | 
      418
     | 
    
      	return($str);
 
     | 
  
  
    | 
      419
     | 
    
      }
 
     | 
  
  
    | 
      420
     | 
    
      
 
     | 
  
  
    | 
      421
     | 
    
      /*
 
     | 
  
  
    | 
      422
     | 
    
       * Convert a string from mixed html-entities/umlauts to pure $charset_out-umlauts
 
     | 
  
  
    | 
      423
     | 
    
       * 
 
     | 
  
  
    | 
      424
     | 
    
       * Will replace all numeric and named entities except
 
     | 
  
  
    | 
      425
     | 
    
       * > < ' " '  
 
     | 
  
  
    | 
      426
     | 
    
       * @author thorn
 
     | 
  
  
    | 
      427
     | 
    
       */
 
     | 
  
  
    | 
      428
     | 
    
      function entities_to_umlauts2($string, $charset_out=DEFAULT_CHARSET) {
     | 
  
  
    | 
      429
     | 
    
      	$string = charset_to_utf8($string, DEFAULT_CHARSET, true);
 
     | 
  
  
    | 
      430
     | 
    
      	//if(utf8_check($string)) // this check is to much time-consuming (this may fail only if AddDefaultCharset is set)
 
     | 
  
  
    | 
      431
     | 
    
      		$string = utf8_to_charset($string, $charset_out);
 
     | 
  
  
    | 
      432
     | 
    
      	return ($string);
 
     | 
  
  
    | 
      433
     | 
    
      }
 
     | 
  
  
    | 
      434
     | 
    
      
 
     | 
  
  
    | 
      435
     | 
    
      /*
 
     | 
  
  
    | 
      436
     | 
    
       * Convert a string from mixed html-entities/umlauts to pure ASCII with HTML-entities
 
     | 
  
  
    | 
      437
     | 
    
       * 
 
     | 
  
  
    | 
      438
     | 
    
       * Will convert a string in $charset_in encoding to a pure ASCII string with HTML-entities.
 
     | 
  
  
    | 
      439
     | 
    
       * @author thorn
 
     | 
  
  
    | 
      440
     | 
    
       */
 
     | 
  
  
    | 
      441
     | 
    
      function umlauts_to_entities2($string, $charset_in=DEFAULT_CHARSET) {
     | 
  
  
    | 
      442
     | 
    
      	$string = charset_to_utf8($string, $charset_in, false);
 
     | 
  
  
    | 
      443
     | 
    
      	//if(utf8_check($string)) // this check is to much time-consuming (this may fail only if AddDefaultCharset is set)
 
     | 
  
  
    | 
      444
     | 
    
      		$string = utf8_fast_umlauts_to_entities($string, false);
 
     | 
  
  
    | 
      445
     | 
    
      	return($string);
 
     | 
  
  
    | 
      446
     | 
    
      }
 
     | 
  
  
    | 
      447
     | 
    
      
 
     |