| 
      1
     | 
    
      <?php
 
     | 
  
  
    | 
      2
     | 
    
      
 
     | 
  
  
    | 
      3
     | 
    
      // $Id: functions-utf8.php 758 2008-03-18 18:44:59Z thorn $
 
     | 
  
  
    | 
      4
     | 
    
      
 
     | 
  
  
    | 
      5
     | 
    
      /*
 
     | 
  
  
    | 
      6
     | 
    
      
 
     | 
  
  
    | 
      7
     | 
    
       Website Baker Project <http://www.websitebaker.org/>
 
     | 
  
  
    | 
      8
     | 
    
       Copyright (C) 2004-2008, Ryan Djurovich
 
     | 
  
  
    | 
      9
     | 
    
      
 
     | 
  
  
    | 
      10
     | 
    
       Website Baker is free software; you can redistribute it and/or modify
 
     | 
  
  
    | 
      11
     | 
    
       it under the terms of the GNU General Public License as published by
 
     | 
  
  
    | 
      12
     | 
    
       the Free Software Foundation; either version 2 of the License, or
 
     | 
  
  
    | 
      13
     | 
    
       (at your option) any later version.
 
     | 
  
  
    | 
      14
     | 
    
      
 
     | 
  
  
    | 
      15
     | 
    
       Website Baker is distributed in the hope that it will be useful,
 
     | 
  
  
    | 
      16
     | 
    
       but WITHOUT ANY WARRANTY; without even the implied warranty of
 
     | 
  
  
    | 
      17
     | 
    
       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
     | 
  
  
    | 
      18
     | 
    
       GNU General Public License for more details.
 
     | 
  
  
    | 
      19
     | 
    
      
 
     | 
  
  
    | 
      20
     | 
    
       You should have received a copy of the GNU General Public License
 
     | 
  
  
    | 
      21
     | 
    
       along with Website Baker; if not, write to the Free Software
 
     | 
  
  
    | 
      22
     | 
    
       Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 
     | 
  
  
    | 
      23
     | 
    
      
 
     | 
  
  
    | 
      24
     | 
    
      */
 
     | 
  
  
    | 
      25
     | 
    
      
 
     | 
  
  
    | 
      26
     | 
    
      /*
 
     | 
  
  
    | 
      27
     | 
    
       * A large part of this file is based on 'utf8.php' from the DokuWiki-project.
 
     | 
  
  
    | 
      28
     | 
    
       * (http://www.splitbrain.org/projects/dokuwiki):
 
     | 
  
  
    | 
      29
     | 
    
       **
 
     | 
  
  
    | 
      30
     | 
    
       * UTF8 helper functions
 
     | 
  
  
    | 
      31
     | 
    
       * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)
 
     | 
  
  
    | 
      32
     | 
    
       * @author     Andreas Gohr <andi@splitbrain.org>
 
     | 
  
  
    | 
      33
     | 
    
       **
 
     | 
  
  
    | 
      34
     | 
    
       * modified for use with Website Baker
 
     | 
  
  
    | 
      35
     | 
    
       * from thorn, Jan. 2008
 
     | 
  
  
    | 
      36
     | 
    
       */
 
     | 
  
  
    | 
      37
     | 
    
      
 
     | 
  
  
    | 
      38
     | 
    
      // Functions we use in Website Baker:
 
     | 
  
  
    | 
      39
     | 
    
      //   entities_to_7bit()
 
     | 
  
  
    | 
      40
     | 
    
      //   entities_to_umlauts2()
 
     | 
  
  
    | 
      41
     | 
    
      //   umlauts_to_entities2()
 
     | 
  
  
    | 
      42
     | 
    
      
 
     | 
  
  
    | 
      43
     | 
    
      if(!defined('WB_URL')) {
     | 
  
  
    | 
      44
     | 
    
      	header('Location: ../index.php');
     | 
  
  
    | 
      45
     | 
    
      	exit(0);
 
     | 
  
  
    | 
      46
     | 
    
      }
 
     | 
  
  
    | 
      47
     | 
    
      
 
     | 
  
  
    | 
      48
     | 
    
      /*
 
     | 
  
  
    | 
      49
     | 
    
       * check for mb_string support
 
     | 
  
  
    | 
      50
     | 
    
       */
 
     | 
  
  
    | 
      51
     | 
    
      if(!defined('UTF8_MBSTRING')){
     | 
  
  
    | 
      52
     | 
    
        if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
     | 
  
  
    | 
      53
     | 
    
          define('UTF8_MBSTRING',1);
     | 
  
  
    | 
      54
     | 
    
        }else{
     | 
  
  
    | 
      55
     | 
    
          define('UTF8_MBSTRING',0);
     | 
  
  
    | 
      56
     | 
    
        }
 
     | 
  
  
    | 
      57
     | 
    
      }
 
     | 
  
  
    | 
      58
     | 
    
      
 
     | 
  
  
    | 
      59
     | 
    
      if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
     | 
  
  
    | 
      60
     | 
    
      
 
     | 
  
  
    | 
      61
     | 
    
      require_once(WB_PATH.'/framework/charsets_table.php');
 
     | 
  
  
    | 
      62
     | 
    
      
 
     | 
  
  
    | 
      63
     | 
    
      /*
 
     | 
  
  
    | 
      64
     | 
    
       * Checks if a string contains 7bit ASCII only
 
     | 
  
  
    | 
      65
     | 
    
       *
 
     | 
  
  
    | 
      66
     | 
    
       * @author thorn
 
     | 
  
  
    | 
      67
     | 
    
       */
 
     | 
  
  
    | 
      68
     | 
    
      function utf8_isASCII($str){
     | 
  
  
    | 
      69
     | 
    
      	if(preg_match('/[\x80-\xFF]/', $str))
     | 
  
  
    | 
      70
     | 
    
      		return false;
 
     | 
  
  
    | 
      71
     | 
    
      	else
 
     | 
  
  
    | 
      72
     | 
    
      		return true;
 
     | 
  
  
    | 
      73
     | 
    
      }
 
     | 
  
  
    | 
      74
     | 
    
      
 
     | 
  
  
    | 
      75
     | 
    
      /*
 
     | 
  
  
    | 
      76
     | 
    
       * Strips all highbyte chars
 
     | 
  
  
    | 
      77
     | 
    
       *
 
     | 
  
  
    | 
      78
     | 
    
       * Returns a pure ASCII7 string
 
     | 
  
  
    | 
      79
     | 
    
       *
 
     | 
  
  
    | 
      80
     | 
    
       * @author Andreas Gohr <andi@splitbrain.org>
 
     | 
  
  
    | 
      81
     | 
    
       */
 
     | 
  
  
    | 
      82
     | 
    
      function utf8_strip($str){
     | 
  
  
    | 
      83
     | 
    
        $ascii = '';
 
     | 
  
  
    | 
      84
     | 
    
        for($i=0; $i<strlen($str); $i++){
     | 
  
  
    | 
      85
     | 
    
          if(ord($str{$i}) <128){
     | 
  
  
    | 
      86
     | 
    
            $ascii .= $str{$i};
     | 
  
  
    | 
      87
     | 
    
          }
 
     | 
  
  
    | 
      88
     | 
    
        }
 
     | 
  
  
    | 
      89
     | 
    
        return $ascii;
 
     | 
  
  
    | 
      90
     | 
    
      }
 
     | 
  
  
    | 
      91
     | 
    
      
 
     | 
  
  
    | 
      92
     | 
    
      /*
 
     | 
  
  
    | 
      93
     | 
    
       * Tries to detect if a string is in Unicode encoding
 
     | 
  
  
    | 
      94
     | 
    
       *
 
     | 
  
  
    | 
      95
     | 
    
       * @author <bmorel@ssi.fr>
 
     | 
  
  
    | 
      96
     | 
    
       * @link   http://www.php.net/manual/en/function.utf8-encode.php
 
     | 
  
  
    | 
      97
     | 
    
       */
 
     | 
  
  
    | 
      98
     | 
    
      function utf8_check($Str) {
     | 
  
  
    | 
      99
     | 
    
       for ($i=0; $i<strlen($Str); $i++) {
     | 
  
  
    | 
      100
     | 
    
        $b = ord($Str[$i]);
 
     | 
  
  
    | 
      101
     | 
    
        if ($b < 0x80) continue; # 0bbbbbbb
 
     | 
  
  
    | 
      102
     | 
    
        elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
 
     | 
  
  
    | 
      103
     | 
    
        elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
 
     | 
  
  
    | 
      104
     | 
    
        elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
 
     | 
  
  
    | 
      105
     | 
    
        elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
 
     | 
  
  
    | 
      106
     | 
    
        elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
 
     | 
  
  
    | 
      107
     | 
    
        else return false; # Does not match any model
 
     | 
  
  
    | 
      108
     | 
    
        for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
     | 
  
  
    | 
      109
     | 
    
         if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
 
     | 
  
  
    | 
      110
     | 
    
         return false;
 
     | 
  
  
    | 
      111
     | 
    
        }
 
     | 
  
  
    | 
      112
     | 
    
       }
 
     | 
  
  
    | 
      113
     | 
    
       return true;
 
     | 
  
  
    | 
      114
     | 
    
      }
 
     | 
  
  
    | 
      115
     | 
    
      
 
     | 
  
  
    | 
      116
     | 
    
      /*
 
     | 
  
  
    | 
      117
     | 
    
       * Unicode aware replacement for strlen()
 
     | 
  
  
    | 
      118
     | 
    
       *
 
     | 
  
  
    | 
      119
     | 
    
       * utf8_decode() converts characters that are not in ISO-8859-1
 
     | 
  
  
    | 
      120
     | 
    
       * to '?', which, for the purpose of counting, is alright - It's
 
     | 
  
  
    | 
      121
     | 
    
       * even faster than mb_strlen.
 
     | 
  
  
    | 
      122
     | 
    
       *
 
     | 
  
  
    | 
      123
     | 
    
       * @author <chernyshevsky at hotmail dot com>
 
     | 
  
  
    | 
      124
     | 
    
       * @see    strlen()
 
     | 
  
  
    | 
      125
     | 
    
       * @see    utf8_decode()
 
     | 
  
  
    | 
      126
     | 
    
       */
 
     | 
  
  
    | 
      127
     | 
    
      function utf8_strlen($string){
     | 
  
  
    | 
      128
     | 
    
        return strlen(utf8_decode($string));
 
     | 
  
  
    | 
      129
     | 
    
      }
 
     | 
  
  
    | 
      130
     | 
    
      
 
     | 
  
  
    | 
      131
     | 
    
      /*
 
     | 
  
  
    | 
      132
     | 
    
       * UTF-8 aware alternative to substr
 
     | 
  
  
    | 
      133
     | 
    
       *
 
     | 
  
  
    | 
      134
     | 
    
       * Return part of a string given character offset (and optionally length)
 
     | 
  
  
    | 
      135
     | 
    
       *
 
     | 
  
  
    | 
      136
     | 
    
       * @author Harry Fuecks <hfuecks@gmail.com>
 
     | 
  
  
    | 
      137
     | 
    
       * @author Chris Smith <chris@jalakai.co.uk>
 
     | 
  
  
    | 
      138
     | 
    
       * @param string
 
     | 
  
  
    | 
      139
     | 
    
       * @param integer number of UTF-8 characters offset (from left)
 
     | 
  
  
    | 
      140
     | 
    
       * @param integer (optional) length in UTF-8 characters from offset
 
     | 
  
  
    | 
      141
     | 
    
       * @return mixed string or false if failure
 
     | 
  
  
    | 
      142
     | 
    
       */
 
     | 
  
  
    | 
      143
     | 
    
      function utf8_substr($str, $offset, $length = null) {
     | 
  
  
    | 
      144
     | 
    
          if(UTF8_MBSTRING){
     | 
  
  
    | 
      145
     | 
    
              if( $length === null ){
     | 
  
  
    | 
      146
     | 
    
                  return mb_substr($str, $offset);
 
     | 
  
  
    | 
      147
     | 
    
              }else{
     | 
  
  
    | 
      148
     | 
    
                  return mb_substr($str, $offset, $length);
 
     | 
  
  
    | 
      149
     | 
    
              }
 
     | 
  
  
    | 
      150
     | 
    
          }
 
     | 
  
  
    | 
      151
     | 
    
      
 
     | 
  
  
    | 
      152
     | 
    
          /*
 
     | 
  
  
    | 
      153
     | 
    
           * Notes:
 
     | 
  
  
    | 
      154
     | 
    
           *
 
     | 
  
  
    | 
      155
     | 
    
           * no mb string support, so we'll use pcre regex's with 'u' flag
 
     | 
  
  
    | 
      156
     | 
    
           * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
 
     | 
  
  
    | 
      157
     | 
    
           * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
 
     | 
  
  
    | 
      158
     | 
    
           *
 
     | 
  
  
    | 
      159
     | 
    
           * substr documentation states false can be returned in some cases (e.g. offset > string length)
 
     | 
  
  
    | 
      160
     | 
    
           * mb_substr never returns false, it will return an empty string instead.
 
     | 
  
  
    | 
      161
     | 
    
           *
 
     | 
  
  
    | 
      162
     | 
    
           * calculating the number of characters in the string is a relatively expensive operation, so
 
     | 
  
  
    | 
      163
     | 
    
           * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
 
     | 
  
  
    | 
      164
     | 
    
           */
 
     | 
  
  
    | 
      165
     | 
    
      
 
     | 
  
  
    | 
      166
     | 
    
          // cast parameters to appropriate types to avoid multiple notices/warnings
 
     | 
  
  
    | 
      167
     | 
    
          $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
 
     | 
  
  
    | 
      168
     | 
    
          $offset = (int)$offset;
 
     | 
  
  
    | 
      169
     | 
    
          if (!is_null($length)) $length = (int)$length;
 
     | 
  
  
    | 
      170
     | 
    
      
 
     | 
  
  
    | 
      171
     | 
    
          // handle trivial cases
 
     | 
  
  
    | 
      172
     | 
    
          if ($length === 0) return '';
 
     | 
  
  
    | 
      173
     | 
    
          if ($offset < 0 && $length < 0 && $length < $offset) return '';
 
     | 
  
  
    | 
      174
     | 
    
      
 
     | 
  
  
    | 
      175
     | 
    
          $offset_pattern = '';
 
     | 
  
  
    | 
      176
     | 
    
          $length_pattern = '';
 
     | 
  
  
    | 
      177
     | 
    
      
 
     | 
  
  
    | 
      178
     | 
    
          // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
 
     | 
  
  
    | 
      179
     | 
    
          if ($offset < 0) {
     | 
  
  
    | 
      180
     | 
    
            $strlen = strlen(utf8_decode($str));        // see notes
 
     | 
  
  
    | 
      181
     | 
    
            $offset = $strlen + $offset;
 
     | 
  
  
    | 
      182
     | 
    
            if ($offset < 0) $offset = 0;
 
     | 
  
  
    | 
      183
     | 
    
          }
 
     | 
  
  
    | 
      184
     | 
    
      
 
     | 
  
  
    | 
      185
     | 
    
          // establish a pattern for offset, a non-captured group equal in length to offset
 
     | 
  
  
    | 
      186
     | 
    
          if ($offset > 0) {
     | 
  
  
    | 
      187
     | 
    
            $Ox = (int)($offset/65535);
 
     | 
  
  
    | 
      188
     | 
    
            $Oy = $offset%65535;
 
     | 
  
  
    | 
      189
     | 
    
      
 
     | 
  
  
    | 
      190
     | 
    
            if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
     | 
  
  
    | 
      191
     | 
    
            $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
     | 
  
  
    | 
      192
     | 
    
          } else {
     | 
  
  
    | 
      193
     | 
    
            $offset_pattern = '^';                      // offset == 0; just anchor the pattern
 
     | 
  
  
    | 
      194
     | 
    
          }
 
     | 
  
  
    | 
      195
     | 
    
      
 
     | 
  
  
    | 
      196
     | 
    
          // establish a pattern for length
 
     | 
  
  
    | 
      197
     | 
    
          if (is_null($length)) {
     | 
  
  
    | 
      198
     | 
    
            $length_pattern = '(.*)$';                  // the rest of the string
 
     | 
  
  
    | 
      199
     | 
    
          } else {
     | 
  
  
    | 
      200
     | 
    
      
 
     | 
  
  
    | 
      201
     | 
    
            if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes
 
     | 
  
  
    | 
      202
     | 
    
            if ($offset > $strlen) return '';           // another trivial case
 
     | 
  
  
    | 
      203
     | 
    
      
 
     | 
  
  
    | 
      204
     | 
    
            if ($length > 0) {
     | 
  
  
    | 
      205
     | 
    
      
 
     | 
  
  
    | 
      206
     | 
    
              $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
 
     | 
  
  
    | 
      207
     | 
    
      
 
     | 
  
  
    | 
      208
     | 
    
              $Lx = (int)($length/65535);
 
     | 
  
  
    | 
      209
     | 
    
              $Ly = $length%65535;
 
     | 
  
  
    | 
      210
     | 
    
      
 
     | 
  
  
    | 
      211
     | 
    
              // +ve length requires ... a captured group of length characters
 
     | 
  
  
    | 
      212
     | 
    
              if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
     | 
  
  
    | 
      213
     | 
    
              $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
     | 
  
  
    | 
      214
     | 
    
      
 
     | 
  
  
    | 
      215
     | 
    
            } else if ($length < 0) {
     | 
  
  
    | 
      216
     | 
    
      
 
     | 
  
  
    | 
      217
     | 
    
              if ($length < ($offset - $strlen)) return '';
 
     | 
  
  
    | 
      218
     | 
    
      
 
     | 
  
  
    | 
      219
     | 
    
              $Lx = (int)((-$length)/65535);
 
     | 
  
  
    | 
      220
     | 
    
              $Ly = (-$length)%65535;
 
     | 
  
  
    | 
      221
     | 
    
      
 
     | 
  
  
    | 
      222
     | 
    
              // -ve length requires ... capture everything except a group of -length characters
 
     | 
  
  
    | 
      223
     | 
    
              //                         anchored at the tail-end of the string
 
     | 
  
  
    | 
      224
     | 
    
              if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
     | 
  
  
    | 
      225
     | 
    
              $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
     | 
  
  
    | 
      226
     | 
    
            }
 
     | 
  
  
    | 
      227
     | 
    
          }
 
     | 
  
  
    | 
      228
     | 
    
      
 
     | 
  
  
    | 
      229
     | 
    
          if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
     | 
  
  
    | 
      230
     | 
    
          return $match[1];
 
     | 
  
  
    | 
      231
     | 
    
      }
 
     | 
  
  
    | 
      232
     | 
    
      
 
     | 
  
  
    | 
      233
     | 
    
      /*
 
     | 
  
  
    | 
      234
     | 
    
       * Unicode aware replacement for substr_replace()
 
     | 
  
  
    | 
      235
     | 
    
       *
 
     | 
  
  
    | 
      236
     | 
    
       * @author Andreas Gohr <andi@splitbrain.org>
 
     | 
  
  
    | 
      237
     | 
    
       * @see    substr_replace()
 
     | 
  
  
    | 
      238
     | 
    
       */
 
     | 
  
  
    | 
      239
     | 
    
      function utf8_substr_replace($string, $replacement, $start , $length=0 ){
     | 
  
  
    | 
      240
     | 
    
        $ret = '';
 
     | 
  
  
    | 
      241
     | 
    
        if($start>0) $ret .= utf8_substr($string, 0, $start);
 
     | 
  
  
    | 
      242
     | 
    
        $ret .= $replacement;
 
     | 
  
  
    | 
      243
     | 
    
        $ret .= utf8_substr($string, $start+$length);
 
     | 
  
  
    | 
      244
     | 
    
        return $ret;
 
     | 
  
  
    | 
      245
     | 
    
      }
 
     | 
  
  
    | 
      246
     | 
    
      
 
     | 
  
  
    | 
      247
     | 
    
      /*
 
     | 
  
  
    | 
      248
     | 
    
       * Unicode aware replacement for ltrim()
 
     | 
  
  
    | 
      249
     | 
    
       *
 
     | 
  
  
    | 
      250
     | 
    
       * @author Andreas Gohr <andi@splitbrain.org>
 
     | 
  
  
    | 
      251
     | 
    
       * @see    ltrim()
 
     | 
  
  
    | 
      252
     | 
    
       * @return string
 
     | 
  
  
    | 
      253
     | 
    
       */
 
     | 
  
  
    | 
      254
     | 
    
      function utf8_ltrim($str,$charlist=''){
     | 
  
  
    | 
      255
     | 
    
        if($charlist == '') return ltrim($str);
 
     | 
  
  
    | 
      256
     | 
    
      
 
     | 
  
  
    | 
      257
     | 
    
        //quote charlist for use in a characterclass
 
     | 
  
  
    | 
      258
     | 
    
        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
     | 
  
  
    | 
      259
     | 
    
      
 
     | 
  
  
    | 
      260
     | 
    
        return preg_replace('/^['.$charlist.']+/u','',$str);
     | 
  
  
    | 
      261
     | 
    
      }
 
     | 
  
  
    | 
      262
     | 
    
      
 
     | 
  
  
    | 
      263
     | 
    
      /*
 
     | 
  
  
    | 
      264
     | 
    
       * Unicode aware replacement for rtrim()
 
     | 
  
  
    | 
      265
     | 
    
       *
 
     | 
  
  
    | 
      266
     | 
    
       * @author Andreas Gohr <andi@splitbrain.org>
 
     | 
  
  
    | 
      267
     | 
    
       * @see    rtrim()
 
     | 
  
  
    | 
      268
     | 
    
       * @return string
 
     | 
  
  
    | 
      269
     | 
    
       */
 
     | 
  
  
    | 
      270
     | 
    
      function  utf8_rtrim($str,$charlist=''){
     | 
  
  
    | 
      271
     | 
    
        if($charlist == '') return rtrim($str);
 
     | 
  
  
    | 
      272
     | 
    
      
 
     | 
  
  
    | 
      273
     | 
    
        //quote charlist for use in a characterclass
 
     | 
  
  
    | 
      274
     | 
    
        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
     | 
  
  
    | 
      275
     | 
    
      
 
     | 
  
  
    | 
      276
     | 
    
        return preg_replace('/['.$charlist.']+$/u','',$str);
     | 
  
  
    | 
      277
     | 
    
      }
 
     | 
  
  
    | 
      278
     | 
    
      
 
     | 
  
  
    | 
      279
     | 
    
      /*
 
     | 
  
  
    | 
      280
     | 
    
       * Unicode aware replacement for trim()
 
     | 
  
  
    | 
      281
     | 
    
       *
 
     | 
  
  
    | 
      282
     | 
    
       * @author Andreas Gohr <andi@splitbrain.org>
 
     | 
  
  
    | 
      283
     | 
    
       * @see    trim()
 
     | 
  
  
    | 
      284
     | 
    
       * @return string
 
     | 
  
  
    | 
      285
     | 
    
       */
 
     | 
  
  
    | 
      286
     | 
    
      function  utf8_trim($str,$charlist='') {
     | 
  
  
    | 
      287
     | 
    
        if($charlist == '') return trim($str);
 
     | 
  
  
    | 
      288
     | 
    
      
 
     | 
  
  
    | 
      289
     | 
    
        return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist);
 
     | 
  
  
    | 
      290
     | 
    
      }
 
     | 
  
  
    | 
      291
     | 
    
      
 
     | 
  
  
    | 
      292
     | 
    
      /*
 
     | 
  
  
    | 
      293
     | 
    
       * This is a unicode aware replacement for strtolower()
 
     | 
  
  
    | 
      294
     | 
    
       *
 
     | 
  
  
    | 
      295
     | 
    
       * Uses mb_string extension if available
 
     | 
  
  
    | 
      296
     | 
    
       *
 
     | 
  
  
    | 
      297
     | 
    
       * @author Leo Feyer <leo@typolight.org>
 
     | 
  
  
    | 
      298
     | 
    
       * @see    strtolower()
 
     | 
  
  
    | 
      299
     | 
    
       * @see    utf8_strtoupper()
 
     | 
  
  
    | 
      300
     | 
    
       */
 
     | 
  
  
    | 
      301
     | 
    
      function utf8_strtolower($string){
     | 
  
  
    | 
      302
     | 
    
        if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
 
     | 
  
  
    | 
      303
     | 
    
      
 
     | 
  
  
    | 
      304
     | 
    
        global $UTF8_UPPER_TO_LOWER;
 
     | 
  
  
    | 
      305
     | 
    
        return strtr($string,$UTF8_UPPER_TO_LOWER);
 
     | 
  
  
    | 
      306
     | 
    
      }
 
     | 
  
  
    | 
      307
     | 
    
      
 
     | 
  
  
    | 
      308
     | 
    
      /*
 
     | 
  
  
    | 
      309
     | 
    
       * This is a unicode aware replacement for strtoupper()
 
     | 
  
  
    | 
      310
     | 
    
       *
 
     | 
  
  
    | 
      311
     | 
    
       * Uses mb_string extension if available
 
     | 
  
  
    | 
      312
     | 
    
       *
 
     | 
  
  
    | 
      313
     | 
    
       * @author Leo Feyer <leo@typolight.org>
 
     | 
  
  
    | 
      314
     | 
    
       * @see    strtoupper()
 
     | 
  
  
    | 
      315
     | 
    
       * @see    utf8_strtoupper()
 
     | 
  
  
    | 
      316
     | 
    
       */
 
     | 
  
  
    | 
      317
     | 
    
      function utf8_strtoupper($string){
     | 
  
  
    | 
      318
     | 
    
        if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
 
     | 
  
  
    | 
      319
     | 
    
      
 
     | 
  
  
    | 
      320
     | 
    
        global $UTF8_LOWER_TO_UPPER;
 
     | 
  
  
    | 
      321
     | 
    
        return strtr($string,$UTF8_LOWER_TO_UPPER);
 
     | 
  
  
    | 
      322
     | 
    
      }
 
     | 
  
  
    | 
      323
     | 
    
      
 
     | 
  
  
    | 
      324
     | 
    
      /*
 
     | 
  
  
    | 
      325
     | 
    
       * Romanize a non-latin string
 
     | 
  
  
    | 
      326
     | 
    
       *
 
     | 
  
  
    | 
      327
     | 
    
       * @author Andreas Gohr <andi@splitbrain.org>
 
     | 
  
  
    | 
      328
     | 
    
       */
 
     | 
  
  
    | 
      329
     | 
    
      function utf8_romanize($string){
     | 
  
  
    | 
      330
     | 
    
        if(utf8_isASCII($string)) return $string; //nothing to do
 
     | 
  
  
    | 
      331
     | 
    
      
 
     | 
  
  
    | 
      332
     | 
    
        global $UTF8_ROMANIZATION;
 
     | 
  
  
    | 
      333
     | 
    
        return strtr($string,$UTF8_ROMANIZATION);
 
     | 
  
  
    | 
      334
     | 
    
      }
 
     | 
  
  
    | 
      335
     | 
    
      
 
     | 
  
  
    | 
      336
     | 
    
      /*
 
     | 
  
  
    | 
      337
     | 
    
       * Removes special characters (nonalphanumeric) from a UTF-8 string
 
     | 
  
  
    | 
      338
     | 
    
       *
 
     | 
  
  
    | 
      339
     | 
    
       * This function adds the controlchars 0x00 to 0x19 to the array of
 
     | 
  
  
    | 
      340
     | 
    
       * stripped chars (they are not included in $UTF8_SPECIAL_CHARS2)
 
     | 
  
  
    | 
      341
     | 
    
       *
 
     | 
  
  
    | 
      342
     | 
    
       * @author Andreas Gohr <andi@splitbrain.org>
 
     | 
  
  
    | 
      343
     | 
    
       * @param  string $string     The UTF8 string to strip of special chars
 
     | 
  
  
    | 
      344
     | 
    
       * @param  string $repl       Replace special with this string
 
     | 
  
  
    | 
      345
     | 
    
       * @param  string $additional Additional chars to strip (used in regexp char class)
 
     | 
  
  
    | 
      346
     | 
    
       */
 
     | 
  
  
    | 
      347
     | 
    
      function utf8_stripspecials($string,$repl='',$additional=''){
     | 
  
  
    | 
      348
     | 
    
        global $UTF8_SPECIAL_CHARS2;
 
     | 
  
  
    | 
      349
     | 
    
      
 
     | 
  
  
    | 
      350
     | 
    
        static $specials = null;
 
     | 
  
  
    | 
      351
     | 
    
        if(is_null($specials)){
     | 
  
  
    | 
      352
     | 
    
          $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
 
     | 
  
  
    | 
      353
     | 
    
        }
 
     | 
  
  
    | 
      354
     | 
    
      
 
     | 
  
  
    | 
      355
     | 
    
        return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
     | 
  
  
    | 
      356
     | 
    
      }
 
     | 
  
  
    | 
      357
     | 
    
      
 
     | 
  
  
    | 
      358
     | 
    
      /*
 
     | 
  
  
    | 
      359
     | 
    
       * This is an Unicode aware replacement for strpos
 
     | 
  
  
    | 
      360
     | 
    
       *
 
     | 
  
  
    | 
      361
     | 
    
       * @author Leo Feyer <leo@typolight.org>
 
     | 
  
  
    | 
      362
     | 
    
       * @see    strpos()
 
     | 
  
  
    | 
      363
     | 
    
       * @param  string
 
     | 
  
  
    | 
      364
     | 
    
       * @param  string
 
     | 
  
  
    | 
      365
     | 
    
       * @param  integer
 
     | 
  
  
    | 
      366
     | 
    
       * @return integer
 
     | 
  
  
    | 
      367
     | 
    
       */
 
     | 
  
  
    | 
      368
     | 
    
      function utf8_strpos($haystack, $needle, $offset=0){
     | 
  
  
    | 
      369
     | 
    
          $comp = 0;
 
     | 
  
  
    | 
      370
     | 
    
          $length = null;
 
     | 
  
  
    | 
      371
     | 
    
      
 
     | 
  
  
    | 
      372
     | 
    
          while (is_null($length) || $length < $offset) {
     | 
  
  
    | 
      373
     | 
    
              $pos = strpos($haystack, $needle, $offset + $comp);
 
     | 
  
  
    | 
      374
     | 
    
      
 
     | 
  
  
    | 
      375
     | 
    
              if ($pos === false)
 
     | 
  
  
    | 
      376
     | 
    
                  return false;
 
     | 
  
  
    | 
      377
     | 
    
      
 
     | 
  
  
    | 
      378
     | 
    
              $length = utf8_strlen(substr($haystack, 0, $pos));
 
     | 
  
  
    | 
      379
     | 
    
      
 
     | 
  
  
    | 
      380
     | 
    
              if ($length < $offset)
 
     | 
  
  
    | 
      381
     | 
    
                  $comp = $pos - $length;
 
     | 
  
  
    | 
      382
     | 
    
          }
 
     | 
  
  
    | 
      383
     | 
    
      
 
     | 
  
  
    | 
      384
     | 
    
          return $length;
 
     | 
  
  
    | 
      385
     | 
    
      }
 
     | 
  
  
    | 
      386
     | 
    
      
 
     | 
  
  
    | 
      387
     | 
    
      /*
 
     | 
  
  
    | 
      388
     | 
    
       * Encodes UTF-8 characters to HTML entities
 
     | 
  
  
    | 
      389
     | 
    
       *
 
     | 
  
  
    | 
      390
     | 
    
       * @author Tom N Harris <tnharris@whoopdedo.org>
 
     | 
  
  
    | 
      391
     | 
    
       * @author <vpribish at shopping dot com>
 
     | 
  
  
    | 
      392
     | 
    
       * @link   http://www.php.net/manual/en/function.utf8-decode.php
 
     | 
  
  
    | 
      393
     | 
    
       */
 
     | 
  
  
    | 
      394
     | 
    
      function utf8_tohtml ($str) {
     | 
  
  
    | 
      395
     | 
    
          $ret = '';
 
     | 
  
  
    | 
      396
     | 
    
          foreach (utf8_to_unicode($str) as $cp) {
     | 
  
  
    | 
      397
     | 
    
              if ($cp < 0x80)
 
     | 
  
  
    | 
      398
     | 
    
                  $ret .= chr($cp);
 
     | 
  
  
    | 
      399
     | 
    
              //elseif ($cp < 0x100)
 
     | 
  
  
    | 
      400
     | 
    
              //    $ret .= "&#$cp;";
 
     | 
  
  
    | 
      401
     | 
    
              else
 
     | 
  
  
    | 
      402
     | 
    
                  $ret .= "&#$cp;";
 
     | 
  
  
    | 
      403
     | 
    
              //    $ret .= '&#x'.dechex($cp).';';
 
     | 
  
  
    | 
      404
     | 
    
          }
 
     | 
  
  
    | 
      405
     | 
    
          return $ret;
 
     | 
  
  
    | 
      406
     | 
    
      }
 
     | 
  
  
    | 
      407
     | 
    
      
 
     | 
  
  
    | 
      408
     | 
    
      /*
 
     | 
  
  
    | 
      409
     | 
    
       * Decodes HTML entities to UTF-8 characters
 
     | 
  
  
    | 
      410
     | 
    
       *
 
     | 
  
  
    | 
      411
     | 
    
       * Convert any &#..; entity to a codepoint,
 
     | 
  
  
    | 
      412
     | 
    
       * The entities flag defaults to only decoding numeric entities.
 
     | 
  
  
    | 
      413
     | 
    
       * Pass HTML_ENTITIES and named entities, including & < etc.
 
     | 
  
  
    | 
      414
     | 
    
       * are handled as well. Avoids the problem that would occur if you
 
     | 
  
  
    | 
      415
     | 
    
       * had to decode "&#38;&amp;#38;"
 
     | 
  
  
    | 
      416
     | 
    
       *
 
     | 
  
  
    | 
      417
     | 
    
       * unhtmlspecialchars(utf8_unhtml($s)) -> "&&"
 
     | 
  
  
    | 
      418
     | 
    
       * utf8_unhtml(unhtmlspecialchars($s)) -> "&&#38;"
 
     | 
  
  
    | 
      419
     | 
    
       * what it should be                   -> "&&#38;"
 
     | 
  
  
    | 
      420
     | 
    
       *
 
     | 
  
  
    | 
      421
     | 
    
       * @author Tom N Harris <tnharris@whoopdedo.org>
 
     | 
  
  
    | 
      422
     | 
    
       * @param  string  $str      UTF-8 encoded string
 
     | 
  
  
    | 
      423
     | 
    
       * @param  boolean $entities Flag controlling decoding of named entities.
 
     | 
  
  
    | 
      424
     | 
    
       * @return UTF-8 encoded string with numeric (and named) entities replaced.
 
     | 
  
  
    | 
      425
     | 
    
       */
 
     | 
  
  
    | 
      426
     | 
    
      function utf8_unhtml($str, $entities=null) {
     | 
  
  
    | 
      427
     | 
    
          static $decoder = null;
 
     | 
  
  
    | 
      428
     | 
    
          if (is_null($decoder))
 
     | 
  
  
    | 
      429
     | 
    
            $decoder = new utf8_entity_decoder();
 
     | 
  
  
    | 
      430
     | 
    
          if (is_null($entities))
 
     | 
  
  
    | 
      431
     | 
    
              return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
     | 
  
  
    | 
      432
     | 
    
                                           'utf8_decode_numeric', $str);
 
     | 
  
  
    | 
      433
     | 
    
          else
 
     | 
  
  
    | 
      434
     | 
    
              return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
     | 
  
  
    | 
      435
     | 
    
                                           array(&$decoder, 'decode'), $str);
 
     | 
  
  
    | 
      436
     | 
    
      }
 
     | 
  
  
    | 
      437
     | 
    
      function utf8_decode_numeric($ent) {
     | 
  
  
    | 
      438
     | 
    
          switch ($ent[2]) {
     | 
  
  
    | 
      439
     | 
    
            case 'X':
 
     | 
  
  
    | 
      440
     | 
    
            case 'x':
 
     | 
  
  
    | 
      441
     | 
    
                $cp = hexdec($ent[3]);
 
     | 
  
  
    | 
      442
     | 
    
                break;
 
     | 
  
  
    | 
      443
     | 
    
            default:
 
     | 
  
  
    | 
      444
     | 
    
                $cp = intval($ent[3]);
 
     | 
  
  
    | 
      445
     | 
    
                break;
 
     | 
  
  
    | 
      446
     | 
    
          }
 
     | 
  
  
    | 
      447
     | 
    
          return unicode_to_utf8(array($cp));
 
     | 
  
  
    | 
      448
     | 
    
      }
 
     | 
  
  
    | 
      449
     | 
    
      class utf8_entity_decoder {
     | 
  
  
    | 
      450
     | 
    
          var $table;
 
     | 
  
  
    | 
      451
     | 
    
          function utf8_entity_decoder() {
     | 
  
  
    | 
      452
     | 
    
              $table = get_html_translation_table(HTML_ENTITIES);
 
     | 
  
  
    | 
      453
     | 
    
              $table = array_flip($table);
 
     | 
  
  
    | 
      454
     | 
    
              $this->table = array_map(array(&$this,'makeutf8'), $table);
 
     | 
  
  
    | 
      455
     | 
    
          }
 
     | 
  
  
    | 
      456
     | 
    
          function makeutf8($c) {
     | 
  
  
    | 
      457
     | 
    
              return unicode_to_utf8(array(ord($c)));
 
     | 
  
  
    | 
      458
     | 
    
          }
 
     | 
  
  
    | 
      459
     | 
    
          function decode($ent) {
     | 
  
  
    | 
      460
     | 
    
              if ($ent[1] == '#') {
     | 
  
  
    | 
      461
     | 
    
                  return utf8_decode_numeric($ent);
 
     | 
  
  
    | 
      462
     | 
    
              } elseif (array_key_exists($ent[0],$this->table)) {
     | 
  
  
    | 
      463
     | 
    
                  return $this->table[$ent[0]];
 
     | 
  
  
    | 
      464
     | 
    
              } else {
     | 
  
  
    | 
      465
     | 
    
                  return $ent[0];
 
     | 
  
  
    | 
      466
     | 
    
              }
 
     | 
  
  
    | 
      467
     | 
    
          }
 
     | 
  
  
    | 
      468
     | 
    
      }
 
     | 
  
  
    | 
      469
     | 
    
      
 
     | 
  
  
    | 
      470
     | 
    
      /*
 
     | 
  
  
    | 
      471
     | 
    
       * Takes an UTF-8 string and returns an array of ints representing the
 
     | 
  
  
    | 
      472
     | 
    
       * Unicode characters. Astral planes are supported ie. the ints in the
 
     | 
  
  
    | 
      473
     | 
    
       * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
 
     | 
  
  
    | 
      474
     | 
    
       * are not allowed.
 
     | 
  
  
    | 
      475
     | 
    
       *
 
     | 
  
  
    | 
      476
     | 
    
       * If $strict is set to true the function returns false if the input
 
     | 
  
  
    | 
      477
     | 
    
       * string isn't a valid UTF-8 octet sequence and raises a PHP error at
 
     | 
  
  
    | 
      478
     | 
    
       * level E_USER_WARNING
 
     | 
  
  
    | 
      479
     | 
    
       *
 
     | 
  
  
    | 
      480
     | 
    
       * Note: this function has been modified slightly in this library to
 
     | 
  
  
    | 
      481
     | 
    
       * trigger errors on encountering bad bytes
 
     | 
  
  
    | 
      482
     | 
    
       *
 
     | 
  
  
    | 
      483
     | 
    
       * @author <hsivonen@iki.fi>
 
     | 
  
  
    | 
      484
     | 
    
       * @author Harry Fuecks <hfuecks@gmail.com>
 
     | 
  
  
    | 
      485
     | 
    
       * @param  string  UTF-8 encoded string
 
     | 
  
  
    | 
      486
     | 
    
       * @param  boolean Check for invalid sequences?
 
     | 
  
  
    | 
      487
     | 
    
       * @return mixed array of unicode code points or false if UTF-8 invalid
 
     | 
  
  
    | 
      488
     | 
    
       * @see    unicode_to_utf8
 
     | 
  
  
    | 
      489
     | 
    
       * @link   http://hsivonen.iki.fi/php-utf8/
 
     | 
  
  
    | 
      490
     | 
    
       * @link   http://sourceforge.net/projects/phputf8/
 
     | 
  
  
    | 
      491
     | 
    
       */
 
     | 
  
  
    | 
      492
     | 
    
      function utf8_to_unicode($str,$strict=false) {
     | 
  
  
    | 
      493
     | 
    
          $mState = 0;     // cached expected number of octets after the current octet
 
     | 
  
  
    | 
      494
     | 
    
                           // until the beginning of the next UTF8 character sequence
 
     | 
  
  
    | 
      495
     | 
    
          $mUcs4  = 0;     // cached Unicode character
 
     | 
  
  
    | 
      496
     | 
    
          $mBytes = 1;     // cached expected number of octets in the current sequence
 
     | 
  
  
    | 
      497
     | 
    
      
 
     | 
  
  
    | 
      498
     | 
    
          $out = array();
 
     | 
  
  
    | 
      499
     | 
    
      
 
     | 
  
  
    | 
      500
     | 
    
          $len = strlen($str);
 
     | 
  
  
    | 
      501
     | 
    
      
 
     | 
  
  
    | 
      502
     | 
    
          for($i = 0; $i < $len; $i++) {
     | 
  
  
    | 
      503
     | 
    
      
 
     | 
  
  
    | 
      504
     | 
    
              $in = ord($str{$i});
     | 
  
  
    | 
      505
     | 
    
      
 
     | 
  
  
    | 
      506
     | 
    
              if ( $mState == 0) {
     | 
  
  
    | 
      507
     | 
    
      
 
     | 
  
  
    | 
      508
     | 
    
                  // When mState is zero we expect either a US-ASCII character or a
 
     | 
  
  
    | 
      509
     | 
    
                  // multi-octet sequence.
 
     | 
  
  
    | 
      510
     | 
    
                  if (0 == (0x80 & ($in))) {
     | 
  
  
    | 
      511
     | 
    
                      // US-ASCII, pass straight through.
 
     | 
  
  
    | 
      512
     | 
    
                      $out[] = $in;
 
     | 
  
  
    | 
      513
     | 
    
                      $mBytes = 1;
 
     | 
  
  
    | 
      514
     | 
    
      
 
     | 
  
  
    | 
      515
     | 
    
                  } else if (0xC0 == (0xE0 & ($in))) {
     | 
  
  
    | 
      516
     | 
    
                      // First octet of 2 octet sequence
 
     | 
  
  
    | 
      517
     | 
    
                      $mUcs4 = ($in);
 
     | 
  
  
    | 
      518
     | 
    
                      $mUcs4 = ($mUcs4 & 0x1F) << 6;
 
     | 
  
  
    | 
      519
     | 
    
                      $mState = 1;
 
     | 
  
  
    | 
      520
     | 
    
                      $mBytes = 2;
 
     | 
  
  
    | 
      521
     | 
    
      
 
     | 
  
  
    | 
      522
     | 
    
                  } else if (0xE0 == (0xF0 & ($in))) {
     | 
  
  
    | 
      523
     | 
    
                      // First octet of 3 octet sequence
 
     | 
  
  
    | 
      524
     | 
    
                      $mUcs4 = ($in);
 
     | 
  
  
    | 
      525
     | 
    
                      $mUcs4 = ($mUcs4 & 0x0F) << 12;
 
     | 
  
  
    | 
      526
     | 
    
                      $mState = 2;
 
     | 
  
  
    | 
      527
     | 
    
                      $mBytes = 3;
 
     | 
  
  
    | 
      528
     | 
    
      
 
     | 
  
  
    | 
      529
     | 
    
                  } else if (0xF0 == (0xF8 & ($in))) {
     | 
  
  
    | 
      530
     | 
    
                      // First octet of 4 octet sequence
 
     | 
  
  
    | 
      531
     | 
    
                      $mUcs4 = ($in);
 
     | 
  
  
    | 
      532
     | 
    
                      $mUcs4 = ($mUcs4 & 0x07) << 18;
 
     | 
  
  
    | 
      533
     | 
    
                      $mState = 3;
 
     | 
  
  
    | 
      534
     | 
    
                      $mBytes = 4;
 
     | 
  
  
    | 
      535
     | 
    
      
 
     | 
  
  
    | 
      536
     | 
    
                  } else if (0xF8 == (0xFC & ($in))) {
     | 
  
  
    | 
      537
     | 
    
                      /* First octet of 5 octet sequence.
 
     | 
  
  
    | 
      538
     | 
    
                       *
 
     | 
  
  
    | 
      539
     | 
    
                       * This is illegal because the encoded codepoint must be either
 
     | 
  
  
    | 
      540
     | 
    
                       * (a) not the shortest form or
 
     | 
  
  
    | 
      541
     | 
    
                       * (b) outside the Unicode range of 0-0x10FFFF.
 
     | 
  
  
    | 
      542
     | 
    
                       * Rather than trying to resynchronize, we will carry on until the end
 
     | 
  
  
    | 
      543
     | 
    
                       * of the sequence and let the later error handling code catch it.
 
     | 
  
  
    | 
      544
     | 
    
                       */
 
     | 
  
  
    | 
      545
     | 
    
                      $mUcs4 = ($in);
 
     | 
  
  
    | 
      546
     | 
    
                      $mUcs4 = ($mUcs4 & 0x03) << 24;
 
     | 
  
  
    | 
      547
     | 
    
                      $mState = 4;
 
     | 
  
  
    | 
      548
     | 
    
                      $mBytes = 5;
 
     | 
  
  
    | 
      549
     | 
    
      
 
     | 
  
  
    | 
      550
     | 
    
                  } else if (0xFC == (0xFE & ($in))) {
     | 
  
  
    | 
      551
     | 
    
                      // First octet of 6 octet sequence, see comments for 5 octet sequence.
 
     | 
  
  
    | 
      552
     | 
    
                      $mUcs4 = ($in);
 
     | 
  
  
    | 
      553
     | 
    
                      $mUcs4 = ($mUcs4 & 1) << 30;
 
     | 
  
  
    | 
      554
     | 
    
                      $mState = 5;
 
     | 
  
  
    | 
      555
     | 
    
                      $mBytes = 6;
 
     | 
  
  
    | 
      556
     | 
    
      
 
     | 
  
  
    | 
      557
     | 
    
                  } elseif($strict) {
     | 
  
  
    | 
      558
     | 
    
                      /* Current octet is neither in the US-ASCII range nor a legal first
 
     | 
  
  
    | 
      559
     | 
    
                       * octet of a multi-octet sequence.
 
     | 
  
  
    | 
      560
     | 
    
                       */
 
     | 
  
  
    | 
      561
     | 
    
                      trigger_error(
 
     | 
  
  
    | 
      562
     | 
    
                              'utf8_to_unicode: Illegal sequence identifier '.
 
     | 
  
  
    | 
      563
     | 
    
                                  'in UTF-8 at byte '.$i,
 
     | 
  
  
    | 
      564
     | 
    
                              E_USER_WARNING
 
     | 
  
  
    | 
      565
     | 
    
                          );
 
     | 
  
  
    | 
      566
     | 
    
                      return false;
 
     | 
  
  
    | 
      567
     | 
    
      
 
     | 
  
  
    | 
      568
     | 
    
                  }
 
     | 
  
  
    | 
      569
     | 
    
      
 
     | 
  
  
    | 
      570
     | 
    
              } else {
     | 
  
  
    | 
      571
     | 
    
      
 
     | 
  
  
    | 
      572
     | 
    
                  // When mState is non-zero, we expect a continuation of the multi-octet
 
     | 
  
  
    | 
      573
     | 
    
                  // sequence
 
     | 
  
  
    | 
      574
     | 
    
                  if (0x80 == (0xC0 & ($in))) {
     | 
  
  
    | 
      575
     | 
    
      
 
     | 
  
  
    | 
      576
     | 
    
                      // Legal continuation.
 
     | 
  
  
    | 
      577
     | 
    
                      $shift = ($mState - 1) * 6;
 
     | 
  
  
    | 
      578
     | 
    
                      $tmp = $in;
 
     | 
  
  
    | 
      579
     | 
    
                      $tmp = ($tmp & 0x0000003F) << $shift;
 
     | 
  
  
    | 
      580
     | 
    
                      $mUcs4 |= $tmp;
 
     | 
  
  
    | 
      581
     | 
    
      
 
     | 
  
  
    | 
      582
     | 
    
                      /*
 
     | 
  
  
    | 
      583
     | 
    
                       * End of the multi-octet sequence. mUcs4 now contains the final
 
     | 
  
  
    | 
      584
     | 
    
                       * Unicode codepoint to be output
 
     | 
  
  
    | 
      585
     | 
    
                       */
 
     | 
  
  
    | 
      586
     | 
    
                      if (0 == --$mState) {
     | 
  
  
    | 
      587
     | 
    
      
 
     | 
  
  
    | 
      588
     | 
    
                          /*
 
     | 
  
  
    | 
      589
     | 
    
                           * Check for illegal sequences and codepoints.
 
     | 
  
  
    | 
      590
     | 
    
                           */
 
     | 
  
  
    | 
      591
     | 
    
                          // From Unicode 3.1, non-shortest form is illegal
 
     | 
  
  
    | 
      592
     | 
    
                          if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
 
     | 
  
  
    | 
      593
     | 
    
                              ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
 
     | 
  
  
    | 
      594
     | 
    
                              ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
 
     | 
  
  
    | 
      595
     | 
    
                              (4 < $mBytes) ||
 
     | 
  
  
    | 
      596
     | 
    
                              // From Unicode 3.2, surrogate characters are illegal
 
     | 
  
  
    | 
      597
     | 
    
                              (($mUcs4 & 0xFFFFF800) == 0xD800) ||
 
     | 
  
  
    | 
      598
     | 
    
                              // Codepoints outside the Unicode range are illegal
 
     | 
  
  
    | 
      599
     | 
    
                              ($mUcs4 > 0x10FFFF)) {
     | 
  
  
    | 
      600
     | 
    
      
 
     | 
  
  
    | 
      601
     | 
    
                              if($strict){
     | 
  
  
    | 
      602
     | 
    
                                  trigger_error(
 
     | 
  
  
    | 
      603
     | 
    
                                          'utf8_to_unicode: Illegal sequence or codepoint '.
 
     | 
  
  
    | 
      604
     | 
    
                                              'in UTF-8 at byte '.$i,
 
     | 
  
  
    | 
      605
     | 
    
                                          E_USER_WARNING
 
     | 
  
  
    | 
      606
     | 
    
                                      );
 
     | 
  
  
    | 
      607
     | 
    
      
 
     | 
  
  
    | 
      608
     | 
    
                                  return false;
 
     | 
  
  
    | 
      609
     | 
    
                              }
 
     | 
  
  
    | 
      610
     | 
    
      
 
     | 
  
  
    | 
      611
     | 
    
                          }
 
     | 
  
  
    | 
      612
     | 
    
      
 
     | 
  
  
    | 
      613
     | 
    
                          if (0xFEFF != $mUcs4) {
     | 
  
  
    | 
      614
     | 
    
                              // BOM is legal but we don't want to output it
 
     | 
  
  
    | 
      615
     | 
    
                              $out[] = $mUcs4;
 
     | 
  
  
    | 
      616
     | 
    
                          }
 
     | 
  
  
    | 
      617
     | 
    
      
 
     | 
  
  
    | 
      618
     | 
    
                          //initialize UTF8 cache
 
     | 
  
  
    | 
      619
     | 
    
                          $mState = 0;
 
     | 
  
  
    | 
      620
     | 
    
                          $mUcs4  = 0;
 
     | 
  
  
    | 
      621
     | 
    
                          $mBytes = 1;
 
     | 
  
  
    | 
      622
     | 
    
                      }
 
     | 
  
  
    | 
      623
     | 
    
      
 
     | 
  
  
    | 
      624
     | 
    
                  } elseif($strict) {
     | 
  
  
    | 
      625
     | 
    
                      /*
 
     | 
  
  
    | 
      626
     | 
    
                       *((0xC0 & (*in) != 0x80) && (mState != 0))
 
     | 
  
  
    | 
      627
     | 
    
                       * Incomplete multi-octet sequence.
 
     | 
  
  
    | 
      628
     | 
    
                       */
 
     | 
  
  
    | 
      629
     | 
    
                      trigger_error(
 
     | 
  
  
    | 
      630
     | 
    
                              'utf8_to_unicode: Incomplete multi-octet '.
 
     | 
  
  
    | 
      631
     | 
    
                              '   sequence in UTF-8 at byte '.$i,
 
     | 
  
  
    | 
      632
     | 
    
                              E_USER_WARNING
 
     | 
  
  
    | 
      633
     | 
    
                          );
 
     | 
  
  
    | 
      634
     | 
    
      
 
     | 
  
  
    | 
      635
     | 
    
                      return false;
 
     | 
  
  
    | 
      636
     | 
    
                  }
 
     | 
  
  
    | 
      637
     | 
    
              }
 
     | 
  
  
    | 
      638
     | 
    
          }
 
     | 
  
  
    | 
      639
     | 
    
          return $out;
 
     | 
  
  
    | 
      640
     | 
    
      }
 
     | 
  
  
    | 
      641
     | 
    
      
 
     | 
  
  
    | 
      642
     | 
    
      /*
 
     | 
  
  
    | 
      643
     | 
    
       * Takes an array of ints representing the Unicode characters and returns
 
     | 
  
  
    | 
      644
     | 
    
       * a UTF-8 string. Astral planes are supported ie. the ints in the
 
     | 
  
  
    | 
      645
     | 
    
       * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
 
     | 
  
  
    | 
      646
     | 
    
       * are not allowed.
 
     | 
  
  
    | 
      647
     | 
    
       *
 
     | 
  
  
    | 
      648
     | 
    
       * If $strict is set to true the function returns false if the input
 
     | 
  
  
    | 
      649
     | 
    
       * array contains ints that represent surrogates or are outside the
 
     | 
  
  
    | 
      650
     | 
    
       * Unicode range and raises a PHP error at level E_USER_WARNING
 
     | 
  
  
    | 
      651
     | 
    
       *
 
     | 
  
  
    | 
      652
     | 
    
       * Note: this function has been modified slightly in this library to use
 
     | 
  
  
    | 
      653
     | 
    
       * output buffering to concatenate the UTF-8 string (faster) as well as
 
     | 
  
  
    | 
      654
     | 
    
       * reference the array by it's keys
 
     | 
  
  
    | 
      655
     | 
    
       *
 
     | 
  
  
    | 
      656
     | 
    
       * @param  array of unicode code points representing a string
 
     | 
  
  
    | 
      657
     | 
    
       * @param  boolean Check for invalid sequences?
 
     | 
  
  
    | 
      658
     | 
    
       * @return mixed UTF-8 string or false if array contains invalid code points
 
     | 
  
  
    | 
      659
     | 
    
       * @author <hsivonen@iki.fi>
 
     | 
  
  
    | 
      660
     | 
    
       * @author Harry Fuecks <hfuecks@gmail.com>
 
     | 
  
  
    | 
      661
     | 
    
       * @see    utf8_to_unicode
 
     | 
  
  
    | 
      662
     | 
    
       * @link   http://hsivonen.iki.fi/php-utf8/
 
     | 
  
  
    | 
      663
     | 
    
       * @link   http://sourceforge.net/projects/phputf8/
 
     | 
  
  
    | 
      664
     | 
    
       */
 
     | 
  
  
    | 
      665
     | 
    
      function unicode_to_utf8($arr,$strict=false) {
     | 
  
  
    | 
      666
     | 
    
          if (!is_array($arr)) return '';
 
     | 
  
  
    | 
      667
     | 
    
          ob_start();
 
     | 
  
  
    | 
      668
     | 
    
      
 
     | 
  
  
    | 
      669
     | 
    
          foreach (array_keys($arr) as $k) {
     | 
  
  
    | 
      670
     | 
    
      
 
     | 
  
  
    | 
      671
     | 
    
              # ASCII range (including control chars)
 
     | 
  
  
    | 
      672
     | 
    
              if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
     | 
  
  
    | 
      673
     | 
    
      
 
     | 
  
  
    | 
      674
     | 
    
                  echo chr($arr[$k]);
 
     | 
  
  
    | 
      675
     | 
    
      
 
     | 
  
  
    | 
      676
     | 
    
              # 2 byte sequence
 
     | 
  
  
    | 
      677
     | 
    
              } else if ($arr[$k] <= 0x07ff) {
     | 
  
  
    | 
      678
     | 
    
      
 
     | 
  
  
    | 
      679
     | 
    
                  echo chr(0xc0 | ($arr[$k] >> 6));
 
     | 
  
  
    | 
      680
     | 
    
                  echo chr(0x80 | ($arr[$k] & 0x003f));
 
     | 
  
  
    | 
      681
     | 
    
      
 
     | 
  
  
    | 
      682
     | 
    
              # Byte order mark (skip)
 
     | 
  
  
    | 
      683
     | 
    
              } else if($arr[$k] == 0xFEFF) {
     | 
  
  
    | 
      684
     | 
    
      
 
     | 
  
  
    | 
      685
     | 
    
                  // nop -- zap the BOM
 
     | 
  
  
    | 
      686
     | 
    
      
 
     | 
  
  
    | 
      687
     | 
    
              # Test for illegal surrogates
 
     | 
  
  
    | 
      688
     | 
    
              } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
     | 
  
  
    | 
      689
     | 
    
      
 
     | 
  
  
    | 
      690
     | 
    
                  // found a surrogate
 
     | 
  
  
    | 
      691
     | 
    
                  if($strict){
     | 
  
  
    | 
      692
     | 
    
                      trigger_error(
 
     | 
  
  
    | 
      693
     | 
    
                          'unicode_to_utf8: Illegal surrogate '.
 
     | 
  
  
    | 
      694
     | 
    
                              'at index: '.$k.', value: '.$arr[$k],
 
     | 
  
  
    | 
      695
     | 
    
                          E_USER_WARNING
 
     | 
  
  
    | 
      696
     | 
    
                          );
 
     | 
  
  
    | 
      697
     | 
    
                      return false;
 
     | 
  
  
    | 
      698
     | 
    
                  }
 
     | 
  
  
    | 
      699
     | 
    
      
 
     | 
  
  
    | 
      700
     | 
    
              # 3 byte sequence
 
     | 
  
  
    | 
      701
     | 
    
              } else if ($arr[$k] <= 0xffff) {
     | 
  
  
    | 
      702
     | 
    
      
 
     | 
  
  
    | 
      703
     | 
    
                  echo chr(0xe0 | ($arr[$k] >> 12));
 
     | 
  
  
    | 
      704
     | 
    
                  echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
 
     | 
  
  
    | 
      705
     | 
    
                  echo chr(0x80 | ($arr[$k] & 0x003f));
 
     | 
  
  
    | 
      706
     | 
    
      
 
     | 
  
  
    | 
      707
     | 
    
              # 4 byte sequence
 
     | 
  
  
    | 
      708
     | 
    
              } else if ($arr[$k] <= 0x10ffff) {
     | 
  
  
    | 
      709
     | 
    
      
 
     | 
  
  
    | 
      710
     | 
    
                  echo chr(0xf0 | ($arr[$k] >> 18));
 
     | 
  
  
    | 
      711
     | 
    
                  echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
 
     | 
  
  
    | 
      712
     | 
    
                  echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
 
     | 
  
  
    | 
      713
     | 
    
                  echo chr(0x80 | ($arr[$k] & 0x3f));
 
     | 
  
  
    | 
      714
     | 
    
      
 
     | 
  
  
    | 
      715
     | 
    
              } elseif($strict) {
     | 
  
  
    | 
      716
     | 
    
      
 
     | 
  
  
    | 
      717
     | 
    
                  trigger_error(
 
     | 
  
  
    | 
      718
     | 
    
                      'unicode_to_utf8: Codepoint out of Unicode range '.
 
     | 
  
  
    | 
      719
     | 
    
                          'at index: '.$k.', value: '.$arr[$k],
 
     | 
  
  
    | 
      720
     | 
    
                      E_USER_WARNING
 
     | 
  
  
    | 
      721
     | 
    
                      );
 
     | 
  
  
    | 
      722
     | 
    
      
 
     | 
  
  
    | 
      723
     | 
    
                  // out of range
 
     | 
  
  
    | 
      724
     | 
    
                  return false;
 
     | 
  
  
    | 
      725
     | 
    
              }
 
     | 
  
  
    | 
      726
     | 
    
          }
 
     | 
  
  
    | 
      727
     | 
    
      
 
     | 
  
  
    | 
      728
     | 
    
          $result = ob_get_contents();
 
     | 
  
  
    | 
      729
     | 
    
          ob_end_clean();
 
     | 
  
  
    | 
      730
     | 
    
          return $result;
 
     | 
  
  
    | 
      731
     | 
    
      }
 
     | 
  
  
    | 
      732
     | 
    
      
 
     | 
  
  
    | 
      733
     | 
    
      /*
 
     | 
  
  
    | 
      734
     | 
    
       * Replace bad bytes with an alternative character
 
     | 
  
  
    | 
      735
     | 
    
       *
 
     | 
  
  
    | 
      736
     | 
    
       * ASCII character is recommended for replacement char
 
     | 
  
  
    | 
      737
     | 
    
       *
 
     | 
  
  
    | 
      738
     | 
    
       * PCRE Pattern to locate bad bytes in a UTF-8 string
 
     | 
  
  
    | 
      739
     | 
    
       * Comes from W3 FAQ: Multilingual Forms
 
     | 
  
  
    | 
      740
     | 
    
       * Note: modified to include full ASCII range including control chars
 
     | 
  
  
    | 
      741
     | 
    
       *
 
     | 
  
  
    | 
      742
     | 
    
       * @author Harry Fuecks <hfuecks@gmail.com>
 
     | 
  
  
    | 
      743
     | 
    
       * @see http://www.w3.org/International/questions/qa-forms-utf-8
 
     | 
  
  
    | 
      744
     | 
    
       * @param string to search
 
     | 
  
  
    | 
      745
     | 
    
       * @param string to replace bad bytes with (defaults to '?') - use ASCII
 
     | 
  
  
    | 
      746
     | 
    
       * @return string
 
     | 
  
  
    | 
      747
     | 
    
       */
 
     | 
  
  
    | 
      748
     | 
    
      function utf8_bad_replace($str, $replace = '') {
     | 
  
  
    | 
      749
     | 
    
          $UTF8_BAD =
 
     | 
  
  
    | 
      750
     | 
    
           '([\x00-\x7F]'.                          # ASCII (including control chars)
 
     | 
  
  
    | 
      751
     | 
    
           '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
 
     | 
  
  
    | 
      752
     | 
    
           '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
 
     | 
  
  
    | 
      753
     | 
    
           '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
     | 
  
  
    | 
      754
     | 
    
           '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
 
     | 
  
  
    | 
      755
     | 
    
           '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
     | 
  
  
    | 
      756
     | 
    
           '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
     | 
  
  
    | 
      757
     | 
    
           '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
     | 
  
  
    | 
      758
     | 
    
           '|(.{1}))';                              # invalid byte
     | 
  
  
    | 
      759
     | 
    
          ob_start();
 
     | 
  
  
    | 
      760
     | 
    
          while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
     | 
  
  
    | 
      761
     | 
    
              if ( !isset($matches[2])) {
     | 
  
  
    | 
      762
     | 
    
                  echo $matches[0];
 
     | 
  
  
    | 
      763
     | 
    
              } else {
     | 
  
  
    | 
      764
     | 
    
                  echo $replace;
 
     | 
  
  
    | 
      765
     | 
    
              }
 
     | 
  
  
    | 
      766
     | 
    
              $str = substr($str,strlen($matches[0]));
 
     | 
  
  
    | 
      767
     | 
    
          }
 
     | 
  
  
    | 
      768
     | 
    
          $result = ob_get_contents();
 
     | 
  
  
    | 
      769
     | 
    
          ob_end_clean();
 
     | 
  
  
    | 
      770
     | 
    
          return $result;
 
     | 
  
  
    | 
      771
     | 
    
      }
 
     | 
  
  
    | 
      772
     | 
    
      
 
     | 
  
  
    | 
      773
     | 
    
      /*
 
     | 
  
  
    | 
      774
     | 
    
       * URL-Encode a filename to allow unicodecharacters
 
     | 
  
  
    | 
      775
     | 
    
       *
 
     | 
  
  
    | 
      776
     | 
    
       * Slashes are not encoded
 
     | 
  
  
    | 
      777
     | 
    
       *
 
     | 
  
  
    | 
      778
     | 
    
       * When the second parameter is true the string will
 
     | 
  
  
    | 
      779
     | 
    
       * be encoded only if non ASCII characters are detected -
 
     | 
  
  
    | 
      780
     | 
    
       * This makes it safe to run it multiple times on the
 
     | 
  
  
    | 
      781
     | 
    
       * same string (default is true)
 
     | 
  
  
    | 
      782
     | 
    
       *
 
     | 
  
  
    | 
      783
     | 
    
       * @author Andreas Gohr <andi@splitbrain.org>
 
     | 
  
  
    | 
      784
     | 
    
       * @see    urlencode
 
     | 
  
  
    | 
      785
     | 
    
       */
 
     | 
  
  
    | 
      786
     | 
    
      function utf8_encodeFN($file,$safe=true){
     | 
  
  
    | 
      787
     | 
    
        if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
     | 
  
  
    | 
      788
     | 
    
          return $file;
 
     | 
  
  
    | 
      789
     | 
    
        }
 
     | 
  
  
    | 
      790
     | 
    
        $file = urlencode($file);
 
     | 
  
  
    | 
      791
     | 
    
        $file = str_replace('%2F','/',$file);
     | 
  
  
    | 
      792
     | 
    
        return $file;
 
     | 
  
  
    | 
      793
     | 
    
      }
 
     | 
  
  
    | 
      794
     | 
    
      
 
     | 
  
  
    | 
      795
     | 
    
      /*
 
     | 
  
  
    | 
      796
     | 
    
       * URL-Decode a filename
 
     | 
  
  
    | 
      797
     | 
    
       *
 
     | 
  
  
    | 
      798
     | 
    
       * This is just a wrapper around urldecode
 
     | 
  
  
    | 
      799
     | 
    
       *
 
     | 
  
  
    | 
      800
     | 
    
       * @author Andreas Gohr <andi@splitbrain.org>
 
     | 
  
  
    | 
      801
     | 
    
       * @see    urldecode
 
     | 
  
  
    | 
      802
     | 
    
       */
 
     | 
  
  
    | 
      803
     | 
    
      function utf8_decodeFN($file){
     | 
  
  
    | 
      804
     | 
    
        $file = urldecode($file);
 
     | 
  
  
    | 
      805
     | 
    
        return $file;
 
     | 
  
  
    | 
      806
     | 
    
      }
 
     | 
  
  
    | 
      807
     | 
    
      
 
     | 
  
  
    | 
      808
     | 
    
      /*
 
     | 
  
  
    | 
      809
     | 
    
       * Moved some functions from framework/functions.php to here - thorn
 
     | 
  
  
    | 
      810
     | 
    
       */
 
     | 
  
  
    | 
      811
     | 
    
      
 
     | 
  
  
    | 
      812
     | 
    
      /*
 
     | 
  
  
    | 
      813
     | 
    
       * Decode HTML entities to UTF-8 characters
 
     | 
  
  
    | 
      814
     | 
    
       * 
 
     | 
  
  
    | 
      815
     | 
    
       * Will replace all numeric and named entities, except
 
     | 
  
  
    | 
      816
     | 
    
       * > < ' " '  
 
     | 
  
  
    | 
      817
     | 
    
       * 
 
     | 
  
  
    | 
      818
     | 
    
       * @param  string UTF-8 or ASCII encoded string
 
     | 
  
  
    | 
      819
     | 
    
       * @return string UTF-8 encoded string with numeric and named entities replaced.
 
     | 
  
  
    | 
      820
     | 
    
       */
 
     | 
  
  
    | 
      821
     | 
    
      function utf8_entities_to_umlauts($str) {
     | 
  
  
    | 
      822
     | 
    
      	global $named_to_numbered_entities;
 
     | 
  
  
    | 
      823
     | 
    
      	// we have to prevent "'" from beeing decoded
 
     | 
  
  
    | 
      824
     | 
    
      	$str = str_replace("'", "&_#39;", $str);
     | 
  
  
    | 
      825
     | 
    
      	$str = strtr($str, $named_to_numbered_entities);
 
     | 
  
  
    | 
      826
     | 
    
      	$str = utf8_unhtml($str);
 
     | 
  
  
    | 
      827
     | 
    
      	$str = str_replace("&_#39;", "'", $str);
     | 
  
  
    | 
      828
     | 
    
      
 
     | 
  
  
    | 
      829
     | 
    
      	return($str);
 
     | 
  
  
    | 
      830
     | 
    
      }
 
     | 
  
  
    | 
      831
     | 
    
      
 
     | 
  
  
    | 
      832
     | 
    
      /*
 
     | 
  
  
    | 
      833
     | 
    
       * Encode UTF-8 characters to HTML entities
 
     | 
  
  
    | 
      834
     | 
    
       *
 
     | 
  
  
    | 
      835
     | 
    
       * Will replace all UTF-8 encoded characters to numeric/named entities
 
     | 
  
  
    | 
      836
     | 
    
       *
 
     | 
  
  
    | 
      837
     | 
    
       * @param  string UTF-8 encoded string
 
     | 
  
  
    | 
      838
     | 
    
       * @param  bool Replace numbered by named entities
 
     | 
  
  
    | 
      839
     | 
    
       * @return string ASCII encoded string with all UTF-8 characters replaced by numeric/named entities
 
     | 
  
  
    | 
      840
     | 
    
       */
 
     | 
  
  
    | 
      841
     | 
    
      function utf8_umlauts_to_entities($str, $named_entities=true) {
     | 
  
  
    | 
      842
     | 
    
      	global $numbered_to_named_entities;
 
     | 
  
  
    | 
      843
     | 
    
      	$str = utf8_tohtml($str);
 
     | 
  
  
    | 
      844
     | 
    
      	if($named_entities)
 
     | 
  
  
    | 
      845
     | 
    
      		$str = strtr($str, $numbered_to_named_entities);
 
     | 
  
  
    | 
      846
     | 
    
      	return($str);
 
     | 
  
  
    | 
      847
     | 
    
      }
 
     | 
  
  
    | 
      848
     | 
    
      
 
     | 
  
  
    | 
      849
     | 
    
      /*
 
     | 
  
  
    | 
      850
     | 
    
       * Converts from various charsets to UTF-8
 
     | 
  
  
    | 
      851
     | 
    
       *
 
     | 
  
  
    | 
      852
     | 
    
       * Will convert a string from various charsets to UTF-8.
 
     | 
  
  
    | 
      853
     | 
    
       * HTML-entities will be converted, too.
 
     | 
  
  
    | 
      854
     | 
    
       * In case of error the returned string is unchanged, and a message is emitted.
 
     | 
  
  
    | 
      855
     | 
    
       * Supported charsets are:
 
     | 
  
  
    | 
      856
     | 
    
       * direct: iso_8859_1 iso_8859_2 iso_8859_3 iso_8859_4 iso_8859_5
 
     | 
  
  
    | 
      857
     | 
    
       *         iso_8859_6 iso_8859_7 iso_8859_8 iso_8859_9 iso_8859_10 iso_8859_11
 
     | 
  
  
    | 
      858
     | 
    
       * mb_convert_encoding: all wb charsets (except those from 'direct'); but not GB2312
 
     | 
  
  
    | 
      859
     | 
    
       * iconv:  all wb charsets (except those from 'direct')
 
     | 
  
  
    | 
      860
     | 
    
       *
 
     | 
  
  
    | 
      861
     | 
    
       * @param  string  A string in supported encoding
 
     | 
  
  
    | 
      862
     | 
    
       * @param  string  The charset to convert from, defaults to DEFAULT_CHARSET
 
     | 
  
  
    | 
      863
     | 
    
       * @return string  A string in UTF-8-encoding, with all entities decoded, too.
 
     | 
  
  
    | 
      864
     | 
    
       *                 String is unchanged in case of error.
 
     | 
  
  
    | 
      865
     | 
    
       */
 
     | 
  
  
    | 
      866
     | 
    
      function charset_to_utf8($str, $charset_in=DEFAULT_CHARSET) {
     | 
  
  
    | 
      867
     | 
    
      	global $iso_8859_2_to_utf8, $iso_8859_3_to_utf8, $iso_8859_4_to_utf8, $iso_8859_5_to_utf8, $iso_8859_6_to_utf8, $iso_8859_7_to_utf8, $iso_8859_8_to_utf8, $iso_8859_9_to_utf8, $iso_8859_10_to_utf8, $iso_8859_11_to_utf8;
 
     | 
  
  
    | 
      868
     | 
    
      	$charset_in = strtoupper($charset_in);
 
     | 
  
  
    | 
      869
     | 
    
      	if ($charset_in == "") { $charset_in = 'UTF-8'; }
     | 
  
  
    | 
      870
     | 
    
      	$wrong_ISO8859 = false;
 
     | 
  
  
    | 
      871
     | 
    
      	$converted = false;
 
     | 
  
  
    | 
      872
     | 
    
      
 
     | 
  
  
    | 
      873
     | 
    
      	if((!function_exists('iconv') && !UTF8_MBSTRING && ($charset_in=='big5' || $charset_in=='iso-2022-jp' || $charset_in=='iso-2022-kr')) || (!function_exists('iconv') && $charset_in=='gb2312')) {
     | 
  
  
    | 
      874
     | 
    
      		// Nothing we can do here :-(
 
     | 
  
  
    | 
      875
     | 
    
      		// Charset is one of those obscure ISO-2022... or BIG5, GB2312 or something
 
     | 
  
  
    | 
      876
     | 
    
      		// and we can't use mb_convert_encoding() or iconv();
 
     | 
  
  
    | 
      877
     | 
    
      		// Emit an error-message.
 
     | 
  
  
    | 
      878
     | 
    
      		trigger_error("Can't convert from $charset_in without mb_convert_encoding() or iconv(). Use UTF-8 instead.", E_USER_WARNING);
     | 
  
  
    | 
      879
     | 
    
      		return($str);
 
     | 
  
  
    | 
      880
     | 
    
      	}
 
     | 
  
  
    | 
      881
     | 
    
      
 
     | 
  
  
    | 
      882
     | 
    
      	// check if we have UTF-8 or a plain ASCII string
 
     | 
  
  
    | 
      883
     | 
    
      	if($charset_in == 'UTF-8' || utf8_isASCII($str)) {
     | 
  
  
    | 
      884
     | 
    
      		// we have utf-8. Just replace HTML-entities and return
 
     | 
  
  
    | 
      885
     | 
    
      		if(preg_match('/&[#0-9a-zA-Z]+;/',$str))
     | 
  
  
    | 
      886
     | 
    
      			return(utf8_entities_to_umlauts($str));
 
     | 
  
  
    | 
      887
     | 
    
      		else // nothing to do
 
     | 
  
  
    | 
      888
     | 
    
      			return($str);
 
     | 
  
  
    | 
      889
     | 
    
      	}
 
     | 
  
  
    | 
      890
     | 
    
      	
 
     | 
  
  
    | 
      891
     | 
    
      	// Convert $str to utf8
 
     | 
  
  
    | 
      892
     | 
    
      	if(substr($charset_in,0,8) == 'ISO-8859') {
     | 
  
  
    | 
      893
     | 
    
      		switch($charset_in) {
     | 
  
  
    | 
      894
     | 
    
      			case 'ISO-8859-1': $str=utf8_encode($str); break;
 
     | 
  
  
    | 
      895
     | 
    
      			case 'ISO-8859-2': $str=strtr($str, $iso_8859_2_to_utf8); break;
 
     | 
  
  
    | 
      896
     | 
    
      			case 'ISO-8859-3': $str=strtr($str, $iso_8859_3_to_utf8); break;
 
     | 
  
  
    | 
      897
     | 
    
      			case 'ISO-8859-4': $str=strtr($str, $iso_8859_4_to_utf8); break;
 
     | 
  
  
    | 
      898
     | 
    
      			case 'ISO-8859-5': $str=strtr($str, $iso_8859_5_to_utf8); break;
 
     | 
  
  
    | 
      899
     | 
    
      			case 'ISO-8859-6': $str=strtr($str, $iso_8859_6_to_utf8); break;
 
     | 
  
  
    | 
      900
     | 
    
      			case 'ISO-8859-7': $str=strtr($str, $iso_8859_7_to_utf8); break;
 
     | 
  
  
    | 
      901
     | 
    
      			case 'ISO-8859-8': $str=strtr($str, $iso_8859_8_to_utf8); break;
 
     | 
  
  
    | 
      902
     | 
    
      			case 'ISO-8859-9': $str=strtr($str, $iso_8859_9_to_utf8); break;
 
     | 
  
  
    | 
      903
     | 
    
      			case 'ISO-8859-10': $str=strtr($str, $iso_8859_10_to_utf8); break;
 
     | 
  
  
    | 
      904
     | 
    
      			case 'ISO-8859-11': $str=strtr($str, $iso_8859_11_to_utf8); break;
 
     | 
  
  
    | 
      905
     | 
    
      			default: $wrong_ISO8859 = true;
 
     | 
  
  
    | 
      906
     | 
    
      		}
 
     | 
  
  
    | 
      907
     | 
    
      		if(!$wrong_ISO8859)
 
     | 
  
  
    | 
      908
     | 
    
      			$converted = true;
 
     | 
  
  
    | 
      909
     | 
    
      	}
 
     | 
  
  
    | 
      910
     | 
    
      	if(!$converted && UTF8_MBSTRING && $charset_in != 'GB2312') {
     | 
  
  
    | 
      911
     | 
    
      		// $charset is neither UTF-8 nor a known ISO-8859...
 
     | 
  
  
    | 
      912
     | 
    
      		// Try mb_convert_encoding() - but there's no GB2312 encoding in php's mb_* functions
 
     | 
  
  
    | 
      913
     | 
    
      		$str = mb_convert_encoding($str, 'UTF-8', $charset_in);
 
     | 
  
  
    | 
      914
     | 
    
      		$converted = true;
 
     | 
  
  
    | 
      915
     | 
    
      	} elseif(!$converted) { // Try iconv
     | 
  
  
    | 
      916
     | 
    
      		if(function_exists('iconv')) {
     | 
  
  
    | 
      917
     | 
    
      			$str = iconv($charset_in, 'UTF-8', $str);
 
     | 
  
  
    | 
      918
     | 
    
      			$converted = true;
 
     | 
  
  
    | 
      919
     | 
    
      		}
 
     | 
  
  
    | 
      920
     | 
    
      	}
 
     | 
  
  
    | 
      921
     | 
    
      	if($converted) {
     | 
  
  
    | 
      922
     | 
    
      		// we have utf-8, now replace HTML-entities and return
 
     | 
  
  
    | 
      923
     | 
    
      		if(preg_match('/&[#0-9a-zA-Z]+;/',$str))
     | 
  
  
    | 
      924
     | 
    
      			$str = utf8_entities_to_umlauts($str);
 
     | 
  
  
    | 
      925
     | 
    
      		// just to be sure, replace bad characters
 
     | 
  
  
    | 
      926
     | 
    
      		$str = utf8_bad_replace($str, '?');
 
     | 
  
  
    | 
      927
     | 
    
      		return($str);
 
     | 
  
  
    | 
      928
     | 
    
      	}
 
     | 
  
  
    | 
      929
     | 
    
      	
 
     | 
  
  
    | 
      930
     | 
    
      	// Nothing we can do here :-(
 
     | 
  
  
    | 
      931
     | 
    
      	// Charset is one of those obscure ISO-2022... or BIG5, GB2312 or something
 
     | 
  
  
    | 
      932
     | 
    
      	// and we can't use mb_convert_encoding() or iconv();
 
     | 
  
  
    | 
      933
     | 
    
      	// Emit an error-message.
 
     | 
  
  
    | 
      934
     | 
    
      	trigger_error("Can't convert from $charset_in without mb_convert_encoding() or iconv(). Use UTF-8 instead.", E_USER_WARNING);
     | 
  
  
    | 
      935
     | 
    
      	
 
     | 
  
  
    | 
      936
     | 
    
      	return $str;
 
     | 
  
  
    | 
      937
     | 
    
      }
 
     | 
  
  
    | 
      938
     | 
    
      
 
     | 
  
  
    | 
      939
     | 
    
      /*
 
     | 
  
  
    | 
      940
     | 
    
       * Converts from UTF-8 to various charsets
 
     | 
  
  
    | 
      941
     | 
    
       *
 
     | 
  
  
    | 
      942
     | 
    
       * Will convert a string from UTF-8 to various charsets.
 
     | 
  
  
    | 
      943
     | 
    
       * HTML-entities will be converted, too.
 
     | 
  
  
    | 
      944
     | 
    
       * In case of error the returned string is unchanged, and a message is emitted.
 
     | 
  
  
    | 
      945
     | 
    
       * Supported charsets are:
 
     | 
  
  
    | 
      946
     | 
    
       * direct: iso_8859_1 iso_8859_2 iso_8859_3 iso_8859_4 iso_8859_5
 
     | 
  
  
    | 
      947
     | 
    
       *         iso_8859_6 iso_8859_7 iso_8859_8 iso_8859_9 iso_8859_10 iso_8859_11
 
     | 
  
  
    | 
      948
     | 
    
       * mb_convert_encoding: all wb charsets (except those from 'direct'); but not GB2312
 
     | 
  
  
    | 
      949
     | 
    
       * iconv:  all wb charsets (except those from 'direct')
 
     | 
  
  
    | 
      950
     | 
    
       *
 
     | 
  
  
    | 
      951
     | 
    
       * @param  string  An UTF-8 encoded string
 
     | 
  
  
    | 
      952
     | 
    
       * @param  string  The charset to convert to, defaults to DEFAULT_CHARSET
 
     | 
  
  
    | 
      953
     | 
    
       * @return string  A string in a supported encoding, with all entities decoded, too.
 
     | 
  
  
    | 
      954
     | 
    
       *                 String is unchanged in case of error.
 
     | 
  
  
    | 
      955
     | 
    
       */
 
     | 
  
  
    | 
      956
     | 
    
      function utf8_to_charset($str, $charset_out=DEFAULT_CHARSET) {
     | 
  
  
    | 
      957
     | 
    
      	global $utf8_to_iso_8859_2, $utf8_to_iso_8859_3, $utf8_to_iso_8859_4, $utf8_to_iso_8859_5, $utf8_to_iso_8859_6, $utf8_to_iso_8859_7, $utf8_to_iso_8859_8, $utf8_to_iso_8859_9, $utf8_to_iso_8859_10, $utf8_to_iso_8859_11;
 
     | 
  
  
    | 
      958
     | 
    
      	$charset_out = strtoupper($charset_out);
 
     | 
  
  
    | 
      959
     | 
    
      	$wrong_ISO8859 = false;
 
     | 
  
  
    | 
      960
     | 
    
      	$converted = false;
 
     | 
  
  
    | 
      961
     | 
    
      
 
     | 
  
  
    | 
      962
     | 
    
      	if((!function_exists('iconv') && !UTF8_MBSTRING && ($charset_out=='big5' || $charset_out=='iso-2022-jp' || $charset_out=='iso-2022-kr')) || (!function_exists('iconv') && $charset_out=='gb2312')) {
     | 
  
  
    | 
      963
     | 
    
      		// Nothing we can do here :-(
 
     | 
  
  
    | 
      964
     | 
    
      		// Charset is one of those obscure ISO-2022... or BIG5, GB2312 or something
 
     | 
  
  
    | 
      965
     | 
    
      		// and we can't use mb_convert_encoding() or iconv();
 
     | 
  
  
    | 
      966
     | 
    
      		// Emit an error-message.
 
     | 
  
  
    | 
      967
     | 
    
      		trigger_error("Can't convert into $charset_out without mb_convert_encoding() or iconv(). Use UTF-8 instead.", E_USER_WARNING);
     | 
  
  
    | 
      968
     | 
    
      		return($str);
 
     | 
  
  
    | 
      969
     | 
    
      	}
 
     | 
  
  
    | 
      970
     | 
    
      	
 
     | 
  
  
    | 
      971
     | 
    
      	// replace HTML-entities first
 
     | 
  
  
    | 
      972
     | 
    
      	if(preg_match('/&[#0-9a-zA-Z]+;/',$str))
     | 
  
  
    | 
      973
     | 
    
      		$str = utf8_entities_to_umlauts($str);
 
     | 
  
  
    | 
      974
     | 
    
      	
 
     | 
  
  
    | 
      975
     | 
    
      	// check if we need to convert
 
     | 
  
  
    | 
      976
     | 
    
      	if($charset_out == 'UTF-8' || utf8_isASCII($str)) {
     | 
  
  
    | 
      977
     | 
    
      		// Nothing to do. Just return
 
     | 
  
  
    | 
      978
     | 
    
      			return($str);
 
     | 
  
  
    | 
      979
     | 
    
      	}
 
     | 
  
  
    | 
      980
     | 
    
      	
 
     | 
  
  
    | 
      981
     | 
    
      	// Convert $str to $charset_out
 
     | 
  
  
    | 
      982
     | 
    
      	if(substr($charset_out,0,8) == 'ISO-8859') {
     | 
  
  
    | 
      983
     | 
    
      		switch($charset_out) {
     | 
  
  
    | 
      984
     | 
    
      			case 'ISO-8859-1': $str=utf8_decode($str); break;
 
     | 
  
  
    | 
      985
     | 
    
      			case 'ISO-8859-2': $str=strtr($str, $utf8_to_iso_8859_2); break;
 
     | 
  
  
    | 
      986
     | 
    
      			case 'ISO-8859-3': $str=strtr($str, $utf8_to_iso_8859_3); break;
 
     | 
  
  
    | 
      987
     | 
    
      			case 'ISO-8859-4': $str=strtr($str, $utf8_to_iso_8859_4); break;
 
     | 
  
  
    | 
      988
     | 
    
      			case 'ISO-8859-5': $str=strtr($str, $utf8_to_iso_8859_5); break;
 
     | 
  
  
    | 
      989
     | 
    
      			case 'ISO-8859-6': $str=strtr($str, $utf8_to_iso_8859_6); break;
 
     | 
  
  
    | 
      990
     | 
    
      			case 'ISO-8859-7': $str=strtr($str, $utf8_to_iso_8859_7); break;
 
     | 
  
  
    | 
      991
     | 
    
      			case 'ISO-8859-8': $str=strtr($str, $utf8_to_iso_8859_8); break;
 
     | 
  
  
    | 
      992
     | 
    
      			case 'ISO-8859-9': $str=strtr($str, $utf8_to_iso_8859_9); break;
 
     | 
  
  
    | 
      993
     | 
    
      			case 'ISO-8859-10': $str=strtr($str, $utf8_to_iso_8859_10); break;
 
     | 
  
  
    | 
      994
     | 
    
      			case 'ISO-8859-11': $str=strtr($str, $utf8_to_iso_8859_11); break;
 
     | 
  
  
    | 
      995
     | 
    
      			default: $wrong_ISO8859 = true;
 
     | 
  
  
    | 
      996
     | 
    
      		}
 
     | 
  
  
    | 
      997
     | 
    
      		if(!$wrong_ISO8859)
 
     | 
  
  
    | 
      998
     | 
    
      			$converted = true;
 
     | 
  
  
    | 
      999
     | 
    
      	}
 
     | 
  
  
    | 
      1000
     | 
    
      	if(!$converted && UTF8_MBSTRING && $charset_out != 'GB2312') {
     | 
  
  
    | 
      1001
     | 
    
      		// $charset is neither UTF-8 nor a known ISO-8859...
 
     | 
  
  
    | 
      1002
     | 
    
      		// Try mb_convert_encoding() - but there's no GB2312 encoding in php's mb_* functions
 
     | 
  
  
    | 
      1003
     | 
    
      		$str = mb_convert_encoding($str, $charset_out, 'UTF-8');
 
     | 
  
  
    | 
      1004
     | 
    
      		$converted = true;
 
     | 
  
  
    | 
      1005
     | 
    
      	} elseif(!$converted) { // Try iconv
     | 
  
  
    | 
      1006
     | 
    
      		if(function_exists('iconv')) {
     | 
  
  
    | 
      1007
     | 
    
      			$str = iconv('UTF-8', $charset_out, $str);
     | 
  
  
    | 
      1008
     | 
    
      			$converted = true;
 
     | 
  
  
    | 
      1009
     | 
    
      		}
 
     | 
  
  
    | 
      1010
     | 
    
      	}
 
     | 
  
  
    | 
      1011
     | 
    
      	if($converted) {
     | 
  
  
    | 
      1012
     | 
    
      		return($str);
 
     | 
  
  
    | 
      1013
     | 
    
      	}
 
     | 
  
  
    | 
      1014
     | 
    
      	
 
     | 
  
  
    | 
      1015
     | 
    
      	// Nothing we can do here :-(
 
     | 
  
  
    | 
      1016
     | 
    
      	// Charset is one of those obscure ISO-2022... or BIG5, GB2312 or something
 
     | 
  
  
    | 
      1017
     | 
    
      	// and we can't use mb_convert_encoding() or iconv();
 
     | 
  
  
    | 
      1018
     | 
    
      	// Emit an error-message.
 
     | 
  
  
    | 
      1019
     | 
    
      	trigger_error("Can't convert into $charset_out without mb_convert_encoding() or iconv(). Use UTF-8 instead.", E_USER_WARNING);
     | 
  
  
    | 
      1020
     | 
    
      	
 
     | 
  
  
    | 
      1021
     | 
    
      	return $str;
 
     | 
  
  
    | 
      1022
     | 
    
      }
 
     | 
  
  
    | 
      1023
     | 
    
      
 
     | 
  
  
    | 
      1024
     | 
    
      /*
 
     | 
  
  
    | 
      1025
     | 
    
       * convert Filenames to ASCII
 
     | 
  
  
    | 
      1026
     | 
    
       *
 
     | 
  
  
    | 
      1027
     | 
    
       * Convert all non-ASCII characters and all HTML-entities to their plain 7bit equivalents
 
     | 
  
  
    | 
      1028
     | 
    
       * Characters without an equivalent will be converted to hex-values.
 
     | 
  
  
    | 
      1029
     | 
    
       * The name entities_to_7bit() is somewhat misleading, but kept for compatibility-reasons.
 
     | 
  
  
    | 
      1030
     | 
    
       *
 
     | 
  
  
    | 
      1031
     | 
    
       * @param  string  Filename to convert (all encodings from charset_to_utf8() are allowed)
 
     | 
  
  
    | 
      1032
     | 
    
       * @return string  ASCII encoded string, to use as filename in wb's page_filename() and media_filename
 
     | 
  
  
    | 
      1033
     | 
    
       */
 
     | 
  
  
    | 
      1034
     | 
    
      function entities_to_7bit($str) {
     | 
  
  
    | 
      1035
     | 
    
      	// convert to UTF-8
 
     | 
  
  
    | 
      1036
     | 
    
      	$str = charset_to_utf8($str);
 
     | 
  
  
    | 
      1037
     | 
    
      	if(!utf8_check($str))
 
     | 
  
  
    | 
      1038
     | 
    
      		return($str);
 
     | 
  
  
    | 
      1039
     | 
    
      	// replace some specials
 
     | 
  
  
    | 
      1040
     | 
    
      	$str = utf8_stripspecials($str, '_');
 
     | 
  
  
    | 
      1041
     | 
    
      	// translate non-ASCII characters to ASCII
 
     | 
  
  
    | 
      1042
     | 
    
      	$str = utf8_romanize($str);
 
     | 
  
  
    | 
      1043
     | 
    
      	// missed some? - Many UTF-8-chars can't be romanized
 
     | 
  
  
    | 
      1044
     | 
    
      	// convert to HTML-entities, and replace entites by hex-numbers
 
     | 
  
  
    | 
      1045
     | 
    
      	$str = utf8_umlauts_to_entities($str, false);
 
     | 
  
  
    | 
      1046
     | 
    
      	$str = str_replace(''', ''', $str);
     | 
  
  
    | 
      1047
     | 
    
      	$str = preg_replace('/&#([0-9]+);/e', "dechex('$1')",  $str);
     | 
  
  
    | 
      1048
     | 
    
      	// maybe there are some > < ' " &   left, replace them too
 
     | 
  
  
    | 
      1049
     | 
    
      	$entities = array('>'=>'_','<'=>'_','''=>'_','"'=>'_','&'=>'_',' '=>' ');
     | 
  
  
    | 
      1050
     | 
    
      	$str = strtr($str, $entities);
 
     | 
  
  
    | 
      1051
     | 
    
      	
 
     | 
  
  
    | 
      1052
     | 
    
      	return($str);
 
     | 
  
  
    | 
      1053
     | 
    
      }
 
     | 
  
  
    | 
      1054
     | 
    
      
 
     | 
  
  
    | 
      1055
     | 
    
      /*
 
     | 
  
  
    | 
      1056
     | 
    
       * Convert a string from mixed html-entities/umlauts to pure $charset_out-umlauts
 
     | 
  
  
    | 
      1057
     | 
    
       * 
 
     | 
  
  
    | 
      1058
     | 
    
       * Will replace all numeric and named entities except
 
     | 
  
  
    | 
      1059
     | 
    
       * > < ' " '  
 
     | 
  
  
    | 
      1060
     | 
    
       * In case of error the returned string is unchanged, and a message is emitted.
 
     | 
  
  
    | 
      1061
     | 
    
       * Supported charsets are:
 
     | 
  
  
    | 
      1062
     | 
    
       * direct: iso_8859_1 iso_8859_2 iso_8859_3 iso_8859_4 iso_8859_5
 
     | 
  
  
    | 
      1063
     | 
    
       *         iso_8859_6 iso_8859_7 iso_8859_8 iso_8859_9 iso_8859_10 iso_8859_11
 
     | 
  
  
    | 
      1064
     | 
    
       * mb_convert_encoding: all wb charsets (except those from 'direct'); but not GB2312
 
     | 
  
  
    | 
      1065
     | 
    
       * iconv:  all wb charsets (except those from 'direct')
 
     | 
  
  
    | 
      1066
     | 
    
       * 
 
     | 
  
  
    | 
      1067
     | 
    
       * @param  string  A string in DEFAULT_CHARSET encoding
 
     | 
  
  
    | 
      1068
     | 
    
       * @return string  A string in $charset_out encoding with numeric and named entities replaced.
 
     | 
  
  
    | 
      1069
     | 
    
       *         The string is unchanged in case of error. 
 
     | 
  
  
    | 
      1070
     | 
    
       */
 
     | 
  
  
    | 
      1071
     | 
    
      function entities_to_umlauts2($string, $charset_out=DEFAULT_CHARSET) {
     | 
  
  
    | 
      1072
     | 
    
      	$string = charset_to_utf8($string, DEFAULT_CHARSET);
 
     | 
  
  
    | 
      1073
     | 
    
      	//if(utf8_check($string)) // this is to much time-consuming
 
     | 
  
  
    | 
      1074
     | 
    
      		$string = utf8_to_charset($string, $charset_out);
 
     | 
  
  
    | 
      1075
     | 
    
      	return ($string);
 
     | 
  
  
    | 
      1076
     | 
    
      }
 
     | 
  
  
    | 
      1077
     | 
    
      
 
     | 
  
  
    | 
      1078
     | 
    
      /*
 
     | 
  
  
    | 
      1079
     | 
    
       * Convert a string from mixed html-entities/umlauts to pure ASCII with HTML-entities
 
     | 
  
  
    | 
      1080
     | 
    
       * 
 
     | 
  
  
    | 
      1081
     | 
    
       * Will convert a string in $charset_in encoding to a pure ASCII string with HTML-entities.
 
     | 
  
  
    | 
      1082
     | 
    
       * In case of error the returned string is unchanged, and a message is emitted.
 
     | 
  
  
    | 
      1083
     | 
    
       * Supported charsets are:
 
     | 
  
  
    | 
      1084
     | 
    
       * direct: iso_8859_1 iso_8859_2 iso_8859_3 iso_8859_4 iso_8859_5
 
     | 
  
  
    | 
      1085
     | 
    
       *         iso_8859_6 iso_8859_7 iso_8859_8 iso_8859_9 iso_8859_10 iso_8859_11
 
     | 
  
  
    | 
      1086
     | 
    
       * mb_convert_encoding: all wb charsets (except those from 'direct'); but not GB2312
 
     | 
  
  
    | 
      1087
     | 
    
       * iconv:  all wb charsets (except those from 'direct')
 
     | 
  
  
    | 
      1088
     | 
    
       * 
 
     | 
  
  
    | 
      1089
     | 
    
       * @param  string  A string in $charset_in encoding
 
     | 
  
  
    | 
      1090
     | 
    
       * @return string  A string in ASCII encoding with numeric and named entities.
 
     | 
  
  
    | 
      1091
     | 
    
       *         The string is unchanged in case of error. 
 
     | 
  
  
    | 
      1092
     | 
    
       */
 
     | 
  
  
    | 
      1093
     | 
    
      function umlauts_to_entities2($string, $charset_in=DEFAULT_CHARSET) {
     | 
  
  
    | 
      1094
     | 
    
      	$string = charset_to_utf8($string, $charset_in);
 
     | 
  
  
    | 
      1095
     | 
    
      	//if(utf8_check($string)) // this is to much time-consuming
 
     | 
  
  
    | 
      1096
     | 
    
      		$string = utf8_umlauts_to_entities($string);
 
     | 
  
  
    | 
      1097
     | 
    
      	return($string);
 
     | 
  
  
    | 
      1098
     | 
    
      }
 
     | 
  
  
    | 
      1099
     | 
    
      
 
     | 
  
  
    | 
      1100
     | 
    
      ?>
 
     |