1
|
<?php
|
2
|
|
3
|
// $Id: functions-utf8.php 554 2008-01-18 12:26:41Z Ruebenwurzel $
|
4
|
|
5
|
/*
|
6
|
|
7
|
Website Baker Project <http://www.websitebaker.org/>
|
8
|
Copyright (C) 2004-2008, Ryan Djurovich
|
9
|
|
10
|
Website Baker is free software; you can redistribute it and/or modify
|
11
|
it under the terms of the GNU General Public License as published by
|
12
|
the Free Software Foundation; either version 2 of the License, or
|
13
|
(at your option) any later version.
|
14
|
|
15
|
Website Baker is distributed in the hope that it will be useful,
|
16
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
17
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
18
|
GNU General Public License for more details.
|
19
|
|
20
|
You should have received a copy of the GNU General Public License
|
21
|
along with Website Baker; if not, write to the Free Software
|
22
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
23
|
|
24
|
*/
|
25
|
|
26
|
/*
|
27
|
* A large part of this file is based on 'utf8.php' from the DokuWiki-project.
|
28
|
* (http://www.splitbrain.org/projects/dokuwiki):
|
29
|
**
|
30
|
* UTF8 helper functions
|
31
|
* @license LGPL (http://www.gnu.org/copyleft/lesser.html)
|
32
|
* @author Andreas Gohr <andi@splitbrain.org>
|
33
|
**
|
34
|
* modified for use with Website Baker
|
35
|
* from thorn, Jan. 2008
|
36
|
*/
|
37
|
|
38
|
// Functions we use in Website Baker:
|
39
|
// entities_to_7bit()
|
40
|
// entities_to_umlauts2()
|
41
|
// umlauts_to_entities2()
|
42
|
|
43
|
if(!defined('WB_URL')) {
|
44
|
header('Location: ../index.php');
|
45
|
exit(0);
|
46
|
}
|
47
|
|
48
|
/*
|
49
|
* check for mb_string support
|
50
|
*/
|
51
|
if(!defined('UTF8_MBSTRING')){
|
52
|
if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
|
53
|
define('UTF8_MBSTRING',1);
|
54
|
}else{
|
55
|
define('UTF8_MBSTRING',0);
|
56
|
}
|
57
|
}
|
58
|
|
59
|
if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
|
60
|
|
61
|
require_once(WB_PATH.'/framework/charsets_table.php');
|
62
|
|
63
|
/*
|
64
|
* Checks if a string contains 7bit ASCII only
|
65
|
*
|
66
|
* @author Andreas Gohr <andi@splitbrain.org>
|
67
|
*/
|
68
|
function utf8_isASCII($str){
|
69
|
for($i=0; $i<strlen($str); $i++){
|
70
|
if(ord($str{$i}) >127) return false;
|
71
|
}
|
72
|
return true;
|
73
|
}
|
74
|
|
75
|
/*
|
76
|
* Strips all highbyte chars
|
77
|
*
|
78
|
* Returns a pure ASCII7 string
|
79
|
*
|
80
|
* @author Andreas Gohr <andi@splitbrain.org>
|
81
|
*/
|
82
|
function utf8_strip($str){
|
83
|
$ascii = '';
|
84
|
for($i=0; $i<strlen($str); $i++){
|
85
|
if(ord($str{$i}) <128){
|
86
|
$ascii .= $str{$i};
|
87
|
}
|
88
|
}
|
89
|
return $ascii;
|
90
|
}
|
91
|
|
92
|
/*
|
93
|
* Tries to detect if a string is in Unicode encoding
|
94
|
*
|
95
|
* @author <bmorel@ssi.fr>
|
96
|
* @link http://www.php.net/manual/en/function.utf8-encode.php
|
97
|
*/
|
98
|
function utf8_check($Str) {
|
99
|
for ($i=0; $i<strlen($Str); $i++) {
|
100
|
$b = ord($Str[$i]);
|
101
|
if ($b < 0x80) continue; # 0bbbbbbb
|
102
|
elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
|
103
|
elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
|
104
|
elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
|
105
|
elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
|
106
|
elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
|
107
|
else return false; # Does not match any model
|
108
|
for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
|
109
|
if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
|
110
|
return false;
|
111
|
}
|
112
|
}
|
113
|
return true;
|
114
|
}
|
115
|
|
116
|
/*
|
117
|
* Unicode aware replacement for strlen()
|
118
|
*
|
119
|
* utf8_decode() converts characters that are not in ISO-8859-1
|
120
|
* to '?', which, for the purpose of counting, is alright - It's
|
121
|
* even faster than mb_strlen.
|
122
|
*
|
123
|
* @author <chernyshevsky at hotmail dot com>
|
124
|
* @see strlen()
|
125
|
* @see utf8_decode()
|
126
|
*/
|
127
|
function utf8_strlen($string){
|
128
|
return strlen(utf8_decode($string));
|
129
|
}
|
130
|
|
131
|
/*
|
132
|
* UTF-8 aware alternative to substr
|
133
|
*
|
134
|
* Return part of a string given character offset (and optionally length)
|
135
|
*
|
136
|
* @author Harry Fuecks <hfuecks@gmail.com>
|
137
|
* @author Chris Smith <chris@jalakai.co.uk>
|
138
|
* @param string
|
139
|
* @param integer number of UTF-8 characters offset (from left)
|
140
|
* @param integer (optional) length in UTF-8 characters from offset
|
141
|
* @return mixed string or false if failure
|
142
|
*/
|
143
|
function utf8_substr($str, $offset, $length = null) {
|
144
|
if(UTF8_MBSTRING){
|
145
|
if( $length === null ){
|
146
|
return mb_substr($str, $offset);
|
147
|
}else{
|
148
|
return mb_substr($str, $offset, $length);
|
149
|
}
|
150
|
}
|
151
|
|
152
|
/*
|
153
|
* Notes:
|
154
|
*
|
155
|
* no mb string support, so we'll use pcre regex's with 'u' flag
|
156
|
* pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
|
157
|
* offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
|
158
|
*
|
159
|
* substr documentation states false can be returned in some cases (e.g. offset > string length)
|
160
|
* mb_substr never returns false, it will return an empty string instead.
|
161
|
*
|
162
|
* calculating the number of characters in the string is a relatively expensive operation, so
|
163
|
* we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
|
164
|
*/
|
165
|
|
166
|
// cast parameters to appropriate types to avoid multiple notices/warnings
|
167
|
$str = (string)$str; // generates E_NOTICE for PHP4 objects, but not PHP5 objects
|
168
|
$offset = (int)$offset;
|
169
|
if (!is_null($length)) $length = (int)$length;
|
170
|
|
171
|
// handle trivial cases
|
172
|
if ($length === 0) return '';
|
173
|
if ($offset < 0 && $length < 0 && $length < $offset) return '';
|
174
|
|
175
|
$offset_pattern = '';
|
176
|
$length_pattern = '';
|
177
|
|
178
|
// normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
|
179
|
if ($offset < 0) {
|
180
|
$strlen = strlen(utf8_decode($str)); // see notes
|
181
|
$offset = $strlen + $offset;
|
182
|
if ($offset < 0) $offset = 0;
|
183
|
}
|
184
|
|
185
|
// establish a pattern for offset, a non-captured group equal in length to offset
|
186
|
if ($offset > 0) {
|
187
|
$Ox = (int)($offset/65535);
|
188
|
$Oy = $offset%65535;
|
189
|
|
190
|
if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
|
191
|
$offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
|
192
|
} else {
|
193
|
$offset_pattern = '^'; // offset == 0; just anchor the pattern
|
194
|
}
|
195
|
|
196
|
// establish a pattern for length
|
197
|
if (is_null($length)) {
|
198
|
$length_pattern = '(.*)$'; // the rest of the string
|
199
|
} else {
|
200
|
|
201
|
if (!isset($strlen)) $strlen = strlen(utf8_decode($str)); // see notes
|
202
|
if ($offset > $strlen) return ''; // another trivial case
|
203
|
|
204
|
if ($length > 0) {
|
205
|
|
206
|
$length = min($strlen-$offset, $length); // reduce any length that would go passed the end of the string
|
207
|
|
208
|
$Lx = (int)($length/65535);
|
209
|
$Ly = $length%65535;
|
210
|
|
211
|
// +ve length requires ... a captured group of length characters
|
212
|
if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
|
213
|
$length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
|
214
|
|
215
|
} else if ($length < 0) {
|
216
|
|
217
|
if ($length < ($offset - $strlen)) return '';
|
218
|
|
219
|
$Lx = (int)((-$length)/65535);
|
220
|
$Ly = (-$length)%65535;
|
221
|
|
222
|
// -ve length requires ... capture everything except a group of -length characters
|
223
|
// anchored at the tail-end of the string
|
224
|
if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
|
225
|
$length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
|
226
|
}
|
227
|
}
|
228
|
|
229
|
if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
|
230
|
return $match[1];
|
231
|
}
|
232
|
|
233
|
/*
|
234
|
* Unicode aware replacement for substr_replace()
|
235
|
*
|
236
|
* @author Andreas Gohr <andi@splitbrain.org>
|
237
|
* @see substr_replace()
|
238
|
*/
|
239
|
function utf8_substr_replace($string, $replacement, $start , $length=0 ){
|
240
|
$ret = '';
|
241
|
if($start>0) $ret .= utf8_substr($string, 0, $start);
|
242
|
$ret .= $replacement;
|
243
|
$ret .= utf8_substr($string, $start+$length);
|
244
|
return $ret;
|
245
|
}
|
246
|
|
247
|
/*
|
248
|
* Unicode aware replacement for ltrim()
|
249
|
*
|
250
|
* @author Andreas Gohr <andi@splitbrain.org>
|
251
|
* @see ltrim()
|
252
|
* @return string
|
253
|
*/
|
254
|
function utf8_ltrim($str,$charlist=''){
|
255
|
if($charlist == '') return ltrim($str);
|
256
|
|
257
|
//quote charlist for use in a characterclass
|
258
|
$charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
|
259
|
|
260
|
return preg_replace('/^['.$charlist.']+/u','',$str);
|
261
|
}
|
262
|
|
263
|
/*
|
264
|
* Unicode aware replacement for rtrim()
|
265
|
*
|
266
|
* @author Andreas Gohr <andi@splitbrain.org>
|
267
|
* @see rtrim()
|
268
|
* @return string
|
269
|
*/
|
270
|
function utf8_rtrim($str,$charlist=''){
|
271
|
if($charlist == '') return rtrim($str);
|
272
|
|
273
|
//quote charlist for use in a characterclass
|
274
|
$charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
|
275
|
|
276
|
return preg_replace('/['.$charlist.']+$/u','',$str);
|
277
|
}
|
278
|
|
279
|
/*
|
280
|
* Unicode aware replacement for trim()
|
281
|
*
|
282
|
* @author Andreas Gohr <andi@splitbrain.org>
|
283
|
* @see trim()
|
284
|
* @return string
|
285
|
*/
|
286
|
function utf8_trim($str,$charlist='') {
|
287
|
if($charlist == '') return trim($str);
|
288
|
|
289
|
return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist);
|
290
|
}
|
291
|
|
292
|
/*
|
293
|
* This is a unicode aware replacement for strtolower()
|
294
|
*
|
295
|
* Uses mb_string extension if available
|
296
|
*
|
297
|
* @author Leo Feyer <leo@typolight.org>
|
298
|
* @see strtolower()
|
299
|
* @see utf8_strtoupper()
|
300
|
*/
|
301
|
function utf8_strtolower($string){
|
302
|
if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
|
303
|
|
304
|
global $UTF8_UPPER_TO_LOWER;
|
305
|
return strtr($string,$UTF8_UPPER_TO_LOWER);
|
306
|
}
|
307
|
|
308
|
/*
|
309
|
* This is a unicode aware replacement for strtoupper()
|
310
|
*
|
311
|
* Uses mb_string extension if available
|
312
|
*
|
313
|
* @author Leo Feyer <leo@typolight.org>
|
314
|
* @see strtoupper()
|
315
|
* @see utf8_strtoupper()
|
316
|
*/
|
317
|
function utf8_strtoupper($string){
|
318
|
if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
|
319
|
|
320
|
global $UTF8_LOWER_TO_UPPER;
|
321
|
return strtr($string,$UTF8_LOWER_TO_UPPER);
|
322
|
}
|
323
|
|
324
|
/*
|
325
|
* Romanize a non-latin string
|
326
|
*
|
327
|
* @author Andreas Gohr <andi@splitbrain.org>
|
328
|
*/
|
329
|
function utf8_romanize($string){
|
330
|
if(utf8_isASCII($string)) return $string; //nothing to do
|
331
|
|
332
|
global $UTF8_ROMANIZATION;
|
333
|
return strtr($string,$UTF8_ROMANIZATION);
|
334
|
}
|
335
|
|
336
|
/*
|
337
|
* Removes special characters (nonalphanumeric) from a UTF-8 string
|
338
|
*
|
339
|
* This function adds the controlchars 0x00 to 0x19 to the array of
|
340
|
* stripped chars (they are not included in $UTF8_SPECIAL_CHARS2)
|
341
|
*
|
342
|
* @author Andreas Gohr <andi@splitbrain.org>
|
343
|
* @param string $string The UTF8 string to strip of special chars
|
344
|
* @param string $repl Replace special with this string
|
345
|
* @param string $additional Additional chars to strip (used in regexp char class)
|
346
|
*/
|
347
|
function utf8_stripspecials($string,$repl='',$additional=''){
|
348
|
global $UTF8_SPECIAL_CHARS2;
|
349
|
|
350
|
static $specials = null;
|
351
|
if(is_null($specials)){
|
352
|
$specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
|
353
|
}
|
354
|
|
355
|
return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
|
356
|
}
|
357
|
|
358
|
/*
|
359
|
* This is an Unicode aware replacement for strpos
|
360
|
*
|
361
|
* @author Leo Feyer <leo@typolight.org>
|
362
|
* @see strpos()
|
363
|
* @param string
|
364
|
* @param string
|
365
|
* @param integer
|
366
|
* @return integer
|
367
|
*/
|
368
|
function utf8_strpos($haystack, $needle, $offset=0){
|
369
|
$comp = 0;
|
370
|
$length = null;
|
371
|
|
372
|
while (is_null($length) || $length < $offset) {
|
373
|
$pos = strpos($haystack, $needle, $offset + $comp);
|
374
|
|
375
|
if ($pos === false)
|
376
|
return false;
|
377
|
|
378
|
$length = utf8_strlen(substr($haystack, 0, $pos));
|
379
|
|
380
|
if ($length < $offset)
|
381
|
$comp = $pos - $length;
|
382
|
}
|
383
|
|
384
|
return $length;
|
385
|
}
|
386
|
|
387
|
/*
|
388
|
* Encodes UTF-8 characters to HTML entities
|
389
|
*
|
390
|
* @author Tom N Harris <tnharris@whoopdedo.org>
|
391
|
* @author <vpribish at shopping dot com>
|
392
|
* @link http://www.php.net/manual/en/function.utf8-decode.php
|
393
|
*/
|
394
|
function utf8_tohtml ($str) {
|
395
|
$ret = '';
|
396
|
foreach (utf8_to_unicode($str) as $cp) {
|
397
|
if ($cp < 0x80)
|
398
|
$ret .= chr($cp);
|
399
|
//elseif ($cp < 0x100)
|
400
|
// $ret .= "&#$cp;";
|
401
|
else
|
402
|
$ret .= "&#$cp;";
|
403
|
// $ret .= '&#x'.dechex($cp).';';
|
404
|
}
|
405
|
return $ret;
|
406
|
}
|
407
|
|
408
|
/*
|
409
|
* Decodes HTML entities to UTF-8 characters
|
410
|
*
|
411
|
* Convert any &#..; entity to a codepoint,
|
412
|
* The entities flag defaults to only decoding numeric entities.
|
413
|
* Pass HTML_ENTITIES and named entities, including & < etc.
|
414
|
* are handled as well. Avoids the problem that would occur if you
|
415
|
* had to decode "&#38;&amp;#38;"
|
416
|
*
|
417
|
* unhtmlspecialchars(utf8_unhtml($s)) -> "&&"
|
418
|
* utf8_unhtml(unhtmlspecialchars($s)) -> "&&#38;"
|
419
|
* what it should be -> "&&#38;"
|
420
|
*
|
421
|
* @author Tom N Harris <tnharris@whoopdedo.org>
|
422
|
* @param string $str UTF-8 encoded string
|
423
|
* @param boolean $entities Flag controlling decoding of named entities.
|
424
|
* @return UTF-8 encoded string with numeric (and named) entities replaced.
|
425
|
*/
|
426
|
function utf8_unhtml($str, $entities=null) {
|
427
|
static $decoder = null;
|
428
|
if (is_null($decoder))
|
429
|
$decoder = new utf8_entity_decoder();
|
430
|
if (is_null($entities))
|
431
|
return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
|
432
|
'utf8_decode_numeric', $str);
|
433
|
else
|
434
|
return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
|
435
|
array(&$decoder, 'decode'), $str);
|
436
|
}
|
437
|
function utf8_decode_numeric($ent) {
|
438
|
switch ($ent[2]) {
|
439
|
case 'X':
|
440
|
case 'x':
|
441
|
$cp = hexdec($ent[3]);
|
442
|
break;
|
443
|
default:
|
444
|
$cp = intval($ent[3]);
|
445
|
break;
|
446
|
}
|
447
|
return unicode_to_utf8(array($cp));
|
448
|
}
|
449
|
class utf8_entity_decoder {
|
450
|
var $table;
|
451
|
function utf8_entity_decoder() {
|
452
|
$table = get_html_translation_table(HTML_ENTITIES);
|
453
|
$table = array_flip($table);
|
454
|
$this->table = array_map(array(&$this,'makeutf8'), $table);
|
455
|
}
|
456
|
function makeutf8($c) {
|
457
|
return unicode_to_utf8(array(ord($c)));
|
458
|
}
|
459
|
function decode($ent) {
|
460
|
if ($ent[1] == '#') {
|
461
|
return utf8_decode_numeric($ent);
|
462
|
} elseif (array_key_exists($ent[0],$this->table)) {
|
463
|
return $this->table[$ent[0]];
|
464
|
} else {
|
465
|
return $ent[0];
|
466
|
}
|
467
|
}
|
468
|
}
|
469
|
|
470
|
/*
|
471
|
* Takes an UTF-8 string and returns an array of ints representing the
|
472
|
* Unicode characters. Astral planes are supported ie. the ints in the
|
473
|
* output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
|
474
|
* are not allowed.
|
475
|
*
|
476
|
* If $strict is set to true the function returns false if the input
|
477
|
* string isn't a valid UTF-8 octet sequence and raises a PHP error at
|
478
|
* level E_USER_WARNING
|
479
|
*
|
480
|
* Note: this function has been modified slightly in this library to
|
481
|
* trigger errors on encountering bad bytes
|
482
|
*
|
483
|
* @author <hsivonen@iki.fi>
|
484
|
* @author Harry Fuecks <hfuecks@gmail.com>
|
485
|
* @param string UTF-8 encoded string
|
486
|
* @param boolean Check for invalid sequences?
|
487
|
* @return mixed array of unicode code points or false if UTF-8 invalid
|
488
|
* @see unicode_to_utf8
|
489
|
* @link http://hsivonen.iki.fi/php-utf8/
|
490
|
* @link http://sourceforge.net/projects/phputf8/
|
491
|
*/
|
492
|
function utf8_to_unicode($str,$strict=false) {
|
493
|
$mState = 0; // cached expected number of octets after the current octet
|
494
|
// until the beginning of the next UTF8 character sequence
|
495
|
$mUcs4 = 0; // cached Unicode character
|
496
|
$mBytes = 1; // cached expected number of octets in the current sequence
|
497
|
|
498
|
$out = array();
|
499
|
|
500
|
$len = strlen($str);
|
501
|
|
502
|
for($i = 0; $i < $len; $i++) {
|
503
|
|
504
|
$in = ord($str{$i});
|
505
|
|
506
|
if ( $mState == 0) {
|
507
|
|
508
|
// When mState is zero we expect either a US-ASCII character or a
|
509
|
// multi-octet sequence.
|
510
|
if (0 == (0x80 & ($in))) {
|
511
|
// US-ASCII, pass straight through.
|
512
|
$out[] = $in;
|
513
|
$mBytes = 1;
|
514
|
|
515
|
} else if (0xC0 == (0xE0 & ($in))) {
|
516
|
// First octet of 2 octet sequence
|
517
|
$mUcs4 = ($in);
|
518
|
$mUcs4 = ($mUcs4 & 0x1F) << 6;
|
519
|
$mState = 1;
|
520
|
$mBytes = 2;
|
521
|
|
522
|
} else if (0xE0 == (0xF0 & ($in))) {
|
523
|
// First octet of 3 octet sequence
|
524
|
$mUcs4 = ($in);
|
525
|
$mUcs4 = ($mUcs4 & 0x0F) << 12;
|
526
|
$mState = 2;
|
527
|
$mBytes = 3;
|
528
|
|
529
|
} else if (0xF0 == (0xF8 & ($in))) {
|
530
|
// First octet of 4 octet sequence
|
531
|
$mUcs4 = ($in);
|
532
|
$mUcs4 = ($mUcs4 & 0x07) << 18;
|
533
|
$mState = 3;
|
534
|
$mBytes = 4;
|
535
|
|
536
|
} else if (0xF8 == (0xFC & ($in))) {
|
537
|
/* First octet of 5 octet sequence.
|
538
|
*
|
539
|
* This is illegal because the encoded codepoint must be either
|
540
|
* (a) not the shortest form or
|
541
|
* (b) outside the Unicode range of 0-0x10FFFF.
|
542
|
* Rather than trying to resynchronize, we will carry on until the end
|
543
|
* of the sequence and let the later error handling code catch it.
|
544
|
*/
|
545
|
$mUcs4 = ($in);
|
546
|
$mUcs4 = ($mUcs4 & 0x03) << 24;
|
547
|
$mState = 4;
|
548
|
$mBytes = 5;
|
549
|
|
550
|
} else if (0xFC == (0xFE & ($in))) {
|
551
|
// First octet of 6 octet sequence, see comments for 5 octet sequence.
|
552
|
$mUcs4 = ($in);
|
553
|
$mUcs4 = ($mUcs4 & 1) << 30;
|
554
|
$mState = 5;
|
555
|
$mBytes = 6;
|
556
|
|
557
|
} elseif($strict) {
|
558
|
/* Current octet is neither in the US-ASCII range nor a legal first
|
559
|
* octet of a multi-octet sequence.
|
560
|
*/
|
561
|
trigger_error(
|
562
|
'utf8_to_unicode: Illegal sequence identifier '.
|
563
|
'in UTF-8 at byte '.$i,
|
564
|
E_USER_WARNING
|
565
|
);
|
566
|
return false;
|
567
|
|
568
|
}
|
569
|
|
570
|
} else {
|
571
|
|
572
|
// When mState is non-zero, we expect a continuation of the multi-octet
|
573
|
// sequence
|
574
|
if (0x80 == (0xC0 & ($in))) {
|
575
|
|
576
|
// Legal continuation.
|
577
|
$shift = ($mState - 1) * 6;
|
578
|
$tmp = $in;
|
579
|
$tmp = ($tmp & 0x0000003F) << $shift;
|
580
|
$mUcs4 |= $tmp;
|
581
|
|
582
|
/*
|
583
|
* End of the multi-octet sequence. mUcs4 now contains the final
|
584
|
* Unicode codepoint to be output
|
585
|
*/
|
586
|
if (0 == --$mState) {
|
587
|
|
588
|
/*
|
589
|
* Check for illegal sequences and codepoints.
|
590
|
*/
|
591
|
// From Unicode 3.1, non-shortest form is illegal
|
592
|
if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
|
593
|
((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
|
594
|
((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
|
595
|
(4 < $mBytes) ||
|
596
|
// From Unicode 3.2, surrogate characters are illegal
|
597
|
(($mUcs4 & 0xFFFFF800) == 0xD800) ||
|
598
|
// Codepoints outside the Unicode range are illegal
|
599
|
($mUcs4 > 0x10FFFF)) {
|
600
|
|
601
|
if($strict){
|
602
|
trigger_error(
|
603
|
'utf8_to_unicode: Illegal sequence or codepoint '.
|
604
|
'in UTF-8 at byte '.$i,
|
605
|
E_USER_WARNING
|
606
|
);
|
607
|
|
608
|
return false;
|
609
|
}
|
610
|
|
611
|
}
|
612
|
|
613
|
if (0xFEFF != $mUcs4) {
|
614
|
// BOM is legal but we don't want to output it
|
615
|
$out[] = $mUcs4;
|
616
|
}
|
617
|
|
618
|
//initialize UTF8 cache
|
619
|
$mState = 0;
|
620
|
$mUcs4 = 0;
|
621
|
$mBytes = 1;
|
622
|
}
|
623
|
|
624
|
} elseif($strict) {
|
625
|
/*
|
626
|
*((0xC0 & (*in) != 0x80) && (mState != 0))
|
627
|
* Incomplete multi-octet sequence.
|
628
|
*/
|
629
|
trigger_error(
|
630
|
'utf8_to_unicode: Incomplete multi-octet '.
|
631
|
' sequence in UTF-8 at byte '.$i,
|
632
|
E_USER_WARNING
|
633
|
);
|
634
|
|
635
|
return false;
|
636
|
}
|
637
|
}
|
638
|
}
|
639
|
return $out;
|
640
|
}
|
641
|
|
642
|
/*
|
643
|
* Takes an array of ints representing the Unicode characters and returns
|
644
|
* a UTF-8 string. Astral planes are supported ie. the ints in the
|
645
|
* input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
|
646
|
* are not allowed.
|
647
|
*
|
648
|
* If $strict is set to true the function returns false if the input
|
649
|
* array contains ints that represent surrogates or are outside the
|
650
|
* Unicode range and raises a PHP error at level E_USER_WARNING
|
651
|
*
|
652
|
* Note: this function has been modified slightly in this library to use
|
653
|
* output buffering to concatenate the UTF-8 string (faster) as well as
|
654
|
* reference the array by it's keys
|
655
|
*
|
656
|
* @param array of unicode code points representing a string
|
657
|
* @param boolean Check for invalid sequences?
|
658
|
* @return mixed UTF-8 string or false if array contains invalid code points
|
659
|
* @author <hsivonen@iki.fi>
|
660
|
* @author Harry Fuecks <hfuecks@gmail.com>
|
661
|
* @see utf8_to_unicode
|
662
|
* @link http://hsivonen.iki.fi/php-utf8/
|
663
|
* @link http://sourceforge.net/projects/phputf8/
|
664
|
*/
|
665
|
function unicode_to_utf8($arr,$strict=false) {
|
666
|
if (!is_array($arr)) return '';
|
667
|
ob_start();
|
668
|
|
669
|
foreach (array_keys($arr) as $k) {
|
670
|
|
671
|
# ASCII range (including control chars)
|
672
|
if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
|
673
|
|
674
|
echo chr($arr[$k]);
|
675
|
|
676
|
# 2 byte sequence
|
677
|
} else if ($arr[$k] <= 0x07ff) {
|
678
|
|
679
|
echo chr(0xc0 | ($arr[$k] >> 6));
|
680
|
echo chr(0x80 | ($arr[$k] & 0x003f));
|
681
|
|
682
|
# Byte order mark (skip)
|
683
|
} else if($arr[$k] == 0xFEFF) {
|
684
|
|
685
|
// nop -- zap the BOM
|
686
|
|
687
|
# Test for illegal surrogates
|
688
|
} else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
|
689
|
|
690
|
// found a surrogate
|
691
|
if($strict){
|
692
|
trigger_error(
|
693
|
'unicode_to_utf8: Illegal surrogate '.
|
694
|
'at index: '.$k.', value: '.$arr[$k],
|
695
|
E_USER_WARNING
|
696
|
);
|
697
|
return false;
|
698
|
}
|
699
|
|
700
|
# 3 byte sequence
|
701
|
} else if ($arr[$k] <= 0xffff) {
|
702
|
|
703
|
echo chr(0xe0 | ($arr[$k] >> 12));
|
704
|
echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
|
705
|
echo chr(0x80 | ($arr[$k] & 0x003f));
|
706
|
|
707
|
# 4 byte sequence
|
708
|
} else if ($arr[$k] <= 0x10ffff) {
|
709
|
|
710
|
echo chr(0xf0 | ($arr[$k] >> 18));
|
711
|
echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
|
712
|
echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
|
713
|
echo chr(0x80 | ($arr[$k] & 0x3f));
|
714
|
|
715
|
} elseif($strict) {
|
716
|
|
717
|
trigger_error(
|
718
|
'unicode_to_utf8: Codepoint out of Unicode range '.
|
719
|
'at index: '.$k.', value: '.$arr[$k],
|
720
|
E_USER_WARNING
|
721
|
);
|
722
|
|
723
|
// out of range
|
724
|
return false;
|
725
|
}
|
726
|
}
|
727
|
|
728
|
$result = ob_get_contents();
|
729
|
ob_end_clean();
|
730
|
return $result;
|
731
|
}
|
732
|
|
733
|
/*
|
734
|
* Replace bad bytes with an alternative character
|
735
|
*
|
736
|
* ASCII character is recommended for replacement char
|
737
|
*
|
738
|
* PCRE Pattern to locate bad bytes in a UTF-8 string
|
739
|
* Comes from W3 FAQ: Multilingual Forms
|
740
|
* Note: modified to include full ASCII range including control chars
|
741
|
*
|
742
|
* @author Harry Fuecks <hfuecks@gmail.com>
|
743
|
* @see http://www.w3.org/International/questions/qa-forms-utf-8
|
744
|
* @param string to search
|
745
|
* @param string to replace bad bytes with (defaults to '?') - use ASCII
|
746
|
* @return string
|
747
|
*/
|
748
|
function utf8_bad_replace($str, $replace = '') {
|
749
|
$UTF8_BAD =
|
750
|
'([\x00-\x7F]'. # ASCII (including control chars)
|
751
|
'|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
|
752
|
'|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
|
753
|
'|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
|
754
|
'|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
|
755
|
'|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
|
756
|
'|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
|
757
|
'|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
|
758
|
'|(.{1}))'; # invalid byte
|
759
|
ob_start();
|
760
|
while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
|
761
|
if ( !isset($matches[2])) {
|
762
|
echo $matches[0];
|
763
|
} else {
|
764
|
echo $replace;
|
765
|
}
|
766
|
$str = substr($str,strlen($matches[0]));
|
767
|
}
|
768
|
$result = ob_get_contents();
|
769
|
ob_end_clean();
|
770
|
return $result;
|
771
|
}
|
772
|
|
773
|
/*
|
774
|
* URL-Encode a filename to allow unicodecharacters
|
775
|
*
|
776
|
* Slashes are not encoded
|
777
|
*
|
778
|
* When the second parameter is true the string will
|
779
|
* be encoded only if non ASCII characters are detected -
|
780
|
* This makes it safe to run it multiple times on the
|
781
|
* same string (default is true)
|
782
|
*
|
783
|
* @author Andreas Gohr <andi@splitbrain.org>
|
784
|
* @see urlencode
|
785
|
*/
|
786
|
function utf8_encodeFN($file,$safe=true){
|
787
|
if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
|
788
|
return $file;
|
789
|
}
|
790
|
$file = urlencode($file);
|
791
|
$file = str_replace('%2F','/',$file);
|
792
|
return $file;
|
793
|
}
|
794
|
|
795
|
/*
|
796
|
* URL-Decode a filename
|
797
|
*
|
798
|
* This is just a wrapper around urldecode
|
799
|
*
|
800
|
* @author Andreas Gohr <andi@splitbrain.org>
|
801
|
* @see urldecode
|
802
|
*/
|
803
|
function utf8_decodeFN($file){
|
804
|
$file = urldecode($file);
|
805
|
return $file;
|
806
|
}
|
807
|
|
808
|
/*
|
809
|
* Moved some functions from framework/functions.php to here - thorn
|
810
|
*/
|
811
|
|
812
|
/*
|
813
|
* Decode HTML entities to UTF-8 characters
|
814
|
*
|
815
|
* Will replace all numeric and named entities, except
|
816
|
* > < ' " '
|
817
|
*
|
818
|
* @param string UTF-8 or ASCII encoded string
|
819
|
* @return string UTF-8 encoded string with numeric and named entities replaced.
|
820
|
*/
|
821
|
function utf8_entities_to_umlauts($str) {
|
822
|
global $named_to_numbered_entities;
|
823
|
// we have to prevent "'" from beeing decoded
|
824
|
$str = str_replace("'", "&_#39;", $str);
|
825
|
$str = strtr($str, $named_to_numbered_entities);
|
826
|
$str = utf8_unhtml($str);
|
827
|
$str = str_replace("&_#39;", "'", $str);
|
828
|
|
829
|
return($str);
|
830
|
}
|
831
|
|
832
|
/*
|
833
|
* Encode UTF-8 characters to HTML entities
|
834
|
*
|
835
|
* Will replace all UTF-8 encoded characters to numeric/named entities
|
836
|
*
|
837
|
* @param string UTF-8 encoded string
|
838
|
* @param bool Replace numbered by named entities
|
839
|
* @return string ASCII encoded string with all UTF-8 characters replaced by numeric/named entities
|
840
|
*/
|
841
|
function utf8_umlauts_to_entities($str, $named_entities=true) {
|
842
|
global $numbered_to_named_entities;
|
843
|
$str = utf8_tohtml($str);
|
844
|
if($named_entities)
|
845
|
$str = strtr($str, $numbered_to_named_entities);
|
846
|
return($str);
|
847
|
}
|
848
|
|
849
|
/*
|
850
|
* Converts from various charsets to UTF-8
|
851
|
*
|
852
|
* Will convert a string from various charsets to UTF-8.
|
853
|
* HTML-entities will be converted, too.
|
854
|
* In case of error the returned string is unchanged, and a message is emitted.
|
855
|
* Supported charsets are:
|
856
|
* direct: iso_8859_1 iso_8859_2 iso_8859_3 iso_8859_4 iso_8859_5
|
857
|
* iso_8859_6 iso_8859_7 iso_8859_8 iso_8859_9 iso_8859_10 iso_8859_11
|
858
|
* mb_convert_encoding: all wb charsets (except those from 'direct'); but not GB2312
|
859
|
* iconv: all wb charsets (except those from 'direct')
|
860
|
*
|
861
|
* @param string A string in supported encoding
|
862
|
* @param string The charset to convert from, defaults to DEFAULT_CHARSET
|
863
|
* @return string A string in UTF-8-encoding, with all entities decoded, too.
|
864
|
* String is unchanged in case of error.
|
865
|
*/
|
866
|
function charset_to_utf8($str, $charset_in=DEFAULT_CHARSET) {
|
867
|
global $iso_8859_2_to_utf8, $iso_8859_3_to_utf8, $iso_8859_4_to_utf8, $iso_8859_5_to_utf8, $iso_8859_6_to_utf8, $iso_8859_7_to_utf8, $iso_8859_8_to_utf8, $iso_8859_9_to_utf8, $iso_8859_10_to_utf8, $iso_8859_11_to_utf8;
|
868
|
$charset_in = strtoupper($charset_in);
|
869
|
if ($charset_in == "") { $charset_in = 'UTF-8'; }
|
870
|
$wrong_ISO8859 = false;
|
871
|
$converted = false;
|
872
|
|
873
|
if((!function_exists('iconv') && !UTF8_MBSTRING && ($charset_in=='big5' || $charset_in=='iso-2022-jp' || $charset_in=='iso-2022-kr')) || (!function_exists('iconv') && $charset_in=='gb2312')) {
|
874
|
// Nothing we can do here :-(
|
875
|
// Charset is one of those obscure ISO-2022... or BIG5, GB2312 or something
|
876
|
// and we can't use mb_convert_encoding() or iconv();
|
877
|
// Emit an error-message.
|
878
|
trigger_error("Can't convert from $charset_in without mb_convert_encoding() or iconv(). Use UTF-8 instead.", E_USER_WARNING);
|
879
|
return($str);
|
880
|
}
|
881
|
|
882
|
// check if we have UTF-8 or a plain ASCII string
|
883
|
if($charset_in == 'UTF-8' || utf8_isASCII($str)) {
|
884
|
// we have utf-8. Just replace HTML-entities and return
|
885
|
if(preg_match('/&[#0-9a-zA-Z]+;/',$str))
|
886
|
return(utf8_entities_to_umlauts($str));
|
887
|
else // nothing to do
|
888
|
return($str);
|
889
|
}
|
890
|
|
891
|
// Convert $str to utf8
|
892
|
if(substr($charset_in,0,8) == 'ISO-8859') {
|
893
|
switch($charset_in) {
|
894
|
case 'ISO-8859-1': $str=utf8_encode($str); break;
|
895
|
case 'ISO-8859-2': $str=strtr($str, $iso_8859_2_to_utf8); break;
|
896
|
case 'ISO-8859-3': $str=strtr($str, $iso_8859_3_to_utf8); break;
|
897
|
case 'ISO-8859-4': $str=strtr($str, $iso_8859_4_to_utf8); break;
|
898
|
case 'ISO-8859-5': $str=strtr($str, $iso_8859_5_to_utf8); break;
|
899
|
case 'ISO-8859-6': $str=strtr($str, $iso_8859_6_to_utf8); break;
|
900
|
case 'ISO-8859-7': $str=strtr($str, $iso_8859_7_to_utf8); break;
|
901
|
case 'ISO-8859-8': $str=strtr($str, $iso_8859_8_to_utf8); break;
|
902
|
case 'ISO-8859-9': $str=strtr($str, $iso_8859_9_to_utf8); break;
|
903
|
case 'ISO-8859-10': $str=strtr($str, $iso_8859_10_to_utf8); break;
|
904
|
case 'ISO-8859-11': $str=strtr($str, $iso_8859_11_to_utf8); break;
|
905
|
default: $wrong_ISO8859 = true;
|
906
|
}
|
907
|
if(!$wrong_ISO8859)
|
908
|
$converted = true;
|
909
|
}
|
910
|
if(!$converted && UTF8_MBSTRING && $charset_in != 'GB2312') {
|
911
|
// $charset is neither UTF-8 nor a known ISO-8859...
|
912
|
// Try mb_convert_encoding() - but there's no GB2312 encoding in php's mb_* functions
|
913
|
$str = mb_convert_encoding($str, 'UTF-8', $charset_in);
|
914
|
$converted = true;
|
915
|
} elseif(!$converted) { // Try iconv
|
916
|
if(function_exists('iconv')) {
|
917
|
$str = iconv($charset_in, 'UTF-8', $str);
|
918
|
$converted = true;
|
919
|
}
|
920
|
}
|
921
|
if($converted) {
|
922
|
// we have utf-8, now replace HTML-entities and return
|
923
|
if(preg_match('/&[#0-9a-zA-Z]+;/',$str))
|
924
|
$str = utf8_entities_to_umlauts($str);
|
925
|
// just to be sure, replace bad characters
|
926
|
$str = utf8_bad_replace($str, '?');
|
927
|
return($str);
|
928
|
}
|
929
|
|
930
|
// Nothing we can do here :-(
|
931
|
// Charset is one of those obscure ISO-2022... or BIG5, GB2312 or something
|
932
|
// and we can't use mb_convert_encoding() or iconv();
|
933
|
// Emit an error-message.
|
934
|
trigger_error("Can't convert from $charset_in without mb_convert_encoding() or iconv(). Use UTF-8 instead.", E_USER_WARNING);
|
935
|
|
936
|
return $str;
|
937
|
}
|
938
|
|
939
|
/*
|
940
|
* Converts from UTF-8 to various charsets
|
941
|
*
|
942
|
* Will convert a string from UTF-8 to various charsets.
|
943
|
* HTML-entities will be converted, too.
|
944
|
* In case of error the returned string is unchanged, and a message is emitted.
|
945
|
* Supported charsets are:
|
946
|
* direct: iso_8859_1 iso_8859_2 iso_8859_3 iso_8859_4 iso_8859_5
|
947
|
* iso_8859_6 iso_8859_7 iso_8859_8 iso_8859_9 iso_8859_10 iso_8859_11
|
948
|
* mb_convert_encoding: all wb charsets (except those from 'direct'); but not GB2312
|
949
|
* iconv: all wb charsets (except those from 'direct')
|
950
|
*
|
951
|
* @param string An UTF-8 encoded string
|
952
|
* @param string The charset to convert to, defaults to DEFAULT_CHARSET
|
953
|
* @return string A string in a supported encoding, with all entities decoded, too.
|
954
|
* String is unchanged in case of error.
|
955
|
*/
|
956
|
function utf8_to_charset($str, $charset_out=DEFAULT_CHARSET) {
|
957
|
global $utf8_to_iso_8859_2, $utf8_to_iso_8859_3, $utf8_to_iso_8859_4, $utf8_to_iso_8859_5, $utf8_to_iso_8859_6, $utf8_to_iso_8859_7, $utf8_to_iso_8859_8, $utf8_to_iso_8859_9, $utf8_to_iso_8859_10, $utf8_to_iso_8859_11;
|
958
|
$charset_out = strtoupper($charset_out);
|
959
|
$wrong_ISO8859 = false;
|
960
|
$converted = false;
|
961
|
|
962
|
if((!function_exists('iconv') && !UTF8_MBSTRING && ($charset_out=='big5' || $charset_out=='iso-2022-jp' || $charset_out=='iso-2022-kr')) || (!function_exists('iconv') && $charset_out=='gb2312')) {
|
963
|
// Nothing we can do here :-(
|
964
|
// Charset is one of those obscure ISO-2022... or BIG5, GB2312 or something
|
965
|
// and we can't use mb_convert_encoding() or iconv();
|
966
|
// Emit an error-message.
|
967
|
trigger_error("Can't convert into $charset_out without mb_convert_encoding() or iconv(). Use UTF-8 instead.", E_USER_WARNING);
|
968
|
return($str);
|
969
|
}
|
970
|
|
971
|
// replace HTML-entities first
|
972
|
if(preg_match('/&[#0-9a-zA-Z]+;/',$str))
|
973
|
$str = utf8_entities_to_umlauts($str);
|
974
|
|
975
|
// check if we need to convert
|
976
|
if($charset_out == 'UTF-8' || utf8_isASCII($str)) {
|
977
|
// Nothing to do. Just return
|
978
|
return($str);
|
979
|
}
|
980
|
|
981
|
// Convert $str to $charset_out
|
982
|
if(substr($charset_out,0,8) == 'ISO-8859') {
|
983
|
switch($charset_out) {
|
984
|
case 'ISO-8859-1': $str=utf8_decode($str); break;
|
985
|
case 'ISO-8859-2': $str=strtr($str, $utf8_to_iso_8859_2); break;
|
986
|
case 'ISO-8859-3': $str=strtr($str, $utf8_to_iso_8859_3); break;
|
987
|
case 'ISO-8859-4': $str=strtr($str, $utf8_to_iso_8859_4); break;
|
988
|
case 'ISO-8859-5': $str=strtr($str, $utf8_to_iso_8859_5); break;
|
989
|
case 'ISO-8859-6': $str=strtr($str, $utf8_to_iso_8859_6); break;
|
990
|
case 'ISO-8859-7': $str=strtr($str, $utf8_to_iso_8859_7); break;
|
991
|
case 'ISO-8859-8': $str=strtr($str, $utf8_to_iso_8859_8); break;
|
992
|
case 'ISO-8859-9': $str=strtr($str, $utf8_to_iso_8859_9); break;
|
993
|
case 'ISO-8859-10': $str=strtr($str, $utf8_to_iso_8859_10); break;
|
994
|
case 'ISO-8859-11': $str=strtr($str, $utf8_to_iso_8859_11); break;
|
995
|
default: $wrong_ISO8859 = true;
|
996
|
}
|
997
|
if(!$wrong_ISO8859)
|
998
|
$converted = true;
|
999
|
}
|
1000
|
if(!$converted && UTF8_MBSTRING && $charset_out != 'GB2312') {
|
1001
|
// $charset is neither UTF-8 nor a known ISO-8859...
|
1002
|
// Try mb_convert_encoding() - but there's no GB2312 encoding in php's mb_* functions
|
1003
|
$str = mb_convert_encoding($str, $charset_out, 'UTF-8');
|
1004
|
$converted = true;
|
1005
|
} elseif(!$converted) { // Try iconv
|
1006
|
if(function_exists('iconv')) {
|
1007
|
$str = iconv('UTF-8', $charset_out, $str);
|
1008
|
$converted = true;
|
1009
|
}
|
1010
|
}
|
1011
|
if($converted) {
|
1012
|
return($str);
|
1013
|
}
|
1014
|
|
1015
|
// Nothing we can do here :-(
|
1016
|
// Charset is one of those obscure ISO-2022... or BIG5, GB2312 or something
|
1017
|
// and we can't use mb_convert_encoding() or iconv();
|
1018
|
// Emit an error-message.
|
1019
|
trigger_error("Can't convert into $charset_out without mb_convert_encoding() or iconv(). Use UTF-8 instead.", E_USER_WARNING);
|
1020
|
|
1021
|
return $str;
|
1022
|
}
|
1023
|
|
1024
|
/*
|
1025
|
* convert Filenames to ASCII
|
1026
|
*
|
1027
|
* Convert all non-ASCII characters and all HTML-entities to their plain 7bit equivalents
|
1028
|
* Characters without an equivalent will be converted to hex-values.
|
1029
|
* The name entities_to_7bit() is somewhat misleading, but kept for compatibility-reasons.
|
1030
|
*
|
1031
|
* @param string Filename to convert (all encodings from charset_to_utf8() are allowed)
|
1032
|
* @return string ASCII encoded string, to use as filename in wb's page_filename() and media_filename
|
1033
|
*/
|
1034
|
function entities_to_7bit($str) {
|
1035
|
// convert to UTF-8
|
1036
|
$str = charset_to_utf8($str);
|
1037
|
// replace some specials
|
1038
|
$str = utf8_stripspecials($str, '_');
|
1039
|
// translate non-ASCII characters to ASCII
|
1040
|
$str = utf8_romanize($str);
|
1041
|
// missed some? - Many UTF-8-chars can't be romanized
|
1042
|
// convert to HTML-entities, and replace entites by hex-numbers
|
1043
|
$str = utf8_umlauts_to_entities($str, false);
|
1044
|
$str = str_replace(''', ''', $str);
|
1045
|
$str = preg_replace('/&#([0-9]+);/e', "dechex('$1')", $str);
|
1046
|
// maybe there are some > < ' " & left, replace them too
|
1047
|
$entities = array('>'=>'_','<'=>'_','''=>'_','"'=>'_','&'=>'_',' '=>' ');
|
1048
|
$str = strtr($str, $entities);
|
1049
|
|
1050
|
return($str);
|
1051
|
}
|
1052
|
|
1053
|
/*
|
1054
|
* Convert a string from mixed html-entities/umlauts to pure $charset_out-umlauts
|
1055
|
*
|
1056
|
* Will replace all numeric and named entities except
|
1057
|
* > < ' " '
|
1058
|
* In case of error the returned string is unchanged, and a message is emitted.
|
1059
|
* Supported charsets are:
|
1060
|
* direct: iso_8859_1 iso_8859_2 iso_8859_3 iso_8859_4 iso_8859_5
|
1061
|
* iso_8859_6 iso_8859_7 iso_8859_8 iso_8859_9 iso_8859_10 iso_8859_11
|
1062
|
* mb_convert_encoding: all wb charsets (except those from 'direct'); but not GB2312
|
1063
|
* iconv: all wb charsets (except those from 'direct')
|
1064
|
*
|
1065
|
* @param string A string in DEFAULT_CHARSET encoding
|
1066
|
* @return string A string in $charset_out encoding with numeric and named entities replaced.
|
1067
|
* The string is unchanged in case of error.
|
1068
|
*/
|
1069
|
function entities_to_umlauts2($string, $charset_out=DEFAULT_CHARSET) {
|
1070
|
$string = charset_to_utf8($string, DEFAULT_CHARSET);
|
1071
|
if(utf8_check($string))
|
1072
|
$string = utf8_to_charset($string, $charset_out);
|
1073
|
return ($string);
|
1074
|
}
|
1075
|
|
1076
|
/*
|
1077
|
* Convert a string from mixed html-entities/umlauts to pure ASCII with HTML-entities
|
1078
|
*
|
1079
|
* Will convert a string in $charset_in encoding to a pure ASCII string with HTML-entities.
|
1080
|
* In case of error the returned string is unchanged, and a message is emitted.
|
1081
|
* Supported charsets are:
|
1082
|
* direct: iso_8859_1 iso_8859_2 iso_8859_3 iso_8859_4 iso_8859_5
|
1083
|
* iso_8859_6 iso_8859_7 iso_8859_8 iso_8859_9 iso_8859_10 iso_8859_11
|
1084
|
* mb_convert_encoding: all wb charsets (except those from 'direct'); but not GB2312
|
1085
|
* iconv: all wb charsets (except those from 'direct')
|
1086
|
*
|
1087
|
* @param string A string in $charset_in encoding
|
1088
|
* @return string A string in ASCII encoding with numeric and named entities.
|
1089
|
* The string is unchanged in case of error.
|
1090
|
*/
|
1091
|
function umlauts_to_entities2($string, $charset_in=DEFAULT_CHARSET) {
|
1092
|
$string = charset_to_utf8($string, $charset_in);
|
1093
|
if(utf8_check($string))
|
1094
|
$string = utf8_umlauts_to_entities($string);
|
1095
|
return($string);
|
1096
|
}
|
1097
|
|
1098
|
?>
|