Revision 552
Added by thorn almost 17 years ago
functions.php | ||
---|---|---|
341 | 341 |
} |
342 | 342 |
|
343 | 343 |
// Function as replacement for php's htmlspecialchars() |
344 |
// Will not mangle HTML-entities |
|
344 | 345 |
function my_htmlspecialchars($string) { |
345 |
$string = preg_replace("/&(?=[#a-z0-9]+;)/i", "_x_", $string);
|
|
346 |
$string = strtr($string, array("<"=>"<", ">"=>">", "&"=>"&", "\""=>""", "\'"=>"'"));
|
|
347 |
$string = preg_replace("/_x_(?=[#a-z0-9]+;)/i", "&", $string);
|
|
346 |
$string = preg_replace('/&(?=[#a-z0-9]+;)/i', '__amp;_', $string);
|
|
347 |
$string = strtr($string, array('<'=>'<', '>'=>'>', '&'=>'&', '"'=>'"', '\''=>'''));
|
|
348 |
$string = preg_replace('/__amp;_(?=[#a-z0-9]+;)/i', '&', $string);
|
|
348 | 349 |
return($string); |
349 | 350 |
} |
350 | 351 |
|
351 |
// Function to convert a string from $from- to $to-encoding, using mysql |
|
352 |
function my_mysql_iconv($string, $from, $to) { |
|
353 |
// keep current character set values |
|
354 |
global $database; |
|
355 |
$query = $database->query("SELECT @@character_set_client"); |
|
356 |
if($query->numRows() > 0) { |
|
357 |
$res = $query->fetchRow(); |
|
358 |
$character_set_database = $res['@@character_set_client']; |
|
359 |
} else { echo mysql_error()."\n<br />"; } |
|
360 |
$query = $database->query("SELECT @@character_set_results"); |
|
361 |
if($query->numRows() > 0) { |
|
362 |
$res = $query->fetchRow(); |
|
363 |
$character_set_results = $res['@@character_set_results']; |
|
364 |
} else { echo mysql_error()."\n<br />"; } |
|
365 |
$query = $database->query("SELECT @@collation_connection"); |
|
366 |
if($query->numRows() > 0) { |
|
367 |
$res = $query->fetchRow(); |
|
368 |
$collation_results = $res['@@collation_connection']; |
|
369 |
} else { echo mysql_error()."\n<br />"; } |
|
370 |
// set new character set values |
|
371 |
$query = $database->query("SET character_set_client=$from"); |
|
372 |
$query = $database->query("SET character_set_results=$to"); |
|
373 |
$query = $database->query("SET collation_connection=utf8_unicode_ci"); |
|
374 |
$string_escaped = mysql_real_escape_string($string); |
|
375 |
// convert the string |
|
376 |
$query = $database->query("SELECT '$string_escaped'"); |
|
377 |
if($query->numRows() > 0) { |
|
378 |
$res = $query->fetchRow(); |
|
379 |
$converted_string = $res[0]; |
|
380 |
} else { echo mysql_error()."\n<br />"; } |
|
381 |
// restore previous character set values |
|
382 |
$query = $database->query("SET character_set_client=$character_set_database"); |
|
383 |
$query = $database->query("SET character_set_results=$character_set_results"); |
|
384 |
$query = $database->query("SET collation_connection=$collation_results"); |
|
385 |
return $converted_string; |
|
386 |
} |
|
387 |
|
|
388 |
// Function as wrapper for mb_convert_encoding |
|
389 |
// converts $charset_in to $charset_out or |
|
390 |
// UTF-8 to HTML-ENTITIES or HTML-ENTITIES to UTF-8 |
|
391 |
function mb_convert_encoding_wrapper($string, $charset_out, $charset_in) { |
|
392 |
if ($charset_out == $charset_in) { |
|
393 |
return $string; |
|
352 |
// init utf8-functions -- workaround to prevent functions-utf8.php and charsets_table.php (~140kB) to be loaded more than once |
|
353 |
// functions and arrays from functions-utf8.php and charsets_table.php will be in global name-space |
|
354 |
function init_utf8funcs() { |
|
355 |
static $utf8_ok=0; |
|
356 |
if($utf8_ok == 0) { |
|
357 |
++$utf8_ok; |
|
358 |
// debug XXX to be removed |
|
359 |
if($utf8_ok > 1) |
|
360 |
trigger_error("init_utf8funcs: utf8_ok > 1", E_USER_ERROR); |
|
361 |
// XXX remove end |
|
362 |
require_once(WB_PATH.'/framework/functions-utf8.php'); |
|
394 | 363 |
} |
395 |
$use_iconv = true; |
|
396 |
$use_mbstring = true; |
|
397 |
/* |
|
398 |
if(version_compare(PHP_VERSION, "5.1.0", "<")) { |
|
399 |
$use_mbstring = false; // don't rely on mb_convert_encoding if php<5.1.0 |
|
400 |
$use_iconv = false; // don't rely on iconv neither |
|
401 |
} |
|
402 |
*/ |
|
403 |
|
|
404 |
// try mb_convert_encoding(). This can handle to or from HTML-ENTITIES, too |
|
405 |
if ($use_mbstring && function_exists('mb_convert_encoding')) { |
|
406 |
// there's no GB2312 or ISO-8859-11 encoding in php's mb_* functions |
|
407 |
if ($charset_in=='ISO-8859-11' || $charset_in=='GB2312') { |
|
408 |
if ($use_iconv && function_exists('iconv')) { |
|
409 |
$string = iconv($charset_in, 'UTF-8', $string); |
|
410 |
} |
|
411 |
else { |
|
412 |
if ($charset_in == 'GB2312') { |
|
413 |
$string=my_mysql_iconv($string, 'gb2312', 'utf8'); |
|
414 |
} else { |
|
415 |
$string=my_mysql_iconv($string, 'tis620', 'utf8'); |
|
416 |
} |
|
417 |
} |
|
418 |
$charset_in='UTF-8'; |
|
419 |
if ($charset_out == 'UTF-8') { |
|
420 |
return $string; |
|
421 |
} |
|
422 |
} |
|
423 |
if ($charset_out=='ISO-8859-11' || $charset_out=='GB2312') { |
|
424 |
$string=mb_convert_encoding($string, 'UTF-8', $charset_in); |
|
425 |
if ($use_iconv && function_exists('iconv')) { |
|
426 |
$string = iconv('UTF-8', $charset_out, $string); |
|
427 |
} |
|
428 |
else { |
|
429 |
if ($charset_out == 'GB2312') { |
|
430 |
$string=my_mysql_iconv($string, 'utf8', 'gb2312'); |
|
431 |
} else { |
|
432 |
$string=my_mysql_iconv($string, 'utf8', 'tis620'); |
|
433 |
} |
|
434 |
} |
|
435 |
} else { |
|
436 |
$string = strtr($string, array("<"=>"&_lt;", ">"=>"&_gt;", "&"=>"&_amp;", """=>"&_quot;", "'"=>"&_#39;")); |
|
437 |
$string=mb_convert_encoding($string, $charset_out, $charset_in); |
|
438 |
$string = strtr($string, array("&_lt;"=>"<", "&_gt;"=>">", "&_amp;"=>"&", "&_quot;"=>""", "&_#39;"=>"'")); |
|
439 |
} |
|
440 |
return $string; |
|
441 |
} |
|
442 |
|
|
443 |
// try iconv(). This can't handle to or from HTML-ENTITIES. |
|
444 |
if ($use_iconv && function_exists('iconv') && $charset_out!='HTML-ENTITIES' && $charset_in!='HTML-ENTITIES' ) { |
|
445 |
$string = iconv($charset_in, $charset_out, $string); |
|
446 |
return $string; |
|
447 |
} |
|
448 |
|
|
449 |
// do the UTF-8->HTML-ENTITIES or HTML-ENTITIES->UTF-8 translation if mb_convert_encoding isn't available |
|
450 |
if (($charset_in=='HTML-ENTITIES' && $charset_out=='UTF-8') || ($charset_in=='UTF-8' && $charset_out=='HTML-ENTITIES')) { |
|
451 |
$string = string_decode_encode_entities($string, $charset_out, $charset_in); |
|
452 |
return $string; |
|
453 |
} |
|
454 |
|
|
455 |
// mb_convert_encoding() and iconv() aren't available, so use my_mysql_iconv() |
|
456 |
if ($charset_in == 'ISO-8859-1') { $mysqlcharset_from = 'latin1'; } |
|
457 |
elseif ($charset_in == 'ISO-8859-2') { $mysqlcharset_from = 'latin2'; } |
|
458 |
elseif ($charset_in == 'ISO-8859-3') { $mysqlcharset_from = 'latin1'; } |
|
459 |
elseif ($charset_in == 'ISO-8859-4') { $mysqlcharset_from = 'latin7'; } |
|
460 |
elseif ($charset_in == 'ISO-8859-5') { $string = convert_cyr_string ($string, "iso8859-5", "windows-1251" ); $mysqlcharset_from = 'cp1251'; } |
|
461 |
elseif ($charset_in == 'ISO-8859-6') { $mysqlcharset_from = ''; } //? |
|
462 |
elseif ($charset_in == 'ISO-8859-7') { $mysqlcharset_from = 'greek'; } |
|
463 |
elseif ($charset_in == 'ISO-8859-8') { $mysqlcharset_from = 'hebrew'; } |
|
464 |
elseif ($charset_in == 'ISO-8859-9') { $mysqlcharset_from = 'latin5'; } |
|
465 |
elseif ($charset_in == 'ISO-8859-10') { $mysqlcharset_from = 'latin1'; } |
|
466 |
elseif ($charset_in == 'BIG5') { $mysqlcharset_from = 'big5'; } |
|
467 |
elseif ($charset_in == 'ISO-2022-JP') { $mysqlcharset_from = ''; } //? |
|
468 |
elseif ($charset_in == 'ISO-2022-KR') { $mysqlcharset_from = ''; } //? |
|
469 |
elseif ($charset_in == 'GB2312') { $mysqlcharset_from = 'gb2312'; } |
|
470 |
elseif ($charset_in == 'ISO-8859-11') { $mysqlcharset_from = 'tis620'; } |
|
471 |
elseif ($charset_in == 'UTF-8') { $mysqlcharset_from = 'utf8'; } |
|
472 |
else { $mysqlcharset_from = 'latin1'; } |
|
473 |
|
|
474 |
if ($charset_out == 'ISO-8859-1') { $mysqlcharset_to = 'latin1'; } |
|
475 |
elseif ($charset_out == 'ISO-8859-2') { $mysqlcharset_to = 'latin2'; } |
|
476 |
elseif ($charset_out == 'ISO-8859-3') { $mysqlcharset_to = 'latin1'; } |
|
477 |
elseif ($charset_out == 'ISO-8859-4') { $mysqlcharset_to = 'latin7'; } |
|
478 |
elseif ($charset_out == 'ISO-8859-5') { $mysqlcharset_to = 'cp1251'; } // use convert_cyr_string afterwards |
|
479 |
elseif ($charset_out == 'ISO-8859-6') { $mysqlcharset_to = ''; } //? |
|
480 |
elseif ($charset_out == 'ISO-8859-7') { $mysqlcharset_to = 'greek'; } |
|
481 |
elseif ($charset_out == 'ISO-8859-8') { $mysqlcharset_to = 'hebrew'; } |
|
482 |
elseif ($charset_out == 'ISO-8859-9') { $mysqlcharset_to = 'latin5'; } |
|
483 |
elseif ($charset_out == 'ISO-8859-10') { $mysqlcharset_to = 'latin1'; } |
|
484 |
elseif ($charset_out == 'BIG5') { $mysqlcharset_to = 'big5'; } |
|
485 |
elseif ($charset_out == 'ISO-2022-JP') { $mysqlcharset_to = ''; } //? |
|
486 |
elseif ($charset_out == 'ISO-2022-KR') { $mysqlcharset_to = ''; } //? |
|
487 |
elseif ($charset_out == 'GB2312') { $mysqlcharset_to = 'gb2312'; } |
|
488 |
elseif ($charset_out == 'ISO-8859-11') { $mysqlcharset_to = 'tis620'; } |
|
489 |
elseif ($charset_out == 'UTF-8') { $mysqlcharset_to = 'utf8'; } |
|
490 |
else { $mysqlcharset_to = 'latin1'; } |
|
491 |
|
|
492 |
if ($mysqlcharset_from!="" && $mysqlcharset_to!="" && $mysqlcharset_from!=$mysqlcharset_to) { |
|
493 |
$string=my_mysql_iconv($string, $mysqlcharset_from, $mysqlcharset_to); |
|
494 |
if ($mysqlcharset_to == 'cp1251') { |
|
495 |
$string = convert_cyr_string ($string, "windows-1251", "iso-8859-5" ); |
|
496 |
} |
|
497 |
return($string); |
|
498 |
} |
|
499 |
|
|
500 |
// $string is unchanged. This will happen if we have to deal with ISO-8859-6 or ISO-2022-JP or -KR |
|
501 |
// and mbstring _and_ iconv aren't available. |
|
502 |
return $string; |
|
503 | 364 |
} |
504 | 365 |
|
505 |
// Decodes or encodes html-entities. Works for utf-8 only! |
|
506 |
function string_decode_encode_entities($string, $out='HTML-ENTITIES', $in='UTF-8') { |
|
507 |
if(!(($in=='UTF-8' || $in=='HTML-ENTITIES') && ($out=='UTF-8' || $out=='HTML-ENTITIES'))) { |
|
508 |
return $string; |
|
509 |
} |
|
510 |
$named_to_numbered_entities=array( |
|
511 |
'Á'=>'Á','á'=>'á', |
|
512 |
'Â'=>'Â','â'=>'â','´'=>'´','Æ'=>'Æ','æ'=>'æ', |
|
513 |
'À'=>'À','à'=>'à','ℵ'=>'ℵ','Α'=>'Α','α'=>'α', |
|
514 |
'∧'=>'∧','∠'=>'∠','''=>''','Å'=>'Å','å'=>'å', |
|
515 |
'≈'=>'≈','Ã'=>'Ã','ã'=>'ã','Ä'=>'Ä','ä'=>'ä', |
|
516 |
'„'=>'„','Β'=>'Β','β'=>'β','¦'=>'¦','•'=>'•', |
|
517 |
'∩'=>'∩','Ç'=>'Ç','ç'=>'ç','¸'=>'¸','¢'=>'¢', |
|
518 |
'Χ'=>'Χ','χ'=>'χ','ˆ'=>'ˆ','♣'=>'♣','≅'=>'≅', |
|
519 |
'©'=>'©','↵'=>'↵','∪'=>'∪','¤'=>'¤','‡'=>'‡', |
|
520 |
'†'=>'†','⇓'=>'⇓','↓'=>'↓','°'=>'°','Δ'=>'Δ', |
|
521 |
'δ'=>'δ','♦'=>'&v#9830;','÷'=>'÷','É'=>'É','é'=>'é', |
|
522 |
'Ê'=>'Ê','ê'=>'ê','È'=>'È','è'=>'è','∅'=>'∅', |
|
523 |
' '=>' ',' '=>' ','Ε'=>'Ε','ε'=>'ε','≡'=>'≡', |
|
524 |
'Η'=>'Η','η'=>'η','Ð'=>'Ð','ð'=>'ð','Ë'=>'Ë','ë'=>'ë', |
|
525 |
'€'=>'€','∃'=>'∃','ƒ'=>'ƒ','∀'=>'∀','½'=>'½', |
|
526 |
'¼'=>'¼','¾'=>'¾','⁄'=>'⁄','Γ'=>'Γ','γ'=>'γ', |
|
527 |
'≥'=>'≥','⇔'=>'⇔','↔'=>'↔','♥'=>'♥', |
|
528 |
'…'=>'…','Í'=>'Í','í'=>'í','Î'=>'Î','î'=>'î', |
|
529 |
'¡'=>'¡','Ì'=>'Ì','ì'=>'ì','ℑ'=>'ℑ','∞'=>'∞', |
|
530 |
'∫'=>'∫','Ι'=>'Ι','ι'=>'ι','¿'=>'¿','∈'=>'∈', |
|
531 |
'Ï'=>'Ï','ï'=>'ï','Κ'=>'Κ','κ'=>'κ','Λ'=>'Λ', |
|
532 |
'λ'=>'λ','⟨'=>'〈','«'=>'«','⇐'=>'⇐','←'=>'←', |
|
533 |
'⌈'=>'⌈','“'=>'“','≤'=>'≤','⌊'=>'⌊','∗'=>'∗', |
|
534 |
'◊'=>'◊','‎'=>'‎','‹'=>'‹','‘'=>'‘', |
|
535 |
'¯'=>'¯','—'=>'—','µ'=>'µ','·'=>'·','−'=>'−', |
|
536 |
'Μ'=>'Μ','μ'=>'μ','∇'=>'∇',' '=>' ','–'=>'–', |
|
537 |
'≠'=>'≠','∋'=>'∋','¬'=>'¬','∉'=>'∉','⊄'=>'⊄', |
|
538 |
'Ñ'=>'Ñ','ñ'=>'ñ','Ν'=>'Ν','ν'=>'ν','Ó'=>'Ó', |
|
539 |
'ó'=>'ó','Ô'=>'Ô','ô'=>'ô','Œ'=>'Œ','œ'=>'œ', |
|
540 |
'Ò'=>'Ò','ò'=>'ò','‾'=>'‾','Ω'=>'Ω','ω'=>'ω', |
|
541 |
'Ο'=>'Ο','ο'=>'ο','⊕'=>'⊕','∨'=>'∨','ª'=>'ª', |
|
542 |
'º'=>'º','Ø'=>'Ø','ø'=>'ø','Õ'=>'Õ','õ'=>'õ', |
|
543 |
'⊗'=>'⊗','Ö'=>'Ö','ö'=>'ö','¶'=>'¶','∂'=>'∂', |
|
544 |
'‰'=>'‰','⊥'=>'⊥','Φ'=>'Φ','φ'=>'φ','Π'=>'Π', |
|
545 |
'π'=>'π','ϖ'=>'ϖ','±'=>'±','£'=>'£','″'=>'″', |
|
546 |
'′'=>'′','∏'=>'∏','∝'=>'∝','Ψ'=>'Ψ','ψ'=>'ψ', |
|
547 |
'"'=>'"','√'=>'√','⟩'=>'〉','»'=>'»','⇒'=>'⇒', |
|
548 |
'→'=>'→','⌉'=>'⌉','”'=>'”','ℜ'=>'ℜ','®'=>'®', |
|
549 |
'⌋'=>'⌋','Ρ'=>'Ρ','ρ'=>'ρ','‏'=>'‏','›'=>'›', |
|
550 |
'’'=>'’','‚'=>'‚','Š'=>'Š','š'=>'š','⋅'=>'⋅', |
|
551 |
'§'=>'§','­'=>'­','Σ'=>'Σ','σ'=>'σ','ς'=>'ς', |
|
552 |
'∼'=>'∼','♠'=>'♠','⊂'=>'⊂','⊆'=>'⊆','∑'=>'∑', |
|
553 |
'⊃'=>'⊃','¹'=>'¹','²'=>'²','³'=>'³','⊇'=>'⊇', |
|
554 |
'ß'=>'ß','Τ'=>'Τ','τ'=>'τ','∴'=>'∴','Θ'=>'Θ', |
|
555 |
'θ'=>'θ','ϑ'=>'ϑ',' '=>' ','Þ'=>'Þ','þ'=>'þ', |
|
556 |
'˜'=>'˜','×'=>'×','™'=>'™','Ú'=>'Ú','ú'=>'ú', |
|
557 |
'⇑'=>'⇑','↑'=>'↑','Û'=>'Û','û'=>'û','Ù'=>'Ù', |
|
558 |
'ù'=>'ù','¨'=>'¨','ϒ'=>'ϒ','Υ'=>'Υ','υ'=>'υ', |
|
559 |
'Ü'=>'Ü','ü'=>'ü','℘'=>'℘','Ξ'=>'Ξ','ξ'=>'ξ', |
|
560 |
'Ý'=>'Ý','ý'=>'ý','¥'=>'¥','Ÿ'=>'Ÿ','ÿ'=>'ÿ', |
|
561 |
'Ζ'=>'Ζ','ζ'=>'ζ','‍'=>'‍','‌'=>'‌' |
|
562 |
); |
|
563 |
$numbered_to_named_entities=array( |
|
564 |
'Á'=>'Á','á'=>'á','Â'=>'Â','â'=>'â','´'=>'´', |
|
565 |
'Æ'=>'Æ','æ'=>'æ','À'=>'À','à'=>'à','ℵ'=>'ℵ', |
|
566 |
'Α'=>'Α','α'=>'α','∧'=>'∧','∠'=>'∠', |
|
567 |
'''=>''','Å'=>'Å','å'=>'å','≈'=>'≈','Ã'=>'Ã', |
|
568 |
'ã'=>'ã','Ä'=>'Ä','ä'=>'ä','„'=>'„','Β'=>'Β', |
|
569 |
'β'=>'β','¦'=>'¦','•'=>'•','∩'=>'∩','Ç'=>'Ç', |
|
570 |
'ç'=>'ç','¸'=>'¸','¢'=>'¢','Χ'=>'Χ','χ'=>'χ', |
|
571 |
'ˆ'=>'ˆ','♣'=>'♣','≅'=>'≅','©'=>'©','↵'=>'↵', |
|
572 |
'∪'=>'∪','¤'=>'¤','‡'=>'‡','†'=>'†','⇓'=>'⇓', |
|
573 |
'↓'=>'↓','°'=>'°','Δ'=>'Δ','δ'=>'δ','&v#9830;'=>'♦', |
|
574 |
'÷'=>'÷','É'=>'É','é'=>'é','Ê'=>'Ê','ê'=>'ê', |
|
575 |
'È'=>'È','è'=>'è','∅'=>'∅',' '=>' ',' '=>' ', |
|
576 |
'Ε'=>'Ε','ε'=>'ε','≡'=>'≡','Η'=>'Η','η'=>'η', |
|
577 |
'Ð'=>'Ð','ð'=>'ð','Ë'=>'Ë','ë'=>'ë','€'=>'€', |
|
578 |
'∃'=>'∃','ƒ'=>'ƒ','∀'=>'∀','½'=>'½','¼'=>'¼', |
|
579 |
'¾'=>'¾','⁄'=>'⁄','Γ'=>'Γ','γ'=>'γ','≥'=>'≥', |
|
580 |
'⇔'=>'⇔','↔'=>'↔','♥'=>'♥','…'=>'…', |
|
581 |
'Í'=>'Í','í'=>'í','Î'=>'Î','î'=>'î','¡'=>'¡', |
|
582 |
'Ì'=>'Ì','ì'=>'ì','ℑ'=>'ℑ','∞'=>'∞','∫'=>'∫', |
|
583 |
'Ι'=>'Ι','ι'=>'ι','¿'=>'¿','∈'=>'∈','Ï'=>'Ï', |
|
584 |
'ï'=>'ï','Κ'=>'Κ','κ'=>'κ','Λ'=>'Λ','λ'=>'λ', |
|
585 |
'〈'=>'⟨','«'=>'«','⇐'=>'⇐','←'=>'←','⌈'=>'⌈', |
|
586 |
'“'=>'“','≤'=>'≤','⌊'=>'⌊','∗'=>'∗','◊'=>'◊', |
|
587 |
'‎'=>'‎','‹'=>'‹','‘'=>'‘','¯'=>'¯', |
|
588 |
'—'=>'—','µ'=>'µ','·'=>'·','−'=>'−','Μ'=>'Μ', |
|
589 |
'μ'=>'μ','∇'=>'∇',' '=>' ','–'=>'–','≠'=>'≠', |
|
590 |
'∋'=>'∋','¬'=>'¬','∉'=>'∉','⊄'=>'⊄','Ñ'=>'Ñ', |
|
591 |
'ñ'=>'ñ','Ν'=>'Ν','ν'=>'ν','Ó'=>'Ó','ó'=>'ó', |
|
592 |
'Ô'=>'Ô','ô'=>'ô','Œ'=>'Œ','œ'=>'œ','Ò'=>'Ò', |
|
593 |
'ò'=>'ò','‾'=>'‾','Ω'=>'Ω','ω'=>'ω','Ο'=>'Ο', |
|
594 |
'ο'=>'ο','⊕'=>'⊕','∨'=>'∨','ª'=>'ª','º'=>'º', |
|
595 |
'Ø'=>'Ø','ø'=>'ø','Õ'=>'Õ','õ'=>'õ','⊗'=>'⊗', |
|
596 |
'Ö'=>'Ö','ö'=>'ö','¶'=>'¶','∂'=>'∂','‰'=>'‰', |
|
597 |
'⊥'=>'⊥','Φ'=>'Φ','φ'=>'φ','Π'=>'Π','π'=>'π','ϖ'=>'ϖ', |
|
598 |
'±'=>'±','£'=>'£','″'=>'″','′'=>'′','∏'=>'∏', |
|
599 |
'∝'=>'∝','Ψ'=>'Ψ','ψ'=>'ψ','"'=>'"','√'=>'√', |
|
600 |
'〉'=>'⟩','»'=>'»','⇒'=>'⇒','→'=>'→','⌉'=>'⌉', |
|
601 |
'”'=>'”','ℜ'=>'ℜ','®'=>'®','⌋'=>'⌋','Ρ'=>'Ρ', |
|
602 |
'ρ'=>'ρ','‏'=>'‏','›'=>'›','’'=>'’','‚'=>'‚', |
|
603 |
'Š'=>'Š','š'=>'š','⋅'=>'⋅','§'=>'§','­'=>'­', |
|
604 |
'Σ'=>'Σ','σ'=>'σ','ς'=>'ς','∼'=>'∼','♠'=>'♠', |
|
605 |
'⊂'=>'⊂','⊆'=>'⊆','∑'=>'∑','⊃'=>'⊃','¹'=>'¹', |
|
606 |
'²'=>'²','³'=>'³','⊇'=>'⊇','ß'=>'ß','Τ'=>'Τ', |
|
607 |
'τ'=>'τ','∴'=>'∴','Θ'=>'Θ','θ'=>'θ','ϑ'=>'ϑ', |
|
608 |
' '=>' ','Þ'=>'Þ','þ'=>'þ','˜'=>'˜','×'=>'×', |
|
609 |
'™'=>'™','Ú'=>'Ú','ú'=>'ú','⇑'=>'⇑','↑'=>'↑', |
|
610 |
'Û'=>'Û','û'=>'û','Ù'=>'Ù','ù'=>'ù','¨'=>'¨', |
|
611 |
'ϒ'=>'ϒ','Υ'=>'Υ','υ'=>'υ','Ü'=>'Ü','ü'=>'ü', |
|
612 |
'℘'=>'℘','Ξ'=>'Ξ','ξ'=>'ξ','Ý'=>'Ý','ý'=>'ý', |
|
613 |
'¥'=>'¥','Ÿ'=>'Ÿ','ÿ'=>'ÿ','Ζ'=>'Ζ','ζ'=>'ζ','‍'=>'‍', |
|
614 |
'‌'=>'‌' |
|
615 |
); |
|
616 |
|
|
617 |
if ($in == 'HTML-ENTITIES') { |
|
618 |
$string = strtr($string, $named_to_numbered_entities); |
|
619 |
$string = preg_replace("/&#([0-9]+);/e", "code_to_utf8($1)", $string); |
|
620 |
} |
|
621 |
elseif ($out == 'HTML-ENTITIES') { |
|
622 |
$char = ""; |
|
623 |
$i=0; |
|
624 |
$len=strlen($string); |
|
625 |
if($len==0) return $string; |
|
626 |
do { |
|
627 |
if(ord($string{$i}) <= 127) $ud = $string{$i++}; |
|
628 |
elseif(ord($string{$i}) <= 223) $ud = (ord($string{$i++})-192)*64 + (ord($string{$i++})-128); |
|
629 |
elseif(ord($string{$i}) <= 239) $ud = (ord($string{$i++})-224)*4096 + (ord($string{$i++})-128)*64 + (ord($string{$i++})-128); |
|
630 |
elseif(ord($string{$i}) <= 247) $ud = (ord($string{$i++})-240)*262144 + (ord($string{$i++})-128)*4096 + (ord($string{$i++})-128)*64 + (ord($string{$i++})-128); |
|
631 |
elseif(ord($string{$i}) <= 251) $ud = ord($string{$i++}); // error! |
|
632 |
if($ud > 127) { |
|
633 |
$char .= "&#$ud;"; |
|
634 |
} else { |
|
635 |
$char .= $ud; |
|
636 |
} |
|
637 |
} while($i < $len); |
|
638 |
$string = $char; |
|
639 |
$string = strtr($string, $numbered_to_named_entities); |
|
640 |
// do ' and " |
|
641 |
$string = strtr($string, array('\''=>''', '\"'=>'"')); |
|
642 |
} |
|
643 |
return $string; |
|
366 |
// Convert a string from mixed html-entities/umlauts to pure $charset_out-umlauts |
|
367 |
// Will replace all numeric and named entities except > < ' " ' |
|
368 |
// In case of error the returned string is unchanged, and a message is emitted. |
|
369 |
function entities_to_umlauts($string, $charset_out=DEFAULT_CHARSET) { |
|
370 |
//init utf8-functions -- workaround to prevent functions-utf8.php and charsets_table.php (~140kB) to be loaded more than once |
|
371 |
init_utf8funcs(); |
|
372 |
return entities_to_umlauts2($string, $charset_out); |
|
644 | 373 |
} |
645 | 374 |
|
646 |
// support-function for string_decode_encode_entities() |
|
647 |
function code_to_utf8($num) { |
|
648 |
if ($num <= 0x7F) { |
|
649 |
return chr($num); |
|
650 |
} elseif ($num <= 0x7FF) { |
|
651 |
return chr(($num >> 6) + 192) . chr(($num & 63) + 128); |
|
652 |
} elseif ($num <= 0xFFFF) { |
|
653 |
return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128); |
|
654 |
} elseif ($num <= 0x1FFFFF) { |
|
655 |
return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128); |
|
656 |
} |
|
657 |
return " "; |
|
658 |
} |
|
659 |
|
|
660 |
// Function to convert a string from mixed html-entities/umlauts to pure utf-8-umlauts |
|
661 |
function string_to_utf8($string, $charset=DEFAULT_CHARSET) { |
|
662 |
$charset = strtoupper($charset); |
|
663 |
if ($charset == '') { $charset = 'ISO-8859-1'; } |
|
664 |
|
|
665 |
if (!is_UTF8($string)) { |
|
666 |
$string=mb_convert_encoding_wrapper($string, 'UTF-8', $charset); |
|
667 |
} |
|
668 |
// check if we really get UTF-8. We don't get UTF-8 if charset is ISO-8859-6 or ISO-2022-JP/KR |
|
669 |
// and mb_string AND iconv aren't available. |
|
670 |
if (is_UTF8($string)) { |
|
671 |
$string=mb_convert_encoding_wrapper($string, 'HTML-ENTITIES', 'UTF-8'); |
|
672 |
$string=mb_convert_encoding_wrapper($string, 'UTF-8', 'HTML-ENTITIES'); |
|
673 |
} else { |
|
674 |
// nothing we can do here :-( |
|
675 |
} |
|
676 |
return($string); |
|
677 |
} |
|
678 |
|
|
679 |
// function to check if a string is UTF-8 |
|
680 |
function is_UTF8 ($str) { |
|
681 |
if (strlen($str) < 4000) { |
|
682 |
// see http://bugs.php.net/bug.php?id=24460 and http://bugs.php.net/bug.php?id=27070 and http://ilia.ws/archives/5-Top-10-ways-to-crash-PHP.html for this. |
|
683 |
// 4000 works for me ... |
|
684 |
return preg_match('/^(?:[\x09\x0A\x0D\x20-\x7E]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}|\xED[\x80-\x9F][\x80-\xBF]|\xF0[\x90-\xBF][\x80-\xBF]{2}|[\xF1-\xF3][\x80-\xBF]{3}|\xF4[\x80-\x8F][\x80-\xBF]{2})*$/s', $str); |
|
685 |
} else { |
|
686 |
$isUTF8 = true; |
|
687 |
while($str{0}) { |
|
688 |
if (preg_match("/^[\x09\x0A\x0D\x20-\x7E]/", $str)) { $str = substr($str, 1); continue; } |
|
689 |
if (preg_match("/^[\xC2-\xDF][\x80-\xBF]/", $str)) { $str = substr($str, 2); continue; } |
|
690 |
if (preg_match("/^\xE0[\xA0-\xBF][\x80-\xBF]/", $str)) { $str = substr($str, 3); continue; } |
|
691 |
if (preg_match("/^[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}/", $str)) { $str = substr($str, 3); continue; } |
|
692 |
if (preg_match("/^\xED[\x80-\x9F][\x80-\xBF]/", $str)) { $str = substr($str, 3); continue; } |
|
693 |
if (preg_match("/^\xF0[\x90-\xBF][\x80-\xBF]{2}/", $str)) { $str = substr($str, 4); continue; } |
|
694 |
if (preg_match("/^[\xF1-\xF3][\x80-\xBF]{3}/", $str)) { $str = substr($str, 4); continue; } |
|
695 |
if (preg_match("/^\xF4[\x80-\x8F][\x80-\xBF]{2}/", $str)) { $str = substr($str, 4); continue; } |
|
696 |
if (preg_match("/^$/", $str)) { break; } |
|
697 |
$isUTF8 = false; |
|
698 |
break; |
|
699 |
} |
|
700 |
return ($isUTF8); |
|
701 |
} |
|
702 |
} |
|
703 |
|
|
704 |
// Function to convert a string from mixed html-entities/umlauts to pure $charset_out-umlauts |
|
705 |
function entities_to_umlauts($string, $charset_out=DEFAULT_CHARSET) { |
|
706 |
$charset_out = strtoupper($charset_out); |
|
707 |
if ($charset_out == '') { $charset_out = 'ISO-8859-1'; } |
|
708 |
$charset_in = strtoupper(DEFAULT_CHARSET); |
|
709 |
require_once(WB_PATH.'/framework/charsets_table.php'); |
|
710 |
global $iso_8859_2_to_utf8, $iso_8859_3_to_utf8, $iso_8859_4_to_utf8, $iso_8859_5_to_utf8, $iso_8859_6_to_utf8, $iso_8859_7_to_utf8, $iso_8859_8_to_utf8, $iso_8859_9_to_utf8, $iso_8859_10_to_utf8, $iso_8859_11_to_utf8; |
|
711 |
global $utf8_to_iso_8859_2, $utf8_to_iso_8859_3, $utf8_to_iso_8859_4, $utf8_to_iso_8859_5, $utf8_to_iso_8859_6, $utf8_to_iso_8859_7, $utf8_to_iso_8859_8, $utf8_to_iso_8859_9, $utf8_to_iso_8859_10, $utf8_to_iso_8859_11; |
|
712 |
|
|
713 |
// string to utf-8, entities_to_utf8 |
|
714 |
if (substr($charset_in,0,8) == 'ISO-8859' || $charset_in == 'UTF-8') { |
|
715 |
if ($charset_in == 'ISO-8859-1') { |
|
716 |
$string=utf8_encode($string); |
|
717 |
} elseif ($charset_in == 'ISO-8859-2') { |
|
718 |
$string = strtr($string, $iso_8859_2_to_utf8); |
|
719 |
} elseif ($charset_in == 'ISO-8859-3') { |
|
720 |
$string = strtr($string, $iso_8859_3_to_utf8); |
|
721 |
} elseif ($charset_in == 'ISO-8859-4') { |
|
722 |
$string = strtr($string, $iso_8859_4_to_utf8); |
|
723 |
} elseif ($charset_in == 'ISO-8859-5') { |
|
724 |
$string = strtr($string, $iso_8859_5_to_utf8); |
|
725 |
} elseif ($charset_in == 'ISO-8859-6') { |
|
726 |
$string = strtr($string, $iso_8859_6_to_utf8); |
|
727 |
} elseif ($charset_in == 'ISO-8859-7') { |
|
728 |
$string = strtr($string, $iso_8859_7_to_utf8); |
|
729 |
} elseif ($charset_in == 'ISO-8859-8') { |
|
730 |
$string = strtr($string, $iso_8859_8_to_utf8); |
|
731 |
} elseif ($charset_in == 'ISO-8859-9') { |
|
732 |
$string = strtr($string, $iso_8859_9_to_utf8); |
|
733 |
} elseif ($charset_in == 'ISO-8859-10') { |
|
734 |
$string = strtr($string, $iso_8859_10_to_utf8); |
|
735 |
} elseif ($charset_in == 'ISO-8859-11') { |
|
736 |
$string = strtr($string, $iso_8859_11_to_utf8); |
|
737 |
} |
|
738 |
// decode html-entities |
|
739 |
if(preg_match("/&[#a-zA-Z0-9]+;/", $string)) { |
|
740 |
$string=string_decode_encode_entities($string, 'UTF-8', 'HTML-ENTITIES'); |
|
741 |
//$string=mb_convert_encoding_wrapper($string, 'HTML-ENTITIES', 'UTF-8'); // alternative to string_decode_encode_entities() |
|
742 |
//$string=mb_convert_encoding_wrapper($string, 'UTF-8', 'HTML-ENTITIES'); |
|
743 |
} |
|
744 |
} |
|
745 |
else { |
|
746 |
$string = string_to_utf8($string); // will decode html-entities, too. |
|
747 |
} |
|
748 |
// string to $charset_out |
|
749 |
if($charset_out == 'ISO-8859-1') { |
|
750 |
$string=utf8_decode($string); |
|
751 |
} elseif($charset_out == 'ISO-8859-2') { |
|
752 |
$string = strtr($string, $utf8_to_iso_8859_2); |
|
753 |
} elseif($charset_out == 'ISO-8859-3') { |
|
754 |
$string = strtr($string, $utf8_to_iso_8859_3); |
|
755 |
} elseif($charset_out == 'ISO-8859-4') { |
|
756 |
$string = strtr($string, $utf8_to_iso_8859_4); |
|
757 |
} elseif($charset_out == 'ISO-8859-5') { |
|
758 |
$string = strtr($string, $utf8_to_iso_8859_5); |
|
759 |
} elseif($charset_out == 'ISO-8859-6') { |
|
760 |
$string = strtr($string, $utf8_to_iso_8859_6); |
|
761 |
} elseif($charset_out == 'ISO-8859-7') { |
|
762 |
$string = strtr($string, $utf8_to_iso_8859_7); |
|
763 |
} elseif($charset_out == 'ISO-8859-8') { |
|
764 |
$string = strtr($string, $utf8_to_iso_8859_8); |
|
765 |
} elseif($charset_out == 'ISO-8859-9') { |
|
766 |
$string = strtr($string, $utf8_to_iso_8859_9); |
|
767 |
} elseif($charset_out == 'ISO-8859-10') { |
|
768 |
$string = strtr($string, $utf8_to_iso_8859_10); |
|
769 |
} elseif($charset_out == 'ISO-8859-11') { |
|
770 |
$string = strtr($string, $utf8_to_iso_8859_11); |
|
771 |
} elseif($charset_out != 'UTF-8') { |
|
772 |
if(is_UTF8($string)) { |
|
773 |
$string=mb_convert_encoding_wrapper($string, $charset_out, 'UTF-8'); |
|
774 |
} |
|
775 |
} |
|
776 |
return $string; |
|
777 |
} |
|
778 |
|
|
779 |
// Function to convert a string from mixed html-entitites/$charset_in-umlauts to pure html-entities |
|
375 |
// Will convert a string in $charset_in encoding to a pure ASCII string with HTML-entities. |
|
376 |
// In case of error the returned string is unchanged, and a message is emitted. |
|
780 | 377 |
function umlauts_to_entities($string, $charset_in=DEFAULT_CHARSET) { |
781 |
$charset_in = strtoupper($charset_in); |
|
782 |
if ($charset_in == "") { $charset_in = 'ISO-8859-1'; } |
|
783 |
require_once(WB_PATH.'/framework/charsets_table.php'); |
|
784 |
global $iso_8859_2_to_utf8, $iso_8859_3_to_utf8, $iso_8859_4_to_utf8, $iso_8859_5_to_utf8, $iso_8859_6_to_utf8, $iso_8859_7_to_utf8, $iso_8859_8_to_utf8, $iso_8859_9_to_utf8, $iso_8859_10_to_utf8, $iso_8859_11_to_utf8; |
|
785 |
|
|
786 |
// string to utf-8, umlauts_to_entities |
|
787 |
if ($charset_in == 'UTF-8' || substr($charset_in,0,8) == 'ISO-8859') { |
|
788 |
if ($charset_in == 'ISO-8859-1') { |
|
789 |
$string=utf8_encode($string); |
|
790 |
} elseif ($charset_in == 'ISO-8859-2') { |
|
791 |
$string = strtr($string, $iso_8859_2_to_utf8); |
|
792 |
} elseif ($charset_in == 'ISO-8859-3') { |
|
793 |
$string = strtr($string, $iso_8859_3_to_utf8); |
|
794 |
} elseif ($charset_in == 'ISO-8859-4') { |
|
795 |
$string = strtr($string, $iso_8859_4_to_utf8); |
|
796 |
} elseif ($charset_in == 'ISO-8859-5') { |
|
797 |
$string = strtr($string, $iso_8859_5_to_utf8); |
|
798 |
} elseif ($charset_in == 'ISO-8859-6') { |
|
799 |
$string = strtr($string, $iso_8859_6_to_utf8); |
|
800 |
} elseif ($charset_in == 'ISO-8859-7') { |
|
801 |
$string = strtr($string, $iso_8859_7_to_utf8); |
|
802 |
} elseif ($charset_in == 'ISO-8859-8') { |
|
803 |
$string = strtr($string, $iso_8859_8_to_utf8); |
|
804 |
} elseif ($charset_in == 'ISO-8859-9') { |
|
805 |
$string = strtr($string, $iso_8859_9_to_utf8); |
|
806 |
} elseif ($charset_in == 'ISO-8859-10') { |
|
807 |
$string = strtr($string, $iso_8859_10_to_utf8); |
|
808 |
} elseif ($charset_in == 'ISO-8859-11') { |
|
809 |
$string = strtr($string, $iso_8859_11_to_utf8); |
|
810 |
} |
|
811 |
// encode html-entities |
|
812 |
$string=string_decode_encode_entities($string, 'HTML-ENTITIES', 'UTF-8'); |
|
813 |
//$string=mb_convert_encoding_wrapper($string, 'HTML-ENTITIES', 'UTF-8'); |
|
814 |
} |
|
815 |
else { |
|
816 |
$string = string_to_utf8($string, $charset_in); |
|
817 |
// encode html-entities |
|
818 |
if (is_UTF8($string)) { |
|
819 |
$string=string_decode_encode_entities($string, 'HTML-ENTITIES', 'UTF-8'); |
|
820 |
//$string=mb_convert_encoding_wrapper($string, 'HTML-ENTITIES', 'UTF-8'); |
|
821 |
} |
|
822 |
} |
|
823 |
return $string; |
|
378 |
//init utf8-functions -- workaround to prevent functions-utf8.php and charsets_table.php (~140kB) to be loaded more than once |
|
379 |
init_utf8funcs(); |
|
380 |
return umlauts_to_entities2($string, $charset_in); |
|
824 | 381 |
} |
825 | 382 |
|
826 |
function umlauts_to_defcharset($string, $charset) { |
|
827 |
$charset_out = strtoupper(DEFAULT_CHARSET); |
|
828 |
if ($charset_out == "") { $charset_out = 'ISO-8859-1'; } |
|
829 |
require_once(WB_PATH.'/framework/charsets_table.php'); |
|
830 |
global $utf8_to_iso_8859_2, $utf8_to_iso_8859_3, $utf8_to_iso_8859_4, $utf8_to_iso_8859_5, $utf8_to_iso_8859_6, $utf8_to_iso_8859_7, $utf8_to_iso_8859_8, $utf8_to_iso_8859_9, $utf8_to_iso_8859_10, $utf8_to_iso_8859_11; |
|
831 |
|
|
832 |
if($charset_out == $charset) { |
|
833 |
return $string; |
|
834 |
} |
|
835 |
|
|
836 |
if($charset == 'UTF-8') { |
|
837 |
if($charset_out == 'ISO-8859-1') { |
|
838 |
$string = utf8_decode($string); |
|
839 |
} elseif ($charset_out == 'ISO-8859-2') { |
|
840 |
$string = strtr($string, $utf8_to_iso_8859_2); |
|
841 |
} elseif ($charset_out == 'ISO-8859-3') { |
|
842 |
$string = strtr($string, $utf8_to_iso_8859_3); |
|
843 |
} elseif ($charset_out == 'ISO-8859-4') { |
|
844 |
$string = strtr($string, $utf8_to_iso_8859_4); |
|
845 |
} elseif ($charset_out == 'ISO-8859-5') { |
|
846 |
$string = strtr($string, $utf8_to_iso_8859_5); |
|
847 |
} elseif ($charset_out == 'ISO-8859-6') { |
|
848 |
$string = strtr($string, $utf8_to_iso_8859_6); |
|
849 |
} elseif ($charset_out == 'ISO-8859-7') { |
|
850 |
$string = strtr($string, $utf8_to_iso_8859_7); |
|
851 |
} elseif ($charset_out == 'ISO-8859-8') { |
|
852 |
$string = strtr($string, $utf8_to_iso_8859_8); |
|
853 |
} elseif ($charset_out == 'ISO-8859-9') { |
|
854 |
$string = strtr($string, $utf8_to_iso_8859_9); |
|
855 |
} elseif ($charset_out == 'ISO-8859-10') { |
|
856 |
$string = strtr($string, $utf8_to_iso_8859_10); |
|
857 |
} elseif ($charset_out == 'ISO-8859-11') { |
|
858 |
$string = strtr($string, $utf8_to_iso_8859_11); |
|
859 |
} |
|
860 |
else { |
|
861 |
$string=mb_convert_encoding_wrapper($string, $charset_out, $charset); |
|
862 |
} |
|
863 |
} |
|
864 |
else { |
|
865 |
$string=mb_convert_encoding_wrapper($string, $charset_out, $charset); |
|
866 |
} |
|
867 |
|
|
868 |
return $string; |
|
869 |
} |
|
870 |
|
|
871 |
// translate any latin/greek/cyrillic html-entities to their plain 7bit equivalents |
|
872 |
// and numbered-entities into hex |
|
873 |
function entities_to_7bit($string) { |
|
874 |
require(WB_PATH.'/framework/convert.php'); |
|
875 |
$string = strtr($string, $conversion_array); |
|
876 |
$string = preg_replace('/&#([0-9]+);/e', "dechex('$1')", $string); |
|
877 |
return($string); |
|
878 |
} |
|
879 |
|
|
880 | 383 |
// Function to convert a page title to a page filename |
881 | 384 |
function page_filename($string) { |
882 |
$string = entities_to_7bit(umlauts_to_entities($string)); |
|
385 |
//init utf8-functions -- workaround to prevent functions-utf8.php and charsets_table.php (~140kB) to be loaded more than once |
|
386 |
init_utf8funcs(); |
|
387 |
$string = entities_to_7bit($string); |
|
883 | 388 |
// Now replace spaces with page spcacer |
884 | 389 |
$string = trim($string); |
885 | 390 |
$string = preg_replace('/(\s)+/', PAGE_SPACER, $string); |
... | ... | |
903 | 408 |
|
904 | 409 |
// Function to convert a desired media filename to a clean filename |
905 | 410 |
function media_filename($string) { |
906 |
$string = entities_to_7bit(umlauts_to_entities($string)); |
|
411 |
//init utf8-functions -- workaround to prevent functions-utf8.php and charsets_table.php (~140kB) to be loaded more than once |
|
412 |
init_utf8funcs(); |
|
413 |
$string = entities_to_7bit($string); |
|
907 | 414 |
// Now remove all bad characters |
908 | 415 |
$bad = array( |
909 | 416 |
'\'', // ' |
Also available in: Unified diff
added new module-based search-function and publish-by-date code