/trunk/wb/framework/functions-utf8.php - Annotate - WB 2.08.x - Tracking

552

thorn

<?php

2

3

// $Id: $

4

5

/*

6

7

 Website Baker Project <http://www.websitebaker.org/>

8

 Copyright (C) 2004-2008, Ryan Djurovich

9

10

 Website Baker is free software; you can redistribute it and/or modify

11

 it under the terms of the GNU General Public License as published by

12

 the Free Software Foundation; either version 2 of the License, or

13

 (at your option) any later version.

14

15

 Website Baker is distributed in the hope that it will be useful,

16

 but WITHOUT ANY WARRANTY; without even the implied warranty of

17

 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

18

 GNU General Public License for more details.

19

20

 You should have received a copy of the GNU General Public License

21

 along with Website Baker; if not, write to the Free Software

22

 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

23

24

*/

25

26

/*

27

 * A large part of this file is based on 'utf8.php' from the DokuWiki-project.

28

 * (http://www.splitbrain.org/projects/dokuwiki):

29

**

30

 * UTF8 helper functions

31

 * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)

32

 * @author     Andreas Gohr <andi@splitbrain.org>

33

**

34

 * modified for use with Website Baker

35

 * from thorn, Jan. 2008

36

*/

37

38

// Functions we use in Website Baker:

39

//   entities_to_7bit()

40

//   entities_to_umlauts2()

41

//   umlauts_to_entities2()

42

43

if(!defined('WB_URL')) {

44

	header('Location: ../index.php');

45

	exit(0);

46

47

48

/*

49

 * check for mb_string support

50

*/

51

if(!defined('UTF8_MBSTRING')){

52

  if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){

53

    define('UTF8_MBSTRING',1);

54

  }else{

55

    define('UTF8_MBSTRING',0);

if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }

60

61

require_once(WB_PATH.'/framework/charsets_table.php');

62

63

/*

64

 * Checks if a string contains 7bit ASCII only

65

66

 * @author Andreas Gohr <andi@splitbrain.org>

67

*/

68

function utf8_isASCII($str){

69

  for($i=0; $i<strlen($str); $i++){

70

    if(ord($str{$i}) >127) return false;

71

72

  return true;

73

74

75

/*

76

 * Strips all highbyte chars

77

78

 * Returns a pure ASCII7 string

79

80

 * @author Andreas Gohr <andi@splitbrain.org>

81

*/

82

function utf8_strip($str){

83

  $ascii = '';

84

  for($i=0; $i<strlen($str); $i++){

85

    if(ord($str{$i}) <128){

86

      $ascii .= $str{$i};

87

88

89

  return $ascii;

90

91

92

/*

93

 * Tries to detect if a string is in Unicode encoding

94

95

 * @author <bmorel@ssi.fr>

96

 * @link   http://www.php.net/manual/en/function.utf8-encode.php

97

*/

98

function utf8_check($Str) {

99

 for ($i=0; $i<strlen($Str); $i++) {

100

  $b = ord($Str[$i]);

101

  if ($b < 0x80) continue; # 0bbbbbbb

102

  elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb

103

  elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb

104

  elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb

105

  elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb

106

  elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b

107

  else return false; # Does not match any model

108

  for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?

109

   if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))

110

   return false;

111

112

113

 return true;

114

115

116

/*

117

 * Unicode aware replacement for strlen()

118

119

 * utf8_decode() converts characters that are not in ISO-8859-1

120

 * to '?', which, for the purpose of counting, is alright - It's

121

 * even faster than mb_strlen.

122

123

 * @author <chernyshevsky at hotmail dot com>

124

 * @see    strlen()

125

 * @see    utf8_decode()

126

*/

127

function utf8_strlen($string){

128

  return strlen(utf8_decode($string));

129

130

131

/*

132

 * UTF-8 aware alternative to substr

133

134

 * Return part of a string given character offset (and optionally length)

135

136

 * @author Harry Fuecks <hfuecks@gmail.com>

137

 * @author Chris Smith <chris@jalakai.co.uk>

138

 * @param string

139

 * @param integer number of UTF-8 characters offset (from left)

140

 * @param integer (optional) length in UTF-8 characters from offset

141

 * @return mixed string or false if failure

142

*/

143

function utf8_substr($str, $offset, $length = null) {

144

    if(UTF8_MBSTRING){

145

        if( $length === null ){

146

            return mb_substr($str, $offset);

147

        }else{

148

            return mb_substr($str, $offset, $length);

/*

153

     * Notes:

154

155

     * no mb string support, so we'll use pcre regex's with 'u' flag

156

     * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for

157

     * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)

158

159

     * substr documentation states false can be returned in some cases (e.g. offset > string length)

160

     * mb_substr never returns false, it will return an empty string instead.

161

162

     * calculating the number of characters in the string is a relatively expensive operation, so

163

     * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length

164

*/

165

166

    // cast parameters to appropriate types to avoid multiple notices/warnings

167

    $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects

168

    $offset = (int)$offset;

169

    if (!is_null($length)) $length = (int)$length;

170

171

    // handle trivial cases

172

    if ($length === 0) return '';

173

    if ($offset < 0 && $length < 0 && $length < $offset) return '';

174

175

    $offset_pattern = '';

176

    $length_pattern = '';

177

178

    // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)

179

    if ($offset < 0) {

180

      $strlen = strlen(utf8_decode($str));        // see notes

181

      $offset = $strlen + $offset;

182

      if ($offset < 0) $offset = 0;

183

184

185

    // establish a pattern for offset, a non-captured group equal in length to offset

186

    if ($offset > 0) {

187

      $Ox = (int)($offset/65535);

188

      $Oy = $offset%65535;

189

190

      if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';

191

      $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';

192

    } else {

193

      $offset_pattern = '^';                      // offset == 0; just anchor the pattern

194

195

196

    // establish a pattern for length

197

    if (is_null($length)) {

198

      $length_pattern = '(.*)$';                  // the rest of the string

199

    } else {

200

201

      if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes

202

      if ($offset > $strlen) return '';           // another trivial case

203

204

      if ($length > 0) {

205

206

        $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string

207

208

        $Lx = (int)($length/65535);

209

        $Ly = $length%65535;

210

211

        // +ve length requires ... a captured group of length characters

212

        if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';

213

        $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';

214

215

      } else if ($length < 0) {

216

217

        if ($length < ($offset - $strlen)) return '';

218

219

        $Lx = (int)((-$length)/65535);

220

        $Ly = (-$length)%65535;

221

222

        // -ve length requires ... capture everything except a group of -length characters

223

        //                         anchored at the tail-end of the string

224

        if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';

225

        $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';

    if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';

230

    return $match[1];

231

232

233

/*

234

 * Unicode aware replacement for substr_replace()

235

236

 * @author Andreas Gohr <andi@splitbrain.org>

237

 * @see    substr_replace()

238

*/

239

function utf8_substr_replace($string, $replacement, $start , $length=0 ){

240

  $ret = '';

241

  if($start>0) $ret .= utf8_substr($string, 0, $start);

242

  $ret .= $replacement;

243

  $ret .= utf8_substr($string, $start+$length);

244

  return $ret;

245

246

247

/*

248

 * Unicode aware replacement for ltrim()

249

250

 * @author Andreas Gohr <andi@splitbrain.org>

251

 * @see    ltrim()

252

 * @return string

253

*/

254

function utf8_ltrim($str,$charlist=''){

255

  if($charlist == '') return ltrim($str);

256

257

  //quote charlist for use in a characterclass

258

  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);

259

260

  return preg_replace('/^['.$charlist.']+/u','',$str);

261

262

263

/*

264

 * Unicode aware replacement for rtrim()

265

266

 * @author Andreas Gohr <andi@splitbrain.org>

267

 * @see    rtrim()

268

 * @return string

269

*/

270

function  utf8_rtrim($str,$charlist=''){

271

  if($charlist == '') return rtrim($str);

272

273

  //quote charlist for use in a characterclass

274

  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);

275

276

  return preg_replace('/['.$charlist.']+$/u','',$str);

277

278

279

/*

280

 * Unicode aware replacement for trim()

281

282

 * @author Andreas Gohr <andi@splitbrain.org>

283

 * @see    trim()

284

 * @return string

285

*/

286

function  utf8_trim($str,$charlist='') {

287

  if($charlist == '') return trim($str);

288

289

  return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist);

290

291

292

/*

293

 * This is a unicode aware replacement for strtolower()

294

295

 * Uses mb_string extension if available

296

297

 * @author Leo Feyer <leo@typolight.org>

298

 * @see    strtolower()

299

 * @see    utf8_strtoupper()

300

*/

301

function utf8_strtolower($string){

302

  if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');

303

304

  global $UTF8_UPPER_TO_LOWER;

305

  return strtr($string,$UTF8_UPPER_TO_LOWER);

306

307

308

/*

309

 * This is a unicode aware replacement for strtoupper()

310

311

 * Uses mb_string extension if available

312

313

 * @author Leo Feyer <leo@typolight.org>

314

 * @see    strtoupper()

315

 * @see    utf8_strtoupper()

316

*/

317

function utf8_strtoupper($string){

318

  if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');

319

320

  global $UTF8_LOWER_TO_UPPER;

321

  return strtr($string,$UTF8_LOWER_TO_UPPER);

322

323

324

/*

325

 * Romanize a non-latin string

326

327

 * @author Andreas Gohr <andi@splitbrain.org>

328

*/

329

function utf8_romanize($string){

330

  if(utf8_isASCII($string)) return $string; //nothing to do

331

332

  global $UTF8_ROMANIZATION;

333

  return strtr($string,$UTF8_ROMANIZATION);

334

335

336

/*

337

 * Removes special characters (nonalphanumeric) from a UTF-8 string

338

339

 * This function adds the controlchars 0x00 to 0x19 to the array of

340

 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS2)

341

342

 * @author Andreas Gohr <andi@splitbrain.org>

343

 * @param  string $string     The UTF8 string to strip of special chars

344

 * @param  string $repl       Replace special with this string

345

 * @param  string $additional Additional chars to strip (used in regexp char class)

346

*/

347

function utf8_stripspecials($string,$repl='',$additional=''){

348

  global $UTF8_SPECIAL_CHARS2;

349

350

  static $specials = null;

351

  if(is_null($specials)){

352

    $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');

353

354

355

  return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);

356

357

358

/*

359

 * This is an Unicode aware replacement for strpos

360

361

 * @author Leo Feyer <leo@typolight.org>

362

 * @see    strpos()

363

 * @param  string

364

 * @param  string

365

 * @param  integer

366

 * @return integer

367

*/

368

function utf8_strpos($haystack, $needle, $offset=0){

369

    $comp = 0;

370

    $length = null;

371

372

    while (is_null($length) || $length < $offset) {

373

        $pos = strpos($haystack, $needle, $offset + $comp);

374

375

        if ($pos === false)

376

            return false;

377

378

        $length = utf8_strlen(substr($haystack, 0, $pos));

379

380

        if ($length < $offset)

381

            $comp = $pos - $length;

382

383

384

    return $length;

385

386

387

/*

388

 * Encodes UTF-8 characters to HTML entities

389

390

 * @author Tom N Harris <tnharris@whoopdedo.org>

391

 * @author <vpribish at shopping dot com>

392

 * @link   http://www.php.net/manual/en/function.utf8-decode.php

393

*/

394

function utf8_tohtml ($str) {

395

    $ret = '';

396

    foreach (utf8_to_unicode($str) as $cp) {

397

        if ($cp < 0x80)

398

            $ret .= chr($cp);

399

        //elseif ($cp < 0x100)

400

        //    $ret .= "&#$cp;";

401

        else

402

            $ret .= "&#$cp;";

403

        //    $ret .= '&#x'.dechex($cp).';';

404

405

    return $ret;

406

407

408

/*

409

 * Decodes HTML entities to UTF-8 characters

410

411

 * Convert any &#..; entity to a codepoint,

412

 * The entities flag defaults to only decoding numeric entities.

413

 * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.

414

 * are handled as well. Avoids the problem that would occur if you

415

 * had to decode "&amp;#38;&#38;amp;#38;"

416

417

 * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"

418

 * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"

419

 * what it should be                   -> "&#38;&amp#38;"

420

421

 * @author Tom N Harris <tnharris@whoopdedo.org>

422

 * @param  string  $str      UTF-8 encoded string

423

 * @param  boolean $entities Flag controlling decoding of named entities.

424

 * @return UTF-8 encoded string with numeric (and named) entities replaced.

425

*/

426

function utf8_unhtml($str, $entities=null) {

427

    static $decoder = null;

428

    if (is_null($decoder))

429

      $decoder = new utf8_entity_decoder();

430

    if (is_null($entities))

431

        return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',

432

                                     'utf8_decode_numeric', $str);

433

    else

434

        return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',

435

                                     array(&$decoder, 'decode'), $str);

436

437

function utf8_decode_numeric($ent) {

438

    switch ($ent[2]) {

439

      case 'X':

440

      case 'x':

441

          $cp = hexdec($ent[3]);

442

          break;

443

      default:

444

          $cp = intval($ent[3]);

445

          break;

446

447

    return unicode_to_utf8(array($cp));

448

449

class utf8_entity_decoder {

450

    var $table;

451

    function utf8_entity_decoder() {

452

        $table = get_html_translation_table(HTML_ENTITIES);

453

        $table = array_flip($table);

454

        $this->table = array_map(array(&$this,'makeutf8'), $table);

455

456

    function makeutf8($c) {

457

        return unicode_to_utf8(array(ord($c)));

458

459

    function decode($ent) {

460

        if ($ent[1] == '#') {

461

            return utf8_decode_numeric($ent);

462

        } elseif (array_key_exists($ent[0],$this->table)) {

463

            return $this->table[$ent[0]];

464

        } else {

465

            return $ent[0];

/*

471

 * Takes an UTF-8 string and returns an array of ints representing the

472

 * Unicode characters. Astral planes are supported ie. the ints in the

473

 * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates

474

 * are not allowed.

475

476

 * If $strict is set to true the function returns false if the input

477

 * string isn't a valid UTF-8 octet sequence and raises a PHP error at

478

 * level E_USER_WARNING

479

480

 * Note: this function has been modified slightly in this library to

481

 * trigger errors on encountering bad bytes

482

483

 * @author <hsivonen@iki.fi>

484

 * @author Harry Fuecks <hfuecks@gmail.com>

485

 * @param  string  UTF-8 encoded string

486

 * @param  boolean Check for invalid sequences?

487

 * @return mixed array of unicode code points or false if UTF-8 invalid

488

 * @see    unicode_to_utf8

489

 * @link   http://hsivonen.iki.fi/php-utf8/

490

 * @link   http://sourceforge.net/projects/phputf8/

491

*/

492

function utf8_to_unicode($str,$strict=false) {

493

    $mState = 0;     // cached expected number of octets after the current octet

494

                     // until the beginning of the next UTF8 character sequence

495

    $mUcs4  = 0;     // cached Unicode character

496

    $mBytes = 1;     // cached expected number of octets in the current sequence

497

498

    $out = array();

499

500

    $len = strlen($str);

501

502

    for($i = 0; $i < $len; $i++) {

503

504

        $in = ord($str{$i});

505

506

        if ( $mState == 0) {

507

508

            // When mState is zero we expect either a US-ASCII character or a

509

            // multi-octet sequence.

510

            if (0 == (0x80 & ($in))) {

511

                // US-ASCII, pass straight through.

512

                $out[] = $in;

513

                $mBytes = 1;

514

515

            } else if (0xC0 == (0xE0 & ($in))) {

516

                // First octet of 2 octet sequence

517

                $mUcs4 = ($in);

518

                $mUcs4 = ($mUcs4 & 0x1F) << 6;

519

                $mState = 1;

520

                $mBytes = 2;

521

522

            } else if (0xE0 == (0xF0 & ($in))) {

523

                // First octet of 3 octet sequence

524

                $mUcs4 = ($in);

525

                $mUcs4 = ($mUcs4 & 0x0F) << 12;

526

                $mState = 2;

527

                $mBytes = 3;

528

529

            } else if (0xF0 == (0xF8 & ($in))) {

530

                // First octet of 4 octet sequence

531

                $mUcs4 = ($in);

532

                $mUcs4 = ($mUcs4 & 0x07) << 18;

533

                $mState = 3;

534

                $mBytes = 4;

535

536

            } else if (0xF8 == (0xFC & ($in))) {

537

                /* First octet of 5 octet sequence.

538

539

                 * This is illegal because the encoded codepoint must be either

540

                 * (a) not the shortest form or

541

                 * (b) outside the Unicode range of 0-0x10FFFF.

542

                 * Rather than trying to resynchronize, we will carry on until the end

543

                 * of the sequence and let the later error handling code catch it.

544

*/

545

                $mUcs4 = ($in);

546

                $mUcs4 = ($mUcs4 & 0x03) << 24;

547

                $mState = 4;

548

                $mBytes = 5;

549

550

            } else if (0xFC == (0xFE & ($in))) {

551

                // First octet of 6 octet sequence, see comments for 5 octet sequence.

552

                $mUcs4 = ($in);

553

                $mUcs4 = ($mUcs4 & 1) << 30;

554

                $mState = 5;

555

                $mBytes = 6;

556

557

            } elseif($strict) {

558

                /* Current octet is neither in the US-ASCII range nor a legal first

559

                 * octet of a multi-octet sequence.

560

*/

561

                trigger_error(

562

                        'utf8_to_unicode: Illegal sequence identifier '.

563

                            'in UTF-8 at byte '.$i,

564

                        E_USER_WARNING

565

);

566

                return false;

        } else {

571

572

            // When mState is non-zero, we expect a continuation of the multi-octet

573

            // sequence

574

            if (0x80 == (0xC0 & ($in))) {

575

576

                // Legal continuation.

577

                $shift = ($mState - 1) * 6;

578

                $tmp = $in;

579

                $tmp = ($tmp & 0x0000003F) << $shift;

580

                $mUcs4 |= $tmp;

581

582

/*

583

                 * End of the multi-octet sequence. mUcs4 now contains the final

584

                 * Unicode codepoint to be output

585

*/

586

                if (0 == --$mState) {

587

588

/*

589

                     * Check for illegal sequences and codepoints.

590

*/

591

                    // From Unicode 3.1, non-shortest form is illegal

592

                    if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||

593

                        ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||

594

                        ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||

595

                        (4 < $mBytes) ||

596

                        // From Unicode 3.2, surrogate characters are illegal

597

                        (($mUcs4 & 0xFFFFF800) == 0xD800) ||

598

                        // Codepoints outside the Unicode range are illegal

599

                        ($mUcs4 > 0x10FFFF)) {

600

601

                        if($strict){

602

                            trigger_error(

603

                                    'utf8_to_unicode: Illegal sequence or codepoint '.

604

                                        'in UTF-8 at byte '.$i,

605

                                    E_USER_WARNING

606

);

607

608

                            return false;

                    if (0xFEFF != $mUcs4) {

614

                        // BOM is legal but we don't want to output it

615

                        $out[] = $mUcs4;

616

617

618

                    //initialize UTF8 cache

619

                    $mState = 0;

620

                    $mUcs4  = 0;

621

                    $mBytes = 1;

622

623

624

            } elseif($strict) {

625

/*

626

                 *((0xC0 & (*in) != 0x80) && (mState != 0))

627

                 * Incomplete multi-octet sequence.

628

*/

629

                trigger_error(

630

                        'utf8_to_unicode: Incomplete multi-octet '.

631

                        '   sequence in UTF-8 at byte '.$i,

632

                        E_USER_WARNING

633

);

634

635

                return false;

    return $out;

640

641

642

/*

643

 * Takes an array of ints representing the Unicode characters and returns

644

 * a UTF-8 string. Astral planes are supported ie. the ints in the

645

 * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates

646

 * are not allowed.

647

648

 * If $strict is set to true the function returns false if the input

649

 * array contains ints that represent surrogates or are outside the

650

 * Unicode range and raises a PHP error at level E_USER_WARNING

651

652

 * Note: this function has been modified slightly in this library to use

653

 * output buffering to concatenate the UTF-8 string (faster) as well as

654

 * reference the array by it's keys

655

656

 * @param  array of unicode code points representing a string

657

 * @param  boolean Check for invalid sequences?

658

 * @return mixed UTF-8 string or false if array contains invalid code points

659

 * @author <hsivonen@iki.fi>

660

 * @author Harry Fuecks <hfuecks@gmail.com>

661

 * @see    utf8_to_unicode

662

 * @link   http://hsivonen.iki.fi/php-utf8/

663

 * @link   http://sourceforge.net/projects/phputf8/

664

*/

665

function unicode_to_utf8($arr,$strict=false) {

666

    if (!is_array($arr)) return '';

667

    ob_start();

668

669

    foreach (array_keys($arr) as $k) {

670

671

        # ASCII range (including control chars)

672

        if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {

673

674

            echo chr($arr[$k]);

675

676

        # 2 byte sequence

677

        } else if ($arr[$k] <= 0x07ff) {

678

679

            echo chr(0xc0 | ($arr[$k] >> 6));

680

            echo chr(0x80 | ($arr[$k] & 0x003f));

681

682

        # Byte order mark (skip)

683

        } else if($arr[$k] == 0xFEFF) {

684

685

            // nop -- zap the BOM

686

687

        # Test for illegal surrogates

688

        } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {

689

690

            // found a surrogate

691

            if($strict){

692

                trigger_error(

693

                    'unicode_to_utf8: Illegal surrogate '.

694

                        'at index: '.$k.', value: '.$arr[$k],

695

                    E_USER_WARNING

696

);

697

                return false;

698

699

700

        # 3 byte sequence

701

        } else if ($arr[$k] <= 0xffff) {

702

703

            echo chr(0xe0 | ($arr[$k] >> 12));

704

            echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));

705

            echo chr(0x80 | ($arr[$k] & 0x003f));

706

707

        # 4 byte sequence

708

        } else if ($arr[$k] <= 0x10ffff) {

709

710

            echo chr(0xf0 | ($arr[$k] >> 18));

711

            echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));

712

            echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));

713

            echo chr(0x80 | ($arr[$k] & 0x3f));

714

715

        } elseif($strict) {

716

717

            trigger_error(

718

                'unicode_to_utf8: Codepoint out of Unicode range '.

719

                    'at index: '.$k.', value: '.$arr[$k],

720

                E_USER_WARNING

721

);

722

723

            // out of range

724

            return false;

    $result = ob_get_contents();

729

    ob_end_clean();

730

    return $result;

731

732

733

/*

734

 * Replace bad bytes with an alternative character

735

736

 * ASCII character is recommended for replacement char

737

738

 * PCRE Pattern to locate bad bytes in a UTF-8 string

739

 * Comes from W3 FAQ: Multilingual Forms

740

 * Note: modified to include full ASCII range including control chars

741

742

 * @author Harry Fuecks <hfuecks@gmail.com>

743

 * @see http://www.w3.org/International/questions/qa-forms-utf-8

744

 * @param string to search

745

 * @param string to replace bad bytes with (defaults to '?') - use ASCII

746

 * @return string

747

*/

748

function utf8_bad_replace($str, $replace = '') {

749

    $UTF8_BAD =

750

     '([\x00-\x7F]'.                          # ASCII (including control chars)

751

     '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte

752

     '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs

753

     '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte

754

     '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates

755

     '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3

756

     '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15

757

     '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16

758

     '|(.{1}))';                              # invalid byte

759

    ob_start();

760

    while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {

761

        if ( !isset($matches[2])) {

762

            echo $matches[0];

763

        } else {

764

            echo $replace;

765

766

        $str = substr($str,strlen($matches[0]));

767

768

    $result = ob_get_contents();

769

    ob_end_clean();

770

    return $result;

771

772

773

/*

774

 * URL-Encode a filename to allow unicodecharacters

775

776

 * Slashes are not encoded

777

778

 * When the second parameter is true the string will

779

 * be encoded only if non ASCII characters are detected -

780

 * This makes it safe to run it multiple times on the

781

 * same string (default is true)

782

783

 * @author Andreas Gohr <andi@splitbrain.org>

784

 * @see    urlencode

785

*/

786

function utf8_encodeFN($file,$safe=true){

787

  if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){

788

    return $file;

789

790

  $file = urlencode($file);

791

  $file = str_replace('%2F','/',$file);

792

  return $file;

793

794

795

/*

796

 * URL-Decode a filename

797

798

 * This is just a wrapper around urldecode

799

800

 * @author Andreas Gohr <andi@splitbrain.org>

801

 * @see    urldecode

802

*/

803

function utf8_decodeFN($file){

804

  $file = urldecode($file);

805

  return $file;

806

807

808

/*

809

 * Moved some functions from framework/functions.php to here - thorn

810

*/

811

812

/*

813

 * Decode HTML entities to UTF-8 characters

814

815

 * Will replace all numeric and named entities, except

816

 * &gt; &lt; &apos; &quot; &#39; &nbsp;

817

818

 * @param  string UTF-8 or ASCII encoded string

819

 * @return string UTF-8 encoded string with numeric and named entities replaced.

820

*/

821

function utf8_entities_to_umlauts($str) {

822

	global $named_to_numbered_entities;

823

	// we have to prevent "&#39;" from beeing decoded

824

	$str = str_replace("&#39;", "&_#39;", $str);

825

	$str = strtr($str, $named_to_numbered_entities);

826

	$str = utf8_unhtml($str);

827

	$str = str_replace("&_#39;", "&#39;", $str);

828

829

	return($str);

830

831

832

/*

833

 * Encode UTF-8 characters to HTML entities

834

835

 * Will replace all UTF-8 encoded characters to numeric/named entities

836

837

 * @param  string UTF-8 encoded string

838

 * @param  bool Replace numbered by named entities

839

 * @return string ASCII encoded string with all UTF-8 characters replaced by numeric/named entities

840

*/

841

function utf8_umlauts_to_entities($str, $named_entities=true) {

842

	global $numbered_to_named_entities;

843

	$str = utf8_tohtml($str);

844

	if($named_entities)

845

		$str = strtr($str, $numbered_to_named_entities);

846

	return($str);

847

848

849

/*

850

 * Converts from various charsets to UTF-8

851

852

 * Will convert a string from various charsets to UTF-8.

853

 * HTML-entities will be converted, too.

854

 * In case of error the returned string is unchanged, and a message is emitted.

855

 * Supported charsets are:

856

 * direct: iso_8859_1 iso_8859_2 iso_8859_3 iso_8859_4 iso_8859_5

857

 *         iso_8859_6 iso_8859_7 iso_8859_8 iso_8859_9 iso_8859_10 iso_8859_11

858

 * mb_convert_encoding: all wb charsets (except those from 'direct'); but not GB2312

859

 * iconv:  all wb charsets (except those from 'direct')

860

861

 * @param  string  A string in supported encoding

862

 * @param  string  The charset to convert from, defaults to DEFAULT_CHARSET

863

 * @return string  A string in UTF-8-encoding, with all entities decoded, too.

864

 *                 String is unchanged in case of error.

865

*/

866

function charset_to_utf8($str, $charset_in=DEFAULT_CHARSET) {

867

	global $iso_8859_2_to_utf8, $iso_8859_3_to_utf8, $iso_8859_4_to_utf8, $iso_8859_5_to_utf8, $iso_8859_6_to_utf8, $iso_8859_7_to_utf8, $iso_8859_8_to_utf8, $iso_8859_9_to_utf8, $iso_8859_10_to_utf8, $iso_8859_11_to_utf8;

868

	$charset_in = strtoupper($charset_in);

869

	if ($charset_in == "") { $charset_in = 'UTF-8'; }

870

	$wrong_ISO8859 = false;

871

	$converted = false;

872

873

	if((!function_exists('iconv') && !UTF8_MBSTRING && ($charset_in=='big5' || $charset_in=='iso-2022-jp' || $charset_in=='iso-2022-kr')) || (!function_exists('iconv') && $charset_in=='gb2312')) {

874

		// Nothing we can do here :-(

875

		// Charset is one of those obscure ISO-2022... or BIG5, GB2312 or something

876

		// and we can't use mb_convert_encoding() or iconv();

877

		// Emit an error-message.

878

		trigger_error("Can't convert from $charset_in without mb_convert_encoding() or iconv(). Use UTF-8 instead.", E_USER_WARNING);

879

		return($str);

880

881

882

	// check if we have UTF-8 or a plain ASCII string

883

	if($charset_in == 'UTF-8' || utf8_isASCII($str)) {

884

		// we have utf-8. Just replace HTML-entities and return

885

		if(preg_match('/&[#0-9a-zA-Z]+;/',$str))

886

			return(utf8_entities_to_umlauts($str));

887

		else // nothing to do

888

			return($str);

889

890

891

	// Convert $str to utf8

892

	if(substr($charset_in,0,8) == 'ISO-8859') {

893

		switch($charset_in) {

894

			case 'ISO-8859-1': $str=utf8_encode($str); break;

895

			case 'ISO-8859-2': $str=strtr($str, $iso_8859_2_to_utf8); break;

896

			case 'ISO-8859-3': $str=strtr($str, $iso_8859_3_to_utf8); break;

897

			case 'ISO-8859-4': $str=strtr($str, $iso_8859_4_to_utf8); break;

898

			case 'ISO-8859-5': $str=strtr($str, $iso_8859_5_to_utf8); break;

899

			case 'ISO-8859-6': $str=strtr($str, $iso_8859_6_to_utf8); break;

900

			case 'ISO-8859-7': $str=strtr($str, $iso_8859_7_to_utf8); break;

901

			case 'ISO-8859-8': $str=strtr($str, $iso_8859_8_to_utf8); break;

902

			case 'ISO-8859-9': $str=strtr($str, $iso_8859_9_to_utf8); break;

903

			case 'ISO-8859-10': $str=strtr($str, $iso_8859_10_to_utf8); break;

904

			case 'ISO-8859-11': $str=strtr($str, $iso_8859_11_to_utf8); break;

905

			default: $wrong_ISO8859 = true;

906

907

		if(!$wrong_ISO8859)

908

			$converted = true;

909

910

	if(!$converted && UTF8_MBSTRING && $charset_in != 'GB2312') {

911

		// $charset is neither UTF-8 nor a known ISO-8859...

912

		// Try mb_convert_encoding() - but there's no GB2312 encoding in php's mb_* functions

913

		$str = mb_convert_encoding($str, 'UTF-8', $charset_in);

914

		$converted = true;

915

	} elseif(!$converted) { // Try iconv

916

		if(function_exists('iconv')) {

917

			$str = iconv($charset_in, 'UTF-8', $str);

918

			$converted = true;

919

920

921

	if($converted) {

922

		// we have utf-8, now replace HTML-entities and return

923

		if(preg_match('/&[#0-9a-zA-Z]+;/',$str))

924

			$str = utf8_entities_to_umlauts($str);

925

		// just to be sure, replace bad characters

926

		$str = utf8_bad_replace($str, '?');

927

		return($str);

928

929

930

	// Nothing we can do here :-(

931

	// Charset is one of those obscure ISO-2022... or BIG5, GB2312 or something

932

	// and we can't use mb_convert_encoding() or iconv();

933

	// Emit an error-message.

934

	trigger_error("Can't convert from $charset_in without mb_convert_encoding() or iconv(). Use UTF-8 instead.", E_USER_WARNING);

935

936

	return $str;

937

938

939

/*

940

 * Converts from UTF-8 to various charsets

941

942

 * Will convert a string from UTF-8 to various charsets.

943

 * HTML-entities will be converted, too.

944

 * In case of error the returned string is unchanged, and a message is emitted.

945

 * Supported charsets are:

946

 * direct: iso_8859_1 iso_8859_2 iso_8859_3 iso_8859_4 iso_8859_5

947

 *         iso_8859_6 iso_8859_7 iso_8859_8 iso_8859_9 iso_8859_10 iso_8859_11

948

 * mb_convert_encoding: all wb charsets (except those from 'direct'); but not GB2312

949

 * iconv:  all wb charsets (except those from 'direct')

950

951

 * @param  string  An UTF-8 encoded string

952

 * @param  string  The charset to convert to, defaults to DEFAULT_CHARSET

953

 * @return string  A string in a supported encoding, with all entities decoded, too.

954

 *                 String is unchanged in case of error.

955

*/

956

function utf8_to_charset($str, $charset_out=DEFAULT_CHARSET) {

957

	global $utf8_to_iso_8859_2, $utf8_to_iso_8859_3, $utf8_to_iso_8859_4, $utf8_to_iso_8859_5, $utf8_to_iso_8859_6, $utf8_to_iso_8859_7, $utf8_to_iso_8859_8, $utf8_to_iso_8859_9, $utf8_to_iso_8859_10, $utf8_to_iso_8859_11;

958

	$charset_out = strtoupper($charset_out);

959

	$wrong_ISO8859 = false;

960

	$converted = false;

961

962

	if((!function_exists('iconv') && !UTF8_MBSTRING && ($charset_out=='big5' || $charset_out=='iso-2022-jp' || $charset_out=='iso-2022-kr')) || (!function_exists('iconv') && $charset_out=='gb2312')) {

963

		// Nothing we can do here :-(

964

		// Charset is one of those obscure ISO-2022... or BIG5, GB2312 or something

965

		// and we can't use mb_convert_encoding() or iconv();

966

		// Emit an error-message.

967

		trigger_error("Can't convert into $charset_out without mb_convert_encoding() or iconv(). Use UTF-8 instead.", E_USER_WARNING);

968

		return($str);

969

970

971

	// replace HTML-entities first

972

	if(preg_match('/&[#0-9a-zA-Z]+;/',$str))

973

		$str = utf8_entities_to_umlauts($str);

974

975

	// check if we need to convert

976

	if($charset_out == 'UTF-8' || utf8_isASCII($str)) {

977

		// Nothing to do. Just return

978

			return($str);

979

980

981

	// Convert $str to $charset_out

982

	if(substr($charset_out,0,8) == 'ISO-8859') {

983

		switch($charset_out) {

984

			case 'ISO-8859-1': $str=utf8_decode($str); break;

985

			case 'ISO-8859-2': $str=strtr($str, $utf8_to_iso_8859_2); break;

986

			case 'ISO-8859-3': $str=strtr($str, $utf8_to_iso_8859_3); break;

987

			case 'ISO-8859-4': $str=strtr($str, $utf8_to_iso_8859_4); break;

988

			case 'ISO-8859-5': $str=strtr($str, $utf8_to_iso_8859_5); break;

989

			case 'ISO-8859-6': $str=strtr($str, $utf8_to_iso_8859_6); break;

990

			case 'ISO-8859-7': $str=strtr($str, $utf8_to_iso_8859_7); break;

991

			case 'ISO-8859-8': $str=strtr($str, $utf8_to_iso_8859_8); break;

992

			case 'ISO-8859-9': $str=strtr($str, $utf8_to_iso_8859_9); break;

993

			case 'ISO-8859-10': $str=strtr($str, $utf8_to_iso_8859_10); break;

994

			case 'ISO-8859-11': $str=strtr($str, $utf8_to_iso_8859_11); break;

995

			default: $wrong_ISO8859 = true;

996

997

		if(!$wrong_ISO8859)

998

			$converted = true;

999

1000

	if(!$converted && UTF8_MBSTRING && $charset_out != 'GB2312') {

1001

		// $charset is neither UTF-8 nor a known ISO-8859...

1002

		// Try mb_convert_encoding() - but there's no GB2312 encoding in php's mb_* functions

1003

		$str = mb_convert_encoding($str, $charset_out, 'UTF-8');

1004

		$converted = true;

1005

	} elseif(!$converted) { // Try iconv

1006

		if(function_exists('iconv')) {

1007

			$str = iconv('UTF-8', $charset_out, $str);

1008

			$converted = true;

1009

1010

1011

	if($converted) {

1012

		return($str);

1013

1014

1015

	// Nothing we can do here :-(

1016

	// Charset is one of those obscure ISO-2022... or BIG5, GB2312 or something

1017

	// and we can't use mb_convert_encoding() or iconv();

1018

	// Emit an error-message.

1019

	trigger_error("Can't convert into $charset_out without mb_convert_encoding() or iconv(). Use UTF-8 instead.", E_USER_WARNING);

1020

1021

	return $str;

1022

1023

1024

/*

1025

 * convert Filenames to ASCII

1026

1027

 * Convert all non-ASCII characters and all HTML-entities to their plain 7bit equivalents

1028

 * Characters without an equivalent will be converted to hex-values.

1029

 * The name entities_to_7bit() is somewhat misleading, but kept for compatibility-reasons.

1030

1031

 * @param  string  Filename to convert (all encodings from charset_to_utf8() are allowed)

1032

 * @return string  ASCII encoded string, to use as filename in wb's page_filename() and media_filename

1033

*/

1034

function entities_to_7bit($str) {

1035

	// convert to UTF-8

1036

	$str = charset_to_utf8($str);

1037

	// replace some specials

1038

	$str = utf8_stripspecials($str, '_');

1039

	// translate non-ASCII characters to ASCII

1040

	$str = utf8_romanize($str);

1041

	// missed some? - Many UTF-8-chars can't be romanized

1042

	// convert to HTML-entities, and replace entites by hex-numbers

1043

	$str = utf8_umlauts_to_entities($str, false);

1044

	$str = str_replace('&#39;', '&apos;', $str);

1045

	$str = preg_replace('/&#([0-9]+);/e', "dechex('$1')",  $str);

1046

	// maybe there are some &gt; &lt; &apos; &quot; &amp; &nbsp; left, replace them too

1047

	$entities = array('&gt;'=>'_','&lt;'=>'_','&apos;'=>'_','&quot;'=>'_','&amp;'=>'_','&nbsp;'=>' ');

1048

	$str = strtr($str, $entities);

1049

1050

	return($str);

1051

1052

1053

/*

1054

 * Convert a string from mixed html-entities/umlauts to pure $charset_out-umlauts

1055

1056

 * Will replace all numeric and named entities except

1057

 * &gt; &lt; &apos; &quot; &#39; &nbsp;

1058

 * In case of error the returned string is unchanged, and a message is emitted.

1059

 * Supported charsets are:

1060

 * direct: iso_8859_1 iso_8859_2 iso_8859_3 iso_8859_4 iso_8859_5

1061

 *         iso_8859_6 iso_8859_7 iso_8859_8 iso_8859_9 iso_8859_10 iso_8859_11

1062

 * mb_convert_encoding: all wb charsets (except those from 'direct'); but not GB2312

1063

 * iconv:  all wb charsets (except those from 'direct')

1064

1065

 * @param  string  A string in DEFAULT_CHARSET encoding

1066

 * @return string  A string in $charset_out encoding with numeric and named entities replaced.

1067

 *         The string is unchanged in case of error.

1068

*/

1069

function entities_to_umlauts2($string, $charset_out=DEFAULT_CHARSET) {

1070

	$string = charset_to_utf8($string, DEFAULT_CHARSET);

1071

	if(utf8_check($string))

1072

		$string = utf8_to_charset($string, $charset_out);

1073

	return ($string);

1074

1075

1076

/*

1077

 * Convert a string from mixed html-entities/umlauts to pure ASCII with HTML-entities

1078

1079

 * Will convert a string in $charset_in encoding to a pure ASCII string with HTML-entities.

1080

 * In case of error the returned string is unchanged, and a message is emitted.

1081

 * Supported charsets are:

1082

 * direct: iso_8859_1 iso_8859_2 iso_8859_3 iso_8859_4 iso_8859_5

1083

 *         iso_8859_6 iso_8859_7 iso_8859_8 iso_8859_9 iso_8859_10 iso_8859_11

1084

 * mb_convert_encoding: all wb charsets (except those from 'direct'); but not GB2312

1085

 * iconv:  all wb charsets (except those from 'direct')

1086

1087

 * @param  string  A string in $charset_in encoding

1088

 * @return string  A string in ASCII encoding with numeric and named entities.

1089

 *         The string is unchanged in case of error.

1090

*/

1091

function umlauts_to_entities2($string, $charset_in=DEFAULT_CHARSET) {

1092

	$string = charset_to_utf8($string, $charset_in);

1093

	if(utf8_check($string))

1094

		$string = utf8_umlauts_to_entities($string);

1095

	return($string);

1096

1097

1098

?>

Project

General

Profile

WB 2.08.x

wb-archiv283/trunk/wb/framework/functions-utf8.php @ 552