/trunk/framework/functions-utf8.php - Annotate - WB 2.11.0 - Tracking

2

Manuela

<?php

2

3

// $Id$

4

5

/*

6

7

 Website Baker Project <http://www.websitebaker.org/>

8

 Copyright (C) 2004-2009, Ryan Djurovich

9

10

 Website Baker is free software; you can redistribute it and/or modify

11

 it under the terms of the GNU General Public License as published by

12

 the Free Software Foundation; either version 2 of the License, or

13

 (at your option) any later version.

14

15

 Website Baker is distributed in the hope that it will be useful,

16

 but WITHOUT ANY WARRANTY; without even the implied warranty of

17

 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

18

 GNU General Public License for more details.

19

20

 You should have received a copy of the GNU General Public License

21

 along with Website Baker; if not, write to the Free Software

22

 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

23

24

*/

25

26

/*

27

 * A part of this file is based on 'utf8.php' from the DokuWiki-project.

28

 * (http://www.splitbrain.org/projects/dokuwiki):

29

**

30

 * UTF8 helper functions

31

 * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)

32

 * @author     Andreas Gohr <andi@splitbrain.org>

33

**

34

 * modified for use with Website Baker

35

 * from thorn, Jan. 2008

36

37

 * most of the original functions appeared to be to slow with large strings, so i replaced them with my own ones

38

 * thorn, Mar. 2008

39

*/

40

41

// Functions we use in Website Baker:

42

//   entities_to_7bit()

43

//   entities_to_umlauts2()

44

//   umlauts_to_entities2()

45

/* -------------------------------------------------------- */

46

// Must include code to stop this file being accessed directly

47

if(!defined('WB_PATH')) {

48

    require_once(dirname(__FILE__).'/globalExceptionHandler.php');

49

    throw new IllegalFileException();

50

51

/* -------------------------------------------------------- */

52

if (function_exists('functions-utf8')){return;}

53

/*

54

 * check for mb_string support

55

*/

56

//define('UTF8_NOMBSTRING',1); // uncomment this to forbid use of mb_string-functions

57

if(!defined('UTF8_MBSTRING')){

58

  if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){

59

    define('UTF8_MBSTRING',1);

60

  }else{

61

    define('UTF8_MBSTRING',0);

if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }

66

67

require_once(WB_PATH.'/framework/charsets_table.php');

68

69

/*

70

 * Checks if a string contains 7bit ASCII only

71

72

 * @author thorn

73

*/

74

function utf8_isASCII($str){

75

    if(preg_match('/[\x80-\xFF]/', $str))

76

        return false;

77

    else

78

        return true;

79

80

81

/*

82

 * Tries to detect if a string is in Unicode encoding

83

84

 * @author <bmorel@ssi.fr>

85

 * @link   http://www.php.net/manual/en/function.utf8-encode.php

86

*/

87

function utf8_check($Str) {

88

 for ($i=0; $i<strlen($Str); $i++) {

89

  $b = ord($Str[$i]);

90

  if ($b < 0x80) continue; # 0bbbbbbb

91

  elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb

92

  elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb

93

  elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb

94

  elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb

95

  elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b

96

  else return false; # Does not match any model

97

  for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?

98

   if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))

99

   return false;

100

101

102

 return true;

103

104

105

/*

106

 * Romanize a non-latin string

107

108

 * @author Andreas Gohr <andi@splitbrain.org>

109

*/

110

function utf8_romanize($string){

111

  if(utf8_isASCII($string)) return $string; //nothing to do

112

113

  global $UTF8_ROMANIZATION;

114

  return strtr($string,$UTF8_ROMANIZATION);

115

116

117

/*

118

 * Removes special characters (nonalphanumeric) from a UTF-8 string

119

120

 * This function adds the controlchars 0x00 to 0x19 to the array of

121

 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS2)

122

123

 * @author Andreas Gohr <andi@splitbrain.org>

124

 * @param  string $string     The UTF8 string to strip of special chars

125

 * @param  string $repl       Replace special with this string

126

 * @param  string $additional Additional chars to strip (used in regexp char class)

127

*/

128

function utf8_stripspecials($string,$repl='',$additional=''){

129

  global $UTF8_SPECIAL_CHARS2;

130

131

  static $specials = null;

132

  if(is_null($specials)){

133

    $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');

134

135

136

  return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);

137

138

139

/*

140

 * added functions - thorn

141

*/

142

143

/*

144

 * faster replacement for utf8_entities_to_umlauts()

145

 * not all features of utf8_entities_to_umlauts() --> utf8_unhtml() are supported!

146

 * @author thorn

147

*/

148

function utf8_fast_entities_to_umlauts($str) {

149

    if(UTF8_MBSTRING) {

150

        // we need this for use with mb_convert_encoding

151

        $str = str_replace(array('&amp;','&gt;','&lt;','&quot;','&#039;','&nbsp;'), array('&amp;amp;','&amp;gt;','&amp;lt;','&amp;quot;','&amp;#39;','&amp;nbsp;'), $str);

152

        // we need two mb_convert_encoding()-calls - is this a bug?

153

        // mb_convert_encoding("ö&ouml;", 'UTF-8', 'HTML-ENTITIES'); // with string in utf-8-encoding doesn't work. Result: "Ã¶ö"

154

        // Work-around: convert all umlauts to entities first ("ö&ouml;"->"&ouml;&ouml;"), then all entities to umlauts ("&ouml;&ouml;"->"öö")

155

        return(mb_convert_encoding(mb_convert_encoding($str, 'HTML-ENTITIES', 'UTF-8'),'UTF-8', 'HTML-ENTITIES'));

156

    } else {

157

        global $named_entities;global $numbered_entities;

158

        $str = str_replace($named_entities, $numbered_entities, $str);

159

        $str = preg_replace("/&#([0-9]+);/e", "code_to_utf8($1)", $str);

160

161

    return($str);

162

163

// support-function for utf8_fast_entities_to_umlauts()

164

function code_to_utf8($num) {

165

    if ($num <= 0x7F) {

166

        return chr($num);

167

    } elseif ($num <= 0x7FF) {

168

        return chr(($num >> 6) + 192) . chr(($num & 63) + 128);

169

    } elseif ($num <= 0xFFFF) {

170

         return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);

171

    } elseif ($num <= 0x1FFFFF) {

172

        return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);

173

174

    return "?";

175

176

177

/*

178

 * faster replacement for utf8_umlauts_to_entities()

179

 * not all features of utf8_umlauts_to_entities() --> utf8_tohtml() are supported!

180

 * @author thorn

181

*/

182

function utf8_fast_umlauts_to_entities($string, $named_entities=true) {

183

    if(UTF8_MBSTRING)

184

        return(mb_convert_encoding($string, 'HTML-ENTITIES', 'UTF-8'));

185

    else {

186

        global $named_entities;global $numbered_entities;

187

        $new = "";

188

        $i=0;

189

        $len=strlen($string);

190

        if($len==0) return $string;

191

        do {

192

            if(ord($string{$i}) <= 127) $ud = $string{$i++};

193

            elseif(ord($string{$i}) <= 223) $ud = (ord($string{$i++})-192)*64 + (ord($string{$i++})-128);

194

            elseif(ord($string{$i}) <= 239) $ud = (ord($string{$i++})-224)*4096 + (ord($string{$i++})-128)*64 + (ord($string{$i++})-128);

195

            elseif(ord($string{$i}) <= 247) $ud = (ord($string{$i++})-240)*262144 + (ord($string{$i++})-128)*4096 + (ord($string{$i++})-128)*64 + (ord($string{$i++})-128);

196

            else $ud = ord($string{$i++}); // error!

197

            if($ud > 127) {

198

                $new .= "&#$ud;";

199

            } else {

200

                $new .= $ud;

201

202

        } while($i < $len);

203

        $string = $new;

204

        if($named_entities)

205

            $string = str_replace($numbered_entities, $named_entities, $string);

206

207

    return($string);

208

209

210

/*

211

 * Converts from various charsets to UTF-8

212

213

 * Will convert a string from various charsets to UTF-8.

214

 * HTML-entities may be converted, too.

215

 * In case of error the returned string is unchanged, and a message is emitted.

216

 * Supported charsets are:

217

 * direct: iso_8859_1 iso_8859_2 iso_8859_3 iso_8859_4 iso_8859_5

218

 *         iso_8859_6 iso_8859_7 iso_8859_8 iso_8859_9 iso_8859_10 iso_8859_11

219

 * mb_convert_encoding: all wb charsets (except those from 'direct'); but not GB2312

220

 * iconv:  all wb charsets (except those from 'direct')

221

222

 * @param  string  A string in supported encoding

223

 * @param  string  The charset to convert from, defaults to DEFAULT_CHARSET

224

 * @return string  A string in UTF-8-encoding, with all entities decoded, too.

225

 *                 String is unchanged in case of error.

226

 * @author thorn

227

*/

228

function charset_to_utf8($str, $charset_in=DEFAULT_CHARSET, $decode_entities=true) {

229

    global $iso_8859_2_to_utf8, $iso_8859_3_to_utf8, $iso_8859_4_to_utf8, $iso_8859_5_to_utf8, $iso_8859_6_to_utf8, $iso_8859_7_to_utf8, $iso_8859_8_to_utf8, $iso_8859_9_to_utf8, $iso_8859_10_to_utf8, $iso_8859_11_to_utf8;

230

    $charset_in = strtoupper($charset_in);

231

    if ($charset_in == "") { $charset_in = 'UTF-8'; }

232

    $wrong_ISO8859 = false;

233

    $converted = false;

234

235

    if((!function_exists('iconv') && !UTF8_MBSTRING && ($charset_in=='BIG5' || $charset_in=='ISO-2022-JP' || $charset_in=='ISO-2022-KR')) || (!function_exists('iconv') && $charset_in=='GB2312')) {

236

        // Nothing we can do here :-(

237

        // Charset is one of those obscure ISO-2022... or BIG5, GB2312 or something

238

        // and we can't use mb_convert_encoding() or iconv();

239

        // Emit an error-message.

240

        trigger_error("Can't convert from $charset_in without mb_convert_encoding() or iconv(). Use UTF-8 instead.", E_USER_WARNING);

241

        return($str);

242

243

244

    // check if we have UTF-8 or a plain ASCII string

245

    if($charset_in == 'UTF-8' || utf8_isASCII($str)) {

246

        // we have utf-8. Just replace HTML-entities and return

247

        if($decode_entities && preg_match('/&[#0-9a-zA-Z]+;/',$str))

248

            return(utf8_fast_entities_to_umlauts($str));

249

        else // nothing to do

250

            return($str);

251

252

253

    // Convert $str to utf8

254

    if(substr($charset_in,0,8) == 'ISO-8859') {

255

        switch($charset_in) {

256

            case 'ISO-8859-1': $str=utf8_encode($str); break;

257

            case 'ISO-8859-2': $str=strtr($str, $iso_8859_2_to_utf8); break;

258

            case 'ISO-8859-3': $str=strtr($str, $iso_8859_3_to_utf8); break;

259

            case 'ISO-8859-4': $str=strtr($str, $iso_8859_4_to_utf8); break;

260

            case 'ISO-8859-5': $str=strtr($str, $iso_8859_5_to_utf8); break;

261

            case 'ISO-8859-6': $str=strtr($str, $iso_8859_6_to_utf8); break;

262

            case 'ISO-8859-7': $str=strtr($str, $iso_8859_7_to_utf8); break;

263

            case 'ISO-8859-8': $str=strtr($str, $iso_8859_8_to_utf8); break;

264

            case 'ISO-8859-9': $str=strtr($str, $iso_8859_9_to_utf8); break;

265

            case 'ISO-8859-10': $str=strtr($str, $iso_8859_10_to_utf8); break;

266

            case 'ISO-8859-11': $str=strtr($str, $iso_8859_11_to_utf8); break;

267

            default: $wrong_ISO8859 = true;

268

269

        if(!$wrong_ISO8859)

270

            $converted = true;

271

272

    if(!$converted && UTF8_MBSTRING && $charset_in != 'GB2312') {

273

        // $charset is neither UTF-8 nor a known ISO-8859...

274

        // Try mb_convert_encoding() - but there's no GB2312 encoding in php's mb_* functions

275

        $str = mb_convert_encoding($str, 'UTF-8', $charset_in);

276

        $converted = true;

277

    } elseif(!$converted) { // Try iconv

278

        if(function_exists('iconv')) {

279

            $str = iconv($charset_in, 'UTF-8', $str);

280

            $converted = true;

281

282

283

    if($converted) {

284

        // we have utf-8, now replace HTML-entities and return

285

        if($decode_entities && preg_match('/&[#0-9a-zA-Z]+;/',$str))

286

            $str = utf8_fast_entities_to_umlauts($str);

287

        return($str);

288

289

290

    // Nothing we can do here :-(

291

    // Charset is one of those obscure ISO-2022... or BIG5, GB2312 or something

292

    // and we can't use mb_convert_encoding() or iconv();

293

    // Emit an error-message.

294

    trigger_error("Can't convert from $charset_in without mb_convert_encoding() or iconv(). Use UTF-8 instead.", E_USER_WARNING);

295

296

    return $str;

297

298

299

/*

300

 * Converts from UTF-8 to various charsets

301

302

 * Will convert a string from UTF-8 to various charsets.

303

 * HTML-entities will not! be converted.

304

 * In case of error the returned string is unchanged, and a message is emitted.

305

 * Supported charsets are:

306

 * direct: iso_8859_1 iso_8859_2 iso_8859_3 iso_8859_4 iso_8859_5

307

 *         iso_8859_6 iso_8859_7 iso_8859_8 iso_8859_9 iso_8859_10 iso_8859_11

308

 * mb_convert_encoding: all wb charsets (except those from 'direct'); but not GB2312

309

 * iconv:  all wb charsets (except those from 'direct')

310

311

 * @param  string  An UTF-8 encoded string

312

 * @param  string  The charset to convert to, defaults to DEFAULT_CHARSET

313

 * @return string  A string in a supported encoding, with all entities decoded, too.

314

 *                 String is unchanged in case of error.

315

 * @author thorn

316

*/

317

function utf8_to_charset($str, $charset_out=DEFAULT_CHARSET) {

318

    global $utf8_to_iso_8859_2, $utf8_to_iso_8859_3, $utf8_to_iso_8859_4, $utf8_to_iso_8859_5, $utf8_to_iso_8859_6, $utf8_to_iso_8859_7, $utf8_to_iso_8859_8, $utf8_to_iso_8859_9, $utf8_to_iso_8859_10, $utf8_to_iso_8859_11;

319

    $charset_out = strtoupper($charset_out);

320

    $wrong_ISO8859 = false;

321

    $converted = false;

322

323

    if((!function_exists('iconv') && !UTF8_MBSTRING && ($charset_out=='BIG5' || $charset_out=='ISO-2022-JP' || $charset_out=='ISO-2022-KR')) || (!function_exists('iconv') && $charset_out=='GB2312')) {

324

        // Nothing we can do here :-(

325

        // Charset is one of those obscure ISO-2022... or BIG5, GB2312 or something

326

        // and we can't use mb_convert_encoding() or iconv();

327

        // Emit an error-message.

328

        trigger_error("Can't convert into $charset_out without mb_convert_encoding() or iconv(). Use UTF-8 instead.", E_USER_WARNING);

329

        return($str);

330

331

332

    // the string comes from charset_to_utf8(), so we can skip this

333

    // replace HTML-entities first

334

    //if(preg_match('/&[#0-9a-zA-Z]+;/',$str))

335

    //    $str = utf8_entities_to_umlauts($str);

336

337

    // check if we need to convert

338

    if($charset_out == 'UTF-8' || utf8_isASCII($str)) {

339

        // Nothing to do. Just return

340

            return($str);

341

342

343

    // Convert $str to $charset_out

344

    if(substr($charset_out,0,8) == 'ISO-8859') {

345

        switch($charset_out) {

346

            case 'ISO-8859-1': $str=utf8_decode($str); break;

347

            case 'ISO-8859-2': $str=strtr($str, $utf8_to_iso_8859_2); break;

348

            case 'ISO-8859-3': $str=strtr($str, $utf8_to_iso_8859_3); break;

349

            case 'ISO-8859-4': $str=strtr($str, $utf8_to_iso_8859_4); break;

350

            case 'ISO-8859-5': $str=strtr($str, $utf8_to_iso_8859_5); break;

351

            case 'ISO-8859-6': $str=strtr($str, $utf8_to_iso_8859_6); break;

352

            case 'ISO-8859-7': $str=strtr($str, $utf8_to_iso_8859_7); break;

353

            case 'ISO-8859-8': $str=strtr($str, $utf8_to_iso_8859_8); break;

354

            case 'ISO-8859-9': $str=strtr($str, $utf8_to_iso_8859_9); break;

355

            case 'ISO-8859-10': $str=strtr($str, $utf8_to_iso_8859_10); break;

356

            case 'ISO-8859-11': $str=strtr($str, $utf8_to_iso_8859_11); break;

357

            default: $wrong_ISO8859 = true;

358

359

        if(!$wrong_ISO8859)

360

            $converted = true;

361

362

    if(!$converted && UTF8_MBSTRING && $charset_out != 'GB2312') {

363

        // $charset is neither UTF-8 nor a known ISO-8859...

364

        // Try mb_convert_encoding() - but there's no GB2312 encoding in php's mb_* functions

365

        $str = mb_convert_encoding($str, $charset_out, 'UTF-8');

366

        $converted = true;

367

    } elseif(!$converted) { // Try iconv

368

        if(function_exists('iconv')) {

369

            $str = iconv('UTF-8', $charset_out, $str);

370

            $converted = true;

371

372

373

    if($converted) {

374

        return($str);

375

376

377

    // Nothing we can do here :-(

378

    // Charset is one of those obscure ISO-2022... or BIG5, GB2312 or something

379

    // and we can't use mb_convert_encoding() or iconv();

380

    // Emit an error-message.

381

    trigger_error("Can't convert into $charset_out without mb_convert_encoding() or iconv(). Use UTF-8 instead.", E_USER_WARNING);

382

383

    return $str;

384

385

386

/*

387

 * convert Filenames to ASCII

388

389

 * Convert all non-ASCII characters and all HTML-entities to their plain 7bit equivalents

390

 * Characters without an equivalent will be converted to hex-values.

391

 * The name entities_to_7bit() is somewhat misleading, but kept for compatibility-reasons.

392

393

 * @param  string  Filename to convert (all encodings from charset_to_utf8() are allowed)

394

 * @return string  ASCII encoded string, to use as filename in wb's page_filename() and media_filename

395

 * @author thorn

396

*/

397

function entities_to_7bit($str) {

398

    // convert to UTF-8

399

    $str = charset_to_utf8($str);

400

    if(!utf8_check($str))

401

        return($str);

402

    // replace some specials

403

    $str = utf8_stripspecials($str, '_');

404

    // translate non-ASCII characters to ASCII

405

    $str = utf8_romanize($str);

406

    // missed some? - Many UTF-8-chars can't be romanized

407

    // convert to HTML-entities, and replace entites by hex-numbers

408

    $str = utf8_fast_umlauts_to_entities($str, false);

409

    $str = str_replace('&#039;', '&apos;', $str);

410

    $str = preg_replace_callback('/&#([0-9]+);/', function($matches) {return dechex($matches[1]);}, $str);

411

    // maybe there are some &gt; &lt; &apos; &quot; &amp; &nbsp; left, replace them too

412

    $str = str_replace(array('&gt;', '&lt;', '&apos;', '\'', '&quot;', '&amp;'), '', $str);

413

    return($str);

414

415

416

/*

417

 * Convert a string from mixed html-entities/umlauts to pure $charset_out-umlauts

418

419

 * Will replace all numeric and named entities except

420

 * &gt; &lt; &apos; &quot; &#039; &nbsp;

421

 * @author thorn

422

*/

423

function entities_to_umlauts2($string, $charset_out=DEFAULT_CHARSET) {

424

    $string = charset_to_utf8($string, DEFAULT_CHARSET, true);

425

    //if(utf8_check($string)) // this check is to much time-consuming (this may fail only if AddDefaultCharset is set)

426

        $string = utf8_to_charset($string, $charset_out);

427

    return ($string);

428

429

430

/*

431

 * Convert a string from mixed html-entities/umlauts to pure ASCII with HTML-entities

432

433

 * Will convert a string in $charset_in encoding to a pure ASCII string with HTML-entities.

434

 * @author thorn

435

*/

436

function umlauts_to_entities2($string, $charset_in=DEFAULT_CHARSET) {

437

    $string = charset_to_utf8($string, $charset_in, false);

438

    //if(utf8_check($string)) // this check is to much time-consuming (this may fail only if AddDefaultCharset is set)

439

        $string = utf8_fast_umlauts_to_entities($string, false);

440

    return($string);

441

Project

General

Profile

WB 2.11.0

wb-2_10_x/trunk/framework/functions-utf8.php @ 26