Project

General

Profile

1
<?php
2
/**
3
 * htmlfilter.inc
4
 * ---------------
5
 * This set of functions allows you to filter html in order to remove
6
 * any malicious tags from it. Useful in cases when you need to filter
7
 * user input for any cross-site-scripting attempts.
8
 *
9
 * Copyright (C) 2002-2004 by Duke University
10
 *
11
 * This library is free software; you can redistribute it and/or
12
 * modify it under the terms of the GNU Lesser General Public
13
 * License as published by the Free Software Foundation; either
14
 * version 2.1 of the License, or (at your option) any later version.
15
 *
16
 * This library is distributed in the hope that it will be useful,
17
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
19
 * Lesser General Public License for more details.
20
 *
21
 * You should have received a copy of the GNU Lesser General Public
22
 * License along with this library; if not, write to the Free Software
23
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
24
 * 02110-1301  USA
25
 *
26
 * @Author	Konstantin Riabitsev <icon@linux.duke.edu>
27
 * @Author  Jim Jagielski <jim@jaguNET.com / jimjag@gmail.com>
28
 * @Version 1.1 ($Date: 2017-07-02 17:14:29 +0200 (Sun, 02 Jul 2017) $)
29
 */
30

    
31
/**
32
 * This function returns the final tag out of the tag name, an array
33
 * of attributes, and the type of the tag. This function is called by
34
 * tln_sanitize internally.
35
 *
36
 * @param string $tagname the name of the tag.
37
 * @param array $attary the array of attributes and their values
38
 * @param integer $tagtype The type of the tag (see in comments).
39
 * @return string A string with the final tag representation.
40
 */
41
function tln_tagprint($tagname, $attary, $tagtype)
42
{
43
    if ($tagtype == 2) {
44
        $fulltag = '</' . $tagname . '>';
45
    } else {
46
        $fulltag = '<' . $tagname;
47
        if (is_array($attary) && sizeof($attary)) {
48
            $atts = array();
49
            while (list($attname, $attvalue) = each($attary)) {
50
                array_push($atts, "$attname=$attvalue");
51
            }
52
            $fulltag .= ' ' . join(' ', $atts);
53
        }
54
        if ($tagtype == 3) {
55
            $fulltag .= ' /';
56
        }
57
        $fulltag .= '>';
58
    }
59
    return $fulltag;
60
}
61

    
62
/**
63
 * A small helper function to use with array_walk. Modifies a by-ref
64
 * value and makes it lowercase.
65
 *
66
 * @param string $val a value passed by-ref.
67
 * @return		void since it modifies a by-ref value.
68
 */
69
function tln_casenormalize(&$val)
70
{
71
    $val = strtolower($val);
72
}
73

    
74
/**
75
 * This function skips any whitespace from the current position within
76
 * a string and to the next non-whitespace value.
77
 *
78
 * @param string $body the string
79
 * @param integer $offset the offset within the string where we should start
80
 *				   looking for the next non-whitespace character.
81
 * @return integer          the location within the $body where the next
82
 *				   non-whitespace char is located.
83
 */
84
function tln_skipspace($body, $offset)
85
{
86
    preg_match('/^(\s*)/s', substr($body, $offset), $matches);
87
    if (sizeof($matches[1])) {
88
        $count = strlen($matches[1]);
89
        $offset += $count;
90
    }
91
    return $offset;
92
}
93

    
94
/**
95
 * This function looks for the next character within a string.	It's
96
 * really just a glorified "strpos", except it catches the failures
97
 * nicely.
98
 *
99
 * @param string $body   The string to look for needle in.
100
 * @param integer $offset Start looking from this position.
101
 * @param string $needle The character/string to look for.
102
 * @return integer           location of the next occurrence of the needle, or
103
 *				   strlen($body) if needle wasn't found.
104
 */
105
function tln_findnxstr($body, $offset, $needle)
106
{
107
    $pos = strpos($body, $needle, $offset);
108
    if ($pos === false) {
109
        $pos = strlen($body);
110
    }
111
    return $pos;
112
}
113

    
114
/**
115
 * This function takes a PCRE-style regexp and tries to match it
116
 * within the string.
117
 *
118
 * @param string $body   The string to look for needle in.
119
 * @param integer $offset Start looking from here.
120
 * @param string $reg       A PCRE-style regex to match.
121
 * @return array|boolean  Returns a false if no matches found, or an array
122
 *				   with the following members:
123
 *				   - integer with the location of the match within $body
124
 *				   - string with whatever content between offset and the match
125
 *				   - string with whatever it is we matched
126
 */
127
function tln_findnxreg($body, $offset, $reg)
128
{
129
    $matches = array();
130
    $retarr = array();
131
    $preg_rule = '%^(.*?)(' . $reg . ')%s';
132
    preg_match($preg_rule, substr($body, $offset), $matches);
133
    if (!isset($matches[0]) || !$matches[0]) {
134
        $retarr = false;
135
    } else {
136
        $retarr[0] = $offset + strlen($matches[1]);
137
        $retarr[1] = $matches[1];
138
        $retarr[2] = $matches[2];
139
    }
140
    return $retarr;
141
}
142

    
143
/**
144
 * This function looks for the next tag.
145
 *
146
 * @param string $body   String where to look for the next tag.
147
 * @param integer $offset Start looking from here.
148
 * @return array|boolean false if no more tags exist in the body, or
149
 *				   an array with the following members:
150
 *				   - string with the name of the tag
151
 *				   - array with attributes and their values
152
 *				   - integer with tag type (1, 2, or 3)
153
 *				   - integer where the tag starts (starting "<")
154
 *				   - integer where the tag ends (ending ">")
155
 *				   first three members will be false, if the tag is invalid.
156
 */
157
function tln_getnxtag($body, $offset)
158
{
159
    if ($offset > strlen($body)) {
160
        return false;
161
    }
162
    $lt = tln_findnxstr($body, $offset, '<');
163
    if ($lt == strlen($body)) {
164
        return false;
165
    }
166
    /**
167
     * We are here:
168
     * blah blah <tag attribute="value">
169
     * \---------^
170
     */
171
    $pos = tln_skipspace($body, $lt + 1);
172
    if ($pos >= strlen($body)) {
173
        return array(false, false, false, $lt, strlen($body));
174
    }
175
    /**
176
     * There are 3 kinds of tags:
177
     * 1. Opening tag, e.g.:
178
     *	  <a href="blah">
179
     * 2. Closing tag, e.g.:
180
     *	  </a>
181
     * 3. XHTML-style content-less tag, e.g.:
182
     *	  <img src="blah"/>
183
     */
184
    switch (substr($body, $pos, 1)) {
185
    case '/':
186
        $tagtype = 2;
187
        $pos++;
188
        break;
189
    case '!':
190
        /**
191
         * A comment or an SGML declaration.
192
         */
193
            if (substr($body, $pos + 1, 2) == '--') {
194
            $gt = strpos($body, '-->', $pos);
195
            if ($gt === false) {
196
                $gt = strlen($body);
197
            } else {
198
                $gt += 2;
199
            }
200
            return array(false, false, false, $lt, $gt);
201
        } else {
202
            $gt = tln_findnxstr($body, $pos, '>');
203
            return array(false, false, false, $lt, $gt);
204
        }
205
        break;
206
    default:
207
        /**
208
         * Assume tagtype 1 for now. If it's type 3, we'll switch values
209
         * later.
210
         */
211
        $tagtype = 1;
212
        break;
213
    }
214

    
215
    /**
216
     * Look for next [\W-_], which will indicate the end of the tag name.
217
     */
218
    $regary = tln_findnxreg($body, $pos, '[^\w\-_]');
219
    if ($regary == false) {
220
        return array(false, false, false, $lt, strlen($body));
221
    }
222
    list($pos, $tagname, $match) = $regary;
223
    $tagname = strtolower($tagname);
224

    
225
    /**
226
     * $match can be either of these:
227
     * '>'	indicating the end of the tag entirely.
228
     * '\s' indicating the end of the tag name.
229
     * '/'	indicating that this is type-3 xhtml tag.
230
     *
231
     * Whatever else we find there indicates an invalid tag.
232
     */
233
    switch ($match) {
234
    case '/':
235
        /**
236
         * This is an xhtml-style tag with a closing / at the
237
         * end, like so: <img src="blah"/>. Check if it's followed
238
         * by the closing bracket. If not, then this tag is invalid
239
         */
240
        if (substr($body, $pos, 2) == '/>') {
241
            $pos++;
242
            $tagtype = 3;
243
        } else {
244
            $gt = tln_findnxstr($body, $pos, '>');
245
            $retary = array(false, false, false, $lt, $gt);
246
            return $retary;
247
        }
248
            //intentional fall-through
249
    case '>':
250
        return array($tagname, false, $tagtype, $lt, $pos);
251
        break;
252
    default:
253
        /**
254
         * Check if it's whitespace
255
         */
256
        if (!preg_match('/\s/', $match)) {
257
            /**
258
             * This is an invalid tag! Look for the next closing ">".
259
             */
260
            $gt = tln_findnxstr($body, $lt, '>');
261
            return array(false, false, false, $lt, $gt);
262
        }
263
        break;
264
    }
265

    
266
    /**
267
     * At this point we're here:
268
     * <tagname	 attribute='blah'>
269
     * \-------^
270
     *
271
     * At this point we loop in order to find all attributes.
272
     */
273
    $attary = array();
274

    
275
    while ($pos <= strlen($body)) {
276
        $pos = tln_skipspace($body, $pos);
277
        if ($pos == strlen($body)) {
278
            /**
279
             * Non-closed tag.
280
             */
281
            return array(false, false, false, $lt, $pos);
282
        }
283
        /**
284
         * See if we arrived at a ">" or "/>", which means that we reached
285
         * the end of the tag.
286
         */
287
        $matches = array();
288
        if (preg_match('%^(\s*)(>|/>)%s', substr($body, $pos), $matches)) {
289
            /**
290
             * Yep. So we did.
291
             */
292
            $pos += strlen($matches[1]);
293
            if ($matches[2] == '/>') {
294
                $tagtype = 3;
295
                $pos++;
296
            }
297
            return array($tagname, $attary, $tagtype, $lt, $pos);
298
        }
299

    
300
        /**
301
         * There are several types of attributes, with optional
302
         * [:space:] between members.
303
         * Type 1:
304
         *	 attrname[:space:]=[:space:]'CDATA'
305
         * Type 2:
306
         *	 attrname[:space:]=[:space:]"CDATA"
307
         * Type 3:
308
         *	 attr[:space:]=[:space:]CDATA
309
         * Type 4:
310
         *	 attrname
311
         *
312
         * We leave types 1 and 2 the same, type 3 we check for
313
         * '"' and convert to "&quot" if needed, then wrap in
314
         * double quotes. Type 4 we convert into:
315
         * attrname="yes".
316
         */
317
        $regary = tln_findnxreg($body, $pos, '[^\w\-_]');
318
        if ($regary == false) {
319
            /**
320
             * Looks like body ended before the end of tag.
321
             */
322
            return array(false, false, false, $lt, strlen($body));
323
        }
324
        list($pos, $attname, $match) = $regary;
325
        $attname = strtolower($attname);
326
        /**
327
         * We arrived at the end of attribute name. Several things possible
328
         * here:
329
         * '>'	means the end of the tag and this is attribute type 4
330
         * '/'	if followed by '>' means the same thing as above
331
         * '\s' means a lot of things -- look what it's followed by.
332
         *		anything else means the attribute is invalid.
333
         */
334
        switch ($match) {
335
        case '/':
336
            /**
337
             * This is an xhtml-style tag with a closing / at the
338
             * end, like so: <img src="blah"/>. Check if it's followed
339
             * by the closing bracket. If not, then this tag is invalid
340
             */
341
            if (substr($body, $pos, 2) == '/>') {
342
                $pos++;
343
                $tagtype = 3;
344
            } else {
345
                $gt = tln_findnxstr($body, $pos, '>');
346
                $retary = array(false, false, false, $lt, $gt);
347
                return $retary;
348
            }
349
                //intentional fall-through
350
        case '>':
351
            $attary{$attname} = '"yes"';
352
            return array($tagname, $attary, $tagtype, $lt, $pos);
353
            break;
354
        default:
355
            /**
356
             * Skip whitespace and see what we arrive at.
357
             */
358
            $pos = tln_skipspace($body, $pos);
359
            $char = substr($body, $pos, 1);
360
            /**
361
             * Two things are valid here:
362
             * '=' means this is attribute type 1 2 or 3.
363
             * \w means this was attribute type 4.
364
             * anything else we ignore and re-loop. End of tag and
365
             * invalid stuff will be caught by our checks at the beginning
366
             * of the loop.
367
             */
368
            if ($char == '=') {
369
                $pos++;
370
                $pos = tln_skipspace($body, $pos);
371
                /**
372
                 * Here are 3 possibilities:
373
                 * "'"	attribute type 1
374
                 * '"'	attribute type 2
375
                 * everything else is the content of tag type 3
376
                 */
377
                $quot = substr($body, $pos, 1);
378
                if ($quot == '\'') {
379
                        $regary = tln_findnxreg($body, $pos + 1, '\'');
380
                    if ($regary == false) {
381
                        return array(false, false, false, $lt, strlen($body));
382
                    }
383
                    list($pos, $attval, $match) = $regary;
384
                    $pos++;
385
                    $attary{$attname} = '\'' . $attval . '\'';
386
                } elseif ($quot == '"') {
387
                    $regary = tln_findnxreg($body, $pos + 1, '\"');
388
                    if ($regary == false) {
389
                        return array(false, false, false, $lt, strlen($body));
390
                    }
391
                    list($pos, $attval, $match) = $regary;
392
                    $pos++;
393
                            $attary{$attname} = '"' . $attval . '"';
394
                } else {
395
                    /**
396
                     * These are hateful. Look for \s, or >.
397
                     */
398
                    $regary = tln_findnxreg($body, $pos, '[\s>]');
399
                    if ($regary == false) {
400
                        return array(false, false, false, $lt, strlen($body));
401
                    }
402
                    list($pos, $attval, $match) = $regary;
403
                    /**
404
                     * If it's ">" it will be caught at the top.
405
                     */
406
                    $attval = preg_replace('/\"/s', '&quot;', $attval);
407
                    $attary{$attname} = '"' . $attval . '"';
408
                }
409
            } elseif (preg_match('|[\w/>]|', $char)) {
410
                /**
411
                 * That was attribute type 4.
412
                 */
413
                $attary{$attname} = '"yes"';
414
            } else {
415
                /**
416
                 * An illegal character. Find next '>' and return.
417
                 */
418
                $gt = tln_findnxstr($body, $pos, '>');
419
                return array(false, false, false, $lt, $gt);
420
            }
421
            break;
422
        }
423
    }
424
    /**
425
     * The fact that we got here indicates that the tag end was never
426
     * found. Return invalid tag indication so it gets stripped.
427
     */
428
    return array(false, false, false, $lt, strlen($body));
429
}
430

    
431
/**
432
 * Translates entities into literal values so they can be checked.
433
 *
434
 * @param string $attvalue the by-ref value to check.
435
 * @param string $regex    the regular expression to check against.
436
 * @param boolean $hex        whether the entites are hexadecimal.
437
 * @return boolean            True or False depending on whether there were matches.
438
 */
439
function tln_deent(&$attvalue, $regex, $hex = false)
440
{
441
    preg_match_all($regex, $attvalue, $matches);
442
    if (is_array($matches) && sizeof($matches[0]) > 0) {
443
        $repl = array();
444
        for ($i = 0; $i < sizeof($matches[0]); $i++) {
445
            $numval = $matches[1][$i];
446
            if ($hex) {
447
                $numval = hexdec($numval);
448
            }
449
            $repl{$matches[0][$i]} = chr($numval);
450
        }
451
        $attvalue = strtr($attvalue, $repl);
452
        return true;
453
    } else {
454
        return false;
455
    }
456
}
457

    
458
/**
459
 * This function checks attribute values for entity-encoded values
460
 * and returns them translated into 8-bit strings so we can run
461
 * checks on them.
462
 *
463
 * @param string $attvalue A string to run entity check against.
464
 */
465
function tln_defang(&$attvalue)
466
{
467
    /**
468
     * Skip this if there aren't ampersands or backslashes.
469
     */
470
    if (strpos($attvalue, '&') === false
471
        && strpos($attvalue, '\\') === false
472
    ) {
473
        return;
474
    }
475
    do {
476
        $m = false;
477
        $m = $m || tln_deent($attvalue, '/\&#0*(\d+);*/s');
478
        $m = $m || tln_deent($attvalue, '/\&#x0*((\d|[a-f])+);*/si', true);
479
        $m = $m || tln_deent($attvalue, '/\\\\(\d+)/s', true);
480
    } while ($m == true);
481
    $attvalue = stripslashes($attvalue);
482
}
483

    
484
/**
485
 * Kill any tabs, newlines, or carriage returns. Our friends the
486
 * makers of the browser with 95% market value decided that it'd
487
 * be funny to make "java[tab]script" be just as good as "javascript".
488
 *
489
 * @param string $attvalue     The attribute value before extraneous spaces removed.
490
 */
491
function tln_unspace(&$attvalue)
492
{
493
    if (strcspn($attvalue, "\t\r\n\0 ") != strlen($attvalue)) {
494
        $attvalue = str_replace(
495
            array("\t", "\r", "\n", "\0", " "),
496
            array('', '', '', '', ''),
497
            $attvalue
498
        );
499
    }
500
}
501

    
502
/**
503
 * This function runs various checks against the attributes.
504
 *
505
 * @param string $tagname            String with the name of the tag.
506
 * @param array $attary            Array with all tag attributes.
507
 * @param array $rm_attnames        See description for tln_sanitize
508
 * @param array $bad_attvals        See description for tln_sanitize
509
 * @param array $add_attr_to_tag See description for tln_sanitize
510
 * @param string $trans_image_path
511
 * @param boolean $block_external_images
512
 * @return array with modified attributes.
513
 */
514
function tln_fixatts(
515
    $tagname,
516
    $attary,
517
    $rm_attnames,
518
    $bad_attvals,
519
    $add_attr_to_tag,
520
    $trans_image_path,
521
    $block_external_images
522
) {
523
    while (list($attname, $attvalue) = each($attary)) {
524
        /**
525
         * See if this attribute should be removed.
526
         */
527
        foreach ($rm_attnames as $matchtag => $matchattrs) {
528
            if (preg_match($matchtag, $tagname)) {
529
                foreach ($matchattrs as $matchattr) {
530
                    if (preg_match($matchattr, $attname)) {
531
                        unset($attary{$attname});
532
                        continue;
533
                    }
534
                }
535
            }
536
        }
537
        /**
538
         * Remove any backslashes, entities, or extraneous whitespace.
539
         */
540
        $oldattvalue = $attvalue;
541
        tln_defang($attvalue);
542
        if ($attname == 'style' && $attvalue !== $oldattvalue) {
543
            $attvalue = "idiocy";
544
            $attary{$attname} = $attvalue;
545
        }
546
        tln_unspace($attvalue);
547

    
548
        /**
549
         * Now let's run checks on the attvalues.
550
         * I don't expect anyone to comprehend this. If you do,
551
         * get in touch with me so I can drive to where you live and
552
         * shake your hand personally. :)
553
         */
554
        foreach ($bad_attvals as $matchtag => $matchattrs) {
555
            if (preg_match($matchtag, $tagname)) {
556
                foreach ($matchattrs as $matchattr => $valary) {
557
                    if (preg_match($matchattr, $attname)) {
558
                        /**
559
                         * There are two arrays in valary.
560
                         * First is matches.
561
                         * Second one is replacements
562
                         */
563
                        list($valmatch, $valrepl) = $valary;
564
                        $newvalue = preg_replace($valmatch, $valrepl, $attvalue);
565
                        if ($newvalue != $attvalue) {
566
                            $attary{$attname} = $newvalue;
567
                            $attvalue = $newvalue;
568
                        }
569
                    }
570
                }
571
            }
572
        }
573
        if ($attname == 'style') {
574
            if (preg_match('/[\0-\37\200-\377]+/', $attvalue)) {
575
                $attary{$attname} = '"disallowed character"';
576
            }
577
            preg_match_all("/url\s*\((.+)\)/si", $attvalue, $aMatch);
578
            if (count($aMatch)) {
579
                foreach($aMatch[1] as $sMatch) {
580
                    $urlvalue = $sMatch;
581
                    tln_fixurl($attname, $urlvalue, $trans_image_path, $block_external_images);
582
                    $attary{$attname} = str_replace($sMatch, $urlvalue, $attvalue);
583
                }
584
            }
585
        }
586
     }
587
    /**
588
     * See if we need to append any attributes to this tag.
589
     */
590
    foreach ($add_attr_to_tag as $matchtag => $addattary) {
591
        if (preg_match($matchtag, $tagname)) {
592
            $attary = array_merge($attary, $addattary);
593
        }
594
    }
595
    return $attary;
596
}
597

    
598
function tln_fixurl($attname, &$attvalue, $trans_image_path, $block_external_images)
599
{
600
    $sQuote = '"';
601
    $attvalue = trim($attvalue);
602
    if ($attvalue && ($attvalue[0] =='"'|| $attvalue[0] == "'")) {
603
        // remove the double quotes
604
        $sQuote = $attvalue[0];
605
        $attvalue = trim(substr($attvalue,1,-1));
606
    }
607

    
608
    /**
609
     * Replace empty src tags with the blank image.  src is only used
610
     * for frames, images, and image inputs.  Doing a replace should
611
     * not affect them working as should be, however it will stop
612
     * IE from being kicked off when src for img tags are not set
613
     */
614
    if ($attvalue == '') {
615
        $attvalue = $sQuote . $trans_image_path . $sQuote;
616
    } else {
617
        // first, disallow 8 bit characters and control characters
618
        if (preg_match('/[\0-\37\200-\377]+/',$attvalue)) {
619
            switch ($attname) {
620
                case 'href':
621
                    $attvalue = $sQuote . 'http://invalid-stuff-detected.example.com' . $sQuote;
622
                    break;
623
                default:
624
                    $attvalue = $sQuote . $trans_image_path . $sQuote;
625
                    break;
626
            }
627
        } else {
628
            $aUrl = parse_url($attvalue);
629
            if (isset($aUrl['scheme'])) {
630
                switch(strtolower($aUrl['scheme'])) {
631
                    case 'mailto':
632
                    case 'http':
633
                    case 'https':
634
                    case 'ftp':
635
                        if ($attname != 'href') {
636
                            if ($block_external_images == true) {
637
                                $attvalue = $sQuote . $trans_image_path . $sQuote;
638
                            } else {
639
                                if (!isset($aUrl['path'])) {
640
                                    $attvalue = $sQuote . $trans_image_path . $sQuote;
641
                                }
642
                            }
643
                        } else {
644
                            $attvalue = $sQuote . $attvalue . $sQuote;
645
                        }
646
                        break;
647
                    case 'outbind':
648
                        $attvalue = $sQuote . $attvalue . $sQuote;
649
                        break;
650
                    case 'cid':
651
                        $attvalue = $sQuote . $attvalue . $sQuote;
652
                        break;
653
                    default:
654
                        $attvalue = $sQuote . $trans_image_path . $sQuote;
655
                        break;
656
                }
657
            } else {
658
                if (!isset($aUrl['path']) || $aUrl['path'] != $trans_image_path) {
659
                    $$attvalue = $sQuote . $trans_image_path . $sQuote;
660
                }
661
            }
662
        }
663
    }
664
}
665

    
666
function tln_fixstyle($body, $pos, $trans_image_path, $block_external_images)
667
{
668
    // workaround for </style> in between comments
669
    $content = '';
670
    $sToken = '';
671
    $bSucces = false;
672
    $bEndTag = false;
673
    for ($i=$pos,$iCount=strlen($body);$i<$iCount;++$i) {
674
        $char = $body{$i};
675
        switch ($char) {
676
            case '<':
677
                $sToken = $char;
678
                break;
679
            case '/':
680
                 if ($sToken == '<') {
681
                    $sToken .= $char;
682
                    $bEndTag = true;
683
                 } else {
684
                    $content .= $char;
685
                 }
686
                 break;
687
            case '>':
688
                 if ($bEndTag) {
689
                    $sToken .= $char;
690
                    if (preg_match('/\<\/\s*style\s*\>/i',$sToken,$aMatch)) {
691
                        $newpos = $i + 1;
692
                        $bSucces = true;
693
                        break 2;
694
                    } else {
695
                        $content .= $sToken;
696
                    }
697
                    $bEndTag = false;
698
                 } else {
699
                    $content .= $char;
700
                 }
701
                 break;
702
            case '!':
703
                if ($sToken == '<') {
704
                    // possible comment
705
                    if (isset($body{$i+2}) && substr($body,$i,3) == '!--') {
706
                        $i = strpos($body,'-->',$i+3);
707
                        if ($i === false) { // no end comment
708
                            $i = strlen($body);
709
                        }
710
                        $sToken = '';
711
                    }
712
                } else {
713
                    $content .= $char;
714
                }
715
                break;
716
            default:
717
                if ($bEndTag) {
718
                    $sToken .= $char;
719
                } else {
720
                    $content .= $char;
721
                }
722
                break;
723
        }
724
    }
725
    if ($bSucces == FALSE){
726
        return array(FALSE, strlen($body));
727
    }
728

    
729

    
730

    
731
    /**
732
     * First look for general BODY style declaration, which would be
733
     * like so:
734
     * body {background: blah-blah}
735
     * and change it to .bodyclass so we can just assign it to a <div>
736
     */
737
    $content = preg_replace("|body(\s*\{.*?\})|si", ".bodyclass\\1", $content);
738

    
739
    /**
740
    * Fix url('blah') declarations.
741
    */
742
    //   $content = preg_replace("|url\s*\(\s*([\'\"])\s*\S+script\s*:.*?([\'\"])\s*\)|si",
743
    //                           "url(\\1$trans_image_path\\2)", $content);
744

    
745
    // first check for 8bit sequences and disallowed control characters
746
    if (preg_match('/[\16-\37\200-\377]+/',$content)) {
747
        $content = '<!-- style block removed by html filter due to presence of 8bit characters -->';
748
        return array($content, $newpos);
749
    }
750

    
751
    // remove @import line
752
    $content = preg_replace("/^\s*(@import.*)$/mi","\n<!-- @import rules forbidden -->\n",$content);
753

    
754
    $content = preg_replace("/(\\\\)?u(\\\\)?r(\\\\)?l(\\\\)?/i", 'url', $content);
755
    preg_match_all("/url\s*\((.+)\)/si",$content,$aMatch);
756
    if (count($aMatch)) {
757
        $aValue = $aReplace = array();
758
        foreach($aMatch[1] as $sMatch) {
759
            // url value
760
            $urlvalue = $sMatch;
761
            tln_fixurl('style',$urlvalue, $trans_image_path, $block_external_images);
762
            $aValue[] = $sMatch;
763
            $aReplace[] = $urlvalue;
764
        }
765
        $content = str_replace($aValue,$aReplace,$content);
766
    }
767

    
768
    /**
769
     * Remove any backslashes, entities, and extraneous whitespace.
770
     */
771
    $contentTemp = $content;
772
    tln_defang($contentTemp);
773
    tln_unspace($contentTemp);
774

    
775
    $match   = Array('/\/\*.*\*\//',
776
                    '/expression/i',
777
                    '/behaviou*r/i',
778
                    '/binding/i',
779
                    '/include-source/i',
780
                    '/javascript/i',
781
                    '/script/i',
782
                    '/position/i');
783
    $replace = Array('','idiocy', 'idiocy', 'idiocy', 'idiocy', 'idiocy', 'idiocy', '');
784
    $contentNew = preg_replace($match, $replace, $contentTemp);
785
    if ($contentNew !== $contentTemp) {
786
        $content = $contentNew;
787
    }
788
    return array($content, $newpos);
789
}
790

    
791
function tln_body2div($attary, $trans_image_path)
792
{
793
    $divattary = array('class' => "'bodyclass'");
794
    $text = '#000000';
795
    $has_bgc_stl = $has_txt_stl = false;
796
    $styledef = '';
797
    if (is_array($attary) && sizeof($attary) > 0){
798
        foreach ($attary as $attname=>$attvalue){
799
            $quotchar = substr($attvalue, 0, 1);
800
            $attvalue = str_replace($quotchar, "", $attvalue);
801
            switch ($attname){
802
                case 'background':
803
                    $styledef .= "background-image: url('$trans_image_path'); ";
804
                    break;
805
                case 'bgcolor':
806
                    $has_bgc_stl = true;
807
                    $styledef .= "background-color: $attvalue; ";
808
                    break;
809
                case 'text':
810
                    $has_txt_stl = true;
811
                    $styledef .= "color: $attvalue; ";
812
                    break;
813
            }
814
        }
815
        // Outlook defines a white bgcolor and no text color. This can lead to
816
        // white text on a white bg with certain themes.
817
        if ($has_bgc_stl && !$has_txt_stl) {
818
            $styledef .= "color: $text; ";
819
        }
820
        if (strlen($styledef) > 0){
821
            $divattary{"style"} = "\"$styledef\"";
822
        }
823
    }
824
    return $divattary;
825
}
826

    
827
/**
828
 *
829
 * @param string $body                    The HTML you wish to filter
830
 * @param array $tag_list                see description above
831
 * @param array $rm_tags_with_content see description above
832
 * @param array $self_closing_tags    see description above
833
 * @param boolean $force_tag_closing    see description above
834
 * @param array $rm_attnames            see description above
835
 * @param array $bad_attvals            see description above
836
 * @param array $add_attr_to_tag        see description above
837
 * @param string $trans_image_path
838
 * @param boolean $block_external_images
839

    
840
 * @return string                       Sanitized html safe to show on your pages.
841
 */
842
function tln_sanitize(
843
    $body,
844
    $tag_list,
845
    $rm_tags_with_content,
846
    $self_closing_tags,
847
    $force_tag_closing,
848
    $rm_attnames,
849
    $bad_attvals,
850
    $add_attr_to_tag,
851
    $trans_image_path,
852
    $block_external_images
853
) {
854
    /**
855
     * Normalize rm_tags and rm_tags_with_content.
856
     */
857
    $rm_tags = array_shift($tag_list);
858
    @array_walk($tag_list, 'tln_casenormalize');
859
    @array_walk($rm_tags_with_content, 'tln_casenormalize');
860
    @array_walk($self_closing_tags, 'tln_casenormalize');
861
    /**
862
     * See if tag_list is of tags to remove or tags to allow.
863
     * false  means remove these tags
864
     * true	  means allow these tags
865
     */
866
    $curpos = 0;
867
    $open_tags = array();
868
    $trusted = "<!-- begin tln_sanitized html -->\n";
869
    $skip_content = false;
870
    /**
871
     * Take care of netscape's stupid javascript entities like
872
     * &{alert('boo')};
873
     */
874
    $body = preg_replace('/&(\{.*?\};)/si', '&amp;\\1', $body);
875
    while (($curtag = tln_getnxtag($body, $curpos)) != false) {
876
        list($tagname, $attary, $tagtype, $lt, $gt) = $curtag;
877
        $free_content = substr($body, $curpos, $lt-$curpos);
878
        /**
879
         * Take care of <style>
880
         */
881
        if ($tagname == "style" && $tagtype == 1){
882
            list($free_content, $curpos) =
883
                tln_fixstyle($body, $gt+1, $trans_image_path, $block_external_images);
884
            if ($free_content != FALSE){
885
                if ( !empty($attary) ) {
886
                    $attary = tln_fixatts($tagname,
887
                                         $attary,
888
                                         $rm_attnames,
889
                                         $bad_attvals,
890
                                         $add_attr_to_tag,
891
                                         $trans_image_path,
892
                                         $block_external_images
893
                                         );
894
                }
895
                $trusted .= tln_tagprint($tagname, $attary, $tagtype);
896
                $trusted .= $free_content;
897
                $trusted .= tln_tagprint($tagname, null, 2);
898
            }
899
            continue;
900
        }
901
        if ($skip_content == false){
902
            $trusted .= $free_content;
903
        }
904
        if ($tagname != false) {
905
            if ($tagtype == 2) {
906
                if ($skip_content == $tagname) {
907
                    /**
908
                     * Got to the end of tag we needed to remove.
909
                     */
910
                    $tagname = false;
911
                    $skip_content = false;
912
                } else {
913
                    if ($skip_content == false) {
914
                        if ($tagname == "body") {
915
                            $tagname = "div";
916
                        }
917
                        if (isset($open_tags{$tagname}) &&
918
                            $open_tags{$tagname} > 0
919
                        ) {
920
                            $open_tags{$tagname}--;
921
                        } else {
922
                            $tagname = false;
923
                        }
924
                    }
925
                }
926
            } else {
927
                /**
928
                 * $rm_tags_with_content
929
                 */
930
                if ($skip_content == false) {
931
                    /**
932
                     * See if this is a self-closing type and change
933
                     * tagtype appropriately.
934
                     */
935
                    if ($tagtype == 1
936
                        && in_array($tagname, $self_closing_tags)
937
                    ) {
938
                        $tagtype = 3;
939
                    }
940
                    /**
941
                     * See if we should skip this tag and any content
942
                     * inside it.
943
                     */
944
                    if ($tagtype == 1
945
                        && in_array($tagname, $rm_tags_with_content)
946
                    ) {
947
                        $skip_content = $tagname;
948
                    } else {
949
                        if (($rm_tags == false
950
                             && in_array($tagname, $tag_list)) ||
951
                            ($rm_tags == true
952
                                && !in_array($tagname, $tag_list))
953
                        ) {
954
                            $tagname = false;
955
                        } else {
956
                            /**
957
                             * Convert body into div.
958
                             */
959
                            if ($tagname == "body"){
960
                                $tagname = "div";
961
                                $attary = tln_body2div($attary, $trans_image_path);
962
                            }
963
                            if ($tagtype == 1) {
964
                                if (isset($open_tags{$tagname})) {
965
                                    $open_tags{$tagname}++;
966
                                } else {
967
                                    $open_tags{$tagname} = 1;
968
                                }
969
                            }
970
                            /**
971
                             * This is where we run other checks.
972
                             */
973
                            if (is_array($attary) && sizeof($attary) > 0) {
974
                                $attary = tln_fixatts(
975
                                    $tagname,
976
                                    $attary,
977
                                    $rm_attnames,
978
                                    $bad_attvals,
979
                                    $add_attr_to_tag,
980
                                    $trans_image_path,
981
                                    $block_external_images
982
                                );
983
                            }
984
                        }
985
                    }
986
                }
987
            }
988
            if ($tagname != false && $skip_content == false) {
989
                $trusted .= tln_tagprint($tagname, $attary, $tagtype);
990
            }
991
        }
992
        $curpos = $gt + 1;
993
    }
994
    $trusted .= substr($body, $curpos, strlen($body) - $curpos);
995
    if ($force_tag_closing == true) {
996
        foreach ($open_tags as $tagname => $opentimes) {
997
            while ($opentimes > 0) {
998
                $trusted .= '</' . $tagname . '>';
999
                $opentimes--;
1000
            }
1001
        }
1002
        $trusted .= "\n";
1003
    }
1004
    $trusted .= "<!-- end tln_sanitized html -->\n";
1005
    return $trusted;
1006
}
1007

    
1008
//
1009
// Use the nifty htmlfilter library
1010
//
1011

    
1012

    
1013
function HTMLFilter($body, $trans_image_path, $block_external_images = false)
1014
{
1015

    
1016
    $tag_list = array(
1017
        false,
1018
        "object",
1019
        "meta",
1020
        "html",
1021
        "head",
1022
        "base",
1023
        "link",
1024
        "frame",
1025
        "iframe",
1026
        "plaintext",
1027
        "marquee"
1028
    );
1029

    
1030
    $rm_tags_with_content = array(
1031
        "script",
1032
        "applet",
1033
        "embed",
1034
        "title",
1035
        "frameset",
1036
        "xmp",
1037
        "xml"
1038
    );
1039

    
1040
    $self_closing_tags =  array(
1041
        "img",
1042
        "br",
1043
        "hr",
1044
        "input",
1045
        "outbind"
1046
    );
1047

    
1048
    $force_tag_closing = true;
1049

    
1050
    $rm_attnames = array(
1051
        "/.*/" =>
1052
            array(
1053
                // "/target/i",
1054
                "/^on.*/i",
1055
                "/^dynsrc/i",
1056
                "/^data.*/i",
1057
                "/^lowsrc.*/i"
1058
            )
1059
    );
1060

    
1061
    $bad_attvals = array(
1062
        "/.*/" =>
1063
        array(
1064
            "/^src|background/i" =>
1065
            array(
1066
                array(
1067
                    '/^([\'"])\s*\S+script\s*:.*([\'"])/si',
1068
                    '/^([\'"])\s*mocha\s*:*.*([\'"])/si',
1069
                    '/^([\'"])\s*about\s*:.*([\'"])/si'
1070
                ),
1071
                array(
1072
                    "\\1$trans_image_path\\2",
1073
                    "\\1$trans_image_path\\2",
1074
                    "\\1$trans_image_path\\2"
1075
                )
1076
            ),
1077
            "/^href|action/i" =>
1078
            array(
1079
                array(
1080
                    '/^([\'"])\s*\S+script\s*:.*([\'"])/si',
1081
                    '/^([\'"])\s*mocha\s*:*.*([\'"])/si',
1082
                    '/^([\'"])\s*about\s*:.*([\'"])/si'
1083
                ),
1084
                array(
1085
                    "\\1#\\1",
1086
                    "\\1#\\1",
1087
                    "\\1#\\1"
1088
                )
1089
            ),
1090
            "/^style/i" =>
1091
            array(
1092
                array(
1093
                    "/\/\*.*\*\//",
1094
                    "/expression/i",
1095
                    "/binding/i",
1096
                    "/behaviou*r/i",
1097
                    "/include-source/i",
1098
                    '/position\s*:/i',
1099
                    '/(\\\\)?u(\\\\)?r(\\\\)?l(\\\\)?/i',
1100
                    '/url\s*\(\s*([\'"])\s*\S+script\s*:.*([\'"])\s*\)/si',
1101
                    '/url\s*\(\s*([\'"])\s*mocha\s*:.*([\'"])\s*\)/si',
1102
                    '/url\s*\(\s*([\'"])\s*about\s*:.*([\'"])\s*\)/si',
1103
                    '/(.*)\s*:\s*url\s*\(\s*([\'"]*)\s*\S+script\s*:.*([\'"]*)\s*\)/si'
1104
                ),
1105
                array(
1106
                    "",
1107
                    "idiocy",
1108
                    "idiocy",
1109
                    "idiocy",
1110
                    "idiocy",
1111
                    "idiocy",
1112
                    "url",
1113
                    "url(\\1#\\1)",
1114
                    "url(\\1#\\1)",
1115
                    "url(\\1#\\1)",
1116
                    "\\1:url(\\2#\\3)"
1117
                )
1118
            )
1119
        )
1120
    );
1121

    
1122
    if ($block_external_images) {
1123
        array_push(
1124
            $bad_attvals{'/.*/'}{'/^src|background/i'}[0],
1125
            '/^([\'\"])\s*https*:.*([\'\"])/si'
1126
        );
1127
        array_push(
1128
            $bad_attvals{'/.*/'}{'/^src|background/i'}[1],
1129
            "\\1$trans_image_path\\1"
1130
        );
1131
        array_push(
1132
            $bad_attvals{'/.*/'}{'/^style/i'}[0],
1133
            '/url\(([\'\"])\s*https*:.*([\'\"])\)/si'
1134
        );
1135
        array_push(
1136
            $bad_attvals{'/.*/'}{'/^style/i'}[1],
1137
            "url(\\1$trans_image_path\\1)"
1138
        );
1139
    }
1140

    
1141
    $add_attr_to_tag = array(
1142
        "/^a$/i" =>
1143
            array('target' => '"_blank"')
1144
    );
1145

    
1146
    $trusted = tln_sanitize(
1147
        $body,
1148
        $tag_list,
1149
        $rm_tags_with_content,
1150
        $self_closing_tags,
1151
        $force_tag_closing,
1152
        $rm_attnames,
1153
        $bad_attvals,
1154
        $add_attr_to_tag,
1155
        $trans_image_path,
1156
        $block_external_images
1157
    );
1158
    return $trusted;
1159
}
(3-3/4)