1
|
<?php
|
2
|
/**
|
3
|
* htmlfilter.inc
|
4
|
* ---------------
|
5
|
* This set of functions allows you to filter html in order to remove
|
6
|
* any malicious tags from it. Useful in cases when you need to filter
|
7
|
* user input for any cross-site-scripting attempts.
|
8
|
*
|
9
|
* Copyright (C) 2002-2004 by Duke University
|
10
|
*
|
11
|
* This library is free software; you can redistribute it and/or
|
12
|
* modify it under the terms of the GNU Lesser General Public
|
13
|
* License as published by the Free Software Foundation; either
|
14
|
* version 2.1 of the License, or (at your option) any later version.
|
15
|
*
|
16
|
* This library is distributed in the hope that it will be useful,
|
17
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
18
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
19
|
* Lesser General Public License for more details.
|
20
|
*
|
21
|
* You should have received a copy of the GNU Lesser General Public
|
22
|
* License along with this library; if not, write to the Free Software
|
23
|
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
24
|
* 02110-1301 USA
|
25
|
*
|
26
|
* @Author Konstantin Riabitsev <icon@linux.duke.edu>
|
27
|
* @Author Jim Jagielski <jim@jaguNET.com / jimjag@gmail.com>
|
28
|
* @Version 1.1 ($Date: 2017-07-02 17:14:29 +0200 (Sun, 02 Jul 2017) $)
|
29
|
*/
|
30
|
|
31
|
/**
|
32
|
* This function returns the final tag out of the tag name, an array
|
33
|
* of attributes, and the type of the tag. This function is called by
|
34
|
* tln_sanitize internally.
|
35
|
*
|
36
|
* @param string $tagname the name of the tag.
|
37
|
* @param array $attary the array of attributes and their values
|
38
|
* @param integer $tagtype The type of the tag (see in comments).
|
39
|
* @return string A string with the final tag representation.
|
40
|
*/
|
41
|
function tln_tagprint($tagname, $attary, $tagtype)
|
42
|
{
|
43
|
if ($tagtype == 2) {
|
44
|
$fulltag = '</' . $tagname . '>';
|
45
|
} else {
|
46
|
$fulltag = '<' . $tagname;
|
47
|
if (is_array($attary) && sizeof($attary)) {
|
48
|
$atts = array();
|
49
|
while (list($attname, $attvalue) = each($attary)) {
|
50
|
array_push($atts, "$attname=$attvalue");
|
51
|
}
|
52
|
$fulltag .= ' ' . join(' ', $atts);
|
53
|
}
|
54
|
if ($tagtype == 3) {
|
55
|
$fulltag .= ' /';
|
56
|
}
|
57
|
$fulltag .= '>';
|
58
|
}
|
59
|
return $fulltag;
|
60
|
}
|
61
|
|
62
|
/**
|
63
|
* A small helper function to use with array_walk. Modifies a by-ref
|
64
|
* value and makes it lowercase.
|
65
|
*
|
66
|
* @param string $val a value passed by-ref.
|
67
|
* @return void since it modifies a by-ref value.
|
68
|
*/
|
69
|
function tln_casenormalize(&$val)
|
70
|
{
|
71
|
$val = strtolower($val);
|
72
|
}
|
73
|
|
74
|
/**
|
75
|
* This function skips any whitespace from the current position within
|
76
|
* a string and to the next non-whitespace value.
|
77
|
*
|
78
|
* @param string $body the string
|
79
|
* @param integer $offset the offset within the string where we should start
|
80
|
* looking for the next non-whitespace character.
|
81
|
* @return integer the location within the $body where the next
|
82
|
* non-whitespace char is located.
|
83
|
*/
|
84
|
function tln_skipspace($body, $offset)
|
85
|
{
|
86
|
preg_match('/^(\s*)/s', substr($body, $offset), $matches);
|
87
|
if (sizeof($matches[1])) {
|
88
|
$count = strlen($matches[1]);
|
89
|
$offset += $count;
|
90
|
}
|
91
|
return $offset;
|
92
|
}
|
93
|
|
94
|
/**
|
95
|
* This function looks for the next character within a string. It's
|
96
|
* really just a glorified "strpos", except it catches the failures
|
97
|
* nicely.
|
98
|
*
|
99
|
* @param string $body The string to look for needle in.
|
100
|
* @param integer $offset Start looking from this position.
|
101
|
* @param string $needle The character/string to look for.
|
102
|
* @return integer location of the next occurrence of the needle, or
|
103
|
* strlen($body) if needle wasn't found.
|
104
|
*/
|
105
|
function tln_findnxstr($body, $offset, $needle)
|
106
|
{
|
107
|
$pos = strpos($body, $needle, $offset);
|
108
|
if ($pos === false) {
|
109
|
$pos = strlen($body);
|
110
|
}
|
111
|
return $pos;
|
112
|
}
|
113
|
|
114
|
/**
|
115
|
* This function takes a PCRE-style regexp and tries to match it
|
116
|
* within the string.
|
117
|
*
|
118
|
* @param string $body The string to look for needle in.
|
119
|
* @param integer $offset Start looking from here.
|
120
|
* @param string $reg A PCRE-style regex to match.
|
121
|
* @return array|boolean Returns a false if no matches found, or an array
|
122
|
* with the following members:
|
123
|
* - integer with the location of the match within $body
|
124
|
* - string with whatever content between offset and the match
|
125
|
* - string with whatever it is we matched
|
126
|
*/
|
127
|
function tln_findnxreg($body, $offset, $reg)
|
128
|
{
|
129
|
$matches = array();
|
130
|
$retarr = array();
|
131
|
$preg_rule = '%^(.*?)(' . $reg . ')%s';
|
132
|
preg_match($preg_rule, substr($body, $offset), $matches);
|
133
|
if (!isset($matches[0]) || !$matches[0]) {
|
134
|
$retarr = false;
|
135
|
} else {
|
136
|
$retarr[0] = $offset + strlen($matches[1]);
|
137
|
$retarr[1] = $matches[1];
|
138
|
$retarr[2] = $matches[2];
|
139
|
}
|
140
|
return $retarr;
|
141
|
}
|
142
|
|
143
|
/**
|
144
|
* This function looks for the next tag.
|
145
|
*
|
146
|
* @param string $body String where to look for the next tag.
|
147
|
* @param integer $offset Start looking from here.
|
148
|
* @return array|boolean false if no more tags exist in the body, or
|
149
|
* an array with the following members:
|
150
|
* - string with the name of the tag
|
151
|
* - array with attributes and their values
|
152
|
* - integer with tag type (1, 2, or 3)
|
153
|
* - integer where the tag starts (starting "<")
|
154
|
* - integer where the tag ends (ending ">")
|
155
|
* first three members will be false, if the tag is invalid.
|
156
|
*/
|
157
|
function tln_getnxtag($body, $offset)
|
158
|
{
|
159
|
if ($offset > strlen($body)) {
|
160
|
return false;
|
161
|
}
|
162
|
$lt = tln_findnxstr($body, $offset, '<');
|
163
|
if ($lt == strlen($body)) {
|
164
|
return false;
|
165
|
}
|
166
|
/**
|
167
|
* We are here:
|
168
|
* blah blah <tag attribute="value">
|
169
|
* \---------^
|
170
|
*/
|
171
|
$pos = tln_skipspace($body, $lt + 1);
|
172
|
if ($pos >= strlen($body)) {
|
173
|
return array(false, false, false, $lt, strlen($body));
|
174
|
}
|
175
|
/**
|
176
|
* There are 3 kinds of tags:
|
177
|
* 1. Opening tag, e.g.:
|
178
|
* <a href="blah">
|
179
|
* 2. Closing tag, e.g.:
|
180
|
* </a>
|
181
|
* 3. XHTML-style content-less tag, e.g.:
|
182
|
* <img src="blah"/>
|
183
|
*/
|
184
|
switch (substr($body, $pos, 1)) {
|
185
|
case '/':
|
186
|
$tagtype = 2;
|
187
|
$pos++;
|
188
|
break;
|
189
|
case '!':
|
190
|
/**
|
191
|
* A comment or an SGML declaration.
|
192
|
*/
|
193
|
if (substr($body, $pos + 1, 2) == '--') {
|
194
|
$gt = strpos($body, '-->', $pos);
|
195
|
if ($gt === false) {
|
196
|
$gt = strlen($body);
|
197
|
} else {
|
198
|
$gt += 2;
|
199
|
}
|
200
|
return array(false, false, false, $lt, $gt);
|
201
|
} else {
|
202
|
$gt = tln_findnxstr($body, $pos, '>');
|
203
|
return array(false, false, false, $lt, $gt);
|
204
|
}
|
205
|
break;
|
206
|
default:
|
207
|
/**
|
208
|
* Assume tagtype 1 for now. If it's type 3, we'll switch values
|
209
|
* later.
|
210
|
*/
|
211
|
$tagtype = 1;
|
212
|
break;
|
213
|
}
|
214
|
|
215
|
/**
|
216
|
* Look for next [\W-_], which will indicate the end of the tag name.
|
217
|
*/
|
218
|
$regary = tln_findnxreg($body, $pos, '[^\w\-_]');
|
219
|
if ($regary == false) {
|
220
|
return array(false, false, false, $lt, strlen($body));
|
221
|
}
|
222
|
list($pos, $tagname, $match) = $regary;
|
223
|
$tagname = strtolower($tagname);
|
224
|
|
225
|
/**
|
226
|
* $match can be either of these:
|
227
|
* '>' indicating the end of the tag entirely.
|
228
|
* '\s' indicating the end of the tag name.
|
229
|
* '/' indicating that this is type-3 xhtml tag.
|
230
|
*
|
231
|
* Whatever else we find there indicates an invalid tag.
|
232
|
*/
|
233
|
switch ($match) {
|
234
|
case '/':
|
235
|
/**
|
236
|
* This is an xhtml-style tag with a closing / at the
|
237
|
* end, like so: <img src="blah"/>. Check if it's followed
|
238
|
* by the closing bracket. If not, then this tag is invalid
|
239
|
*/
|
240
|
if (substr($body, $pos, 2) == '/>') {
|
241
|
$pos++;
|
242
|
$tagtype = 3;
|
243
|
} else {
|
244
|
$gt = tln_findnxstr($body, $pos, '>');
|
245
|
$retary = array(false, false, false, $lt, $gt);
|
246
|
return $retary;
|
247
|
}
|
248
|
//intentional fall-through
|
249
|
case '>':
|
250
|
return array($tagname, false, $tagtype, $lt, $pos);
|
251
|
break;
|
252
|
default:
|
253
|
/**
|
254
|
* Check if it's whitespace
|
255
|
*/
|
256
|
if (!preg_match('/\s/', $match)) {
|
257
|
/**
|
258
|
* This is an invalid tag! Look for the next closing ">".
|
259
|
*/
|
260
|
$gt = tln_findnxstr($body, $lt, '>');
|
261
|
return array(false, false, false, $lt, $gt);
|
262
|
}
|
263
|
break;
|
264
|
}
|
265
|
|
266
|
/**
|
267
|
* At this point we're here:
|
268
|
* <tagname attribute='blah'>
|
269
|
* \-------^
|
270
|
*
|
271
|
* At this point we loop in order to find all attributes.
|
272
|
*/
|
273
|
$attary = array();
|
274
|
|
275
|
while ($pos <= strlen($body)) {
|
276
|
$pos = tln_skipspace($body, $pos);
|
277
|
if ($pos == strlen($body)) {
|
278
|
/**
|
279
|
* Non-closed tag.
|
280
|
*/
|
281
|
return array(false, false, false, $lt, $pos);
|
282
|
}
|
283
|
/**
|
284
|
* See if we arrived at a ">" or "/>", which means that we reached
|
285
|
* the end of the tag.
|
286
|
*/
|
287
|
$matches = array();
|
288
|
if (preg_match('%^(\s*)(>|/>)%s', substr($body, $pos), $matches)) {
|
289
|
/**
|
290
|
* Yep. So we did.
|
291
|
*/
|
292
|
$pos += strlen($matches[1]);
|
293
|
if ($matches[2] == '/>') {
|
294
|
$tagtype = 3;
|
295
|
$pos++;
|
296
|
}
|
297
|
return array($tagname, $attary, $tagtype, $lt, $pos);
|
298
|
}
|
299
|
|
300
|
/**
|
301
|
* There are several types of attributes, with optional
|
302
|
* [:space:] between members.
|
303
|
* Type 1:
|
304
|
* attrname[:space:]=[:space:]'CDATA'
|
305
|
* Type 2:
|
306
|
* attrname[:space:]=[:space:]"CDATA"
|
307
|
* Type 3:
|
308
|
* attr[:space:]=[:space:]CDATA
|
309
|
* Type 4:
|
310
|
* attrname
|
311
|
*
|
312
|
* We leave types 1 and 2 the same, type 3 we check for
|
313
|
* '"' and convert to """ if needed, then wrap in
|
314
|
* double quotes. Type 4 we convert into:
|
315
|
* attrname="yes".
|
316
|
*/
|
317
|
$regary = tln_findnxreg($body, $pos, '[^\w\-_]');
|
318
|
if ($regary == false) {
|
319
|
/**
|
320
|
* Looks like body ended before the end of tag.
|
321
|
*/
|
322
|
return array(false, false, false, $lt, strlen($body));
|
323
|
}
|
324
|
list($pos, $attname, $match) = $regary;
|
325
|
$attname = strtolower($attname);
|
326
|
/**
|
327
|
* We arrived at the end of attribute name. Several things possible
|
328
|
* here:
|
329
|
* '>' means the end of the tag and this is attribute type 4
|
330
|
* '/' if followed by '>' means the same thing as above
|
331
|
* '\s' means a lot of things -- look what it's followed by.
|
332
|
* anything else means the attribute is invalid.
|
333
|
*/
|
334
|
switch ($match) {
|
335
|
case '/':
|
336
|
/**
|
337
|
* This is an xhtml-style tag with a closing / at the
|
338
|
* end, like so: <img src="blah"/>. Check if it's followed
|
339
|
* by the closing bracket. If not, then this tag is invalid
|
340
|
*/
|
341
|
if (substr($body, $pos, 2) == '/>') {
|
342
|
$pos++;
|
343
|
$tagtype = 3;
|
344
|
} else {
|
345
|
$gt = tln_findnxstr($body, $pos, '>');
|
346
|
$retary = array(false, false, false, $lt, $gt);
|
347
|
return $retary;
|
348
|
}
|
349
|
//intentional fall-through
|
350
|
case '>':
|
351
|
$attary{$attname} = '"yes"';
|
352
|
return array($tagname, $attary, $tagtype, $lt, $pos);
|
353
|
break;
|
354
|
default:
|
355
|
/**
|
356
|
* Skip whitespace and see what we arrive at.
|
357
|
*/
|
358
|
$pos = tln_skipspace($body, $pos);
|
359
|
$char = substr($body, $pos, 1);
|
360
|
/**
|
361
|
* Two things are valid here:
|
362
|
* '=' means this is attribute type 1 2 or 3.
|
363
|
* \w means this was attribute type 4.
|
364
|
* anything else we ignore and re-loop. End of tag and
|
365
|
* invalid stuff will be caught by our checks at the beginning
|
366
|
* of the loop.
|
367
|
*/
|
368
|
if ($char == '=') {
|
369
|
$pos++;
|
370
|
$pos = tln_skipspace($body, $pos);
|
371
|
/**
|
372
|
* Here are 3 possibilities:
|
373
|
* "'" attribute type 1
|
374
|
* '"' attribute type 2
|
375
|
* everything else is the content of tag type 3
|
376
|
*/
|
377
|
$quot = substr($body, $pos, 1);
|
378
|
if ($quot == '\'') {
|
379
|
$regary = tln_findnxreg($body, $pos + 1, '\'');
|
380
|
if ($regary == false) {
|
381
|
return array(false, false, false, $lt, strlen($body));
|
382
|
}
|
383
|
list($pos, $attval, $match) = $regary;
|
384
|
$pos++;
|
385
|
$attary{$attname} = '\'' . $attval . '\'';
|
386
|
} elseif ($quot == '"') {
|
387
|
$regary = tln_findnxreg($body, $pos + 1, '\"');
|
388
|
if ($regary == false) {
|
389
|
return array(false, false, false, $lt, strlen($body));
|
390
|
}
|
391
|
list($pos, $attval, $match) = $regary;
|
392
|
$pos++;
|
393
|
$attary{$attname} = '"' . $attval . '"';
|
394
|
} else {
|
395
|
/**
|
396
|
* These are hateful. Look for \s, or >.
|
397
|
*/
|
398
|
$regary = tln_findnxreg($body, $pos, '[\s>]');
|
399
|
if ($regary == false) {
|
400
|
return array(false, false, false, $lt, strlen($body));
|
401
|
}
|
402
|
list($pos, $attval, $match) = $regary;
|
403
|
/**
|
404
|
* If it's ">" it will be caught at the top.
|
405
|
*/
|
406
|
$attval = preg_replace('/\"/s', '"', $attval);
|
407
|
$attary{$attname} = '"' . $attval . '"';
|
408
|
}
|
409
|
} elseif (preg_match('|[\w/>]|', $char)) {
|
410
|
/**
|
411
|
* That was attribute type 4.
|
412
|
*/
|
413
|
$attary{$attname} = '"yes"';
|
414
|
} else {
|
415
|
/**
|
416
|
* An illegal character. Find next '>' and return.
|
417
|
*/
|
418
|
$gt = tln_findnxstr($body, $pos, '>');
|
419
|
return array(false, false, false, $lt, $gt);
|
420
|
}
|
421
|
break;
|
422
|
}
|
423
|
}
|
424
|
/**
|
425
|
* The fact that we got here indicates that the tag end was never
|
426
|
* found. Return invalid tag indication so it gets stripped.
|
427
|
*/
|
428
|
return array(false, false, false, $lt, strlen($body));
|
429
|
}
|
430
|
|
431
|
/**
|
432
|
* Translates entities into literal values so they can be checked.
|
433
|
*
|
434
|
* @param string $attvalue the by-ref value to check.
|
435
|
* @param string $regex the regular expression to check against.
|
436
|
* @param boolean $hex whether the entites are hexadecimal.
|
437
|
* @return boolean True or False depending on whether there were matches.
|
438
|
*/
|
439
|
function tln_deent(&$attvalue, $regex, $hex = false)
|
440
|
{
|
441
|
preg_match_all($regex, $attvalue, $matches);
|
442
|
if (is_array($matches) && sizeof($matches[0]) > 0) {
|
443
|
$repl = array();
|
444
|
for ($i = 0; $i < sizeof($matches[0]); $i++) {
|
445
|
$numval = $matches[1][$i];
|
446
|
if ($hex) {
|
447
|
$numval = hexdec($numval);
|
448
|
}
|
449
|
$repl{$matches[0][$i]} = chr($numval);
|
450
|
}
|
451
|
$attvalue = strtr($attvalue, $repl);
|
452
|
return true;
|
453
|
} else {
|
454
|
return false;
|
455
|
}
|
456
|
}
|
457
|
|
458
|
/**
|
459
|
* This function checks attribute values for entity-encoded values
|
460
|
* and returns them translated into 8-bit strings so we can run
|
461
|
* checks on them.
|
462
|
*
|
463
|
* @param string $attvalue A string to run entity check against.
|
464
|
*/
|
465
|
function tln_defang(&$attvalue)
|
466
|
{
|
467
|
/**
|
468
|
* Skip this if there aren't ampersands or backslashes.
|
469
|
*/
|
470
|
if (strpos($attvalue, '&') === false
|
471
|
&& strpos($attvalue, '\\') === false
|
472
|
) {
|
473
|
return;
|
474
|
}
|
475
|
do {
|
476
|
$m = false;
|
477
|
$m = $m || tln_deent($attvalue, '/\�*(\d+);*/s');
|
478
|
$m = $m || tln_deent($attvalue, '/\�*((\d|[a-f])+);*/si', true);
|
479
|
$m = $m || tln_deent($attvalue, '/\\\\(\d+)/s', true);
|
480
|
} while ($m == true);
|
481
|
$attvalue = stripslashes($attvalue);
|
482
|
}
|
483
|
|
484
|
/**
|
485
|
* Kill any tabs, newlines, or carriage returns. Our friends the
|
486
|
* makers of the browser with 95% market value decided that it'd
|
487
|
* be funny to make "java[tab]script" be just as good as "javascript".
|
488
|
*
|
489
|
* @param string $attvalue The attribute value before extraneous spaces removed.
|
490
|
*/
|
491
|
function tln_unspace(&$attvalue)
|
492
|
{
|
493
|
if (strcspn($attvalue, "\t\r\n\0 ") != strlen($attvalue)) {
|
494
|
$attvalue = str_replace(
|
495
|
array("\t", "\r", "\n", "\0", " "),
|
496
|
array('', '', '', '', ''),
|
497
|
$attvalue
|
498
|
);
|
499
|
}
|
500
|
}
|
501
|
|
502
|
/**
|
503
|
* This function runs various checks against the attributes.
|
504
|
*
|
505
|
* @param string $tagname String with the name of the tag.
|
506
|
* @param array $attary Array with all tag attributes.
|
507
|
* @param array $rm_attnames See description for tln_sanitize
|
508
|
* @param array $bad_attvals See description for tln_sanitize
|
509
|
* @param array $add_attr_to_tag See description for tln_sanitize
|
510
|
* @param string $trans_image_path
|
511
|
* @param boolean $block_external_images
|
512
|
* @return array with modified attributes.
|
513
|
*/
|
514
|
function tln_fixatts(
|
515
|
$tagname,
|
516
|
$attary,
|
517
|
$rm_attnames,
|
518
|
$bad_attvals,
|
519
|
$add_attr_to_tag,
|
520
|
$trans_image_path,
|
521
|
$block_external_images
|
522
|
) {
|
523
|
while (list($attname, $attvalue) = each($attary)) {
|
524
|
/**
|
525
|
* See if this attribute should be removed.
|
526
|
*/
|
527
|
foreach ($rm_attnames as $matchtag => $matchattrs) {
|
528
|
if (preg_match($matchtag, $tagname)) {
|
529
|
foreach ($matchattrs as $matchattr) {
|
530
|
if (preg_match($matchattr, $attname)) {
|
531
|
unset($attary{$attname});
|
532
|
continue;
|
533
|
}
|
534
|
}
|
535
|
}
|
536
|
}
|
537
|
/**
|
538
|
* Remove any backslashes, entities, or extraneous whitespace.
|
539
|
*/
|
540
|
$oldattvalue = $attvalue;
|
541
|
tln_defang($attvalue);
|
542
|
if ($attname == 'style' && $attvalue !== $oldattvalue) {
|
543
|
$attvalue = "idiocy";
|
544
|
$attary{$attname} = $attvalue;
|
545
|
}
|
546
|
tln_unspace($attvalue);
|
547
|
|
548
|
/**
|
549
|
* Now let's run checks on the attvalues.
|
550
|
* I don't expect anyone to comprehend this. If you do,
|
551
|
* get in touch with me so I can drive to where you live and
|
552
|
* shake your hand personally. :)
|
553
|
*/
|
554
|
foreach ($bad_attvals as $matchtag => $matchattrs) {
|
555
|
if (preg_match($matchtag, $tagname)) {
|
556
|
foreach ($matchattrs as $matchattr => $valary) {
|
557
|
if (preg_match($matchattr, $attname)) {
|
558
|
/**
|
559
|
* There are two arrays in valary.
|
560
|
* First is matches.
|
561
|
* Second one is replacements
|
562
|
*/
|
563
|
list($valmatch, $valrepl) = $valary;
|
564
|
$newvalue = preg_replace($valmatch, $valrepl, $attvalue);
|
565
|
if ($newvalue != $attvalue) {
|
566
|
$attary{$attname} = $newvalue;
|
567
|
$attvalue = $newvalue;
|
568
|
}
|
569
|
}
|
570
|
}
|
571
|
}
|
572
|
}
|
573
|
if ($attname == 'style') {
|
574
|
if (preg_match('/[\0-\37\200-\377]+/', $attvalue)) {
|
575
|
$attary{$attname} = '"disallowed character"';
|
576
|
}
|
577
|
preg_match_all("/url\s*\((.+)\)/si", $attvalue, $aMatch);
|
578
|
if (count($aMatch)) {
|
579
|
foreach($aMatch[1] as $sMatch) {
|
580
|
$urlvalue = $sMatch;
|
581
|
tln_fixurl($attname, $urlvalue, $trans_image_path, $block_external_images);
|
582
|
$attary{$attname} = str_replace($sMatch, $urlvalue, $attvalue);
|
583
|
}
|
584
|
}
|
585
|
}
|
586
|
}
|
587
|
/**
|
588
|
* See if we need to append any attributes to this tag.
|
589
|
*/
|
590
|
foreach ($add_attr_to_tag as $matchtag => $addattary) {
|
591
|
if (preg_match($matchtag, $tagname)) {
|
592
|
$attary = array_merge($attary, $addattary);
|
593
|
}
|
594
|
}
|
595
|
return $attary;
|
596
|
}
|
597
|
|
598
|
function tln_fixurl($attname, &$attvalue, $trans_image_path, $block_external_images)
|
599
|
{
|
600
|
$sQuote = '"';
|
601
|
$attvalue = trim($attvalue);
|
602
|
if ($attvalue && ($attvalue[0] =='"'|| $attvalue[0] == "'")) {
|
603
|
// remove the double quotes
|
604
|
$sQuote = $attvalue[0];
|
605
|
$attvalue = trim(substr($attvalue,1,-1));
|
606
|
}
|
607
|
|
608
|
/**
|
609
|
* Replace empty src tags with the blank image. src is only used
|
610
|
* for frames, images, and image inputs. Doing a replace should
|
611
|
* not affect them working as should be, however it will stop
|
612
|
* IE from being kicked off when src for img tags are not set
|
613
|
*/
|
614
|
if ($attvalue == '') {
|
615
|
$attvalue = $sQuote . $trans_image_path . $sQuote;
|
616
|
} else {
|
617
|
// first, disallow 8 bit characters and control characters
|
618
|
if (preg_match('/[\0-\37\200-\377]+/',$attvalue)) {
|
619
|
switch ($attname) {
|
620
|
case 'href':
|
621
|
$attvalue = $sQuote . 'http://invalid-stuff-detected.example.com' . $sQuote;
|
622
|
break;
|
623
|
default:
|
624
|
$attvalue = $sQuote . $trans_image_path . $sQuote;
|
625
|
break;
|
626
|
}
|
627
|
} else {
|
628
|
$aUrl = parse_url($attvalue);
|
629
|
if (isset($aUrl['scheme'])) {
|
630
|
switch(strtolower($aUrl['scheme'])) {
|
631
|
case 'mailto':
|
632
|
case 'http':
|
633
|
case 'https':
|
634
|
case 'ftp':
|
635
|
if ($attname != 'href') {
|
636
|
if ($block_external_images == true) {
|
637
|
$attvalue = $sQuote . $trans_image_path . $sQuote;
|
638
|
} else {
|
639
|
if (!isset($aUrl['path'])) {
|
640
|
$attvalue = $sQuote . $trans_image_path . $sQuote;
|
641
|
}
|
642
|
}
|
643
|
} else {
|
644
|
$attvalue = $sQuote . $attvalue . $sQuote;
|
645
|
}
|
646
|
break;
|
647
|
case 'outbind':
|
648
|
$attvalue = $sQuote . $attvalue . $sQuote;
|
649
|
break;
|
650
|
case 'cid':
|
651
|
$attvalue = $sQuote . $attvalue . $sQuote;
|
652
|
break;
|
653
|
default:
|
654
|
$attvalue = $sQuote . $trans_image_path . $sQuote;
|
655
|
break;
|
656
|
}
|
657
|
} else {
|
658
|
if (!isset($aUrl['path']) || $aUrl['path'] != $trans_image_path) {
|
659
|
$$attvalue = $sQuote . $trans_image_path . $sQuote;
|
660
|
}
|
661
|
}
|
662
|
}
|
663
|
}
|
664
|
}
|
665
|
|
666
|
function tln_fixstyle($body, $pos, $trans_image_path, $block_external_images)
|
667
|
{
|
668
|
// workaround for </style> in between comments
|
669
|
$content = '';
|
670
|
$sToken = '';
|
671
|
$bSucces = false;
|
672
|
$bEndTag = false;
|
673
|
for ($i=$pos,$iCount=strlen($body);$i<$iCount;++$i) {
|
674
|
$char = $body{$i};
|
675
|
switch ($char) {
|
676
|
case '<':
|
677
|
$sToken = $char;
|
678
|
break;
|
679
|
case '/':
|
680
|
if ($sToken == '<') {
|
681
|
$sToken .= $char;
|
682
|
$bEndTag = true;
|
683
|
} else {
|
684
|
$content .= $char;
|
685
|
}
|
686
|
break;
|
687
|
case '>':
|
688
|
if ($bEndTag) {
|
689
|
$sToken .= $char;
|
690
|
if (preg_match('/\<\/\s*style\s*\>/i',$sToken,$aMatch)) {
|
691
|
$newpos = $i + 1;
|
692
|
$bSucces = true;
|
693
|
break 2;
|
694
|
} else {
|
695
|
$content .= $sToken;
|
696
|
}
|
697
|
$bEndTag = false;
|
698
|
} else {
|
699
|
$content .= $char;
|
700
|
}
|
701
|
break;
|
702
|
case '!':
|
703
|
if ($sToken == '<') {
|
704
|
// possible comment
|
705
|
if (isset($body{$i+2}) && substr($body,$i,3) == '!--') {
|
706
|
$i = strpos($body,'-->',$i+3);
|
707
|
if ($i === false) { // no end comment
|
708
|
$i = strlen($body);
|
709
|
}
|
710
|
$sToken = '';
|
711
|
}
|
712
|
} else {
|
713
|
$content .= $char;
|
714
|
}
|
715
|
break;
|
716
|
default:
|
717
|
if ($bEndTag) {
|
718
|
$sToken .= $char;
|
719
|
} else {
|
720
|
$content .= $char;
|
721
|
}
|
722
|
break;
|
723
|
}
|
724
|
}
|
725
|
if ($bSucces == FALSE){
|
726
|
return array(FALSE, strlen($body));
|
727
|
}
|
728
|
|
729
|
|
730
|
|
731
|
/**
|
732
|
* First look for general BODY style declaration, which would be
|
733
|
* like so:
|
734
|
* body {background: blah-blah}
|
735
|
* and change it to .bodyclass so we can just assign it to a <div>
|
736
|
*/
|
737
|
$content = preg_replace("|body(\s*\{.*?\})|si", ".bodyclass\\1", $content);
|
738
|
|
739
|
/**
|
740
|
* Fix url('blah') declarations.
|
741
|
*/
|
742
|
// $content = preg_replace("|url\s*\(\s*([\'\"])\s*\S+script\s*:.*?([\'\"])\s*\)|si",
|
743
|
// "url(\\1$trans_image_path\\2)", $content);
|
744
|
|
745
|
// first check for 8bit sequences and disallowed control characters
|
746
|
if (preg_match('/[\16-\37\200-\377]+/',$content)) {
|
747
|
$content = '<!-- style block removed by html filter due to presence of 8bit characters -->';
|
748
|
return array($content, $newpos);
|
749
|
}
|
750
|
|
751
|
// remove @import line
|
752
|
$content = preg_replace("/^\s*(@import.*)$/mi","\n<!-- @import rules forbidden -->\n",$content);
|
753
|
|
754
|
$content = preg_replace("/(\\\\)?u(\\\\)?r(\\\\)?l(\\\\)?/i", 'url', $content);
|
755
|
preg_match_all("/url\s*\((.+)\)/si",$content,$aMatch);
|
756
|
if (count($aMatch)) {
|
757
|
$aValue = $aReplace = array();
|
758
|
foreach($aMatch[1] as $sMatch) {
|
759
|
// url value
|
760
|
$urlvalue = $sMatch;
|
761
|
tln_fixurl('style',$urlvalue, $trans_image_path, $block_external_images);
|
762
|
$aValue[] = $sMatch;
|
763
|
$aReplace[] = $urlvalue;
|
764
|
}
|
765
|
$content = str_replace($aValue,$aReplace,$content);
|
766
|
}
|
767
|
|
768
|
/**
|
769
|
* Remove any backslashes, entities, and extraneous whitespace.
|
770
|
*/
|
771
|
$contentTemp = $content;
|
772
|
tln_defang($contentTemp);
|
773
|
tln_unspace($contentTemp);
|
774
|
|
775
|
$match = Array('/\/\*.*\*\//',
|
776
|
'/expression/i',
|
777
|
'/behaviou*r/i',
|
778
|
'/binding/i',
|
779
|
'/include-source/i',
|
780
|
'/javascript/i',
|
781
|
'/script/i',
|
782
|
'/position/i');
|
783
|
$replace = Array('','idiocy', 'idiocy', 'idiocy', 'idiocy', 'idiocy', 'idiocy', '');
|
784
|
$contentNew = preg_replace($match, $replace, $contentTemp);
|
785
|
if ($contentNew !== $contentTemp) {
|
786
|
$content = $contentNew;
|
787
|
}
|
788
|
return array($content, $newpos);
|
789
|
}
|
790
|
|
791
|
function tln_body2div($attary, $trans_image_path)
|
792
|
{
|
793
|
$divattary = array('class' => "'bodyclass'");
|
794
|
$text = '#000000';
|
795
|
$has_bgc_stl = $has_txt_stl = false;
|
796
|
$styledef = '';
|
797
|
if (is_array($attary) && sizeof($attary) > 0){
|
798
|
foreach ($attary as $attname=>$attvalue){
|
799
|
$quotchar = substr($attvalue, 0, 1);
|
800
|
$attvalue = str_replace($quotchar, "", $attvalue);
|
801
|
switch ($attname){
|
802
|
case 'background':
|
803
|
$styledef .= "background-image: url('$trans_image_path'); ";
|
804
|
break;
|
805
|
case 'bgcolor':
|
806
|
$has_bgc_stl = true;
|
807
|
$styledef .= "background-color: $attvalue; ";
|
808
|
break;
|
809
|
case 'text':
|
810
|
$has_txt_stl = true;
|
811
|
$styledef .= "color: $attvalue; ";
|
812
|
break;
|
813
|
}
|
814
|
}
|
815
|
// Outlook defines a white bgcolor and no text color. This can lead to
|
816
|
// white text on a white bg with certain themes.
|
817
|
if ($has_bgc_stl && !$has_txt_stl) {
|
818
|
$styledef .= "color: $text; ";
|
819
|
}
|
820
|
if (strlen($styledef) > 0){
|
821
|
$divattary{"style"} = "\"$styledef\"";
|
822
|
}
|
823
|
}
|
824
|
return $divattary;
|
825
|
}
|
826
|
|
827
|
/**
|
828
|
*
|
829
|
* @param string $body The HTML you wish to filter
|
830
|
* @param array $tag_list see description above
|
831
|
* @param array $rm_tags_with_content see description above
|
832
|
* @param array $self_closing_tags see description above
|
833
|
* @param boolean $force_tag_closing see description above
|
834
|
* @param array $rm_attnames see description above
|
835
|
* @param array $bad_attvals see description above
|
836
|
* @param array $add_attr_to_tag see description above
|
837
|
* @param string $trans_image_path
|
838
|
* @param boolean $block_external_images
|
839
|
|
840
|
* @return string Sanitized html safe to show on your pages.
|
841
|
*/
|
842
|
function tln_sanitize(
|
843
|
$body,
|
844
|
$tag_list,
|
845
|
$rm_tags_with_content,
|
846
|
$self_closing_tags,
|
847
|
$force_tag_closing,
|
848
|
$rm_attnames,
|
849
|
$bad_attvals,
|
850
|
$add_attr_to_tag,
|
851
|
$trans_image_path,
|
852
|
$block_external_images
|
853
|
) {
|
854
|
/**
|
855
|
* Normalize rm_tags and rm_tags_with_content.
|
856
|
*/
|
857
|
$rm_tags = array_shift($tag_list);
|
858
|
@array_walk($tag_list, 'tln_casenormalize');
|
859
|
@array_walk($rm_tags_with_content, 'tln_casenormalize');
|
860
|
@array_walk($self_closing_tags, 'tln_casenormalize');
|
861
|
/**
|
862
|
* See if tag_list is of tags to remove or tags to allow.
|
863
|
* false means remove these tags
|
864
|
* true means allow these tags
|
865
|
*/
|
866
|
$curpos = 0;
|
867
|
$open_tags = array();
|
868
|
$trusted = "<!-- begin tln_sanitized html -->\n";
|
869
|
$skip_content = false;
|
870
|
/**
|
871
|
* Take care of netscape's stupid javascript entities like
|
872
|
* &{alert('boo')};
|
873
|
*/
|
874
|
$body = preg_replace('/&(\{.*?\};)/si', '&\\1', $body);
|
875
|
while (($curtag = tln_getnxtag($body, $curpos)) != false) {
|
876
|
list($tagname, $attary, $tagtype, $lt, $gt) = $curtag;
|
877
|
$free_content = substr($body, $curpos, $lt-$curpos);
|
878
|
/**
|
879
|
* Take care of <style>
|
880
|
*/
|
881
|
if ($tagname == "style" && $tagtype == 1){
|
882
|
list($free_content, $curpos) =
|
883
|
tln_fixstyle($body, $gt+1, $trans_image_path, $block_external_images);
|
884
|
if ($free_content != FALSE){
|
885
|
if ( !empty($attary) ) {
|
886
|
$attary = tln_fixatts($tagname,
|
887
|
$attary,
|
888
|
$rm_attnames,
|
889
|
$bad_attvals,
|
890
|
$add_attr_to_tag,
|
891
|
$trans_image_path,
|
892
|
$block_external_images
|
893
|
);
|
894
|
}
|
895
|
$trusted .= tln_tagprint($tagname, $attary, $tagtype);
|
896
|
$trusted .= $free_content;
|
897
|
$trusted .= tln_tagprint($tagname, null, 2);
|
898
|
}
|
899
|
continue;
|
900
|
}
|
901
|
if ($skip_content == false){
|
902
|
$trusted .= $free_content;
|
903
|
}
|
904
|
if ($tagname != false) {
|
905
|
if ($tagtype == 2) {
|
906
|
if ($skip_content == $tagname) {
|
907
|
/**
|
908
|
* Got to the end of tag we needed to remove.
|
909
|
*/
|
910
|
$tagname = false;
|
911
|
$skip_content = false;
|
912
|
} else {
|
913
|
if ($skip_content == false) {
|
914
|
if ($tagname == "body") {
|
915
|
$tagname = "div";
|
916
|
}
|
917
|
if (isset($open_tags{$tagname}) &&
|
918
|
$open_tags{$tagname} > 0
|
919
|
) {
|
920
|
$open_tags{$tagname}--;
|
921
|
} else {
|
922
|
$tagname = false;
|
923
|
}
|
924
|
}
|
925
|
}
|
926
|
} else {
|
927
|
/**
|
928
|
* $rm_tags_with_content
|
929
|
*/
|
930
|
if ($skip_content == false) {
|
931
|
/**
|
932
|
* See if this is a self-closing type and change
|
933
|
* tagtype appropriately.
|
934
|
*/
|
935
|
if ($tagtype == 1
|
936
|
&& in_array($tagname, $self_closing_tags)
|
937
|
) {
|
938
|
$tagtype = 3;
|
939
|
}
|
940
|
/**
|
941
|
* See if we should skip this tag and any content
|
942
|
* inside it.
|
943
|
*/
|
944
|
if ($tagtype == 1
|
945
|
&& in_array($tagname, $rm_tags_with_content)
|
946
|
) {
|
947
|
$skip_content = $tagname;
|
948
|
} else {
|
949
|
if (($rm_tags == false
|
950
|
&& in_array($tagname, $tag_list)) ||
|
951
|
($rm_tags == true
|
952
|
&& !in_array($tagname, $tag_list))
|
953
|
) {
|
954
|
$tagname = false;
|
955
|
} else {
|
956
|
/**
|
957
|
* Convert body into div.
|
958
|
*/
|
959
|
if ($tagname == "body"){
|
960
|
$tagname = "div";
|
961
|
$attary = tln_body2div($attary, $trans_image_path);
|
962
|
}
|
963
|
if ($tagtype == 1) {
|
964
|
if (isset($open_tags{$tagname})) {
|
965
|
$open_tags{$tagname}++;
|
966
|
} else {
|
967
|
$open_tags{$tagname} = 1;
|
968
|
}
|
969
|
}
|
970
|
/**
|
971
|
* This is where we run other checks.
|
972
|
*/
|
973
|
if (is_array($attary) && sizeof($attary) > 0) {
|
974
|
$attary = tln_fixatts(
|
975
|
$tagname,
|
976
|
$attary,
|
977
|
$rm_attnames,
|
978
|
$bad_attvals,
|
979
|
$add_attr_to_tag,
|
980
|
$trans_image_path,
|
981
|
$block_external_images
|
982
|
);
|
983
|
}
|
984
|
}
|
985
|
}
|
986
|
}
|
987
|
}
|
988
|
if ($tagname != false && $skip_content == false) {
|
989
|
$trusted .= tln_tagprint($tagname, $attary, $tagtype);
|
990
|
}
|
991
|
}
|
992
|
$curpos = $gt + 1;
|
993
|
}
|
994
|
$trusted .= substr($body, $curpos, strlen($body) - $curpos);
|
995
|
if ($force_tag_closing == true) {
|
996
|
foreach ($open_tags as $tagname => $opentimes) {
|
997
|
while ($opentimes > 0) {
|
998
|
$trusted .= '</' . $tagname . '>';
|
999
|
$opentimes--;
|
1000
|
}
|
1001
|
}
|
1002
|
$trusted .= "\n";
|
1003
|
}
|
1004
|
$trusted .= "<!-- end tln_sanitized html -->\n";
|
1005
|
return $trusted;
|
1006
|
}
|
1007
|
|
1008
|
//
|
1009
|
// Use the nifty htmlfilter library
|
1010
|
//
|
1011
|
|
1012
|
|
1013
|
function HTMLFilter($body, $trans_image_path, $block_external_images = false)
|
1014
|
{
|
1015
|
|
1016
|
$tag_list = array(
|
1017
|
false,
|
1018
|
"object",
|
1019
|
"meta",
|
1020
|
"html",
|
1021
|
"head",
|
1022
|
"base",
|
1023
|
"link",
|
1024
|
"frame",
|
1025
|
"iframe",
|
1026
|
"plaintext",
|
1027
|
"marquee"
|
1028
|
);
|
1029
|
|
1030
|
$rm_tags_with_content = array(
|
1031
|
"script",
|
1032
|
"applet",
|
1033
|
"embed",
|
1034
|
"title",
|
1035
|
"frameset",
|
1036
|
"xmp",
|
1037
|
"xml"
|
1038
|
);
|
1039
|
|
1040
|
$self_closing_tags = array(
|
1041
|
"img",
|
1042
|
"br",
|
1043
|
"hr",
|
1044
|
"input",
|
1045
|
"outbind"
|
1046
|
);
|
1047
|
|
1048
|
$force_tag_closing = true;
|
1049
|
|
1050
|
$rm_attnames = array(
|
1051
|
"/.*/" =>
|
1052
|
array(
|
1053
|
// "/target/i",
|
1054
|
"/^on.*/i",
|
1055
|
"/^dynsrc/i",
|
1056
|
"/^data.*/i",
|
1057
|
"/^lowsrc.*/i"
|
1058
|
)
|
1059
|
);
|
1060
|
|
1061
|
$bad_attvals = array(
|
1062
|
"/.*/" =>
|
1063
|
array(
|
1064
|
"/^src|background/i" =>
|
1065
|
array(
|
1066
|
array(
|
1067
|
'/^([\'"])\s*\S+script\s*:.*([\'"])/si',
|
1068
|
'/^([\'"])\s*mocha\s*:*.*([\'"])/si',
|
1069
|
'/^([\'"])\s*about\s*:.*([\'"])/si'
|
1070
|
),
|
1071
|
array(
|
1072
|
"\\1$trans_image_path\\2",
|
1073
|
"\\1$trans_image_path\\2",
|
1074
|
"\\1$trans_image_path\\2"
|
1075
|
)
|
1076
|
),
|
1077
|
"/^href|action/i" =>
|
1078
|
array(
|
1079
|
array(
|
1080
|
'/^([\'"])\s*\S+script\s*:.*([\'"])/si',
|
1081
|
'/^([\'"])\s*mocha\s*:*.*([\'"])/si',
|
1082
|
'/^([\'"])\s*about\s*:.*([\'"])/si'
|
1083
|
),
|
1084
|
array(
|
1085
|
"\\1#\\1",
|
1086
|
"\\1#\\1",
|
1087
|
"\\1#\\1"
|
1088
|
)
|
1089
|
),
|
1090
|
"/^style/i" =>
|
1091
|
array(
|
1092
|
array(
|
1093
|
"/\/\*.*\*\//",
|
1094
|
"/expression/i",
|
1095
|
"/binding/i",
|
1096
|
"/behaviou*r/i",
|
1097
|
"/include-source/i",
|
1098
|
'/position\s*:/i',
|
1099
|
'/(\\\\)?u(\\\\)?r(\\\\)?l(\\\\)?/i',
|
1100
|
'/url\s*\(\s*([\'"])\s*\S+script\s*:.*([\'"])\s*\)/si',
|
1101
|
'/url\s*\(\s*([\'"])\s*mocha\s*:.*([\'"])\s*\)/si',
|
1102
|
'/url\s*\(\s*([\'"])\s*about\s*:.*([\'"])\s*\)/si',
|
1103
|
'/(.*)\s*:\s*url\s*\(\s*([\'"]*)\s*\S+script\s*:.*([\'"]*)\s*\)/si'
|
1104
|
),
|
1105
|
array(
|
1106
|
"",
|
1107
|
"idiocy",
|
1108
|
"idiocy",
|
1109
|
"idiocy",
|
1110
|
"idiocy",
|
1111
|
"idiocy",
|
1112
|
"url",
|
1113
|
"url(\\1#\\1)",
|
1114
|
"url(\\1#\\1)",
|
1115
|
"url(\\1#\\1)",
|
1116
|
"\\1:url(\\2#\\3)"
|
1117
|
)
|
1118
|
)
|
1119
|
)
|
1120
|
);
|
1121
|
|
1122
|
if ($block_external_images) {
|
1123
|
array_push(
|
1124
|
$bad_attvals{'/.*/'}{'/^src|background/i'}[0],
|
1125
|
'/^([\'\"])\s*https*:.*([\'\"])/si'
|
1126
|
);
|
1127
|
array_push(
|
1128
|
$bad_attvals{'/.*/'}{'/^src|background/i'}[1],
|
1129
|
"\\1$trans_image_path\\1"
|
1130
|
);
|
1131
|
array_push(
|
1132
|
$bad_attvals{'/.*/'}{'/^style/i'}[0],
|
1133
|
'/url\(([\'\"])\s*https*:.*([\'\"])\)/si'
|
1134
|
);
|
1135
|
array_push(
|
1136
|
$bad_attvals{'/.*/'}{'/^style/i'}[1],
|
1137
|
"url(\\1$trans_image_path\\1)"
|
1138
|
);
|
1139
|
}
|
1140
|
|
1141
|
$add_attr_to_tag = array(
|
1142
|
"/^a$/i" =>
|
1143
|
array('target' => '"_blank"')
|
1144
|
);
|
1145
|
|
1146
|
$trusted = tln_sanitize(
|
1147
|
$body,
|
1148
|
$tag_list,
|
1149
|
$rm_tags_with_content,
|
1150
|
$self_closing_tags,
|
1151
|
$force_tag_closing,
|
1152
|
$rm_attnames,
|
1153
|
$bad_attvals,
|
1154
|
$add_attr_to_tag,
|
1155
|
$trans_image_path,
|
1156
|
$block_external_images
|
1157
|
);
|
1158
|
return $trusted;
|
1159
|
}
|