Revision 857
Added by thorn about 17 years ago
| search_modext.php | ||
|---|---|---|
| 90 | 90 |
$excerpt_array = array(); |
| 91 | 91 |
$word = '('.implode('|', $search_words).')';
|
| 92 | 92 |
// Build the regex-string |
| 93 |
// start-sign: .!?; + INVERTED EXCLAMATION MARK - INVERTED QUESTION MARK - DOUBLE EXCLAMATION MARK - INTERROBANG - EXCLAMATION QUESTION MARK - QUESTION EXCLAMATION MARK - DOUBLE QUESTION MARK - HALFWIDTH IDEOGRAPHIC FULL STOP - IDEOGRAPHIC FULL STOP - IDEOGRAPHIC COMMA |
|
| 94 |
$str1=".!?;"."\xC2\xA1"."\xC2\xBF"."\xE2\x80\xBC"."\xE2\x80\xBD"."\xE2\x81\x89"."\xE2\x81\x88"."\xE2\x81\x87"."\xEF\xBD\xA1"."\xE3\x80\x82"."\xE3\x80\x81"; |
|
| 95 |
// stop-sign: .!?; + DOUBLE EXCLAMATION MARK - INTERROBANG - EXCLAMATION QUESTION MARK - QUESTION EXCLAMATION MARK - DOUBLE QUESTION MARK - HALFWIDTH IDEOGRAPHIC FULL STOP - IDEOGRAPHIC FULL STOP - IDEOGRAPHIC COMMA |
|
| 96 |
$str2=".!?;"."\xE2\x80\xBC"."\xE2\x80\xBD"."\xE2\x81\x89"."\xE2\x81\x88"."\xE2\x81\x87"."\xEF\xBD\xA1"."\xE3\x80\x82"."\xE3\x80\x81"; |
|
| 97 |
$regex='/(?:^|\b|['.$str1.'])([^'.$str1.']{0,200}?'.$word.'[^'.$str2.']{0,200}(?:['.$str2.']|\b|$))/Sisu';
|
|
| 98 |
if(version_compare(PHP_VERSION, '4.3.3', '>=')) {
|
|
| 93 |
if(strpos(strtoupper(PHP_OS), 'WIN')===0) { // windows -> see below
|
|
| 94 |
$str1=".!?;"; |
|
| 95 |
$str2=".!?;"; |
|
| 96 |
} else { // linux & Co.
|
|
| 97 |
// start-sign: .!?; + INVERTED EXCLAMATION MARK - INVERTED QUESTION MARK - DOUBLE EXCLAMATION MARK - INTERROBANG - EXCLAMATION QUESTION MARK - QUESTION EXCLAMATION MARK - DOUBLE QUESTION MARK - HALFWIDTH IDEOGRAPHIC FULL STOP - IDEOGRAPHIC FULL STOP - IDEOGRAPHIC COMMA |
|
| 98 |
$str1=".!?;"."\xC2\xA1"."\xC2\xBF"."\xE2\x80\xBC"."\xE2\x80\xBD"."\xE2\x81\x89"."\xE2\x81\x88"."\xE2\x81\x87"."\xEF\xBD\xA1"."\xE3\x80\x82"."\xE3\x80\x81"; |
|
| 99 |
// stop-sign: .!?; + DOUBLE EXCLAMATION MARK - INTERROBANG - EXCLAMATION QUESTION MARK - QUESTION EXCLAMATION MARK - DOUBLE QUESTION MARK - HALFWIDTH IDEOGRAPHIC FULL STOP - IDEOGRAPHIC FULL STOP - IDEOGRAPHIC COMMA |
|
| 100 |
$str2=".!?;"."\xE2\x80\xBC"."\xE2\x80\xBD"."\xE2\x81\x89"."\xE2\x81\x88"."\xE2\x81\x87"."\xEF\xBD\xA1"."\xE3\x80\x82"."\xE3\x80\x81"; |
|
| 101 |
} |
|
| 102 |
$regex='/(?:^|\b|['.$str1.'])([^'.$str1.']{0,200}?'.$word.'[^'.$str2.']{0,200}(?:['.$str2.']|\b|$))/isu';
|
|
| 103 |
if(version_compare(PHP_VERSION, '4.3.3', '>=') && |
|
| 104 |
strpos(strtoupper(PHP_OS), 'WIN')!==0) { // this may crash windows server, so skip if on windows
|
|
| 99 | 105 |
// jump from match to match, get excerpt, stop if $max_excerpt_num is reached |
| 100 | 106 |
$last_end = 0; $offset = 0; |
| 101 | 107 |
while(preg_match('/'.$word.'/Sisu', $text, $match_array, PREG_OFFSET_CAPTURE, $last_end)) {
|
| 102 | 108 |
$offset = ($match_array[0][1]-206 < $last_end)?$last_end:$match_array[0][1]-206; |
| 103 | 109 |
if(preg_match($regex, $text, $matches, PREG_OFFSET_CAPTURE, $offset)) {
|
| 104 | 110 |
$last_end = $matches[1][1]+strlen($matches[1][0])-1; |
| 105 |
if(!preg_match('/\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\./S', $matches[1][0])) // skip excerpts with email-addresses
|
|
| 111 |
if(!preg_match('/\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\./', $matches[1][0])) // skip excerpts with email-addresses
|
|
| 106 | 112 |
$excerpt_array[] = trim($matches[1][0]); |
| 107 | 113 |
if(count($excerpt_array)>=$max_excerpt_num) {
|
| 108 | 114 |
$excerpt_array = array_unique($excerpt_array); |
| ... | ... | |
| 116 | 122 |
} else { // compatile, but may be very slow with many large pages
|
| 117 | 123 |
if(preg_match_all($regex, $text, $match_array)) {
|
| 118 | 124 |
foreach($match_array[1] AS $string) {
|
| 119 |
if(!preg_match('/\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\./S', $string)) // skip excerpts with email-addresses
|
|
| 125 |
if(!preg_match('/\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\./', $string)) // skip excerpts with email-addresses
|
|
| 120 | 126 |
$excerpt_array[] = trim($string); |
| 121 | 127 |
} |
| 122 | 128 |
} |
| ... | ... | |
| 233 | 239 |
if($mod_no_highlight) // no highlighting |
| 234 | 240 |
{ $mod_page_link_target = "&nohighlight=1".$mod_page_link_target; }
|
| 235 | 241 |
// clean the text: |
| 236 |
$mod_text = preg_replace('/\s+/', ' ', $mod_text);
|
|
| 237 | 242 |
$mod_text = preg_replace('#<(!--.*--|style.*</style|script.*</script)>#SiU', ' ', $mod_text);
|
| 238 | 243 |
$mod_text = preg_replace('#<(br( /)?|dt|/dd|/?(h[1-6]|tr|table|p|li|ul|pre|code|div|hr))[^>]*>#Si', '.', $mod_text);
|
| 244 |
$mod_text = preg_replace('/\s+/', ' ', $mod_text);
|
|
| 245 |
$mod_text = preg_replace('/ \./', '.', $mod_text);
|
|
| 239 | 246 |
$mod_text = entities_to_umlauts($mod_text, 'UTF-8'); |
| 240 | 247 |
$anchor_text = $mod_text; // make an copy containing html-tags |
| 241 | 248 |
$mod_text = strip_tags($mod_text); |
Also available in: Unified diff
search: fixed windows-related regex issue