Revision 869
Added by thorn about 16 years ago
trunk/CHANGELOG | ||
---|---|---|
14 | 14 |
29-Oct-2008 Thomas Hornik |
15 | 15 |
# News: total number of news wasn't calculated correctly |
16 | 16 |
! Captcha: removed IFRAME for Calculation-as-text captcha |
17 |
- removed useless init_utf8funcs()-function |
|
17 |
- removed useless init_utf8funcs()-function |
|
18 |
+ search: added search_lang for use in search-form. Using DE, the search will search ä=ae,... (alternate spelling of german umlauts) |
|
19 |
- search: removed undocumented word-boundary search |
|
20 |
! search: search_path is now anchored to the beginning of link ("link LIKE '$path%'" instead of "link LIKE '%$path%'") |
|
21 |
+ search: added key 'ext_charset' to search-extension to query external databases |
|
18 | 22 |
25-Oct-2008 Christian Sommer |
19 | 23 |
! updated basic templates according the template guidelines |
20 | 24 |
24-Oct-2008 Dietrich Roland Pehlke |
trunk/wb/framework/frontend.functions.php | ||
---|---|---|
77 | 77 |
array_walk($arr_string, create_function('&$v,$k','$v = preg_quote($v, \'/\');')); |
78 | 78 |
$search_string = implode("|", $arr_string); |
79 | 79 |
$string = strtr($search_string, $string_ul_umlauts); |
80 |
// special-feature: '|' means word-boundary (\b). Searching for 'the|' will find 'the', but not 'thema'. |
|
81 |
$string = strtr($string, array('\\|'=>'\b')); |
|
82 | 80 |
// the highlighting |
83 | 81 |
// match $string, but not inside <style>...</style>, <script>...</script>, <!--...--> or HTML-Tags |
84 | 82 |
// split $string into pieces - "cut away" styles, scripts, comments, HTML-tags and eMail-addresses |
trunk/wb/search/search.php | ||
---|---|---|
117 | 117 |
} |
118 | 118 |
} |
119 | 119 |
|
120 |
// Get search language |
|
121 |
$search_lang = ''; |
|
122 |
if(isset($_REQUEST['search_lang'])) { |
|
123 |
$search_lang = $_REQUEST['search_lang']; |
|
124 |
if(!preg_match('~^[A-Z]{2}$~', $search_lang)) |
|
125 |
$search_lang = LANGUAGE; |
|
126 |
} else { |
|
127 |
$search_lang = LANGUAGE; |
|
128 |
} |
|
129 |
|
|
120 | 130 |
// Get the path to search into. Normally left blank |
131 |
// ATTN: since wb2.7.1 the path is evaluated as SQL: LIKE "/path%" - which will find "/path.php", "/path/info.php", ...; But not "/de/path.php" |
|
132 |
// Add a '%' in front of each path to get SQL: LIKE "%/path%" |
|
121 | 133 |
/* possible values: |
122 | 134 |
* - a single path: "/en/" - search only pages whose link contains 'path' ("/en/machinery/bender-x09") |
123 |
* - a bunch of alternative pathes: "/en/,/machinery/,docs/" - alternatives paths, seperated by comma |
|
124 |
* - a bunch of paths to exclude: "-/about,/info,/jp/,/light" - search all, exclude these. |
|
135 |
* - a single path not to search into: "-/help" - search all, exclude /help... |
|
136 |
* - a bunch of alternative pathes: "/en/,%/machinery/,/docs/" - alternatives paths, seperated by comma |
|
137 |
* - a bunch of paths to exclude: "-/about,%/info,/jp/,/light" - search all, exclude these. |
|
125 | 138 |
* These different styles can't be mixed. |
126 | 139 |
*/ |
127 |
$search_path_SQL = ""; |
|
128 |
$search_path = ""; |
|
140 |
// ATTN: in wb2.7.0 "/en/" matched all links with "/en/" somewhere in the link: "/info/en/intro.php", "/en/info.php", ... |
|
141 |
// since wb2.7.1 "/en/" matches only links _starting_ with "/en/": "/en/intro/info.php" |
|
142 |
// use "%/en/" (or "%/en/, %/info", ...) to get the old behavior |
|
143 |
$search_path_SQL = ''; |
|
144 |
$search_path = ''; |
|
129 | 145 |
if(isset($_REQUEST['search_path'])) { |
130 |
$search_path = $wb->add_slashes($_REQUEST['search_path']);
|
|
131 |
if(!preg_match('~^[-a-zA-Z0-9_,/ ]+$~', $search_path)) |
|
146 |
$search_path = addslashes(htmlspecialchars(strip_tags($wb->strip_slashes($_REQUEST['search_path']))));
|
|
147 |
if(!preg_match('~^%?[-a-zA-Z0-9_,/ ]+$~', $search_path))
|
|
132 | 148 |
$search_path = ''; |
133 | 149 |
if($search_path != '') { |
134 |
$search_path_SQL = "AND ( ";
|
|
135 |
$not = "";
|
|
136 |
$op = "OR";
|
|
150 |
$search_path_SQL = 'AND ( ';
|
|
151 |
$not = '';
|
|
152 |
$op = 'OR';
|
|
137 | 153 |
if($search_path[0] == '-') { |
138 |
$not = "NOT";
|
|
139 |
$op = "AND";
|
|
154 |
$not = 'NOT';
|
|
155 |
$op = 'AND';
|
|
140 | 156 |
$paths = explode(',', substr($search_path, 1) ); |
141 | 157 |
} else { |
142 | 158 |
$paths = explode(',',$search_path); |
... | ... | |
144 | 160 |
$i=0; |
145 | 161 |
foreach($paths as $p) { |
146 | 162 |
if($i++ > 0) { |
147 |
$search_path_SQL .= " $op";
|
|
163 |
$search_path_SQL .= ' $op';
|
|
148 | 164 |
} |
149 |
$search_path_SQL .= " link $not LIKE '%$p%'";
|
|
165 |
$search_path_SQL .= " link $not LIKE '".$p."%'";
|
|
150 | 166 |
} |
151 |
$search_path_SQL .= " )";
|
|
167 |
$search_path_SQL .= ' )';
|
|
152 | 168 |
} |
153 | 169 |
} |
154 | 170 |
|
... | ... | |
167 | 183 |
$search_normal_string = ''; |
168 | 184 |
$search_entities_string = ''; // for SQL's LIKE |
169 | 185 |
$search_display_string = ''; // for displaying |
170 |
$search_url_string = ''; // for $_GET |
|
186 |
$search_url_string = ''; // for $_GET -- ATTN: unquoted! Will become urldecoded later
|
|
171 | 187 |
$string = ''; |
172 | 188 |
if(isset($_REQUEST['string'])) { |
173 |
if($match!='exact') { |
|
189 |
if($match!='exact') { // $string will be cleaned below
|
|
174 | 190 |
$string=str_replace(',', '', $_REQUEST['string']); |
175 | 191 |
} else { |
176 |
$string=$_REQUEST['string']; // $string will be cleaned below
|
|
192 |
$string=$_REQUEST['string']; |
|
177 | 193 |
} |
178 | 194 |
// redo possible magic quotes |
179 | 195 |
$string = $wb->strip_slashes($string); |
... | ... | |
187 | 203 |
$search_entities_string = str_replace('\\\\', '\\\\\\\\', $search_entities_string); |
188 | 204 |
// convert string to utf-8 |
189 | 205 |
$string = entities_to_umlauts($string, 'UTF-8'); |
190 |
// quote ' " and / -we need quoted / for regex |
|
191 | 206 |
$search_url_string = $string; |
192 | 207 |
$string = preg_quote($string); |
208 |
// quote ' " and / -we need quoted / for regex |
|
193 | 209 |
$search_normal_string = str_replace(array('\'','"','/'), array('\\\'','\"','\/'), $string); |
194 | 210 |
} |
195 | 211 |
// make arrays from the search_..._strings above |
212 |
if($match == 'exact') |
|
213 |
$search_url_array[] = $search_url_string; |
|
214 |
else |
|
196 | 215 |
$search_url_array = explode(' ', $search_url_string); |
197 | 216 |
$search_normal_array = array(); |
198 | 217 |
$search_entities_array = array(); |
... | ... | |
220 | 239 |
$search_words = array(); |
221 | 240 |
foreach($search_normal_array AS $str) { |
222 | 241 |
$str = strtr($str, $string_ul_umlauts); |
223 |
// special-feature: '|' means word-boundary (\b). Searching for 'the|' will find the, but not thema. |
|
224 |
// this doesn't(?) work correctly for unicode-chars: '|test' will work, but '|über' not. |
|
225 |
$str = strtr($str, array('\\|'=>'\b')); |
|
226 | 242 |
$search_words[] = $str; |
227 | 243 |
} |
228 | 244 |
|
trunk/wb/search/search_convert.php | ||
---|---|---|
32 | 32 |
header('Location: ../index.php'); |
33 | 33 |
exit(0); |
34 | 34 |
} |
35 |
if(!isset($search_lang)) $search_lang = LANGUAGE; |
|
35 | 36 |
|
36 | 37 |
//umlauts to '(upper|lower)' for preg_match() |
37 | 38 |
//this is UTF-8-encoded |
38 | 39 |
$string_ul_umlauts = array( |
40 |
"i" => "(?:i|\xc4\xb0|I)", |
|
41 |
"I" => "(?:I|\xc4\xb1|i)", |
|
42 |
"k" => "(?:k|\xe2\x84\xaa|K)", |
|
43 |
"S" => "(?:S|\xc5\xbf|s)", |
|
39 | 44 |
"\xc2\xb5" => "(?:\xc2\xb5|\xce\x9c)", |
45 |
"\xc3\x9f" => "(?:\xc3\x9f|SS|ss)", // german ß |
|
40 | 46 |
"\xc3\xa0" => "(?:\xc3\xa0|\xc3\x80)", |
41 | 47 |
"\xc3\xa1" => "(?:\xc3\xa1|\xc3\x81)", |
42 | 48 |
"\xc3\xa2" => "(?:\xc3\xa2|\xc3\x82)", |
... | ... | |
1460 | 1466 |
"\xf0\x90\x90\xa3" => "(?:\xf0\x90\x90\xa3|\xf0\x90\x91\x8b)", |
1461 | 1467 |
"\xf0\x90\x90\xa4" => "(?:\xf0\x90\x90\xa4|\xf0\x90\x91\x8c)", |
1462 | 1468 |
"\xf0\x90\x90\xa5" => "(?:\xf0\x90\x90\xa5|\xf0\x90\x91\x8d)" |
1463 |
//"i" => "(?:i|\xc4\xb0|I)", |
|
1464 |
//"I" => "(?:I|\xc4\xb1|i)", |
|
1465 |
//"k" => "(?:k|\xe2\x84\xaa|K)", |
|
1466 |
//"S" => "(?:S|\xc5\xbf|s)" |
|
1467 | 1469 |
); |
1468 | 1470 |
|
1471 |
|
|
1472 |
if($search_lang=='DE') { |
|
1473 |
$string_ul_umlauts["\xc3\x9f"] = "(?:\xc3\x9f|SS|ss)"; // german ß |
|
1474 |
$string_ul_umlauts["\xc3\xa4"] = "(?:\xc3\xa4|\xc3\x84|ae|Ae)"; // german ä |
|
1475 |
$string_ul_umlauts["\xc3\xb6"] = "(?:\xc3\xb6|\xc3\x96|oe|Oe)"; // german ö |
|
1476 |
$string_ul_umlauts["\xc3\xbc"] = "(?:\xc3\xbc|\xc3\x9c|ue|Ue)"; // german ü |
|
1477 |
$string_ul_umlauts["\xc3\x84"] = "(?:\xc3\x84|\xc3\xa4|Ae|ae)"; // german Ä |
|
1478 |
$string_ul_umlauts["\xc3\x96"] = "(?:\xc3\x96|\xc3\xb6|Oe|oe)"; // german Ö |
|
1479 |
$string_ul_umlauts["\xc3\x9c"] = "(?:\xc3\x9c|\xc3\xbc|Ue|ue)"; // german Ü |
|
1480 |
} |
|
1481 |
|
|
1469 | 1482 |
?> |
trunk/wb/search/search_modext.php | ||
---|---|---|
101 | 101 |
} |
102 | 102 |
$regex='/(?:^|\b|['.$str1.'])([^'.$str1.']{0,200}?'.$word.'[^'.$str2.']{0,200}(?:['.$str2.']|\b|$))/isu'; |
103 | 103 |
if(version_compare(PHP_VERSION, '4.3.3', '>=') && |
104 |
strpos(strtoupper(PHP_OS), 'WIN')!==0) { // this may crash windows server, so skip if on windows |
|
104 |
strpos(strtoupper(PHP_OS), 'WIN')!==0 |
|
105 |
) { // this may crash windows server, so skip if on windows |
|
105 | 106 |
// jump from match to match, get excerpt, stop if $max_excerpt_num is reached |
106 | 107 |
$last_end = 0; $offset = 0; |
107 | 108 |
while(preg_match('/'.$word.'/Sisu', $text, $match_array, PREG_OFFSET_CAPTURE, $last_end)) { |
... | ... | |
115 | 116 |
if(count($excerpt_array) >= $max_excerpt_num) |
116 | 117 |
break; |
117 | 118 |
} |
118 |
} else { // problem - preg_match failed: can't find a start- or stop-sign
|
|
119 |
} else { // problem: preg_match failed - can't find a start- or stop-sign
|
|
119 | 120 |
$last_end += 201; // jump forward and try again |
120 | 121 |
} |
121 | 122 |
} |
122 |
} else { // compatile, but may be very slow with many large pages
|
|
123 |
} else { // compatible, but may be very slow with large pages
|
|
123 | 124 |
if(preg_match_all($regex, $text, $match_array)) { |
124 | 125 |
foreach($match_array[1] AS $string) { |
125 | 126 |
if(!preg_match('/\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\./', $string)) // skip excerpts with email-addresses |
... | ... | |
173 | 174 |
if($match && is_array($match[0])) { |
174 | 175 |
$x=$match[0][1]; // position of first match |
175 | 176 |
// is there an anchor nearby? |
176 |
if(preg_match_all('/<(?:[^>]+id|\s*a[^>]+name)\s*=\s*"(.*)"/SiU', substr($text,0,$x), $match, PREG_OFFSET_CAPTURE)) {
|
|
177 |
if(preg_match_all('/<(?:[^>]+id|\s*a[^>]+name)\s*=\s*"(.*)"/iU', substr($text,0,$x), $match, PREG_OFFSET_CAPTURE)) { |
|
177 | 178 |
$anchor=''; |
178 | 179 |
foreach($match[1] AS $array) { |
179 | 180 |
if($array[1] > $x) { |
... | ... | |
234 | 235 |
if(!isset($mod_pic_link)) $mod_pic_link = ""; |
235 | 236 |
if(!isset($mod_no_highlight)) $mod_no_highlight = false; |
236 | 237 |
if(!isset($func_enable_flush)) $func_enable_flush = false; // set this in db: wb_search.cfg_enable_flush [READ THE DOC BEFORE] |
238 |
if(isset($mod_ext_charset) && $mod_ext_charset!='utf-8') $mod_ext_charset = 'utf-8'; // only utf-8 is allowed, yet. For other charset see DOCU |
|
237 | 239 |
if($mod_text == "") // nothing to do |
238 | 240 |
{ return false; } |
239 | 241 |
if($mod_no_highlight) // no highlighting |
... | ... | |
243 | 245 |
$mod_text = preg_replace('#<(br( /)?|dt|/dd|/?(h[1-6]|tr|table|p|li|ul|pre|code|div|hr))[^>]*>#Si', '.', $mod_text); |
244 | 246 |
$mod_text = preg_replace('/\s+/', ' ', $mod_text); |
245 | 247 |
$mod_text = preg_replace('/ \./', '.', $mod_text); |
248 |
if(isset($mod_ext_charset)) { // data from external database may have a different charset |
|
249 |
require_once(WB_PATH.'/framework/functions-utf8.php'); |
|
250 |
$mod_text = charset_to_utf8($mod_text, $mod_ext_charset); |
|
251 |
} else { |
|
246 | 252 |
$mod_text = entities_to_umlauts($mod_text, 'UTF-8'); |
253 |
} |
|
247 | 254 |
$anchor_text = $mod_text; // make an copy containing html-tags |
248 | 255 |
$mod_text = strip_tags($mod_text); |
249 | 256 |
$mod_text = str_replace(array('>','<','&','"',''',''',' '), array('>','<','&','"','\'','\'',"\xC2\xA0"), $mod_text); |
257 |
$mod_text = '.'.trim($mod_text).'.'; |
|
250 | 258 |
// Do a fast scan over $mod_text first. This will speedup things a lot. |
251 | 259 |
if($func_search_match == 'all') { |
252 | 260 |
if(!is_all_matched($mod_text, $func_search_words)) |
Also available in: Unified diff
search: added search_lang for use in search-form. Using DE, the search will search ä=ae,... (alternate spelling of german umlauts)
removed undocumented word-boundary search
search_path is now anchored to the beginning of link ("link LIKE '$path%'" instead of "link LIKE '%$path%'")
added key 'ext_charset' to search-extension to query external databases