Revision 869
Added by thorn about 17 years ago
| trunk/CHANGELOG | ||
|---|---|---|
| 14 | 14 |
29-Oct-2008 Thomas Hornik |
| 15 | 15 |
# News: total number of news wasn't calculated correctly |
| 16 | 16 |
! Captcha: removed IFRAME for Calculation-as-text captcha |
| 17 |
- removed useless init_utf8funcs()-function |
|
| 17 |
- removed useless init_utf8funcs()-function |
|
| 18 |
+ search: added search_lang for use in search-form. Using DE, the search will search ä=ae,... (alternate spelling of german umlauts) |
|
| 19 |
- search: removed undocumented word-boundary search |
|
| 20 |
! search: search_path is now anchored to the beginning of link ("link LIKE '$path%'" instead of "link LIKE '%$path%'")
|
|
| 21 |
+ search: added key 'ext_charset' to search-extension to query external databases |
|
| 18 | 22 |
25-Oct-2008 Christian Sommer |
| 19 | 23 |
! updated basic templates according the template guidelines |
| 20 | 24 |
24-Oct-2008 Dietrich Roland Pehlke |
| trunk/wb/framework/frontend.functions.php | ||
|---|---|---|
| 77 | 77 |
array_walk($arr_string, create_function('&$v,$k','$v = preg_quote($v, \'/\');'));
|
| 78 | 78 |
$search_string = implode("|", $arr_string);
|
| 79 | 79 |
$string = strtr($search_string, $string_ul_umlauts); |
| 80 |
// special-feature: '|' means word-boundary (\b). Searching for 'the|' will find 'the', but not 'thema'. |
|
| 81 |
$string = strtr($string, array('\\|'=>'\b'));
|
|
| 82 | 80 |
// the highlighting |
| 83 | 81 |
// match $string, but not inside <style>...</style>, <script>...</script>, <!--...--> or HTML-Tags |
| 84 | 82 |
// split $string into pieces - "cut away" styles, scripts, comments, HTML-tags and eMail-addresses |
| trunk/wb/search/search.php | ||
|---|---|---|
| 117 | 117 |
} |
| 118 | 118 |
} |
| 119 | 119 |
|
| 120 |
// Get search language |
|
| 121 |
$search_lang = ''; |
|
| 122 |
if(isset($_REQUEST['search_lang'])) {
|
|
| 123 |
$search_lang = $_REQUEST['search_lang']; |
|
| 124 |
if(!preg_match('~^[A-Z]{2}$~', $search_lang))
|
|
| 125 |
$search_lang = LANGUAGE; |
|
| 126 |
} else {
|
|
| 127 |
$search_lang = LANGUAGE; |
|
| 128 |
} |
|
| 129 |
|
|
| 120 | 130 |
// Get the path to search into. Normally left blank |
| 131 |
// ATTN: since wb2.7.1 the path is evaluated as SQL: LIKE "/path%" - which will find "/path.php", "/path/info.php", ...; But not "/de/path.php" |
|
| 132 |
// Add a '%' in front of each path to get SQL: LIKE "%/path%" |
|
| 121 | 133 |
/* possible values: |
| 122 | 134 |
* - a single path: "/en/" - search only pages whose link contains 'path' ("/en/machinery/bender-x09")
|
| 123 |
* - a bunch of alternative pathes: "/en/,/machinery/,docs/" - alternatives paths, seperated by comma |
|
| 124 |
* - a bunch of paths to exclude: "-/about,/info,/jp/,/light" - search all, exclude these. |
|
| 135 |
* - a single path not to search into: "-/help" - search all, exclude /help... |
|
| 136 |
* - a bunch of alternative pathes: "/en/,%/machinery/,/docs/" - alternatives paths, seperated by comma |
|
| 137 |
* - a bunch of paths to exclude: "-/about,%/info,/jp/,/light" - search all, exclude these. |
|
| 125 | 138 |
* These different styles can't be mixed. |
| 126 | 139 |
*/ |
| 127 |
$search_path_SQL = ""; |
|
| 128 |
$search_path = ""; |
|
| 140 |
// ATTN: in wb2.7.0 "/en/" matched all links with "/en/" somewhere in the link: "/info/en/intro.php", "/en/info.php", ... |
|
| 141 |
// since wb2.7.1 "/en/" matches only links _starting_ with "/en/": "/en/intro/info.php" |
|
| 142 |
// use "%/en/" (or "%/en/, %/info", ...) to get the old behavior |
|
| 143 |
$search_path_SQL = ''; |
|
| 144 |
$search_path = ''; |
|
| 129 | 145 |
if(isset($_REQUEST['search_path'])) {
|
| 130 |
$search_path = $wb->add_slashes($_REQUEST['search_path']);
|
|
| 131 |
if(!preg_match('~^[-a-zA-Z0-9_,/ ]+$~', $search_path))
|
|
| 146 |
$search_path = addslashes(htmlspecialchars(strip_tags($wb->strip_slashes($_REQUEST['search_path']))));
|
|
| 147 |
if(!preg_match('~^%?[-a-zA-Z0-9_,/ ]+$~', $search_path))
|
|
| 132 | 148 |
$search_path = ''; |
| 133 | 149 |
if($search_path != '') {
|
| 134 |
$search_path_SQL = "AND ( ";
|
|
| 135 |
$not = "";
|
|
| 136 |
$op = "OR";
|
|
| 150 |
$search_path_SQL = 'AND ( ';
|
|
| 151 |
$not = '';
|
|
| 152 |
$op = 'OR';
|
|
| 137 | 153 |
if($search_path[0] == '-') {
|
| 138 |
$not = "NOT";
|
|
| 139 |
$op = "AND";
|
|
| 154 |
$not = 'NOT';
|
|
| 155 |
$op = 'AND';
|
|
| 140 | 156 |
$paths = explode(',', substr($search_path, 1) );
|
| 141 | 157 |
} else {
|
| 142 | 158 |
$paths = explode(',',$search_path);
|
| ... | ... | |
| 144 | 160 |
$i=0; |
| 145 | 161 |
foreach($paths as $p) {
|
| 146 | 162 |
if($i++ > 0) {
|
| 147 |
$search_path_SQL .= " $op";
|
|
| 163 |
$search_path_SQL .= ' $op';
|
|
| 148 | 164 |
} |
| 149 |
$search_path_SQL .= " link $not LIKE '%$p%'";
|
|
| 165 |
$search_path_SQL .= " link $not LIKE '".$p."%'";
|
|
| 150 | 166 |
} |
| 151 |
$search_path_SQL .= " )";
|
|
| 167 |
$search_path_SQL .= ' )';
|
|
| 152 | 168 |
} |
| 153 | 169 |
} |
| 154 | 170 |
|
| ... | ... | |
| 167 | 183 |
$search_normal_string = ''; |
| 168 | 184 |
$search_entities_string = ''; // for SQL's LIKE |
| 169 | 185 |
$search_display_string = ''; // for displaying |
| 170 |
$search_url_string = ''; // for $_GET |
|
| 186 |
$search_url_string = ''; // for $_GET -- ATTN: unquoted! Will become urldecoded later
|
|
| 171 | 187 |
$string = ''; |
| 172 | 188 |
if(isset($_REQUEST['string'])) {
|
| 173 |
if($match!='exact') {
|
|
| 189 |
if($match!='exact') { // $string will be cleaned below
|
|
| 174 | 190 |
$string=str_replace(',', '', $_REQUEST['string']);
|
| 175 | 191 |
} else {
|
| 176 |
$string=$_REQUEST['string']; // $string will be cleaned below
|
|
| 192 |
$string=$_REQUEST['string']; |
|
| 177 | 193 |
} |
| 178 | 194 |
// redo possible magic quotes |
| 179 | 195 |
$string = $wb->strip_slashes($string); |
| ... | ... | |
| 187 | 203 |
$search_entities_string = str_replace('\\\\', '\\\\\\\\', $search_entities_string);
|
| 188 | 204 |
// convert string to utf-8 |
| 189 | 205 |
$string = entities_to_umlauts($string, 'UTF-8'); |
| 190 |
// quote ' " and / -we need quoted / for regex |
|
| 191 | 206 |
$search_url_string = $string; |
| 192 | 207 |
$string = preg_quote($string); |
| 208 |
// quote ' " and / -we need quoted / for regex |
|
| 193 | 209 |
$search_normal_string = str_replace(array('\'','"','/'), array('\\\'','\"','\/'), $string);
|
| 194 | 210 |
} |
| 195 | 211 |
// make arrays from the search_..._strings above |
| 212 |
if($match == 'exact') |
|
| 213 |
$search_url_array[] = $search_url_string; |
|
| 214 |
else |
|
| 196 | 215 |
$search_url_array = explode(' ', $search_url_string);
|
| 197 | 216 |
$search_normal_array = array(); |
| 198 | 217 |
$search_entities_array = array(); |
| ... | ... | |
| 220 | 239 |
$search_words = array(); |
| 221 | 240 |
foreach($search_normal_array AS $str) {
|
| 222 | 241 |
$str = strtr($str, $string_ul_umlauts); |
| 223 |
// special-feature: '|' means word-boundary (\b). Searching for 'the|' will find the, but not thema. |
|
| 224 |
// this doesn't(?) work correctly for unicode-chars: '|test' will work, but '|über' not. |
|
| 225 |
$str = strtr($str, array('\\|'=>'\b'));
|
|
| 226 | 242 |
$search_words[] = $str; |
| 227 | 243 |
} |
| 228 | 244 |
|
| trunk/wb/search/search_convert.php | ||
|---|---|---|
| 32 | 32 |
header('Location: ../index.php');
|
| 33 | 33 |
exit(0); |
| 34 | 34 |
} |
| 35 |
if(!isset($search_lang)) $search_lang = LANGUAGE; |
|
| 35 | 36 |
|
| 36 | 37 |
//umlauts to '(upper|lower)' for preg_match() |
| 37 | 38 |
//this is UTF-8-encoded |
| 38 | 39 |
$string_ul_umlauts = array( |
| 40 |
"i" => "(?:i|\xc4\xb0|I)", |
|
| 41 |
"I" => "(?:I|\xc4\xb1|i)", |
|
| 42 |
"k" => "(?:k|\xe2\x84\xaa|K)", |
|
| 43 |
"S" => "(?:S|\xc5\xbf|s)", |
|
| 39 | 44 |
"\xc2\xb5" => "(?:\xc2\xb5|\xce\x9c)", |
| 45 |
"\xc3\x9f" => "(?:\xc3\x9f|SS|ss)", // german ß |
|
| 40 | 46 |
"\xc3\xa0" => "(?:\xc3\xa0|\xc3\x80)", |
| 41 | 47 |
"\xc3\xa1" => "(?:\xc3\xa1|\xc3\x81)", |
| 42 | 48 |
"\xc3\xa2" => "(?:\xc3\xa2|\xc3\x82)", |
| ... | ... | |
| 1460 | 1466 |
"\xf0\x90\x90\xa3" => "(?:\xf0\x90\x90\xa3|\xf0\x90\x91\x8b)", |
| 1461 | 1467 |
"\xf0\x90\x90\xa4" => "(?:\xf0\x90\x90\xa4|\xf0\x90\x91\x8c)", |
| 1462 | 1468 |
"\xf0\x90\x90\xa5" => "(?:\xf0\x90\x90\xa5|\xf0\x90\x91\x8d)" |
| 1463 |
//"i" => "(?:i|\xc4\xb0|I)", |
|
| 1464 |
//"I" => "(?:I|\xc4\xb1|i)", |
|
| 1465 |
//"k" => "(?:k|\xe2\x84\xaa|K)", |
|
| 1466 |
//"S" => "(?:S|\xc5\xbf|s)" |
|
| 1467 | 1469 |
); |
| 1468 | 1470 |
|
| 1471 |
|
|
| 1472 |
if($search_lang=='DE') {
|
|
| 1473 |
$string_ul_umlauts["\xc3\x9f"] = "(?:\xc3\x9f|SS|ss)"; // german ß |
|
| 1474 |
$string_ul_umlauts["\xc3\xa4"] = "(?:\xc3\xa4|\xc3\x84|ae|Ae)"; // german ä |
|
| 1475 |
$string_ul_umlauts["\xc3\xb6"] = "(?:\xc3\xb6|\xc3\x96|oe|Oe)"; // german ö |
|
| 1476 |
$string_ul_umlauts["\xc3\xbc"] = "(?:\xc3\xbc|\xc3\x9c|ue|Ue)"; // german ü |
|
| 1477 |
$string_ul_umlauts["\xc3\x84"] = "(?:\xc3\x84|\xc3\xa4|Ae|ae)"; // german Ä |
|
| 1478 |
$string_ul_umlauts["\xc3\x96"] = "(?:\xc3\x96|\xc3\xb6|Oe|oe)"; // german Ö |
|
| 1479 |
$string_ul_umlauts["\xc3\x9c"] = "(?:\xc3\x9c|\xc3\xbc|Ue|ue)"; // german Ü |
|
| 1480 |
} |
|
| 1481 |
|
|
| 1469 | 1482 |
?> |
| trunk/wb/search/search_modext.php | ||
|---|---|---|
| 101 | 101 |
} |
| 102 | 102 |
$regex='/(?:^|\b|['.$str1.'])([^'.$str1.']{0,200}?'.$word.'[^'.$str2.']{0,200}(?:['.$str2.']|\b|$))/isu';
|
| 103 | 103 |
if(version_compare(PHP_VERSION, '4.3.3', '>=') && |
| 104 |
strpos(strtoupper(PHP_OS), 'WIN')!==0) { // this may crash windows server, so skip if on windows
|
|
| 104 |
strpos(strtoupper(PHP_OS), 'WIN')!==0 |
|
| 105 |
) { // this may crash windows server, so skip if on windows
|
|
| 105 | 106 |
// jump from match to match, get excerpt, stop if $max_excerpt_num is reached |
| 106 | 107 |
$last_end = 0; $offset = 0; |
| 107 | 108 |
while(preg_match('/'.$word.'/Sisu', $text, $match_array, PREG_OFFSET_CAPTURE, $last_end)) {
|
| ... | ... | |
| 115 | 116 |
if(count($excerpt_array) >= $max_excerpt_num) |
| 116 | 117 |
break; |
| 117 | 118 |
} |
| 118 |
} else { // problem - preg_match failed: can't find a start- or stop-sign
|
|
| 119 |
} else { // problem: preg_match failed - can't find a start- or stop-sign
|
|
| 119 | 120 |
$last_end += 201; // jump forward and try again |
| 120 | 121 |
} |
| 121 | 122 |
} |
| 122 |
} else { // compatile, but may be very slow with many large pages
|
|
| 123 |
} else { // compatible, but may be very slow with large pages
|
|
| 123 | 124 |
if(preg_match_all($regex, $text, $match_array)) {
|
| 124 | 125 |
foreach($match_array[1] AS $string) {
|
| 125 | 126 |
if(!preg_match('/\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\./', $string)) // skip excerpts with email-addresses
|
| ... | ... | |
| 173 | 174 |
if($match && is_array($match[0])) {
|
| 174 | 175 |
$x=$match[0][1]; // position of first match |
| 175 | 176 |
// is there an anchor nearby? |
| 176 |
if(preg_match_all('/<(?:[^>]+id|\s*a[^>]+name)\s*=\s*"(.*)"/SiU', substr($text,0,$x), $match, PREG_OFFSET_CAPTURE)) {
|
|
| 177 |
if(preg_match_all('/<(?:[^>]+id|\s*a[^>]+name)\s*=\s*"(.*)"/iU', substr($text,0,$x), $match, PREG_OFFSET_CAPTURE)) {
|
|
| 177 | 178 |
$anchor=''; |
| 178 | 179 |
foreach($match[1] AS $array) {
|
| 179 | 180 |
if($array[1] > $x) {
|
| ... | ... | |
| 234 | 235 |
if(!isset($mod_pic_link)) $mod_pic_link = ""; |
| 235 | 236 |
if(!isset($mod_no_highlight)) $mod_no_highlight = false; |
| 236 | 237 |
if(!isset($func_enable_flush)) $func_enable_flush = false; // set this in db: wb_search.cfg_enable_flush [READ THE DOC BEFORE] |
| 238 |
if(isset($mod_ext_charset) && $mod_ext_charset!='utf-8') $mod_ext_charset = 'utf-8'; // only utf-8 is allowed, yet. For other charset see DOCU |
|
| 237 | 239 |
if($mod_text == "") // nothing to do |
| 238 | 240 |
{ return false; }
|
| 239 | 241 |
if($mod_no_highlight) // no highlighting |
| ... | ... | |
| 243 | 245 |
$mod_text = preg_replace('#<(br( /)?|dt|/dd|/?(h[1-6]|tr|table|p|li|ul|pre|code|div|hr))[^>]*>#Si', '.', $mod_text);
|
| 244 | 246 |
$mod_text = preg_replace('/\s+/', ' ', $mod_text);
|
| 245 | 247 |
$mod_text = preg_replace('/ \./', '.', $mod_text);
|
| 248 |
if(isset($mod_ext_charset)) { // data from external database may have a different charset
|
|
| 249 |
require_once(WB_PATH.'/framework/functions-utf8.php'); |
|
| 250 |
$mod_text = charset_to_utf8($mod_text, $mod_ext_charset); |
|
| 251 |
} else {
|
|
| 246 | 252 |
$mod_text = entities_to_umlauts($mod_text, 'UTF-8'); |
| 253 |
} |
|
| 247 | 254 |
$anchor_text = $mod_text; // make an copy containing html-tags |
| 248 | 255 |
$mod_text = strip_tags($mod_text); |
| 249 | 256 |
$mod_text = str_replace(array('>','<','&','"',''',''',' '), array('>','<','&','"','\'','\'',"\xC2\xA0"), $mod_text);
|
| 257 |
$mod_text = '.'.trim($mod_text).'.'; |
|
| 250 | 258 |
// Do a fast scan over $mod_text first. This will speedup things a lot. |
| 251 | 259 |
if($func_search_match == 'all') {
|
| 252 | 260 |
if(!is_all_matched($mod_text, $func_search_words)) |
Also available in: Unified diff
search: added search_lang for use in search-form. Using DE, the search will search ä=ae,... (alternate spelling of german umlauts)
removed undocumented word-boundary search
search_path is now anchored to the beginning of link ("link LIKE '$path%'" instead of "link LIKE '%$path%'")
added key 'ext_charset' to search-extension to query external databases