Index: trunk/CHANGELOG
===================================================================
--- trunk/CHANGELOG	(revision 868)
+++ trunk/CHANGELOG	(revision 869)
@@ -14,7 +14,11 @@
 29-Oct-2008 Thomas Hornik
 # News: total number of news wasn't calculated correctly
 ! Captcha: removed IFRAME for Calculation-as-text captcha
-- removed useless init_utf8funcs()-function
+- removed useless init_utf8funcs()-function
++ search: added search_lang for use in search-form. Using DE, the search will search ä=ae,... (alternate spelling of german umlauts)
+- search: removed undocumented word-boundary search
+! search: search_path is now anchored to the beginning of link ("link LIKE '$path%'" instead of "link LIKE '%$path%'")
++ search: added key 'ext_charset' to search-extension to query external databases
 25-Oct-2008 Christian Sommer
 !	updated basic templates according the template guidelines
 24-Oct-2008 Dietrich Roland Pehlke
Index: trunk/wb/framework/frontend.functions.php
===================================================================
--- trunk/wb/framework/frontend.functions.php	(revision 868)
+++ trunk/wb/framework/frontend.functions.php	(revision 869)
@@ -77,8 +77,6 @@
 	array_walk($arr_string, create_function('&$v,$k','$v = preg_quote($v, \'/\');'));
 	$search_string = implode("|", $arr_string);
 	$string = strtr($search_string, $string_ul_umlauts);
-	// special-feature: '|' means word-boundary (\b). Searching for 'the|' will find 'the', but not 'thema'.
-	$string = strtr($string, array('\\|'=>'\b'));
 	// the highlighting
 	// match $string, but not inside <style>...</style>, <script>...</script>, <!--...--> or HTML-Tags
 	// split $string into pieces - "cut away" styles, scripts, comments, HTML-tags and eMail-addresses
Index: trunk/wb/search/search.php
===================================================================
--- trunk/wb/search/search.php	(revision 868)
+++ trunk/wb/search/search.php	(revision 869)
@@ -117,26 +117,42 @@
 	}
 }
 
+// Get search language
+$search_lang = '';
+if(isset($_REQUEST['search_lang'])) {
+	$search_lang = $_REQUEST['search_lang'];
+	if(!preg_match('~^[A-Z]{2}$~', $search_lang))
+		$search_lang = LANGUAGE;
+} else {
+	$search_lang = LANGUAGE;
+}
+
 // Get the path to search into. Normally left blank
+// ATTN: since wb2.7.1 the path is evaluated as SQL: LIKE "/path%" - which will find "/path.php", "/path/info.php", ...; But not "/de/path.php"
+// Add a '%' in front of each path to get SQL: LIKE "%/path%"
 /* possible values:
  * - a single path: "/en/" - search only pages whose link contains 'path' ("/en/machinery/bender-x09")
- * - a bunch of alternative pathes: "/en/,/machinery/,docs/" - alternatives paths, seperated by comma
- * - a bunch of paths to exclude: "-/about,/info,/jp/,/light" - search all, exclude these.
+ * - a single path not to search into: "-/help" - search all, exclude /help...
+ * - a bunch of alternative pathes: "/en/,%/machinery/,/docs/" - alternatives paths, seperated by comma
+ * - a bunch of paths to exclude: "-/about,%/info,/jp/,/light" - search all, exclude these.
  * These different styles can't be mixed.
  */
-$search_path_SQL = "";
-$search_path = "";
+// ATTN: in wb2.7.0 "/en/" matched all links with "/en/" somewhere in the link: "/info/en/intro.php", "/en/info.php", ...
+// since wb2.7.1 "/en/" matches only links _starting_  with "/en/": "/en/intro/info.php"
+// use "%/en/" (or "%/en/, %/info", ...) to get the old behavior
+$search_path_SQL = '';
+$search_path = '';
 if(isset($_REQUEST['search_path'])) {
-	$search_path = $wb->add_slashes($_REQUEST['search_path']);
-	if(!preg_match('~^[-a-zA-Z0-9_,/ ]+$~', $search_path))
+	$search_path = addslashes(htmlspecialchars(strip_tags($wb->strip_slashes($_REQUEST['search_path']))));
+	if(!preg_match('~^%?[-a-zA-Z0-9_,/ ]+$~', $search_path))
 		$search_path = '';
 	if($search_path != '') {
-		$search_path_SQL = "AND ( ";
-		$not = "";
-		$op = "OR";
+		$search_path_SQL = 'AND ( ';
+		$not = '';
+		$op = 'OR';
 		if($search_path[0] == '-') {
-			$not = "NOT";
-			$op = "AND";
+			$not = 'NOT';
+			$op = 'AND';
 			$paths = explode(',', substr($search_path, 1) );
 		} else {
 			$paths = explode(',',$search_path);
@@ -144,11 +160,11 @@
 		$i=0;
 		foreach($paths as $p) {
 			if($i++ > 0) {
-				$search_path_SQL .= " $op";
+				$search_path_SQL .= ' $op';
 			}
-			$search_path_SQL .= " link $not LIKE '%$p%'";			
+			$search_path_SQL .= " link $not LIKE '".$p."%'";			
 		}
-		$search_path_SQL .= " )";
+		$search_path_SQL .= ' )';
 	}
 }
 
@@ -167,13 +183,13 @@
 $search_normal_string = '';
 $search_entities_string = ''; // for SQL's LIKE
 $search_display_string = ''; // for displaying
-$search_url_string = ''; // for $_GET
+$search_url_string = ''; // for $_GET -- ATTN: unquoted! Will become urldecoded later
 $string = '';
 if(isset($_REQUEST['string'])) {
-	if($match!='exact') {
+	if($match!='exact') { // $string will be cleaned below
 		$string=str_replace(',', '', $_REQUEST['string']);
 	} else {
-		$string=$_REQUEST['string']; // $string will be cleaned below
+		$string=$_REQUEST['string'];
 	}
 	// redo possible magic quotes
 	$string = $wb->strip_slashes($string);
@@ -187,12 +203,15 @@
 	$search_entities_string = str_replace('\\\\', '\\\\\\\\', $search_entities_string);
 	// convert string to utf-8
 	$string = entities_to_umlauts($string, 'UTF-8');
-	// quote ' " and /  -we need quoted / for regex
 	$search_url_string = $string;
 	$string = preg_quote($string);
+	// quote ' " and /  -we need quoted / for regex
 	$search_normal_string = str_replace(array('\'','"','/'), array('\\\'','\"','\/'), $string);
 }
 // make arrays from the search_..._strings above
+if($match == 'exact')
+	$search_url_array[] = $search_url_string;
+else
 $search_url_array = explode(' ', $search_url_string);
 $search_normal_array = array();
 $search_entities_array = array();
@@ -220,9 +239,6 @@
 $search_words = array();
 foreach($search_normal_array AS $str) {
 	$str = strtr($str, $string_ul_umlauts);
-	// special-feature: '|' means word-boundary (\b). Searching for 'the|' will find the, but not thema.
-	// this doesn't(?) work correctly for unicode-chars: '|test' will work, but '|über' not.
-	$str = strtr($str, array('\\|'=>'\b'));
 	$search_words[] = $str;
 }
 
Index: trunk/wb/search/search_convert.php
===================================================================
--- trunk/wb/search/search_convert.php	(revision 868)
+++ trunk/wb/search/search_convert.php	(revision 869)
@@ -32,11 +32,17 @@
 	header('Location: ../index.php');
 	exit(0);
 }
+if(!isset($search_lang)) $search_lang = LANGUAGE;
 
 //umlauts to '(upper|lower)' for preg_match()
 //this is UTF-8-encoded
 $string_ul_umlauts = array(
+	"i" => "(?:i|\xc4\xb0|I)",
+	"I" => "(?:I|\xc4\xb1|i)",
+	"k" => "(?:k|\xe2\x84\xaa|K)",
+	"S" => "(?:S|\xc5\xbf|s)",
 	"\xc2\xb5" => "(?:\xc2\xb5|\xce\x9c)",
+	"\xc3\x9f" => "(?:\xc3\x9f|SS|ss)", // german ß
 	"\xc3\xa0" => "(?:\xc3\xa0|\xc3\x80)",
 	"\xc3\xa1" => "(?:\xc3\xa1|\xc3\x81)",
 	"\xc3\xa2" => "(?:\xc3\xa2|\xc3\x82)",
@@ -1460,10 +1466,17 @@
 	"\xf0\x90\x90\xa3" => "(?:\xf0\x90\x90\xa3|\xf0\x90\x91\x8b)",
 	"\xf0\x90\x90\xa4" => "(?:\xf0\x90\x90\xa4|\xf0\x90\x91\x8c)",
 	"\xf0\x90\x90\xa5" => "(?:\xf0\x90\x90\xa5|\xf0\x90\x91\x8d)"
-	//"i" => "(?:i|\xc4\xb0|I)",
-	//"I" => "(?:I|\xc4\xb1|i)",
-	//"k" => "(?:k|\xe2\x84\xaa|K)",
-	//"S" => "(?:S|\xc5\xbf|s)"
 );
 
+
+if($search_lang=='DE') {
+	$string_ul_umlauts["\xc3\x9f"] = "(?:\xc3\x9f|SS|ss)"; // german ß
+	$string_ul_umlauts["\xc3\xa4"] = "(?:\xc3\xa4|\xc3\x84|ae|Ae)"; // german ä
+	$string_ul_umlauts["\xc3\xb6"] = "(?:\xc3\xb6|\xc3\x96|oe|Oe)"; // german ö
+	$string_ul_umlauts["\xc3\xbc"] = "(?:\xc3\xbc|\xc3\x9c|ue|Ue)"; // german ü
+	$string_ul_umlauts["\xc3\x84"] = "(?:\xc3\x84|\xc3\xa4|Ae|ae)"; // german Ä
+	$string_ul_umlauts["\xc3\x96"] = "(?:\xc3\x96|\xc3\xb6|Oe|oe)"; // german Ö
+	$string_ul_umlauts["\xc3\x9c"] = "(?:\xc3\x9c|\xc3\xbc|Ue|ue)"; // german Ü
+}
+
 ?>
\ No newline at end of file
Index: trunk/wb/search/search_modext.php
===================================================================
--- trunk/wb/search/search_modext.php	(revision 868)
+++ trunk/wb/search/search_modext.php	(revision 869)
@@ -101,7 +101,8 @@
 	}
 	$regex='/(?:^|\b|['.$str1.'])([^'.$str1.']{0,200}?'.$word.'[^'.$str2.']{0,200}(?:['.$str2.']|\b|$))/isu';
 	if(version_compare(PHP_VERSION, '4.3.3', '>=') &&
-	strpos(strtoupper(PHP_OS), 'WIN')!==0) { // this may crash windows server, so skip if on windows
+	   strpos(strtoupper(PHP_OS), 'WIN')!==0
+	) { // this may crash windows server, so skip if on windows
 		// jump from match to match, get excerpt, stop if $max_excerpt_num is reached
 		$last_end = 0; $offset = 0;
 		while(preg_match('/'.$word.'/Sisu', $text, $match_array, PREG_OFFSET_CAPTURE, $last_end)) {
@@ -115,11 +116,11 @@
 					if(count($excerpt_array) >= $max_excerpt_num)
 						break;
 				}
-			} else { // problem - preg_match failed: can't find a start- or stop-sign
+			} else { // problem: preg_match failed - can't find a start- or stop-sign
 				$last_end += 201; // jump forward and try again
 			}
 		}
-	} else { // compatile, but may be very slow with many large pages
+	} else { // compatible, but may be very slow with large pages
 		if(preg_match_all($regex, $text, $match_array)) {
 			foreach($match_array[1] AS $string) {
 				if(!preg_match('/\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\./', $string)) // skip excerpts with email-addresses
@@ -173,7 +174,7 @@
 		if($match && is_array($match[0])) {
 			$x=$match[0][1]; // position of first match
 			// is there an anchor nearby?
-			if(preg_match_all('/<(?:[^>]+id|\s*a[^>]+name)\s*=\s*"(.*)"/SiU', substr($text,0,$x), $match, PREG_OFFSET_CAPTURE)) {
+			if(preg_match_all('/<(?:[^>]+id|\s*a[^>]+name)\s*=\s*"(.*)"/iU', substr($text,0,$x), $match, PREG_OFFSET_CAPTURE)) {
 				$anchor='';
 				foreach($match[1] AS $array) {
 					if($array[1] > $x) {
@@ -234,6 +235,7 @@
 	if(!isset($mod_pic_link))           $mod_pic_link = "";
 	if(!isset($mod_no_highlight))       $mod_no_highlight = false;
 	if(!isset($func_enable_flush))      $func_enable_flush = false; // set this in db: wb_search.cfg_enable_flush [READ THE DOC BEFORE]
+	if(isset($mod_ext_charset) && $mod_ext_charset!='utf-8') $mod_ext_charset = 'utf-8'; // only utf-8 is allowed, yet. For other charset see DOCU
 	if($mod_text == "") // nothing to do
 		{ return false; }
 	if($mod_no_highlight) // no highlighting
@@ -243,10 +245,16 @@
 	$mod_text = preg_replace('#<(br( /)?|dt|/dd|/?(h[1-6]|tr|table|p|li|ul|pre|code|div|hr))[^>]*>#Si', '.', $mod_text);
 	$mod_text = preg_replace('/\s+/', ' ', $mod_text);
 	$mod_text = preg_replace('/ \./', '.', $mod_text);
+	if(isset($mod_ext_charset)) { // data from external database may have a different charset
+		require_once(WB_PATH.'/framework/functions-utf8.php');
+		$mod_text = charset_to_utf8($mod_text, $mod_ext_charset);
+	} else {
 	$mod_text = entities_to_umlauts($mod_text, 'UTF-8');
+	}
 	$anchor_text = $mod_text; // make an copy containing html-tags
 	$mod_text = strip_tags($mod_text);
 	$mod_text = str_replace(array('&gt;','&lt;','&amp;','&quot;','&#39;','&apos;','&nbsp;'), array('>','<','&','"','\'','\'',"\xC2\xA0"), $mod_text);
+	$mod_text = '.'.trim($mod_text).'.';
 	// Do a fast scan over $mod_text first. This will speedup things a lot.
 	if($func_search_match == 'all') {
 		if(!is_all_matched($mod_text, $func_search_words))
