Репозиторий ALT Linux backports/2.4
Последнее обновление: 9 июля 2008 | Пакетов: 497 | Посещений: 1587446
 поиск   регистрация   авторизация 
 
Группа :: Базы Данных
Пакет: xapian

 Главная   Изменения   Спек   Патчи   Загрузить   Bugs and FR 

Патч: xapian-qp-utf8-0.9.2.patch


diff -x *.rej -x *.orig -x Makefile -x *.lo -x *.o -x *.la -x .libs -x .deps -prNU2 xapian-core-0.9.2-orig/queryparser/Makefile.am xapian-core-0.9.2/queryparser/Makefile.am
--- xapian-core-0.9.2-orig/queryparser/Makefile.am	Fri Jul 15 12:21:35 2005
+++ xapian-core-0.9.2/queryparser/Makefile.am	Sun Sep 18 17:06:41 2005
@@ -1,5 +1,5 @@
 ## Process this file with automake to produce Makefile.in
 
-INCLUDES = -I$(top_srcdir)/include -I${top_builddir}/include -I$(top_srcdir)/common -I$(top_srcdir)/api
+INCLUDES = -I$(top_srcdir)/include -I${top_builddir}/include -I$(top_srcdir)/common -I$(top_srcdir)/api -I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include
 
 noinst_HEADERS = accentnormalisingitor.h symboltab.h queryparser_internal.h \
@@ -26,2 +26,3 @@ endif
 
 libqueryparser_la_SOURCES = queryparser.cc queryparser_internal.cc
+libqueryparser_la_LIBADD = /usr/lib/libglib-2.0.la
diff -x *.rej -x *.orig -x Makefile -x *.lo -x *.o -x *.la -x .libs -x .deps -prNU2 xapian-core-0.9.2-orig/queryparser/Makefile.in xapian-core-0.9.2/queryparser/Makefile.in
--- xapian-core-0.9.2-orig/queryparser/Makefile.in	Fri Jul 15 12:23:02 2005
+++ xapian-core-0.9.2/queryparser/Makefile.in	Fri Oct  7 04:39:35 2005
@@ -55,5 +55,5 @@ CONFIG_HEADER = $(top_builddir)/config.h
 CONFIG_CLEAN_FILES =
 LTLIBRARIES = $(noinst_LTLIBRARIES)
-libqueryparser_la_LIBADD =
+libqueryparser_la_LIBADD = /usr/lib/libglib-2.0.la
 am_libqueryparser_la_OBJECTS = queryparser.lo queryparser_internal.lo
 libqueryparser_la_OBJECTS = $(am_libqueryparser_la_OBJECTS)
@@ -201,5 +201,5 @@ sharedstatedir = @sharedstatedir@
 sysconfdir = @sysconfdir@
 target_alias = @target_alias@
-INCLUDES = -I$(top_srcdir)/include -I${top_builddir}/include -I$(top_srcdir)/common -I$(top_srcdir)/api
+INCLUDES = -I$(top_srcdir)/include -I${top_builddir}/include -I$(top_srcdir)/common -I$(top_srcdir)/api -I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include
 noinst_HEADERS = accentnormalisingitor.h symboltab.h queryparser_internal.h \
 	queryparser_token.h
diff -x *.rej -x *.orig -x Makefile -x *.lo -x *.o -x *.la -x .libs -x .deps -prNU2 xapian-core-0.9.2-orig/queryparser/accentnormalisingitor.h xapian-core-0.9.2/queryparser/accentnormalisingitor.h
--- xapian-core-0.9.2-orig/queryparser/accentnormalisingitor.h	Fri Jul 15 12:21:35 2005
+++ xapian-core-0.9.2/queryparser/accentnormalisingitor.h	Sun Sep 18 17:06:41 2005
@@ -20,4 +20,5 @@
 
 #include "symboltab.h"
+#include <glib/gunicode.h>
 
 #include <string>
@@ -25,32 +26,41 @@
 using std::string;
 
-/** A wrapper class for a char which returns the char if dereferenced 
+typedef gunichar char_type;
+
+/** A wrapper class for a char_type which returns the char_type if dereferenced 
  *  with *.  We need this to implement input_iterator semantics.
  */
 class CharWrapper {
     private:
-	char ch;
+	char_type ch;
     public:
-	CharWrapper(char ch_) : ch(ch_) { }
-	char operator*() const { return ch; }
+	CharWrapper(char_type ch_) : ch(ch_) { }
+	char_type operator*() const { return ch; }
 };
 
 class AccentNormalisingItor {
   private:
-    string::const_iterator itor;
-    char queued;
+    /*string::const_iterator*/const gchar * itor;
+    const gchar * end;
+
+    char_type queued;
     size_t trans;
 
   public:
     AccentNormalisingItor()
-	: itor(), queued(0), trans(0) {}
-    AccentNormalisingItor(string::const_iterator itor_)
-	: itor(itor_), queued(0), trans(0) {}
+	: itor(NULL), queued(0), trans(0) {}
+    explicit AccentNormalisingItor(const char * itor_)
+	: itor(itor_), end(itor_), queued(0), trans(0) {}
+    AccentNormalisingItor(const char * itor_, const char *end_)
+	: itor(itor_), end(end_), queued(0), trans(0) {}
+#if 0
     void operator=(string::const_iterator itor_)
     {
 	itor = itor_;
+	end = end_;
 	queued = 0;
 	trans = 0;
     }
+#endif
     bool operator==(const AccentNormalisingItor &o) const {
 	return queued == o.queued && itor == o.itor;
@@ -59,13 +69,13 @@ class AccentNormalisingItor {
 	return !(*this == o);
     }
-    char operator*() const {
+    char_type operator*() const {
 	if (queued) return queued;
-	unsigned char ch = (unsigned char)*itor;
+	char_type ch = g_utf8_get_char_validated(itor, end - itor);
 	if (ch >= 160
-#if CHAR_BIT > 8 // Avoid compiler warning.
-		      && ch < 256
-#endif
-				 ) return TRANSLIT1[ch - 160];
-	return (char)ch;
+//#if CHAR_BIT > 8 // Avoid compiler warning.
+		      && ch < 0x240
+//#endif
+				 ) ch = /*return*/ char_type(TRANSLIT1[ch - 160]);
+	return /*(char)*/ch;
     }
     AccentNormalisingItor & operator++() {
@@ -73,9 +83,9 @@ class AccentNormalisingItor {
 	    queued = 0;
 	} else {
-	    unsigned char ch = (unsigned char)*itor;
+	    char_type ch = g_utf8_get_char_validated(itor, end - itor);
 	    if (ch >= 160
-#if CHAR_BIT > 8 // Avoid compiler warning.
-			  && ch < 256
-#endif
+//#if CHAR_BIT > 8 // Avoid compiler warning.
+		      && ch < 0x240
+//#endif
 				     ) {
 		++trans;
@@ -87,14 +97,20 @@ class AccentNormalisingItor {
 	    }
 	}
-	++itor;
+	// ++itor;  becomes:
+	size_t skip = g_utf8_skip[*reinterpret_cast<const guchar *>(itor)];
+	if (size_t(end - itor) < skip) {
+	    itor = end;
+	} else {
+	    itor += skip;
+	}
 	return *this;
     }
     CharWrapper operator++(int) {
-	char tmp = **this;
+	char_type tmp = **this;
 	operator++();
 	return CharWrapper(tmp);
     }
     size_t transliterations() const { return trans; }
-    string::const_iterator raw() const { return itor; }
+    //string::const_iterator raw() const { return itor; }
 
     /// We implement the semantics of an STL input_iterator.
diff -x *.rej -x *.orig -x Makefile -x *.lo -x *.o -x *.la -x .libs -x .deps -prNU2 xapian-core-0.9.2-orig/queryparser/queryparser.lemony xapian-core-0.9.2/queryparser/queryparser.lemony
--- xapian-core-0.9.2-orig/queryparser/queryparser.lemony	Fri Jul 15 12:21:35 2005
+++ xapian-core-0.9.2/queryparser/queryparser.lemony	Sun Sep 18 17:06:41 2005
@@ -37,4 +37,39 @@ using namespace std;
 using namespace Xapian;
 
+static inline bool
+U_isupper(gunichar ch) {
+    return (ch < 128 && C_isupper(ch));
+}
+
+static inline bool
+U_isspace(gunichar ch) {
+    return (ch < 128 && C_isspace(ch));
+}
+
+static inline bool
+U_isnotspace(gunichar ch) {
+    return !U_isspace(ch);
+}
+
+static inline bool
+U_isalnum(gunichar ch) {
+    return (ch < 128 && C_isalnum(ch));
+}
+
+static inline bool
+U_isnotalnum(gunichar ch) {
+    return !U_isalnum(ch);
+}
+
+static inline bool
+U_issign(gunichar ch) {
+    return (ch < 128 && C_issign(ch));
+}
+
+static inline bool
+G_unichar_isnotalnum(gunichar ch) {
+    return !g_unichar_isalnum(ch);
+}
+
 // Disable debug code lemon adds.
 #define NDEBUG
@@ -118,25 +153,25 @@ static inline string
 downcase_term(const string &term)
 {
-    string t;
-    t.reserve(term.size());
-    AccentNormalisingItor i(term.begin());
-    const AccentNormalisingItor end(term.end());
-    while (i != end) t += C_tolower(*i++);
+    gchar * r;
+    r = g_utf8_strdown(static_cast<const gchar*>(term.data()),
+		       term.length());
+    string t(static_cast<char *>(r));
+    free(r);
     return t;
 }
 
 static inline bool
-is_phrase_generator(unsigned char ch)
+is_phrase_generator(gunichar ch)
 {
     // These characters generate a phrase search.
     // Ordered mostly by frequency of calls to this function done when
     // running queryparsertest.
-    return (ch && strchr(".-/':\\_@", ch) != NULL);
+    return (ch && ch < 128 && strchr(".-/':\\_@", ch) != NULL);
 }
 
 static inline bool
-prefix_needs_colon(const string & prefix, unsigned char ch)
+prefix_needs_colon(const string & prefix, gunichar ch)
 {
-    if (!C_isupper(ch)) return false;
+    if (!U_isupper(ch)) return false;
     string::size_type len = prefix.length();
     return (len > 1 && prefix[len - 1] != ':');
@@ -151,4 +186,5 @@ Query
 QueryParser::Internal::parse_query(const string &qs, unsigned flags)
 {
+    gchar ubuf[6];
 #ifndef NDEBUG
     // Set the prefix added to Lemon's debug output, if it's enabled.
@@ -161,27 +197,27 @@ QueryParser::Internal::parse_query(const
 
     termpos term_pos = 1;
-    AccentNormalisingItor it(qs.begin()), end(qs.end());
+    AccentNormalisingItor it(qs.data(), qs.data() + qs.size()), end(qs.data() + qs.size());
 
     State state(this);
 
     enum { DEFAULT, IN_QUOTES, IN_PHRASED_TERM } mode = DEFAULT;
-    unsigned char newprev = ' ';
+    gunichar newprev = ' ';
     while (it != end) {
 	if (mode == IN_PHRASED_TERM) mode = DEFAULT;
-	if (C_isspace(*it)) {
+	if (U_isspace(*it)) {
 	    newprev = ' ';
 	    ++it;
-	    it = find_if(it, end, C_isnotspace);
+	    it = find_if(it, end, U_isnotspace);
 	    if (it == end) break;
 	}
 
-	if (!C_isalnum(*it)) {
-	    unsigned char prev = newprev;
-	    unsigned char ch = *it++;
+	if (!g_unichar_isalnum(*it)) {
+	    gunichar prev = newprev;
+	    gunichar ch = *it++;
 	    if (it != end) newprev = *it;
 	    switch (ch) {
 	      case '"':
 		// Skip whitespace.
-		it = find_if(it, end, C_isnotspace);
+		it = find_if(it, end, U_isnotspace);
 		if (mode != IN_QUOTES) {
 		    if (it == end) {
@@ -216,5 +252,5 @@ QueryParser::Internal::parse_query(const
 		    continue;
 		}
-		if (C_isspace(*it) || *it == '+' || *it == '-') {
+		if (U_isspace(*it) || *it == '+' || *it == '-') {
 		    // Ignore + or - followed by a space, or further + or -.
 		    // Postfix + (such as in C++ and H+) is handled as part of
@@ -229,5 +265,5 @@ QueryParser::Internal::parse_query(const
 	      case '(':
 		// Skip whitespace.
-		it = find_if(it, end, C_isnotspace);
+		it = find_if(it, end, U_isnotspace);
 		// Ignore ( at end of query.
 		if (it == end) goto done;
@@ -264,13 +300,14 @@ QueryParser::Internal::parse_query(const
 	if (mode == DEFAULT && !prefixes.empty()) {
 	    // Check for fieldname prefixes (e.g. title:historical).
-	    AccentNormalisingItor p = find_if(it, end, C_isnotalnum);
+	    AccentNormalisingItor p = find_if(it, end, G_unichar_isnotalnum);
 	    if (p != end && *p == ':' && ++p != end) {
-		unsigned char ch = *p;
-		if (C_isalnum(ch) ||
+		gunichar ch = *p;
+		if (g_unichar_isalnum(ch) ||
 		    ((flags & FLAG_PHRASE) && ch == '"') || 
 		    ((flags & FLAG_BOOLEAN) && ch == '(')) {
 		    string field;
 		    p = it;
-		    while (*p != ':') field += *p++;
+		    while (*p != ':')
+			field += string(ubuf, g_unichar_to_utf8(*p++, ubuf));
 		    map<string, pair<bool, string> >::const_iterator f;
 		    f = prefixes.find(field);
@@ -278,8 +315,8 @@ QueryParser::Internal::parse_query(const
 			// Can't boolean prefix a subexpression or phrase.
 			bool boolean_filter = f->second.first;
-			if (!boolean_filter || C_isalnum(ch)) {
+			if (!boolean_filter || g_unichar_isalnum(ch)) {
 			    it = p;
 			    ++it;
-			    if (!C_isalnum(ch)) {
+			    if (!g_unichar_isalnum(ch)) {
 				newprev = ch;
 				++it;
@@ -300,5 +337,5 @@ QueryParser::Internal::parse_query(const
 				    prefix += ':';
 				while (it != end && *it > ' ' && *it != ')')
-				    prefix += *it++;
+				    prefix += string(ubuf, g_unichar_to_utf8(*it++, ubuf));
 				Parse(pParser, BOOLEAN_FILTER,
 				      new Term(prefix, 0), &state);
@@ -316,10 +353,10 @@ phrased_term:
 	// Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
 	// Don't worry if there's a trailing '.' or not.
-	if (C_isupper(*it)) {
+	if (U_isupper(*it)) {
 	    string t;
 	    AccentNormalisingItor p = it;
 	    do {
-		t += *p++;
-	    } while (p != end && *p == '.' && ++p != end && C_isupper(*p));
+		t += string(ubuf, g_unichar_to_utf8(*p++, ubuf));
+	    } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
 	    // One letter does not make an acronym!  If we handled a single
 	    // uppercase letter here, we wouldn't catch M&S below.
@@ -327,5 +364,5 @@ phrased_term:
 		// Check there's not a (lower case) letter or digit
 		// immediately after it.
-		if (p == end || !C_isalnum(*p)) {
+		if (p == end || !g_unichar_isalnum(*p)) {
 		    it = p;
 		    swap(term, t);
@@ -337,5 +374,5 @@ phrased_term:
 	if (term.empty()) {
 	    while (it != end) {
-		if (!C_isalnum(*it)) {
+		if (!g_unichar_isalnum(*it)) {
 		    // Treat a single embedded '&' as a word character
 		    // (e.g. AT&T).
@@ -343,9 +380,9 @@ phrased_term:
 		    AccentNormalisingItor p = it;
 		    ++p;
-		    if (p == end || !C_isalnum(*p)) break;
+		    if (p == end || !g_unichar_isalnum(*p)) break;
 		}
-		term += *it++;
+		term.append(ubuf, g_unichar_to_utf8(*it++, ubuf));
 	    }
-	    if (it != end && (*it == '#' || C_issign(*it))) {
+	    if (it != end && (*it == '#' || U_issign(*it))) {
 		string suff_term = term;
 		AccentNormalisingItor p = it;
@@ -363,8 +400,8 @@ phrased_term:
 		    // to what combinations are allowed.
 		    do {
-			suff_term += *p++;
-		    } while (p != end && C_issign(*p));
+ 			suff_term += string(ubuf, g_unichar_to_utf8(*p++, ubuf));
+ 		    } while (p != end && U_issign(*p));
 		}
-		if (p == end || !C_isalnum(*p)) {
+ 		if (p == end || !g_unichar_isalnum(*p)) {
 		    // If the suffixed term doesn't exist, check that the
 		    // non-suffixed term does.  This also takes care of
@@ -383,5 +420,5 @@ phrased_term:
 	    // Don't want to interpret A.N.D. or аND as an AND operator.
 	    if (!was_acronym && transliterations == it.transliterations()) {
-		if (prefix.empty() && !term.empty() && C_isalpha(term[0])) {
+ 		if (prefix.empty() && !term.empty() && U_isupper(term[0])) {
 		    if (C_isupper(term[0])) {
 			if (term == "AND") {
@@ -433,5 +470,5 @@ phrased_term:
 	    // e.g. "example.com" should give a phrase search for "exampl"
 	    // and "com", not "example" and "com".
-	    if (p == end || C_isspace(*p)) {
+	    if (p == end || U_isspace(*p)) {
 		it = p;
 		// If topterms added a term with a trailing '.', it will be
@@ -439,5 +476,5 @@ phrased_term:
 		// initial in someone's name, a full stop in pasted text or
 		// something like that.
-		if (!C_isupper(term[0])) {
+		if (!U_isupper(term[0])) {
 		    unstemmed_term = term + '.';
 		    need_to_stem = false;
@@ -449,5 +486,5 @@ phrased_term:
 	term = downcase_term(term);
 	if (need_to_stem) {
-	    if (stem_action == STEM_SOME && C_isupper(unstemmed_term[0]))
+	    if (stem_action == STEM_SOME && U_isupper(unstemmed_term[0]))
 		term = 'R' + term;
 	    else 
@@ -494,5 +531,5 @@ phrased_term:
 	    // Don't generate a phrase unless the phrase generators are
 	    // immediately followed by another term.
-	    if (it != end && C_isalnum(*it)) {
+	    if (it != end && g_unichar_isalnum(*it)) {
 		mode = IN_PHRASED_TERM;
 		goto phrased_term;
diff -x *.rej -x *.orig -x Makefile -x *.lo -x *.o -x *.la -x .libs -x .deps -prNU2 xapian-core-0.9.2-orig/queryparser/queryparser_internal.cc xapian-core-0.9.2/queryparser/queryparser_internal.cc
--- xapian-core-0.9.2-orig/queryparser/queryparser_internal.cc	Fri Jul 15 12:27:03 2005
+++ xapian-core-0.9.2/queryparser/queryparser_internal.cc	Sun Sep 18 20:34:56 2005
@@ -43,4 +43,39 @@ using namespace std;
 using namespace Xapian;
 
+static inline bool
+U_isupper(gunichar ch) {
+    return (ch < 128 && C_isupper(ch));
+}
+
+static inline bool
+U_isspace(gunichar ch) {
+    return (ch < 128 && C_isspace(ch));
+}
+
+static inline bool
+U_isnotspace(gunichar ch) {
+    return !U_isspace(ch);
+}
+
+static inline bool
+U_isalnum(gunichar ch) {
+    return (ch < 128 && C_isalnum(ch));
+}
+
+static inline bool
+U_isnotalnum(gunichar ch) {
+    return !U_isalnum(ch);
+}
+
+static inline bool
+U_issign(gunichar ch) {
+    return (ch < 128 && C_issign(ch));
+}
+
+static inline bool
+G_unichar_isnotalnum(gunichar ch) {
+    return !g_unichar_isalnum(ch);
+}
+
 // Disable debug code lemon adds.
 #define NDEBUG
@@ -124,25 +159,25 @@ static inline string
 downcase_term(const string &term)
 {
-    string t;
-    t.reserve(term.size());
-    AccentNormalisingItor i(term.begin());
-    const AccentNormalisingItor end(term.end());
-    while (i != end) t += C_tolower(*i++);
+    gchar * r;
+    r = g_utf8_strdown(static_cast<const gchar*>(term.data()),
+		       term.length());
+    string t(static_cast<char *>(r));
+    free(r);
     return t;
 }
 
 static inline bool
-is_phrase_generator(unsigned char ch)
+is_phrase_generator(gunichar ch)
 {
     // These characters generate a phrase search.
     // Ordered mostly by frequency of calls to this function done when
     // running queryparsertest.
-    return (ch && strchr(".-/':\\_@", ch) != NULL);
+    return (ch && ch < 128 && strchr(".-/':\\_@", ch) != NULL);
 }
 
 static inline bool
-prefix_needs_colon(const string & prefix, unsigned char ch)
+prefix_needs_colon(const string & prefix, gunichar ch)
 {
-    if (!C_isupper(ch)) return false;
+    if (!U_isupper(ch)) return false;
     string::size_type len = prefix.length();
     return (len > 1 && prefix[len - 1] != ':');
@@ -157,4 +192,5 @@ Query
 QueryParser::Internal::parse_query(const string &qs, unsigned flags)
 {
+    gchar ubuf[6];
 #ifndef NDEBUG
     // Set the prefix added to Lemon's debug output, if it's enabled.
@@ -167,27 +203,27 @@ QueryParser::Internal::parse_query(const
 
     termpos term_pos = 1;
-    AccentNormalisingItor it(qs.begin()), end(qs.end());
+    AccentNormalisingItor it(qs.data(), qs.data() + qs.size()), end(qs.data() + qs.size());
 
     State state(this);
 
     enum { DEFAULT, IN_QUOTES, IN_PHRASED_TERM } mode = DEFAULT;
-    unsigned char newprev = ' ';
+    gunichar newprev = ' ';
     while (it != end) {
 	if (mode == IN_PHRASED_TERM) mode = DEFAULT;
-	if (C_isspace(*it)) {
+	if (U_isspace(*it)) {
 	    newprev = ' ';
 	    ++it;
-	    it = find_if(it, end, C_isnotspace);
+	    it = find_if(it, end, U_isnotspace);
 	    if (it == end) break;
 	}
 
-	if (!C_isalnum(*it)) {
-	    unsigned char prev = newprev;
-	    unsigned char ch = *it++;
+	if (!g_unichar_isalnum(*it)) {
+	    gunichar prev = newprev;
+	    gunichar ch = *it++;
 	    if (it != end) newprev = *it;
 	    switch (ch) {
 	      case '"':
 		// Skip whitespace.
-		it = find_if(it, end, C_isnotspace);
+		it = find_if(it, end, U_isnotspace);
 		if (mode != IN_QUOTES) {
 		    if (it == end) {
@@ -222,5 +258,5 @@ QueryParser::Internal::parse_query(const
 		    continue;
 		}
-		if (C_isspace(*it) || *it == '+' || *it == '-') {
+		if (U_isspace(*it) || *it == '+' || *it == '-') {
 		    // Ignore + or - followed by a space, or further + or -.
 		    // Postfix + (such as in C++ and H+) is handled as part of
@@ -235,5 +271,5 @@ QueryParser::Internal::parse_query(const
 	      case '(':
 		// Skip whitespace.
-		it = find_if(it, end, C_isnotspace);
+		it = find_if(it, end, U_isnotspace);
 		// Ignore ( at end of query.
 		if (it == end) goto done;
@@ -270,13 +306,14 @@ QueryParser::Internal::parse_query(const
 	if (mode == DEFAULT && !prefixes.empty()) {
 	    // Check for fieldname prefixes (e.g. title:historical).
-	    AccentNormalisingItor p = find_if(it, end, C_isnotalnum);
+	    AccentNormalisingItor p = find_if(it, end, G_unichar_isnotalnum);
 	    if (p != end && *p == ':' && ++p != end) {
-		unsigned char ch = *p;
-		if (C_isalnum(ch) ||
+		gunichar ch = *p;
+		if (g_unichar_isalnum(ch) ||
 		    ((flags & FLAG_PHRASE) && ch == '"') || 
 		    ((flags & FLAG_BOOLEAN) && ch == '(')) {
 		    string field;
 		    p = it;
-		    while (*p != ':') field += *p++;
+		    while (*p != ':')
+			field += string(ubuf, g_unichar_to_utf8(*p++, ubuf));
 		    map<string, pair<bool, string> >::const_iterator f;
 		    f = prefixes.find(field);
@@ -284,8 +321,8 @@ QueryParser::Internal::parse_query(const
 			// Can't boolean prefix a subexpression or phrase.
 			bool boolean_filter = f->second.first;
-			if (!boolean_filter || C_isalnum(ch)) {
+			if (!boolean_filter || g_unichar_isalnum(ch)) {
 			    it = p;
 			    ++it;
-			    if (!C_isalnum(ch)) {
+			    if (!g_unichar_isalnum(ch)) {
 				newprev = ch;
 				++it;
@@ -306,5 +343,5 @@ QueryParser::Internal::parse_query(const
 				    prefix += ':';
 				while (it != end && *it > ' ' && *it != ')')
-				    prefix += *it++;
+				    prefix += string(ubuf, g_unichar_to_utf8(*it++, ubuf));
 				Parse(pParser, BOOLEAN_FILTER,
 				      new Term(prefix, 0), &state);
@@ -322,10 +359,10 @@ phrased_term:
 	// Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
 	// Don't worry if there's a trailing '.' or not.
-	if (C_isupper(*it)) {
+	if (U_isupper(*it)) {
 	    string t;
 	    AccentNormalisingItor p = it;
 	    do {
-		t += *p++;
-	    } while (p != end && *p == '.' && ++p != end && C_isupper(*p));
+		t += string(ubuf, g_unichar_to_utf8(*p++, ubuf));
+	    } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
 	    // One letter does not make an acronym!  If we handled a single
 	    // uppercase letter here, we wouldn't catch M&S below.
@@ -333,5 +370,5 @@ phrased_term:
 		// Check there's not a (lower case) letter or digit
 		// immediately after it.
-		if (p == end || !C_isalnum(*p)) {
+		if (p == end || !g_unichar_isalnum(*p)) {
 		    it = p;
 		    swap(term, t);
@@ -343,5 +380,5 @@ phrased_term:
 	if (term.empty()) {
 	    while (it != end) {
-		if (!C_isalnum(*it)) {
+		if (!g_unichar_isalnum(*it)) {
 		    // Treat a single embedded '&' as a word character
 		    // (e.g. AT&T).
@@ -349,9 +386,9 @@ phrased_term:
 		    AccentNormalisingItor p = it;
 		    ++p;
-		    if (p == end || !C_isalnum(*p)) break;
+		    if (p == end || !g_unichar_isalnum(*p)) break;
 		}
-		term += *it++;
+		term.append(ubuf, g_unichar_to_utf8(*it++, ubuf));
 	    }
-	    if (it != end && (*it == '#' || C_issign(*it))) {
+	    if (it != end && (*it == '#' || U_issign(*it))) {
 		string suff_term = term;
 		AccentNormalisingItor p = it;
@@ -369,8 +406,8 @@ phrased_term:
 		    // to what combinations are allowed.
 		    do {
-			suff_term += *p++;
-		    } while (p != end && C_issign(*p));
+ 			suff_term += string(ubuf, g_unichar_to_utf8(*p++, ubuf));
+ 		    } while (p != end && U_issign(*p));
 		}
-		if (p == end || !C_isalnum(*p)) {
+ 		if (p == end || !g_unichar_isalnum(*p)) {
 		    // If the suffixed term doesn't exist, check that the
 		    // non-suffixed term does.  This also takes care of
@@ -389,5 +426,5 @@ phrased_term:
 	    // Don't want to interpret A.N.D. or аND as an AND operator.
 	    if (!was_acronym && transliterations == it.transliterations()) {
-		if (prefix.empty() && !term.empty() && C_isalpha(term[0])) {
+ 		if (prefix.empty() && !term.empty() && U_isupper(term[0])) {
 		    if (C_isupper(term[0])) {
 			if (term == "AND") {
@@ -439,5 +476,5 @@ phrased_term:
 	    // e.g. "example.com" should give a phrase search for "exampl"
 	    // and "com", not "example" and "com".
-	    if (p == end || C_isspace(*p)) {
+	    if (p == end || U_isspace(*p)) {
 		it = p;
 		// If topterms added a term with a trailing '.', it will be
@@ -445,5 +482,5 @@ phrased_term:
 		// initial in someone's name, a full stop in pasted text or
 		// something like that.
-		if (!C_isupper(term[0])) {
+		if (!U_isupper(term[0])) {
 		    unstemmed_term = term + '.';
 		    need_to_stem = false;
@@ -455,5 +492,5 @@ phrased_term:
 	term = downcase_term(term);
 	if (need_to_stem) {
-	    if (stem_action == STEM_SOME && C_isupper(unstemmed_term[0]))
+	    if (stem_action == STEM_SOME && U_isupper(unstemmed_term[0]))
 		term = 'R' + term;
 	    else 
@@ -500,5 +537,5 @@ phrased_term:
 	    // Don't generate a phrase unless the phrase generators are
 	    // immediately followed by another term.
-	    if (it != end && C_isalnum(*it)) {
+	    if (it != end && g_unichar_isalnum(*it)) {
 		mode = IN_PHRASED_TERM;
 		goto phrased_term;
 
design & coding: Vladimir Lettiev aka crux © 2004-2005