Группа :: Базы Данных
        
    
Пакет: xapian
    
 Главная   Изменения   Спек   Патчи   Загрузить   Bugs and FR 
Патч: xapian-qp-utf8-0.9.2.patch
diff -x *.rej -x *.orig -x Makefile -x *.lo -x *.o -x *.la -x .libs -x .deps -prNU2 xapian-core-0.9.2-orig/queryparser/Makefile.am xapian-core-0.9.2/queryparser/Makefile.am
--- xapian-core-0.9.2-orig/queryparser/Makefile.am Fri Jul 15 12:21:35 2005
+++ xapian-core-0.9.2/queryparser/Makefile.am Sun Sep 18 17:06:41 2005
@@ -1,5 +1,5 @@
## Process this file with automake to produce Makefile.in
-INCLUDES = -I$(top_srcdir)/include -I${top_builddir}/include -I$(top_srcdir)/common -I$(top_srcdir)/api+INCLUDES = -I$(top_srcdir)/include -I${top_builddir}/include -I$(top_srcdir)/common -I$(top_srcdir)/api -I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/includenoinst_HEADERS = accentnormalisingitor.h symboltab.h queryparser_internal.h \
@@ -26,2 +26,3 @@ endif
libqueryparser_la_SOURCES = queryparser.cc queryparser_internal.cc
+libqueryparser_la_LIBADD = /usr/lib/libglib-2.0.la
diff -x *.rej -x *.orig -x Makefile -x *.lo -x *.o -x *.la -x .libs -x .deps -prNU2 xapian-core-0.9.2-orig/queryparser/Makefile.in xapian-core-0.9.2/queryparser/Makefile.in
--- xapian-core-0.9.2-orig/queryparser/Makefile.in Fri Jul 15 12:23:02 2005
+++ xapian-core-0.9.2/queryparser/Makefile.in Fri Oct 7 04:39:35 2005
@@ -55,5 +55,5 @@ CONFIG_HEADER = $(top_builddir)/config.h
CONFIG_CLEAN_FILES =
LTLIBRARIES = $(noinst_LTLIBRARIES)
-libqueryparser_la_LIBADD =
+libqueryparser_la_LIBADD = /usr/lib/libglib-2.0.la
am_libqueryparser_la_OBJECTS = queryparser.lo queryparser_internal.lo
libqueryparser_la_OBJECTS = $(am_libqueryparser_la_OBJECTS)
@@ -201,5 +201,5 @@ sharedstatedir = @sharedstatedir@
sysconfdir = @sysconfdir@
target_alias = @target_alias@
-INCLUDES = -I$(top_srcdir)/include -I${top_builddir}/include -I$(top_srcdir)/common -I$(top_srcdir)/api+INCLUDES = -I$(top_srcdir)/include -I${top_builddir}/include -I$(top_srcdir)/common -I$(top_srcdir)/api -I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/includenoinst_HEADERS = accentnormalisingitor.h symboltab.h queryparser_internal.h \
queryparser_token.h
diff -x *.rej -x *.orig -x Makefile -x *.lo -x *.o -x *.la -x .libs -x .deps -prNU2 xapian-core-0.9.2-orig/queryparser/accentnormalisingitor.h xapian-core-0.9.2/queryparser/accentnormalisingitor.h
--- xapian-core-0.9.2-orig/queryparser/accentnormalisingitor.h Fri Jul 15 12:21:35 2005
+++ xapian-core-0.9.2/queryparser/accentnormalisingitor.h Sun Sep 18 17:06:41 2005
@@ -20,4 +20,5 @@
#include "symboltab.h"
+#include <glib/gunicode.h>
#include <string>
@@ -25,32 +26,41 @@
using std::string;
-/** A wrapper class for a char which returns the char if dereferenced
+typedef gunichar char_type;
+
+/** A wrapper class for a char_type which returns the char_type if dereferenced
* with *. We need this to implement input_iterator semantics.
*/
 class CharWrapper {private:
- char ch;
+ char_type ch;
public:
-	CharWrapper(char ch_) : ch(ch_) { }-	char operator*() const { return ch; }+	CharWrapper(char_type ch_) : ch(ch_) { }+	char_type operator*() const { return ch; }};
 class AccentNormalisingItor {private:
- string::const_iterator itor;
- char queued;
+ /*string::const_iterator*/const gchar * itor;
+ const gchar * end;
+
+ char_type queued;
size_t trans;
public:
AccentNormalisingItor()
-	: itor(), queued(0), trans(0) {}- AccentNormalisingItor(string::const_iterator itor_)
-	: itor(itor_), queued(0), trans(0) {}+	: itor(NULL), queued(0), trans(0) {}+ explicit AccentNormalisingItor(const char * itor_)
+	: itor(itor_), end(itor_), queued(0), trans(0) {}+ AccentNormalisingItor(const char * itor_, const char *end_)
+	: itor(itor_), end(end_), queued(0), trans(0) {}+#if 0
void operator=(string::const_iterator itor_)
     {itor = itor_;
+ end = end_;
queued = 0;
trans = 0;
}
+#endif
     bool operator==(const AccentNormalisingItor &o) const {return queued == o.queued && itor == o.itor;
@@ -59,13 +69,13 @@ class AccentNormalisingItor {return !(*this == o);
}
-    char operator*() const {+    char_type operator*() const {if (queued) return queued;
- unsigned char ch = (unsigned char)*itor;
+ char_type ch = g_utf8_get_char_validated(itor, end - itor);
if (ch >= 160
-#if CHAR_BIT > 8 // Avoid compiler warning.
- && ch < 256
-#endif
- ) return TRANSLIT1[ch - 160];
- return (char)ch;
+//#if CHAR_BIT > 8 // Avoid compiler warning.
+ && ch < 0x240
+//#endif
+ ) ch = /*return*/ char_type(TRANSLIT1[ch - 160]);
+ return /*(char)*/ch;
}
     AccentNormalisingItor & operator++() {@@ -73,9 +83,9 @@ class AccentNormalisingItor {queued = 0;
 	} else {- unsigned char ch = (unsigned char)*itor;
+ char_type ch = g_utf8_get_char_validated(itor, end - itor);
if (ch >= 160
-#if CHAR_BIT > 8 // Avoid compiler warning.
- && ch < 256
-#endif
+//#if CHAR_BIT > 8 // Avoid compiler warning.
+ && ch < 0x240
+//#endif
 				     ) {++trans;
@@ -87,14 +97,20 @@ class AccentNormalisingItor {}
}
- ++itor;
+ // ++itor; becomes:
+ size_t skip = g_utf8_skip[*reinterpret_cast<const guchar *>(itor)];
+	if (size_t(end - itor) < skip) {+ itor = end;
+	} else {+ itor += skip;
+ }
return *this;
}
     CharWrapper operator++(int) {- char tmp = **this;
+ char_type tmp = **this;
operator++();
return CharWrapper(tmp);
}
     size_t transliterations() const { return trans; }-    string::const_iterator raw() const { return itor; }+    //string::const_iterator raw() const { return itor; }/// We implement the semantics of an STL input_iterator.
diff -x *.rej -x *.orig -x Makefile -x *.lo -x *.o -x *.la -x .libs -x .deps -prNU2 xapian-core-0.9.2-orig/queryparser/queryparser.lemony xapian-core-0.9.2/queryparser/queryparser.lemony
--- xapian-core-0.9.2-orig/queryparser/queryparser.lemony Fri Jul 15 12:21:35 2005
+++ xapian-core-0.9.2/queryparser/queryparser.lemony Sun Sep 18 17:06:41 2005
@@ -37,4 +37,39 @@ using namespace std;
using namespace Xapian;
+static inline bool
+U_isupper(gunichar ch) {+ return (ch < 128 && C_isupper(ch));
+}
+
+static inline bool
+U_isspace(gunichar ch) {+ return (ch < 128 && C_isspace(ch));
+}
+
+static inline bool
+U_isnotspace(gunichar ch) {+ return !U_isspace(ch);
+}
+
+static inline bool
+U_isalnum(gunichar ch) {+ return (ch < 128 && C_isalnum(ch));
+}
+
+static inline bool
+U_isnotalnum(gunichar ch) {+ return !U_isalnum(ch);
+}
+
+static inline bool
+U_issign(gunichar ch) {+ return (ch < 128 && C_issign(ch));
+}
+
+static inline bool
+G_unichar_isnotalnum(gunichar ch) {+ return !g_unichar_isalnum(ch);
+}
+
// Disable debug code lemon adds.
#define NDEBUG
@@ -118,25 +153,25 @@ static inline string
downcase_term(const string &term)
 {- string t;
- t.reserve(term.size());
- AccentNormalisingItor i(term.begin());
- const AccentNormalisingItor end(term.end());
- while (i != end) t += C_tolower(*i++);
+ gchar * r;
+ r = g_utf8_strdown(static_cast<const gchar*>(term.data()),
+ term.length());
+ string t(static_cast<char *>(r));
+ free(r);
return t;
}
static inline bool
-is_phrase_generator(unsigned char ch)
+is_phrase_generator(gunichar ch)
 {// These characters generate a phrase search.
// Ordered mostly by frequency of calls to this function done when
// running queryparsertest.
-    return (ch && strchr(".-/':\\_@", ch) != NULL);+    return (ch && ch < 128 && strchr(".-/':\\_@", ch) != NULL);}
static inline bool
-prefix_needs_colon(const string & prefix, unsigned char ch)
+prefix_needs_colon(const string & prefix, gunichar ch)
 {- if (!C_isupper(ch)) return false;
+ if (!U_isupper(ch)) return false;
string::size_type len = prefix.length();
return (len > 1 && prefix[len - 1] != ':');
@@ -151,4 +186,5 @@ Query
QueryParser::Internal::parse_query(const string &qs, unsigned flags)
 {+ gchar ubuf[6];
#ifndef NDEBUG
// Set the prefix added to Lemon's debug output, if it's enabled.
@@ -161,27 +197,27 @@ QueryParser::Internal::parse_query(const
termpos term_pos = 1;
- AccentNormalisingItor it(qs.begin()), end(qs.end());
+ AccentNormalisingItor it(qs.data(), qs.data() + qs.size()), end(qs.data() + qs.size());
State state(this);
     enum { DEFAULT, IN_QUOTES, IN_PHRASED_TERM } mode = DEFAULT;- unsigned char newprev = ' ';
+ gunichar newprev = ' ';
     while (it != end) {if (mode == IN_PHRASED_TERM) mode = DEFAULT;
-	if (C_isspace(*it)) {+	if (U_isspace(*it)) {newprev = ' ';
++it;
- it = find_if(it, end, C_isnotspace);
+ it = find_if(it, end, U_isnotspace);
if (it == end) break;
}
-	if (!C_isalnum(*it)) {- unsigned char prev = newprev;
- unsigned char ch = *it++;
+	if (!g_unichar_isalnum(*it)) {+ gunichar prev = newprev;
+ gunichar ch = *it++;
if (it != end) newprev = *it;
 	    switch (ch) {case '"':
// Skip whitespace.
- it = find_if(it, end, C_isnotspace);
+ it = find_if(it, end, U_isnotspace);
 		if (mode != IN_QUOTES) { 		    if (it == end) {@@ -216,5 +252,5 @@ QueryParser::Internal::parse_query(const
continue;
}
-		if (C_isspace(*it) || *it == '+' || *it == '-') {+		if (U_isspace(*it) || *it == '+' || *it == '-') {// Ignore + or - followed by a space, or further + or -.
// Postfix + (such as in C++ and H+) is handled as part of
@@ -229,5 +265,5 @@ QueryParser::Internal::parse_query(const
 	      case '(':// Skip whitespace.
- it = find_if(it, end, C_isnotspace);
+ it = find_if(it, end, U_isnotspace);
// Ignore ( at end of query.
if (it == end) goto done;
@@ -264,13 +300,14 @@ QueryParser::Internal::parse_query(const
 	if (mode == DEFAULT && !prefixes.empty()) {// Check for fieldname prefixes (e.g. title:historical).
- AccentNormalisingItor p = find_if(it, end, C_isnotalnum);
+ AccentNormalisingItor p = find_if(it, end, G_unichar_isnotalnum);
 	    if (p != end && *p == ':' && ++p != end) {- unsigned char ch = *p;
- if (C_isalnum(ch) ||
+ gunichar ch = *p;
+ if (g_unichar_isalnum(ch) ||
((flags & FLAG_PHRASE) && ch == '"') ||
 		    ((flags & FLAG_BOOLEAN) && ch == '(')) {string field;
p = it;
- while (*p != ':') field += *p++;
+ while (*p != ':')
+ field += string(ubuf, g_unichar_to_utf8(*p++, ubuf));
map<string, pair<bool, string> >::const_iterator f;
f = prefixes.find(field);
@@ -278,8 +315,8 @@ QueryParser::Internal::parse_query(const
// Can't boolean prefix a subexpression or phrase.
bool boolean_filter = f->second.first;
-			if (!boolean_filter || C_isalnum(ch)) {+			if (!boolean_filter || g_unichar_isalnum(ch)) {it = p;
++it;
-			    if (!C_isalnum(ch)) {+			    if (!g_unichar_isalnum(ch)) {newprev = ch;
++it;
@@ -300,5 +337,5 @@ QueryParser::Internal::parse_query(const
prefix += ':';
while (it != end && *it > ' ' && *it != ')')
- prefix += *it++;
+ prefix += string(ubuf, g_unichar_to_utf8(*it++, ubuf));
Parse(pParser, BOOLEAN_FILTER,
new Term(prefix, 0), &state);
@@ -316,10 +353,10 @@ phrased_term:
// Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
// Don't worry if there's a trailing '.' or not.
-	if (C_isupper(*it)) {+	if (U_isupper(*it)) {string t;
AccentNormalisingItor p = it;
 	    do {- t += *p++;
- } while (p != end && *p == '.' && ++p != end && C_isupper(*p));
+ t += string(ubuf, g_unichar_to_utf8(*p++, ubuf));
+ } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
// One letter does not make an acronym! If we handled a single
// uppercase letter here, we wouldn't catch M&S below.
@@ -327,5 +364,5 @@ phrased_term:
// Check there's not a (lower case) letter or digit
// immediately after it.
-		if (p == end || !C_isalnum(*p)) {+		if (p == end || !g_unichar_isalnum(*p)) {it = p;
swap(term, t);
@@ -337,5 +374,5 @@ phrased_term:
 	if (term.empty()) { 	    while (it != end) {-		if (!C_isalnum(*it)) {+		if (!g_unichar_isalnum(*it)) {// Treat a single embedded '&' as a word character
// (e.g. AT&T).
@@ -343,9 +380,9 @@ phrased_term:
AccentNormalisingItor p = it;
++p;
- if (p == end || !C_isalnum(*p)) break;
+ if (p == end || !g_unichar_isalnum(*p)) break;
}
- term += *it++;
+ term.append(ubuf, g_unichar_to_utf8(*it++, ubuf));
}
-	    if (it != end && (*it == '#' || C_issign(*it))) {+	    if (it != end && (*it == '#' || U_issign(*it))) {string suff_term = term;
AccentNormalisingItor p = it;
@@ -363,8 +400,8 @@ phrased_term:
// to what combinations are allowed.
 		    do {- suff_term += *p++;
- } while (p != end && C_issign(*p));
+ suff_term += string(ubuf, g_unichar_to_utf8(*p++, ubuf));
+ } while (p != end && U_issign(*p));
}
-		if (p == end || !C_isalnum(*p)) {+ 		if (p == end || !g_unichar_isalnum(*p)) {// If the suffixed term doesn't exist, check that the
// non-suffixed term does. This also takes care of
@@ -383,5 +420,5 @@ phrased_term:
// Don't want to interpret A.N.D. or аND as an AND operator.
 	    if (!was_acronym && transliterations == it.transliterations()) {-		if (prefix.empty() && !term.empty() && C_isalpha(term[0])) {+ 		if (prefix.empty() && !term.empty() && U_isupper(term[0])) { 		    if (C_isupper(term[0])) { 			if (term == "AND") {@@ -433,5 +470,5 @@ phrased_term:
// e.g. "example.com" should give a phrase search for "exampl"
// and "com", not "example" and "com".
-	    if (p == end || C_isspace(*p)) {+	    if (p == end || U_isspace(*p)) {it = p;
// If topterms added a term with a trailing '.', it will be
@@ -439,5 +476,5 @@ phrased_term:
// initial in someone's name, a full stop in pasted text or
// something like that.
-		if (!C_isupper(term[0])) {+		if (!U_isupper(term[0])) {unstemmed_term = term + '.';
need_to_stem = false;
@@ -449,5 +486,5 @@ phrased_term:
term = downcase_term(term);
 	if (need_to_stem) {- if (stem_action == STEM_SOME && C_isupper(unstemmed_term[0]))
+ if (stem_action == STEM_SOME && U_isupper(unstemmed_term[0]))
term = 'R' + term;
else
@@ -494,5 +531,5 @@ phrased_term:
// Don't generate a phrase unless the phrase generators are
// immediately followed by another term.
-	    if (it != end && C_isalnum(*it)) {+	    if (it != end && g_unichar_isalnum(*it)) {mode = IN_PHRASED_TERM;
goto phrased_term;
diff -x *.rej -x *.orig -x Makefile -x *.lo -x *.o -x *.la -x .libs -x .deps -prNU2 xapian-core-0.9.2-orig/queryparser/queryparser_internal.cc xapian-core-0.9.2/queryparser/queryparser_internal.cc
--- xapian-core-0.9.2-orig/queryparser/queryparser_internal.cc Fri Jul 15 12:27:03 2005
+++ xapian-core-0.9.2/queryparser/queryparser_internal.cc Sun Sep 18 20:34:56 2005
@@ -43,4 +43,39 @@ using namespace std;
using namespace Xapian;
+static inline bool
+U_isupper(gunichar ch) {+ return (ch < 128 && C_isupper(ch));
+}
+
+static inline bool
+U_isspace(gunichar ch) {+ return (ch < 128 && C_isspace(ch));
+}
+
+static inline bool
+U_isnotspace(gunichar ch) {+ return !U_isspace(ch);
+}
+
+static inline bool
+U_isalnum(gunichar ch) {+ return (ch < 128 && C_isalnum(ch));
+}
+
+static inline bool
+U_isnotalnum(gunichar ch) {+ return !U_isalnum(ch);
+}
+
+static inline bool
+U_issign(gunichar ch) {+ return (ch < 128 && C_issign(ch));
+}
+
+static inline bool
+G_unichar_isnotalnum(gunichar ch) {+ return !g_unichar_isalnum(ch);
+}
+
// Disable debug code lemon adds.
#define NDEBUG
@@ -124,25 +159,25 @@ static inline string
downcase_term(const string &term)
 {- string t;
- t.reserve(term.size());
- AccentNormalisingItor i(term.begin());
- const AccentNormalisingItor end(term.end());
- while (i != end) t += C_tolower(*i++);
+ gchar * r;
+ r = g_utf8_strdown(static_cast<const gchar*>(term.data()),
+ term.length());
+ string t(static_cast<char *>(r));
+ free(r);
return t;
}
static inline bool
-is_phrase_generator(unsigned char ch)
+is_phrase_generator(gunichar ch)
 {// These characters generate a phrase search.
// Ordered mostly by frequency of calls to this function done when
// running queryparsertest.
-    return (ch && strchr(".-/':\\_@", ch) != NULL);+    return (ch && ch < 128 && strchr(".-/':\\_@", ch) != NULL);}
static inline bool
-prefix_needs_colon(const string & prefix, unsigned char ch)
+prefix_needs_colon(const string & prefix, gunichar ch)
 {- if (!C_isupper(ch)) return false;
+ if (!U_isupper(ch)) return false;
string::size_type len = prefix.length();
return (len > 1 && prefix[len - 1] != ':');
@@ -157,4 +192,5 @@ Query
QueryParser::Internal::parse_query(const string &qs, unsigned flags)
 {+ gchar ubuf[6];
#ifndef NDEBUG
// Set the prefix added to Lemon's debug output, if it's enabled.
@@ -167,27 +203,27 @@ QueryParser::Internal::parse_query(const
termpos term_pos = 1;
- AccentNormalisingItor it(qs.begin()), end(qs.end());
+ AccentNormalisingItor it(qs.data(), qs.data() + qs.size()), end(qs.data() + qs.size());
State state(this);
     enum { DEFAULT, IN_QUOTES, IN_PHRASED_TERM } mode = DEFAULT;- unsigned char newprev = ' ';
+ gunichar newprev = ' ';
     while (it != end) {if (mode == IN_PHRASED_TERM) mode = DEFAULT;
-	if (C_isspace(*it)) {+	if (U_isspace(*it)) {newprev = ' ';
++it;
- it = find_if(it, end, C_isnotspace);
+ it = find_if(it, end, U_isnotspace);
if (it == end) break;
}
-	if (!C_isalnum(*it)) {- unsigned char prev = newprev;
- unsigned char ch = *it++;
+	if (!g_unichar_isalnum(*it)) {+ gunichar prev = newprev;
+ gunichar ch = *it++;
if (it != end) newprev = *it;
 	    switch (ch) {case '"':
// Skip whitespace.
- it = find_if(it, end, C_isnotspace);
+ it = find_if(it, end, U_isnotspace);
 		if (mode != IN_QUOTES) { 		    if (it == end) {@@ -222,5 +258,5 @@ QueryParser::Internal::parse_query(const
continue;
}
-		if (C_isspace(*it) || *it == '+' || *it == '-') {+		if (U_isspace(*it) || *it == '+' || *it == '-') {// Ignore + or - followed by a space, or further + or -.
// Postfix + (such as in C++ and H+) is handled as part of
@@ -235,5 +271,5 @@ QueryParser::Internal::parse_query(const
 	      case '(':// Skip whitespace.
- it = find_if(it, end, C_isnotspace);
+ it = find_if(it, end, U_isnotspace);
// Ignore ( at end of query.
if (it == end) goto done;
@@ -270,13 +306,14 @@ QueryParser::Internal::parse_query(const
 	if (mode == DEFAULT && !prefixes.empty()) {// Check for fieldname prefixes (e.g. title:historical).
- AccentNormalisingItor p = find_if(it, end, C_isnotalnum);
+ AccentNormalisingItor p = find_if(it, end, G_unichar_isnotalnum);
 	    if (p != end && *p == ':' && ++p != end) {- unsigned char ch = *p;
- if (C_isalnum(ch) ||
+ gunichar ch = *p;
+ if (g_unichar_isalnum(ch) ||
((flags & FLAG_PHRASE) && ch == '"') ||
 		    ((flags & FLAG_BOOLEAN) && ch == '(')) {string field;
p = it;
- while (*p != ':') field += *p++;
+ while (*p != ':')
+ field += string(ubuf, g_unichar_to_utf8(*p++, ubuf));
map<string, pair<bool, string> >::const_iterator f;
f = prefixes.find(field);
@@ -284,8 +321,8 @@ QueryParser::Internal::parse_query(const
// Can't boolean prefix a subexpression or phrase.
bool boolean_filter = f->second.first;
-			if (!boolean_filter || C_isalnum(ch)) {+			if (!boolean_filter || g_unichar_isalnum(ch)) {it = p;
++it;
-			    if (!C_isalnum(ch)) {+			    if (!g_unichar_isalnum(ch)) {newprev = ch;
++it;
@@ -306,5 +343,5 @@ QueryParser::Internal::parse_query(const
prefix += ':';
while (it != end && *it > ' ' && *it != ')')
- prefix += *it++;
+ prefix += string(ubuf, g_unichar_to_utf8(*it++, ubuf));
Parse(pParser, BOOLEAN_FILTER,
new Term(prefix, 0), &state);
@@ -322,10 +359,10 @@ phrased_term:
// Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
// Don't worry if there's a trailing '.' or not.
-	if (C_isupper(*it)) {+	if (U_isupper(*it)) {string t;
AccentNormalisingItor p = it;
 	    do {- t += *p++;
- } while (p != end && *p == '.' && ++p != end && C_isupper(*p));
+ t += string(ubuf, g_unichar_to_utf8(*p++, ubuf));
+ } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
// One letter does not make an acronym! If we handled a single
// uppercase letter here, we wouldn't catch M&S below.
@@ -333,5 +370,5 @@ phrased_term:
// Check there's not a (lower case) letter or digit
// immediately after it.
-		if (p == end || !C_isalnum(*p)) {+		if (p == end || !g_unichar_isalnum(*p)) {it = p;
swap(term, t);
@@ -343,5 +380,5 @@ phrased_term:
 	if (term.empty()) { 	    while (it != end) {-		if (!C_isalnum(*it)) {+		if (!g_unichar_isalnum(*it)) {// Treat a single embedded '&' as a word character
// (e.g. AT&T).
@@ -349,9 +386,9 @@ phrased_term:
AccentNormalisingItor p = it;
++p;
- if (p == end || !C_isalnum(*p)) break;
+ if (p == end || !g_unichar_isalnum(*p)) break;
}
- term += *it++;
+ term.append(ubuf, g_unichar_to_utf8(*it++, ubuf));
}
-	    if (it != end && (*it == '#' || C_issign(*it))) {+	    if (it != end && (*it == '#' || U_issign(*it))) {string suff_term = term;
AccentNormalisingItor p = it;
@@ -369,8 +406,8 @@ phrased_term:
// to what combinations are allowed.
 		    do {- suff_term += *p++;
- } while (p != end && C_issign(*p));
+ suff_term += string(ubuf, g_unichar_to_utf8(*p++, ubuf));
+ } while (p != end && U_issign(*p));
}
-		if (p == end || !C_isalnum(*p)) {+ 		if (p == end || !g_unichar_isalnum(*p)) {// If the suffixed term doesn't exist, check that the
// non-suffixed term does. This also takes care of
@@ -389,5 +426,5 @@ phrased_term:
// Don't want to interpret A.N.D. or аND as an AND operator.
 	    if (!was_acronym && transliterations == it.transliterations()) {-		if (prefix.empty() && !term.empty() && C_isalpha(term[0])) {+ 		if (prefix.empty() && !term.empty() && U_isupper(term[0])) { 		    if (C_isupper(term[0])) { 			if (term == "AND") {@@ -439,5 +476,5 @@ phrased_term:
// e.g. "example.com" should give a phrase search for "exampl"
// and "com", not "example" and "com".
-	    if (p == end || C_isspace(*p)) {+	    if (p == end || U_isspace(*p)) {it = p;
// If topterms added a term with a trailing '.', it will be
@@ -445,5 +482,5 @@ phrased_term:
// initial in someone's name, a full stop in pasted text or
// something like that.
-		if (!C_isupper(term[0])) {+		if (!U_isupper(term[0])) {unstemmed_term = term + '.';
need_to_stem = false;
@@ -455,5 +492,5 @@ phrased_term:
term = downcase_term(term);
 	if (need_to_stem) {- if (stem_action == STEM_SOME && C_isupper(unstemmed_term[0]))
+ if (stem_action == STEM_SOME && U_isupper(unstemmed_term[0]))
term = 'R' + term;
else
@@ -500,5 +537,5 @@ phrased_term:
// Don't generate a phrase unless the phrase generators are
// immediately followed by another term.
-	    if (it != end && C_isalnum(*it)) {+	    if (it != end && g_unichar_isalnum(*it)) {mode = IN_PHRASED_TERM;
goto phrased_term;

 
 
 
