Группа :: Базы Данных
Пакет: xapian
Главная Изменения Спек Патчи Загрузить Bugs and FR
Патч: xapian-qp-utf8-0.9.2.patch
diff -x *.rej -x *.orig -x Makefile -x *.lo -x *.o -x *.la -x .libs -x .deps -prNU2 xapian-core-0.9.2-orig/queryparser/Makefile.am xapian-core-0.9.2/queryparser/Makefile.am
--- xapian-core-0.9.2-orig/queryparser/Makefile.am Fri Jul 15 12:21:35 2005
+++ xapian-core-0.9.2/queryparser/Makefile.am Sun Sep 18 17:06:41 2005
@@ -1,5 +1,5 @@
## Process this file with automake to produce Makefile.in
-INCLUDES = -I$(top_srcdir)/include -I${top_builddir}/include -I$(top_srcdir)/common -I$(top_srcdir)/api
+INCLUDES = -I$(top_srcdir)/include -I${top_builddir}/include -I$(top_srcdir)/common -I$(top_srcdir)/api -I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include
noinst_HEADERS = accentnormalisingitor.h symboltab.h queryparser_internal.h \
@@ -26,2 +26,3 @@ endif
libqueryparser_la_SOURCES = queryparser.cc queryparser_internal.cc
+libqueryparser_la_LIBADD = /usr/lib/libglib-2.0.la
diff -x *.rej -x *.orig -x Makefile -x *.lo -x *.o -x *.la -x .libs -x .deps -prNU2 xapian-core-0.9.2-orig/queryparser/Makefile.in xapian-core-0.9.2/queryparser/Makefile.in
--- xapian-core-0.9.2-orig/queryparser/Makefile.in Fri Jul 15 12:23:02 2005
+++ xapian-core-0.9.2/queryparser/Makefile.in Fri Oct 7 04:39:35 2005
@@ -55,5 +55,5 @@ CONFIG_HEADER = $(top_builddir)/config.h
CONFIG_CLEAN_FILES =
LTLIBRARIES = $(noinst_LTLIBRARIES)
-libqueryparser_la_LIBADD =
+libqueryparser_la_LIBADD = /usr/lib/libglib-2.0.la
am_libqueryparser_la_OBJECTS = queryparser.lo queryparser_internal.lo
libqueryparser_la_OBJECTS = $(am_libqueryparser_la_OBJECTS)
@@ -201,5 +201,5 @@ sharedstatedir = @sharedstatedir@
sysconfdir = @sysconfdir@
target_alias = @target_alias@
-INCLUDES = -I$(top_srcdir)/include -I${top_builddir}/include -I$(top_srcdir)/common -I$(top_srcdir)/api
+INCLUDES = -I$(top_srcdir)/include -I${top_builddir}/include -I$(top_srcdir)/common -I$(top_srcdir)/api -I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include
noinst_HEADERS = accentnormalisingitor.h symboltab.h queryparser_internal.h \
queryparser_token.h
diff -x *.rej -x *.orig -x Makefile -x *.lo -x *.o -x *.la -x .libs -x .deps -prNU2 xapian-core-0.9.2-orig/queryparser/accentnormalisingitor.h xapian-core-0.9.2/queryparser/accentnormalisingitor.h
--- xapian-core-0.9.2-orig/queryparser/accentnormalisingitor.h Fri Jul 15 12:21:35 2005
+++ xapian-core-0.9.2/queryparser/accentnormalisingitor.h Sun Sep 18 17:06:41 2005
@@ -20,4 +20,5 @@
#include "symboltab.h"
+#include <glib/gunicode.h>
#include <string>
@@ -25,32 +26,41 @@
using std::string;
-/** A wrapper class for a char which returns the char if dereferenced
+typedef gunichar char_type;
+
+/** A wrapper class for a char_type which returns the char_type if dereferenced
* with *. We need this to implement input_iterator semantics.
*/
class CharWrapper {
private:
- char ch;
+ char_type ch;
public:
- CharWrapper(char ch_) : ch(ch_) { }
- char operator*() const { return ch; }
+ CharWrapper(char_type ch_) : ch(ch_) { }
+ char_type operator*() const { return ch; }
};
class AccentNormalisingItor {
private:
- string::const_iterator itor;
- char queued;
+ /*string::const_iterator*/const gchar * itor;
+ const gchar * end;
+
+ char_type queued;
size_t trans;
public:
AccentNormalisingItor()
- : itor(), queued(0), trans(0) {}
- AccentNormalisingItor(string::const_iterator itor_)
- : itor(itor_), queued(0), trans(0) {}
+ : itor(NULL), queued(0), trans(0) {}
+ explicit AccentNormalisingItor(const char * itor_)
+ : itor(itor_), end(itor_), queued(0), trans(0) {}
+ AccentNormalisingItor(const char * itor_, const char *end_)
+ : itor(itor_), end(end_), queued(0), trans(0) {}
+#if 0
void operator=(string::const_iterator itor_)
{
itor = itor_;
+ end = end_;
queued = 0;
trans = 0;
}
+#endif
bool operator==(const AccentNormalisingItor &o) const {
return queued == o.queued && itor == o.itor;
@@ -59,13 +69,13 @@ class AccentNormalisingItor {
return !(*this == o);
}
- char operator*() const {
+ char_type operator*() const {
if (queued) return queued;
- unsigned char ch = (unsigned char)*itor;
+ char_type ch = g_utf8_get_char_validated(itor, end - itor);
if (ch >= 160
-#if CHAR_BIT > 8 // Avoid compiler warning.
- && ch < 256
-#endif
- ) return TRANSLIT1[ch - 160];
- return (char)ch;
+//#if CHAR_BIT > 8 // Avoid compiler warning.
+ && ch < 0x240
+//#endif
+ ) ch = /*return*/ char_type(TRANSLIT1[ch - 160]);
+ return /*(char)*/ch;
}
AccentNormalisingItor & operator++() {
@@ -73,9 +83,9 @@ class AccentNormalisingItor {
queued = 0;
} else {
- unsigned char ch = (unsigned char)*itor;
+ char_type ch = g_utf8_get_char_validated(itor, end - itor);
if (ch >= 160
-#if CHAR_BIT > 8 // Avoid compiler warning.
- && ch < 256
-#endif
+//#if CHAR_BIT > 8 // Avoid compiler warning.
+ && ch < 0x240
+//#endif
) {
++trans;
@@ -87,14 +97,20 @@ class AccentNormalisingItor {
}
}
- ++itor;
+ // ++itor; becomes:
+ size_t skip = g_utf8_skip[*reinterpret_cast<const guchar *>(itor)];
+ if (size_t(end - itor) < skip) {
+ itor = end;
+ } else {
+ itor += skip;
+ }
return *this;
}
CharWrapper operator++(int) {
- char tmp = **this;
+ char_type tmp = **this;
operator++();
return CharWrapper(tmp);
}
size_t transliterations() const { return trans; }
- string::const_iterator raw() const { return itor; }
+ //string::const_iterator raw() const { return itor; }
/// We implement the semantics of an STL input_iterator.
diff -x *.rej -x *.orig -x Makefile -x *.lo -x *.o -x *.la -x .libs -x .deps -prNU2 xapian-core-0.9.2-orig/queryparser/queryparser.lemony xapian-core-0.9.2/queryparser/queryparser.lemony
--- xapian-core-0.9.2-orig/queryparser/queryparser.lemony Fri Jul 15 12:21:35 2005
+++ xapian-core-0.9.2/queryparser/queryparser.lemony Sun Sep 18 17:06:41 2005
@@ -37,4 +37,39 @@ using namespace std;
using namespace Xapian;
+static inline bool
+U_isupper(gunichar ch) {
+ return (ch < 128 && C_isupper(ch));
+}
+
+static inline bool
+U_isspace(gunichar ch) {
+ return (ch < 128 && C_isspace(ch));
+}
+
+static inline bool
+U_isnotspace(gunichar ch) {
+ return !U_isspace(ch);
+}
+
+static inline bool
+U_isalnum(gunichar ch) {
+ return (ch < 128 && C_isalnum(ch));
+}
+
+static inline bool
+U_isnotalnum(gunichar ch) {
+ return !U_isalnum(ch);
+}
+
+static inline bool
+U_issign(gunichar ch) {
+ return (ch < 128 && C_issign(ch));
+}
+
+static inline bool
+G_unichar_isnotalnum(gunichar ch) {
+ return !g_unichar_isalnum(ch);
+}
+
// Disable debug code lemon adds.
#define NDEBUG
@@ -118,25 +153,25 @@ static inline string
downcase_term(const string &term)
{
- string t;
- t.reserve(term.size());
- AccentNormalisingItor i(term.begin());
- const AccentNormalisingItor end(term.end());
- while (i != end) t += C_tolower(*i++);
+ gchar * r;
+ r = g_utf8_strdown(static_cast<const gchar*>(term.data()),
+ term.length());
+ string t(static_cast<char *>(r));
+ free(r);
return t;
}
static inline bool
-is_phrase_generator(unsigned char ch)
+is_phrase_generator(gunichar ch)
{
// These characters generate a phrase search.
// Ordered mostly by frequency of calls to this function done when
// running queryparsertest.
- return (ch && strchr(".-/':\\_@", ch) != NULL);
+ return (ch && ch < 128 && strchr(".-/':\\_@", ch) != NULL);
}
static inline bool
-prefix_needs_colon(const string & prefix, unsigned char ch)
+prefix_needs_colon(const string & prefix, gunichar ch)
{
- if (!C_isupper(ch)) return false;
+ if (!U_isupper(ch)) return false;
string::size_type len = prefix.length();
return (len > 1 && prefix[len - 1] != ':');
@@ -151,4 +186,5 @@ Query
QueryParser::Internal::parse_query(const string &qs, unsigned flags)
{
+ gchar ubuf[6];
#ifndef NDEBUG
// Set the prefix added to Lemon's debug output, if it's enabled.
@@ -161,27 +197,27 @@ QueryParser::Internal::parse_query(const
termpos term_pos = 1;
- AccentNormalisingItor it(qs.begin()), end(qs.end());
+ AccentNormalisingItor it(qs.data(), qs.data() + qs.size()), end(qs.data() + qs.size());
State state(this);
enum { DEFAULT, IN_QUOTES, IN_PHRASED_TERM } mode = DEFAULT;
- unsigned char newprev = ' ';
+ gunichar newprev = ' ';
while (it != end) {
if (mode == IN_PHRASED_TERM) mode = DEFAULT;
- if (C_isspace(*it)) {
+ if (U_isspace(*it)) {
newprev = ' ';
++it;
- it = find_if(it, end, C_isnotspace);
+ it = find_if(it, end, U_isnotspace);
if (it == end) break;
}
- if (!C_isalnum(*it)) {
- unsigned char prev = newprev;
- unsigned char ch = *it++;
+ if (!g_unichar_isalnum(*it)) {
+ gunichar prev = newprev;
+ gunichar ch = *it++;
if (it != end) newprev = *it;
switch (ch) {
case '"':
// Skip whitespace.
- it = find_if(it, end, C_isnotspace);
+ it = find_if(it, end, U_isnotspace);
if (mode != IN_QUOTES) {
if (it == end) {
@@ -216,5 +252,5 @@ QueryParser::Internal::parse_query(const
continue;
}
- if (C_isspace(*it) || *it == '+' || *it == '-') {
+ if (U_isspace(*it) || *it == '+' || *it == '-') {
// Ignore + or - followed by a space, or further + or -.
// Postfix + (such as in C++ and H+) is handled as part of
@@ -229,5 +265,5 @@ QueryParser::Internal::parse_query(const
case '(':
// Skip whitespace.
- it = find_if(it, end, C_isnotspace);
+ it = find_if(it, end, U_isnotspace);
// Ignore ( at end of query.
if (it == end) goto done;
@@ -264,13 +300,14 @@ QueryParser::Internal::parse_query(const
if (mode == DEFAULT && !prefixes.empty()) {
// Check for fieldname prefixes (e.g. title:historical).
- AccentNormalisingItor p = find_if(it, end, C_isnotalnum);
+ AccentNormalisingItor p = find_if(it, end, G_unichar_isnotalnum);
if (p != end && *p == ':' && ++p != end) {
- unsigned char ch = *p;
- if (C_isalnum(ch) ||
+ gunichar ch = *p;
+ if (g_unichar_isalnum(ch) ||
((flags & FLAG_PHRASE) && ch == '"') ||
((flags & FLAG_BOOLEAN) && ch == '(')) {
string field;
p = it;
- while (*p != ':') field += *p++;
+ while (*p != ':')
+ field += string(ubuf, g_unichar_to_utf8(*p++, ubuf));
map<string, pair<bool, string> >::const_iterator f;
f = prefixes.find(field);
@@ -278,8 +315,8 @@ QueryParser::Internal::parse_query(const
// Can't boolean prefix a subexpression or phrase.
bool boolean_filter = f->second.first;
- if (!boolean_filter || C_isalnum(ch)) {
+ if (!boolean_filter || g_unichar_isalnum(ch)) {
it = p;
++it;
- if (!C_isalnum(ch)) {
+ if (!g_unichar_isalnum(ch)) {
newprev = ch;
++it;
@@ -300,5 +337,5 @@ QueryParser::Internal::parse_query(const
prefix += ':';
while (it != end && *it > ' ' && *it != ')')
- prefix += *it++;
+ prefix += string(ubuf, g_unichar_to_utf8(*it++, ubuf));
Parse(pParser, BOOLEAN_FILTER,
new Term(prefix, 0), &state);
@@ -316,10 +353,10 @@ phrased_term:
// Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
// Don't worry if there's a trailing '.' or not.
- if (C_isupper(*it)) {
+ if (U_isupper(*it)) {
string t;
AccentNormalisingItor p = it;
do {
- t += *p++;
- } while (p != end && *p == '.' && ++p != end && C_isupper(*p));
+ t += string(ubuf, g_unichar_to_utf8(*p++, ubuf));
+ } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
// One letter does not make an acronym! If we handled a single
// uppercase letter here, we wouldn't catch M&S below.
@@ -327,5 +364,5 @@ phrased_term:
// Check there's not a (lower case) letter or digit
// immediately after it.
- if (p == end || !C_isalnum(*p)) {
+ if (p == end || !g_unichar_isalnum(*p)) {
it = p;
swap(term, t);
@@ -337,5 +374,5 @@ phrased_term:
if (term.empty()) {
while (it != end) {
- if (!C_isalnum(*it)) {
+ if (!g_unichar_isalnum(*it)) {
// Treat a single embedded '&' as a word character
// (e.g. AT&T).
@@ -343,9 +380,9 @@ phrased_term:
AccentNormalisingItor p = it;
++p;
- if (p == end || !C_isalnum(*p)) break;
+ if (p == end || !g_unichar_isalnum(*p)) break;
}
- term += *it++;
+ term.append(ubuf, g_unichar_to_utf8(*it++, ubuf));
}
- if (it != end && (*it == '#' || C_issign(*it))) {
+ if (it != end && (*it == '#' || U_issign(*it))) {
string suff_term = term;
AccentNormalisingItor p = it;
@@ -363,8 +400,8 @@ phrased_term:
// to what combinations are allowed.
do {
- suff_term += *p++;
- } while (p != end && C_issign(*p));
+ suff_term += string(ubuf, g_unichar_to_utf8(*p++, ubuf));
+ } while (p != end && U_issign(*p));
}
- if (p == end || !C_isalnum(*p)) {
+ if (p == end || !g_unichar_isalnum(*p)) {
// If the suffixed term doesn't exist, check that the
// non-suffixed term does. This also takes care of
@@ -383,5 +420,5 @@ phrased_term:
// Don't want to interpret A.N.D. or аND as an AND operator.
if (!was_acronym && transliterations == it.transliterations()) {
- if (prefix.empty() && !term.empty() && C_isalpha(term[0])) {
+ if (prefix.empty() && !term.empty() && U_isupper(term[0])) {
if (C_isupper(term[0])) {
if (term == "AND") {
@@ -433,5 +470,5 @@ phrased_term:
// e.g. "example.com" should give a phrase search for "exampl"
// and "com", not "example" and "com".
- if (p == end || C_isspace(*p)) {
+ if (p == end || U_isspace(*p)) {
it = p;
// If topterms added a term with a trailing '.', it will be
@@ -439,5 +476,5 @@ phrased_term:
// initial in someone's name, a full stop in pasted text or
// something like that.
- if (!C_isupper(term[0])) {
+ if (!U_isupper(term[0])) {
unstemmed_term = term + '.';
need_to_stem = false;
@@ -449,5 +486,5 @@ phrased_term:
term = downcase_term(term);
if (need_to_stem) {
- if (stem_action == STEM_SOME && C_isupper(unstemmed_term[0]))
+ if (stem_action == STEM_SOME && U_isupper(unstemmed_term[0]))
term = 'R' + term;
else
@@ -494,5 +531,5 @@ phrased_term:
// Don't generate a phrase unless the phrase generators are
// immediately followed by another term.
- if (it != end && C_isalnum(*it)) {
+ if (it != end && g_unichar_isalnum(*it)) {
mode = IN_PHRASED_TERM;
goto phrased_term;
diff -x *.rej -x *.orig -x Makefile -x *.lo -x *.o -x *.la -x .libs -x .deps -prNU2 xapian-core-0.9.2-orig/queryparser/queryparser_internal.cc xapian-core-0.9.2/queryparser/queryparser_internal.cc
--- xapian-core-0.9.2-orig/queryparser/queryparser_internal.cc Fri Jul 15 12:27:03 2005
+++ xapian-core-0.9.2/queryparser/queryparser_internal.cc Sun Sep 18 20:34:56 2005
@@ -43,4 +43,39 @@ using namespace std;
using namespace Xapian;
+static inline bool
+U_isupper(gunichar ch) {
+ return (ch < 128 && C_isupper(ch));
+}
+
+static inline bool
+U_isspace(gunichar ch) {
+ return (ch < 128 && C_isspace(ch));
+}
+
+static inline bool
+U_isnotspace(gunichar ch) {
+ return !U_isspace(ch);
+}
+
+static inline bool
+U_isalnum(gunichar ch) {
+ return (ch < 128 && C_isalnum(ch));
+}
+
+static inline bool
+U_isnotalnum(gunichar ch) {
+ return !U_isalnum(ch);
+}
+
+static inline bool
+U_issign(gunichar ch) {
+ return (ch < 128 && C_issign(ch));
+}
+
+static inline bool
+G_unichar_isnotalnum(gunichar ch) {
+ return !g_unichar_isalnum(ch);
+}
+
// Disable debug code lemon adds.
#define NDEBUG
@@ -124,25 +159,25 @@ static inline string
downcase_term(const string &term)
{
- string t;
- t.reserve(term.size());
- AccentNormalisingItor i(term.begin());
- const AccentNormalisingItor end(term.end());
- while (i != end) t += C_tolower(*i++);
+ gchar * r;
+ r = g_utf8_strdown(static_cast<const gchar*>(term.data()),
+ term.length());
+ string t(static_cast<char *>(r));
+ free(r);
return t;
}
static inline bool
-is_phrase_generator(unsigned char ch)
+is_phrase_generator(gunichar ch)
{
// These characters generate a phrase search.
// Ordered mostly by frequency of calls to this function done when
// running queryparsertest.
- return (ch && strchr(".-/':\\_@", ch) != NULL);
+ return (ch && ch < 128 && strchr(".-/':\\_@", ch) != NULL);
}
static inline bool
-prefix_needs_colon(const string & prefix, unsigned char ch)
+prefix_needs_colon(const string & prefix, gunichar ch)
{
- if (!C_isupper(ch)) return false;
+ if (!U_isupper(ch)) return false;
string::size_type len = prefix.length();
return (len > 1 && prefix[len - 1] != ':');
@@ -157,4 +192,5 @@ Query
QueryParser::Internal::parse_query(const string &qs, unsigned flags)
{
+ gchar ubuf[6];
#ifndef NDEBUG
// Set the prefix added to Lemon's debug output, if it's enabled.
@@ -167,27 +203,27 @@ QueryParser::Internal::parse_query(const
termpos term_pos = 1;
- AccentNormalisingItor it(qs.begin()), end(qs.end());
+ AccentNormalisingItor it(qs.data(), qs.data() + qs.size()), end(qs.data() + qs.size());
State state(this);
enum { DEFAULT, IN_QUOTES, IN_PHRASED_TERM } mode = DEFAULT;
- unsigned char newprev = ' ';
+ gunichar newprev = ' ';
while (it != end) {
if (mode == IN_PHRASED_TERM) mode = DEFAULT;
- if (C_isspace(*it)) {
+ if (U_isspace(*it)) {
newprev = ' ';
++it;
- it = find_if(it, end, C_isnotspace);
+ it = find_if(it, end, U_isnotspace);
if (it == end) break;
}
- if (!C_isalnum(*it)) {
- unsigned char prev = newprev;
- unsigned char ch = *it++;
+ if (!g_unichar_isalnum(*it)) {
+ gunichar prev = newprev;
+ gunichar ch = *it++;
if (it != end) newprev = *it;
switch (ch) {
case '"':
// Skip whitespace.
- it = find_if(it, end, C_isnotspace);
+ it = find_if(it, end, U_isnotspace);
if (mode != IN_QUOTES) {
if (it == end) {
@@ -222,5 +258,5 @@ QueryParser::Internal::parse_query(const
continue;
}
- if (C_isspace(*it) || *it == '+' || *it == '-') {
+ if (U_isspace(*it) || *it == '+' || *it == '-') {
// Ignore + or - followed by a space, or further + or -.
// Postfix + (such as in C++ and H+) is handled as part of
@@ -235,5 +271,5 @@ QueryParser::Internal::parse_query(const
case '(':
// Skip whitespace.
- it = find_if(it, end, C_isnotspace);
+ it = find_if(it, end, U_isnotspace);
// Ignore ( at end of query.
if (it == end) goto done;
@@ -270,13 +306,14 @@ QueryParser::Internal::parse_query(const
if (mode == DEFAULT && !prefixes.empty()) {
// Check for fieldname prefixes (e.g. title:historical).
- AccentNormalisingItor p = find_if(it, end, C_isnotalnum);
+ AccentNormalisingItor p = find_if(it, end, G_unichar_isnotalnum);
if (p != end && *p == ':' && ++p != end) {
- unsigned char ch = *p;
- if (C_isalnum(ch) ||
+ gunichar ch = *p;
+ if (g_unichar_isalnum(ch) ||
((flags & FLAG_PHRASE) && ch == '"') ||
((flags & FLAG_BOOLEAN) && ch == '(')) {
string field;
p = it;
- while (*p != ':') field += *p++;
+ while (*p != ':')
+ field += string(ubuf, g_unichar_to_utf8(*p++, ubuf));
map<string, pair<bool, string> >::const_iterator f;
f = prefixes.find(field);
@@ -284,8 +321,8 @@ QueryParser::Internal::parse_query(const
// Can't boolean prefix a subexpression or phrase.
bool boolean_filter = f->second.first;
- if (!boolean_filter || C_isalnum(ch)) {
+ if (!boolean_filter || g_unichar_isalnum(ch)) {
it = p;
++it;
- if (!C_isalnum(ch)) {
+ if (!g_unichar_isalnum(ch)) {
newprev = ch;
++it;
@@ -306,5 +343,5 @@ QueryParser::Internal::parse_query(const
prefix += ':';
while (it != end && *it > ' ' && *it != ')')
- prefix += *it++;
+ prefix += string(ubuf, g_unichar_to_utf8(*it++, ubuf));
Parse(pParser, BOOLEAN_FILTER,
new Term(prefix, 0), &state);
@@ -322,10 +359,10 @@ phrased_term:
// Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
// Don't worry if there's a trailing '.' or not.
- if (C_isupper(*it)) {
+ if (U_isupper(*it)) {
string t;
AccentNormalisingItor p = it;
do {
- t += *p++;
- } while (p != end && *p == '.' && ++p != end && C_isupper(*p));
+ t += string(ubuf, g_unichar_to_utf8(*p++, ubuf));
+ } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
// One letter does not make an acronym! If we handled a single
// uppercase letter here, we wouldn't catch M&S below.
@@ -333,5 +370,5 @@ phrased_term:
// Check there's not a (lower case) letter or digit
// immediately after it.
- if (p == end || !C_isalnum(*p)) {
+ if (p == end || !g_unichar_isalnum(*p)) {
it = p;
swap(term, t);
@@ -343,5 +380,5 @@ phrased_term:
if (term.empty()) {
while (it != end) {
- if (!C_isalnum(*it)) {
+ if (!g_unichar_isalnum(*it)) {
// Treat a single embedded '&' as a word character
// (e.g. AT&T).
@@ -349,9 +386,9 @@ phrased_term:
AccentNormalisingItor p = it;
++p;
- if (p == end || !C_isalnum(*p)) break;
+ if (p == end || !g_unichar_isalnum(*p)) break;
}
- term += *it++;
+ term.append(ubuf, g_unichar_to_utf8(*it++, ubuf));
}
- if (it != end && (*it == '#' || C_issign(*it))) {
+ if (it != end && (*it == '#' || U_issign(*it))) {
string suff_term = term;
AccentNormalisingItor p = it;
@@ -369,8 +406,8 @@ phrased_term:
// to what combinations are allowed.
do {
- suff_term += *p++;
- } while (p != end && C_issign(*p));
+ suff_term += string(ubuf, g_unichar_to_utf8(*p++, ubuf));
+ } while (p != end && U_issign(*p));
}
- if (p == end || !C_isalnum(*p)) {
+ if (p == end || !g_unichar_isalnum(*p)) {
// If the suffixed term doesn't exist, check that the
// non-suffixed term does. This also takes care of
@@ -389,5 +426,5 @@ phrased_term:
// Don't want to interpret A.N.D. or аND as an AND operator.
if (!was_acronym && transliterations == it.transliterations()) {
- if (prefix.empty() && !term.empty() && C_isalpha(term[0])) {
+ if (prefix.empty() && !term.empty() && U_isupper(term[0])) {
if (C_isupper(term[0])) {
if (term == "AND") {
@@ -439,5 +476,5 @@ phrased_term:
// e.g. "example.com" should give a phrase search for "exampl"
// and "com", not "example" and "com".
- if (p == end || C_isspace(*p)) {
+ if (p == end || U_isspace(*p)) {
it = p;
// If topterms added a term with a trailing '.', it will be
@@ -445,5 +482,5 @@ phrased_term:
// initial in someone's name, a full stop in pasted text or
// something like that.
- if (!C_isupper(term[0])) {
+ if (!U_isupper(term[0])) {
unstemmed_term = term + '.';
need_to_stem = false;
@@ -455,5 +492,5 @@ phrased_term:
term = downcase_term(term);
if (need_to_stem) {
- if (stem_action == STEM_SOME && C_isupper(unstemmed_term[0]))
+ if (stem_action == STEM_SOME && U_isupper(unstemmed_term[0]))
term = 'R' + term;
else
@@ -500,5 +537,5 @@ phrased_term:
// Don't generate a phrase unless the phrase generators are
// immediately followed by another term.
- if (it != end && C_isalnum(*it)) {
+ if (it != end && g_unichar_isalnum(*it)) {
mode = IN_PHRASED_TERM;
goto phrased_term;