Kouhei Sutou
null+****@clear*****
Mon Jan 21 17:55:05 JST 2013
Kouhei Sutou 2013-01-21 17:55:05 +0900 (Mon, 21 Jan 2013) New Revision: 83b5096d9856051f67d251b46d78099f48db252c https://github.com/groonga/groonga/commit/83b5096d9856051f67d251b46d78099f48db252c Log: Revert "Remove deprecated grn_str_open()" This reverts commit 46c15627aa48627f5f24446006ca30dda4c2dae1. Modified files: lib/str.c plugins/suggest/suggest.c test/unit/util/test-string.c Modified: lib/str.c (+1103 -0) =================================================================== --- lib/str.c 2013-01-21 17:53:14 +0900 (ec87c10) +++ lib/str.c 2013-01-21 17:55:05 +0900 (a26e1b9) @@ -170,6 +170,1109 @@ grn_charlen(grn_ctx *ctx, const char *str, const char *end) return grn_charlen_(ctx, str, end, ctx->encoding); } +static unsigned char symbol[] = { + ',', '.', 0, ':', ';', '?', '!', 0, 0, 0, '`', 0, '^', '~', '_', 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, '-', '-', '/', '\\', 0, 0, '|', 0, 0, 0, '\'', 0, + '"', '(', ')', 0, 0, '[', ']', '{', '}', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + '+', '-', 0, 0, 0, '=', 0, '<', '>', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + '$', 0, 0, '%', '#', '&', '*', '@', 0, 0, 0, 0, 0, 0, 0, 0 +}; + +inline static grn_rc +normalize_euc(grn_ctx *ctx, grn_str *nstr) +{ + static uint16_t hankana[] = { + 0xa1a1, 0xa1a3, 0xa1d6, 0xa1d7, 0xa1a2, 0xa1a6, 0xa5f2, 0xa5a1, 0xa5a3, + 0xa5a5, 0xa5a7, 0xa5a9, 0xa5e3, 0xa5e5, 0xa5e7, 0xa5c3, 0xa1bc, 0xa5a2, + 0xa5a4, 0xa5a6, 0xa5a8, 0xa5aa, 0xa5ab, 0xa5ad, 0xa5af, 0xa5b1, 0xa5b3, + 0xa5b5, 0xa5b7, 0xa5b9, 0xa5bb, 0xa5bd, 0xa5bf, 0xa5c1, 0xa5c4, 0xa5c6, + 0xa5c8, 0xa5ca, 0xa5cb, 0xa5cc, 0xa5cd, 0xa5ce, 0xa5cf, 0xa5d2, 0xa5d5, + 0xa5d8, 0xa5db, 0xa5de, 0xa5df, 0xa5e0, 0xa5e1, 0xa5e2, 0xa5e4, 0xa5e6, + 0xa5e8, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5ef, 0xa5f3, 0xa1ab, + 0xa1eb + }; + static unsigned char dakuten[] = { + 0xf4, 0, 0, 0, 0, 0xac, 0, 0xae, 0, 0xb0, 0, 0xb2, 0, 0xb4, 0, 0xb6, 0, + 0xb8, 0, 0xba, 0, 0xbc, 0, 0xbe, 0, 0xc0, 0, 0xc2, 0, 0, 0xc5, 0, 0xc7, + 0, 0xc9, 0, 0, 0, 0, 0, 0, 0xd0, 0, 0, 0xd3, 0, 0, 0xd6, 0, 0, 0xd9, 0, + 0, 0xdc + }; + static unsigned char handaku[] = { + 0xd1, 0, 0, 0xd4, 0, 0, 0xd7, 0, 0, 0xda, 0, 0, 0xdd + }; + int16_t *ch; + const unsigned char *s, *s_, *e; + unsigned char *d, *d0, *d_, b; + uint_least8_t *cp, *ctypes, ctype; + size_t size = nstr->orig_blen, length = 0; + int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK; + if (!(nstr->norm = GRN_MALLOC(size * 2 + 1))) { + return GRN_NO_MEMORY_AVAILABLE; + } + d0 = (unsigned char *) nstr->norm; + if (nstr->flags & GRN_STR_WITH_CHECKS) { + if (!(nstr->checks = GRN_MALLOC(size * 2 * sizeof(int16_t) + 1))) { + GRN_FREE(nstr->norm); + nstr->norm = NULL; + return GRN_NO_MEMORY_AVAILABLE; + } + } + ch = nstr->checks; + if (nstr->flags & GRN_STR_WITH_CTYPES) { + if (!(nstr->ctypes = GRN_MALLOC(size + 1))) { + GRN_FREE(nstr->checks); + GRN_FREE(nstr->norm); + nstr->checks = NULL; + nstr->norm = NULL; + return GRN_NO_MEMORY_AVAILABLE; + } + } + cp = ctypes = nstr->ctypes; + e = (unsigned char *)nstr->orig + size; + for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) { + if ((*s & 0x80)) { + if (((s + 1) < e) && (*(s + 1) & 0x80)) { + unsigned char c1 = *s++, c2 = *s, c3 = 0; + switch (c1 >> 4) { + case 0x08 : + if (c1 == 0x8e && 0xa0 <= c2 && c2 <= 0xdf) { + uint16_t c = hankana[c2 - 0xa0]; + switch (c) { + case 0xa1ab : + if (d > d0 + 1 && d[-2] == 0xa5 + && 0xa6 <= d[-1] && d[-1] <= 0xdb && (b = dakuten[d[-1] - 0xa6])) { + *(d - 1) = b; + if (ch) { ch[-1] += 2; s_ += 2; } + continue; + } else { + *d++ = c >> 8; *d = c & 0xff; + } + break; + case 0xa1eb : + if (d > d0 + 1 && d[-2] == 0xa5 + && 0xcf <= d[-1] && d[-1] <= 0xdb && (b = handaku[d[-1] - 0xcf])) { + *(d - 1) = b; + if (ch) { ch[-1] += 2; s_ += 2; } + continue; + } else { + *d++ = c >> 8; *d = c & 0xff; + } + break; + default : + *d++ = c >> 8; *d = c & 0xff; + break; + } + ctype = grn_str_katakana; + } else { + *d++ = c1; *d = c2; + ctype = grn_str_others; + } + break; + case 0x09 : + *d++ = c1; *d = c2; + ctype = grn_str_others; + break; + case 0x0a : + switch (c1 & 0x0f) { + case 1 : + switch (c2) { + case 0xbc : + *d++ = c1; *d = c2; + ctype = grn_str_katakana; + break; + case 0xb9 : + *d++ = c1; *d = c2; + ctype = grn_str_kanji; + break; + case 0xa1 : + if (removeblankp) { + if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } + continue; + } else { + *d = ' '; + ctype = GRN_STR_BLANK|grn_str_symbol; + } + break; + default : + if (c2 >= 0xa4 && (c3 = symbol[c2 - 0xa4])) { + *d = c3; + ctype = grn_str_symbol; + } else { + *d++ = c1; *d = c2; + ctype = grn_str_others; + } + break; + } + break; + case 2 : + *d++ = c1; *d = c2; + ctype = grn_str_symbol; + break; + case 3 : + c3 = c2 - 0x80; + if ('a' <= c3 && c3 <= 'z') { + ctype = grn_str_alpha; + *d = c3; + } else if ('A' <= c3 && c3 <= 'Z') { + ctype = grn_str_alpha; + *d = c3 + 0x20; + } else if ('0' <= c3 && c3 <= '9') { + ctype = grn_str_digit; + *d = c3; + } else { + ctype = grn_str_others; + *d++ = c1; *d = c2; + } + break; + case 4 : + *d++ = c1; *d = c2; + ctype = grn_str_hiragana; + break; + case 5 : + *d++ = c1; *d = c2; + ctype = grn_str_katakana; + break; + case 6 : + case 7 : + case 8 : + *d++ = c1; *d = c2; + ctype = grn_str_symbol; + break; + default : + *d++ = c1; *d = c2; + ctype = grn_str_others; + break; + } + break; + default : + *d++ = c1; *d = c2; + ctype = grn_str_kanji; + break; + } + } else { + /* skip invalid character */ + continue; + } + } else { + unsigned char c = *s; + switch (c >> 4) { + case 0 : + case 1 : + /* skip unprintable ascii */ + if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } + continue; + case 2 : + if (c == 0x20) { + if (removeblankp) { + if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } + continue; + } else { + *d = ' '; + ctype = GRN_STR_BLANK|grn_str_symbol; + } + } else { + *d = c; + ctype = grn_str_symbol; + } + break; + case 3 : + *d = c; + ctype = (c <= 0x39) ? grn_str_digit : grn_str_symbol; + break; + case 4 : + *d = ('A' <= c) ? c + 0x20 : c; + ctype = (c == 0x40) ? grn_str_symbol : grn_str_alpha; + break; + case 5 : + *d = (c <= 'Z') ? c + 0x20 : c; + ctype = (c <= 0x5a) ? grn_str_alpha : grn_str_symbol; + break; + case 6 : + *d = c; + ctype = (c == 0x60) ? grn_str_symbol : grn_str_alpha; + break; + case 7 : + *d = c; + ctype = (c <= 0x7a) ? grn_str_alpha : (c == 0x7f ? grn_str_others : grn_str_symbol); + break; + default : + *d = c; + ctype = grn_str_others; + break; + } + } + d++; + length++; + if (cp) { *cp++ = ctype; } + if (ch) { + *ch++ = (int16_t)(s + 1 - s_); + s_ = s + 1; + while (++d_ < d) { *ch++ = 0; } + } + } + if (cp) { *cp = grn_str_null; } + *d = '\0'; + nstr->length = length; + nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm); + return GRN_SUCCESS; +} + +#ifdef WITH_NFKC +uint_least8_t grn_nfkc_ctype(const unsigned char *str); +const char *grn_nfkc_map1(const unsigned char *str); +const char *grn_nfkc_map2(const unsigned char *prefix, const unsigned char *suffix); + +inline static grn_rc +normalize_utf8(grn_ctx *ctx, grn_str *nstr) +{ + int16_t *ch; + const unsigned char *s, *s_, *s__ = NULL, *p, *p2, *pe, *e; + unsigned char *d, *d_, *de; + uint_least8_t *cp; + size_t length = 0, ls, lp, size = nstr->orig_blen, ds = size * 3; + int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK; + if (!(nstr->norm = GRN_MALLOC(ds + 1))) { + return GRN_NO_MEMORY_AVAILABLE; + } + if (nstr->flags & GRN_STR_WITH_CHECKS) { + if (!(nstr->checks = GRN_MALLOC(ds * sizeof(int16_t) + 1))) { + GRN_FREE(nstr->norm); nstr->norm = NULL; + return GRN_NO_MEMORY_AVAILABLE; + } + } + ch = nstr->checks; + if (nstr->flags & GRN_STR_WITH_CTYPES) { + if (!(nstr->ctypes = GRN_MALLOC(ds + 1))) { + if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; } + GRN_FREE(nstr->norm); nstr->norm = NULL; + return GRN_NO_MEMORY_AVAILABLE; + } + } + cp = nstr->ctypes; + d = (unsigned char *)nstr->norm; + de = d + ds; + d_ = NULL; + e = (unsigned char *)nstr->orig + size; + for (s = s_ = (unsigned char *)nstr->orig; ; s += ls) { + if (!(ls = grn_str_charlen_utf8(ctx, s, e))) { + break; + } + if ((p = (unsigned char *)grn_nfkc_map1(s))) { + pe = p + strlen((char *)p); + } else { + p = s; + pe = p + ls; + } + if (d_ && (p2 = (unsigned char *)grn_nfkc_map2(d_, p))) { + p = p2; + pe = p + strlen((char *)p); + if (cp) { cp--; } + if (ch) { + ch -= (d - d_); + s_ = s__; + } + d = d_; + length--; + } + for (; ; p += lp) { + if (!(lp = grn_str_charlen_utf8(ctx, p, pe))) { + break; + } + if ((*p == ' ' && removeblankp) || *p < 0x20 /* skip unprintable ascii */ ) { + if (cp > nstr->ctypes) { *(cp - 1) |= GRN_STR_BLANK; } + } else { + if (de <= d + lp) { + unsigned char *norm; + ds += (ds >> 1) + lp; + if (!(norm = GRN_REALLOC(nstr->norm, ds + 1))) { + if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; } + if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; } + GRN_FREE(nstr->norm); nstr->norm = NULL; + return GRN_NO_MEMORY_AVAILABLE; + } + de = norm + ds; + d = norm + (d - (unsigned char *)nstr->norm); + nstr->norm = norm; + if (ch) { + int16_t *checks; + if (!(checks = GRN_REALLOC(nstr->checks, ds * sizeof(int16_t)+ 1))) { + if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; } + GRN_FREE(nstr->checks); nstr->checks = NULL; + GRN_FREE(nstr->norm); nstr->norm = NULL; + return GRN_NO_MEMORY_AVAILABLE; + } + ch = checks + (ch - nstr->checks); + nstr->checks = checks; + } + if (cp) { + uint_least8_t *ctypes; + if (!(ctypes = GRN_REALLOC(nstr->ctypes, ds + 1))) { + GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; + if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; } + GRN_FREE(nstr->norm); nstr->norm = NULL; + return GRN_NO_MEMORY_AVAILABLE; + } + cp = ctypes + (cp - nstr->ctypes); + nstr->ctypes = ctypes; + } + } + memcpy(d, p, lp); + d_ = d; + d += lp; + length++; + if (cp) { *cp++ = grn_nfkc_ctype(p); } + if (ch) { + size_t i; + if (s_ == s + ls) { + *ch++ = -1; + } else { + *ch++ = (int16_t)(s + ls - s_); + s__ = s_; + s_ = s + ls; + } + for (i = lp; i > 1; i--) { *ch++ = 0; } + } + } + } + } + if (cp) { *cp = grn_str_null; } + *d = '\0'; + nstr->length = length; + nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm); + return GRN_SUCCESS; +} +#endif /* WITH_NFKC */ + +inline static grn_rc +normalize_sjis(grn_ctx *ctx, grn_str *nstr) +{ + static uint16_t hankana[] = { + 0x8140, 0x8142, 0x8175, 0x8176, 0x8141, 0x8145, 0x8392, 0x8340, 0x8342, + 0x8344, 0x8346, 0x8348, 0x8383, 0x8385, 0x8387, 0x8362, 0x815b, 0x8341, + 0x8343, 0x8345, 0x8347, 0x8349, 0x834a, 0x834c, 0x834e, 0x8350, 0x8352, + 0x8354, 0x8356, 0x8358, 0x835a, 0x835c, 0x835e, 0x8360, 0x8363, 0x8365, + 0x8367, 0x8369, 0x836a, 0x836b, 0x836c, 0x836d, 0x836e, 0x8371, 0x8374, + 0x8377, 0x837a, 0x837d, 0x837e, 0x8380, 0x8381, 0x8382, 0x8384, 0x8386, + 0x8388, 0x8389, 0x838a, 0x838b, 0x838c, 0x838d, 0x838f, 0x8393, 0x814a, + 0x814b + }; + static unsigned char dakuten[] = { + 0x94, 0, 0, 0, 0, 0x4b, 0, 0x4d, 0, 0x4f, 0, 0x51, 0, 0x53, 0, 0x55, 0, + 0x57, 0, 0x59, 0, 0x5b, 0, 0x5d, 0, 0x5f, 0, 0x61, 0, 0, 0x64, 0, 0x66, + 0, 0x68, 0, 0, 0, 0, 0, 0, 0x6f, 0, 0, 0x72, 0, 0, 0x75, 0, 0, 0x78, 0, + 0, 0x7b + }; + static unsigned char handaku[] = { + 0x70, 0, 0, 0x73, 0, 0, 0x76, 0, 0, 0x79, 0, 0, 0x7c + }; + int16_t *ch; + const unsigned char *s, *s_; + unsigned char *d, *d0, *d_, b, *e; + uint_least8_t *cp, *ctypes, ctype; + size_t size = nstr->orig_blen, length = 0; + int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK; + if (!(nstr->norm = GRN_MALLOC(size * 2 + 1))) { + return GRN_NO_MEMORY_AVAILABLE; + } + d0 = (unsigned char *) nstr->norm; + if (nstr->flags & GRN_STR_WITH_CHECKS) { + if (!(nstr->checks = GRN_MALLOC(size * 2 * sizeof(int16_t) + 1))) { + GRN_FREE(nstr->norm); + nstr->norm = NULL; + return GRN_NO_MEMORY_AVAILABLE; + } + } + ch = nstr->checks; + if (nstr->flags & GRN_STR_WITH_CTYPES) { + if (!(nstr->ctypes = GRN_MALLOC(size + 1))) { + GRN_FREE(nstr->checks); + GRN_FREE(nstr->norm); + nstr->checks = NULL; + nstr->norm = NULL; + return GRN_NO_MEMORY_AVAILABLE; + } + } + cp = ctypes = nstr->ctypes; + e = (unsigned char *)nstr->orig + size; + for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) { + if ((*s & 0x80)) { + if (0xa0 <= *s && *s <= 0xdf) { + uint16_t c = hankana[*s - 0xa0]; + switch (c) { + case 0x814a : + if (d > d0 + 1 && d[-2] == 0x83 + && 0x45 <= d[-1] && d[-1] <= 0x7a && (b = dakuten[d[-1] - 0x45])) { + *(d - 1) = b; + if (ch) { ch[-1]++; s_++; } + continue; + } else { + *d++ = c >> 8; *d = c & 0xff; + } + break; + case 0x814b : + if (d > d0 + 1 && d[-2] == 0x83 + && 0x6e <= d[-1] && d[-1] <= 0x7a && (b = handaku[d[-1] - 0x6e])) { + *(d - 1) = b; + if (ch) { ch[-1]++; s_++; } + continue; + } else { + *d++ = c >> 8; *d = c & 0xff; + } + break; + default : + *d++ = c >> 8; *d = c & 0xff; + break; + } + ctype = grn_str_katakana; + } else { + if ((s + 1) < e && 0x40 <= *(s + 1) && *(s + 1) <= 0xfc) { + unsigned char c1 = *s++, c2 = *s, c3 = 0; + if (0x81 <= c1 && c1 <= 0x87) { + switch (c1 & 0x0f) { + case 1 : + switch (c2) { + case 0x5b : + *d++ = c1; *d = c2; + ctype = grn_str_katakana; + break; + case 0x58 : + *d++ = c1; *d = c2; + ctype = grn_str_kanji; + break; + case 0x40 : + if (removeblankp) { + if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } + continue; + } else { + *d = ' '; + ctype = GRN_STR_BLANK|grn_str_symbol; + } + break; + default : + if (0x43 <= c2 && c2 <= 0x7e && (c3 = symbol[c2 - 0x43])) { + *d = c3; + ctype = grn_str_symbol; + } else if (0x7f <= c2 && c2 <= 0x97 && (c3 = symbol[c2 - 0x44])) { + *d = c3; + ctype = grn_str_symbol; + } else { + *d++ = c1; *d = c2; + ctype = grn_str_others; + } + break; + } + break; + case 2 : + c3 = c2 - 0x1f; + if (0x4f <= c2 && c2 <= 0x58) { + ctype = grn_str_digit; + *d = c2 - 0x1f; + } else if (0x60 <= c2 && c2 <= 0x79) { + ctype = grn_str_alpha; + *d = c2 + 0x01; + } else if (0x81 <= c2 && c2 <= 0x9a) { + ctype = grn_str_alpha; + *d = c2 - 0x20; + } else if (0x9f <= c2 && c2 <= 0xf1) { + *d++ = c1; *d = c2; + ctype = grn_str_hiragana; + } else { + *d++ = c1; *d = c2; + ctype = grn_str_others; + } + break; + case 3 : + if (0x40 <= c2 && c2 <= 0x96) { + *d++ = c1; *d = c2; + ctype = grn_str_katakana; + } else { + *d++ = c1; *d = c2; + ctype = grn_str_symbol; + } + break; + case 4 : + case 7 : + *d++ = c1; *d = c2; + ctype = grn_str_symbol; + break; + default : + *d++ = c1; *d = c2; + ctype = grn_str_others; + break; + } + } else { + *d++ = c1; *d = c2; + ctype = grn_str_kanji; + } + } else { + /* skip invalid character */ + continue; + } + } + } else { + unsigned char c = *s; + switch (c >> 4) { + case 0 : + case 1 : + /* skip unprintable ascii */ + if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } + continue; + case 2 : + if (c == 0x20) { + if (removeblankp) { + if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } + continue; + } else { + *d = ' '; + ctype = GRN_STR_BLANK|grn_str_symbol; + } + } else { + *d = c; + ctype = grn_str_symbol; + } + break; + case 3 : + *d = c; + ctype = (c <= 0x39) ? grn_str_digit : grn_str_symbol; + break; + case 4 : + *d = ('A' <= c) ? c + 0x20 : c; + ctype = (c == 0x40) ? grn_str_symbol : grn_str_alpha; + break; + case 5 : + *d = (c <= 'Z') ? c + 0x20 : c; + ctype = (c <= 0x5a) ? grn_str_alpha : grn_str_symbol; + break; + case 6 : + *d = c; + ctype = (c == 0x60) ? grn_str_symbol : grn_str_alpha; + break; + case 7 : + *d = c; + ctype = (c <= 0x7a) ? grn_str_alpha : (c == 0x7f ? grn_str_others : grn_str_symbol); + break; + default : + *d = c; + ctype = grn_str_others; + break; + } + } + d++; + length++; + if (cp) { *cp++ = ctype; } + if (ch) { + *ch++ = (int16_t)(s + 1 - s_); + s_ = s + 1; + while (++d_ < d) { *ch++ = 0; } + } + } + if (cp) { *cp = grn_str_null; } + *d = '\0'; + nstr->length = length; + nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm); + return GRN_SUCCESS; +} + +inline static grn_rc +normalize_none(grn_ctx *ctx, grn_str *nstr) +{ + int16_t *ch; + const unsigned char *s, *s_, *e; + unsigned char *d, *d0, *d_; + uint_least8_t *cp, *ctypes, ctype; + size_t size = nstr->orig_blen, length = 0; + int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK; + if (!(nstr->norm = GRN_MALLOC(size + 1))) { + return GRN_NO_MEMORY_AVAILABLE; + } + d0 = (unsigned char *) nstr->norm; + if (nstr->flags & GRN_STR_WITH_CHECKS) { + if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) { + GRN_FREE(nstr->norm); + nstr->norm = NULL; + return GRN_NO_MEMORY_AVAILABLE; + } + } + ch = nstr->checks; + if (nstr->flags & GRN_STR_WITH_CTYPES) { + if (!(nstr->ctypes = GRN_MALLOC(size + 1))) { + GRN_FREE(nstr->checks); + GRN_FREE(nstr->norm); + nstr->checks = NULL; + nstr->norm = NULL; + return GRN_NO_MEMORY_AVAILABLE; + } + } + cp = ctypes = nstr->ctypes; + e = (unsigned char *)nstr->orig + size; + for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) { + unsigned char c = *s; + switch (c >> 4) { + case 0 : + case 1 : + /* skip unprintable ascii */ + if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } + continue; + case 2 : + if (c == 0x20) { + if (removeblankp) { + if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } + continue; + } else { + *d = ' '; + ctype = GRN_STR_BLANK|grn_str_symbol; + } + } else { + *d = c; + ctype = grn_str_symbol; + } + break; + case 3 : + *d = c; + ctype = (c <= 0x39) ? grn_str_digit : grn_str_symbol; + break; + case 4 : + *d = ('A' <= c) ? c + 0x20 : c; + ctype = (c == 0x40) ? grn_str_symbol : grn_str_alpha; + break; + case 5 : + *d = (c <= 'Z') ? c + 0x20 : c; + ctype = (c <= 0x5a) ? grn_str_alpha : grn_str_symbol; + break; + case 6 : + *d = c; + ctype = (c == 0x60) ? grn_str_symbol : grn_str_alpha; + break; + case 7 : + *d = c; + ctype = (c <= 0x7a) ? grn_str_alpha : (c == 0x7f ? grn_str_others : grn_str_symbol); + break; + default : + *d = c; + ctype = grn_str_others; + break; + } + d++; + length++; + if (cp) { *cp++ = ctype; } + if (ch) { + *ch++ = (int16_t)(s + 1 - s_); + s_ = s + 1; + while (++d_ < d) { *ch++ = 0; } + } + } + if (cp) { *cp = grn_str_null; } + *d = '\0'; + nstr->length = length; + nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm); + return GRN_SUCCESS; +} + +/* use cp1252 as latin1 */ +inline static grn_rc +normalize_latin1(grn_ctx *ctx, grn_str *nstr) +{ + int16_t *ch; + const unsigned char *s, *s_, *e; + unsigned char *d, *d0, *d_; + uint_least8_t *cp, *ctypes, ctype; + size_t size = nstr->orig_blen, length = 0; + int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK; + if (!(nstr->norm = GRN_MALLOC(size + 1))) { + return GRN_NO_MEMORY_AVAILABLE; + } + d0 = (unsigned char *) nstr->norm; + if (nstr->flags & GRN_STR_WITH_CHECKS) { + if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) { + GRN_FREE(nstr->norm); + nstr->norm = NULL; + return GRN_NO_MEMORY_AVAILABLE; + } + } + ch = nstr->checks; + if (nstr->flags & GRN_STR_WITH_CTYPES) { + if (!(nstr->ctypes = GRN_MALLOC(size + 1))) { + GRN_FREE(nstr->checks); + GRN_FREE(nstr->norm); + nstr->checks = NULL; + nstr->norm = NULL; + return GRN_NO_MEMORY_AVAILABLE; + } + } + cp = ctypes = nstr->ctypes; + e = (unsigned char *)nstr->orig + size; + for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) { + unsigned char c = *s; + switch (c >> 4) { + case 0 : + case 1 : + /* skip unprintable ascii */ + if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } + continue; + case 2 : + if (c == 0x20) { + if (removeblankp) { + if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } + continue; + } else { + *d = ' '; + ctype = GRN_STR_BLANK|grn_str_symbol; + } + } else { + *d = c; + ctype = grn_str_symbol; + } + break; + case 3 : + *d = c; + ctype = (c <= 0x39) ? grn_str_digit : grn_str_symbol; + break; + case 4 : + *d = ('A' <= c) ? c + 0x20 : c; + ctype = (c == 0x40) ? grn_str_symbol : grn_str_alpha; + break; + case 5 : + *d = (c <= 'Z') ? c + 0x20 : c; + ctype = (c <= 0x5a) ? grn_str_alpha : grn_str_symbol; + break; + case 6 : + *d = c; + ctype = (c == 0x60) ? grn_str_symbol : grn_str_alpha; + break; + case 7 : + *d = c; + ctype = (c <= 0x7a) ? grn_str_alpha : (c == 0x7f ? grn_str_others : grn_str_symbol); + break; + case 8 : + if (c == 0x8a || c == 0x8c || c == 0x8e) { + *d = c + 0x10; + ctype = grn_str_alpha; + } else { + *d = c; + ctype = grn_str_symbol; + } + break; + case 9 : + if (c == 0x9a || c == 0x9c || c == 0x9e || c == 0x9f) { + *d = (c == 0x9f) ? c + 0x60 : c; + ctype = grn_str_alpha; + } else { + *d = c; + ctype = grn_str_symbol; + } + break; + case 0x0c : + *d = c + 0x20; + ctype = grn_str_alpha; + break; + case 0x0d : + *d = (c == 0xd7 || c == 0xdf) ? c : c + 0x20; + ctype = (c == 0xd7) ? grn_str_symbol : grn_str_alpha; + break; + case 0x0e : + *d = c; + ctype = grn_str_alpha; + break; + case 0x0f : + *d = c; + ctype = (c == 0xf7) ? grn_str_symbol : grn_str_alpha; + break; + default : + *d = c; + ctype = grn_str_others; + break; + } + d++; + length++; + if (cp) { *cp++ = ctype; } + if (ch) { + *ch++ = (int16_t)(s + 1 - s_); + s_ = s + 1; + while (++d_ < d) { *ch++ = 0; } + } + } + if (cp) { *cp = grn_str_null; } + *d = '\0'; + nstr->length = length; + nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm); + return GRN_SUCCESS; +} + +inline static grn_rc +normalize_koi8r(grn_ctx *ctx, grn_str *nstr) +{ + int16_t *ch; + const unsigned char *s, *s_, *e; + unsigned char *d, *d0, *d_; + uint_least8_t *cp, *ctypes, ctype; + size_t size = strlen(nstr->orig), length = 0; + int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK; + if (!(nstr->norm = GRN_MALLOC(size + 1))) { + return GRN_NO_MEMORY_AVAILABLE; + } + d0 = (unsigned char *) nstr->norm; + if (nstr->flags & GRN_STR_WITH_CHECKS) { + if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) { + GRN_FREE(nstr->norm); + nstr->norm = NULL; + return GRN_NO_MEMORY_AVAILABLE; + } + } + ch = nstr->checks; + if (nstr->flags & GRN_STR_WITH_CTYPES) { + if (!(nstr->ctypes = GRN_MALLOC(size + 1))) { + GRN_FREE(nstr->checks); + GRN_FREE(nstr->norm); + nstr->checks = NULL; + nstr->norm = NULL; + return GRN_NO_MEMORY_AVAILABLE; + } + } + cp = ctypes = nstr->ctypes; + e = (unsigned char *)nstr->orig + size; + for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) { + unsigned char c = *s; + switch (c >> 4) { + case 0 : + case 1 : + /* skip unprintable ascii */ + if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } + continue; + case 2 : + if (c == 0x20) { + if (removeblankp) { + if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } + continue; + } else { + *d = ' '; + ctype = GRN_STR_BLANK|grn_str_symbol; + } + } else { + *d = c; + ctype = grn_str_symbol; + } + break; + case 3 : + *d = c; + ctype = (c <= 0x39) ? grn_str_digit : grn_str_symbol; + break; + case 4 : + *d = ('A' <= c) ? c + 0x20 : c; + ctype = (c == 0x40) ? grn_str_symbol : grn_str_alpha; + break; + case 5 : + *d = (c <= 'Z') ? c + 0x20 : c; + ctype = (c <= 0x5a) ? grn_str_alpha : grn_str_symbol; + break; + case 6 : + *d = c; + ctype = (c == 0x60) ? grn_str_symbol : grn_str_alpha; + break; + case 7 : + *d = c; + ctype = (c <= 0x7a) ? grn_str_alpha : (c == 0x7f ? grn_str_others : grn_str_symbol); + break; + case 0x0a : + *d = c; + ctype = (c == 0xa3) ? grn_str_alpha : grn_str_others; + break; + case 0x0b : + if (c == 0xb3) { + *d = c - 0x10; + ctype = grn_str_alpha; + } else { + *d = c; + ctype = grn_str_others; + } + break; + case 0x0c : + case 0x0d : + *d = c; + ctype = grn_str_alpha; + break; + case 0x0e : + case 0x0f : + *d = c - 0x20; + ctype = grn_str_alpha; + break; + default : + *d = c; + ctype = grn_str_others; + break; + } + d++; + length++; + if (cp) { *cp++ = ctype; } + if (ch) { + *ch++ = (int16_t)(s + 1 - s_); + s_ = s + 1; + while (++d_ < d) { *ch++ = 0; } + } + } + if (cp) { *cp = grn_str_null; } + *d = '\0'; + nstr->length = length; + nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm); + return GRN_SUCCESS; +} + +static grn_str * +grn_fakenstr_open(grn_ctx *ctx, const char *str, size_t str_len, grn_encoding encoding, int flags) +{ + /* TODO: support GRN_STR_REMOVEBLANK flag and ctypes */ + grn_str *nstr; + if (!(nstr = GRN_MALLOC(sizeof(grn_str)))) { + GRN_LOG(ctx, GRN_LOG_ALERT, "memory allocation on grn_fakenstr_open failed !"); + return NULL; + } + if (!(nstr->norm = GRN_MALLOC(str_len + 1))) { + GRN_LOG(ctx, GRN_LOG_ALERT, "memory allocation for keyword on grn_snip_add_cond failed !"); + GRN_FREE(nstr); + return NULL; + } + nstr->orig = str; + nstr->orig_blen = str_len; + memcpy(nstr->norm, str, str_len); + nstr->norm[str_len] = '\0'; + nstr->norm_blen = str_len; + nstr->ctypes = NULL; + nstr->flags = flags; + + if (flags & GRN_STR_WITH_CHECKS) { + int16_t f = 0; + unsigned char c; + size_t i; + if (!(nstr->checks = (int16_t *) GRN_MALLOC(sizeof(int16_t) * str_len))) { + GRN_FREE(nstr->norm); + GRN_FREE(nstr); + return NULL; + } + switch (encoding) { + case GRN_ENC_EUC_JP: + for (i = 0; i < str_len; i++) { + if (!f) { + c = (unsigned char) str[i]; + f = ((c >= 0xa1U && c <= 0xfeU) || c == 0x8eU ? 2 : (c == 0x8fU ? 3 : 1) + ); + nstr->checks[i] = f; + } else { + nstr->checks[i] = 0; + } + f--; + } + break; + case GRN_ENC_SJIS: + for (i = 0; i < str_len; i++) { + if (!f) { + c = (unsigned char) str[i]; + f = (c >= 0x81U && ((c <= 0x9fU) || (c >= 0xe0U && c <= 0xfcU)) ? 2 : 1); + nstr->checks[i] = f; + } else { + nstr->checks[i] = 0; + } + f--; + } + break; + case GRN_ENC_UTF8: + for (i = 0; i < str_len; i++) { + if (!f) { + c = (unsigned char) str[i]; + f = (c & 0x80U ? (c & 0x20U ? (c & 0x10U ? 4 : 3) + : 2) + : 1); + nstr->checks[i] = f; + } else { + nstr->checks[i] = 0; + } + f--; + } + break; + default: + for (i = 0; i < str_len; i++) { + nstr->checks[i] = 1; + } + break; + } + } else { + nstr->checks = NULL; + } + return nstr; +} + +grn_str * +grn_str_open_(grn_ctx *ctx, const char *str, unsigned int str_len, int flags, grn_encoding encoding) +{ + grn_rc rc; + grn_str *nstr; + if (!str || !str_len) { return NULL; } + + if (!(flags & GRN_STR_NORMALIZE)) { + return grn_fakenstr_open(ctx, str, str_len, encoding, flags); + } + + if (!(nstr = GRN_MALLOC(sizeof(grn_str)))) { + GRN_LOG(ctx, GRN_LOG_ALERT, "memory allocation on grn_str_open failed !"); + return NULL; + } + nstr->orig = str; + nstr->orig_blen = str_len; + nstr->norm = NULL; + nstr->norm_blen = 0; + nstr->checks = NULL; + nstr->ctypes = NULL; + nstr->encoding = encoding; + nstr->flags = flags; + switch (encoding) { + case GRN_ENC_EUC_JP : + rc = normalize_euc(ctx, nstr); + break; + case GRN_ENC_UTF8 : +#ifdef WITH_NFKC + rc = normalize_utf8(ctx, nstr); +#else /* WITH_NFKC */ + rc = normalize_none(ctx, nstr); +#endif /* WITH_NFKC */ + break; + case GRN_ENC_SJIS : + rc = normalize_sjis(ctx, nstr); + break; + case GRN_ENC_LATIN1 : + rc = normalize_latin1(ctx, nstr); + break; + case GRN_ENC_KOI8R : + rc = normalize_koi8r(ctx, nstr); + break; + default : + rc = normalize_none(ctx, nstr); + break; + } + if (rc) { + grn_str_close(ctx, nstr); + return NULL; + } + return nstr; +} + +grn_str * +grn_str_open(grn_ctx *ctx, const char *str, unsigned int str_len, int flags) +{ + return grn_str_open_(ctx, str, str_len, flags, ctx->encoding); +} + +grn_rc +grn_str_close(grn_ctx *ctx, grn_str *nstr) +{ + if (nstr) { + if (nstr->norm) { GRN_FREE(nstr->norm); } + if (nstr->ctypes) { GRN_FREE(nstr->ctypes); } + if (nstr->checks) { GRN_FREE(nstr->checks); } + GRN_FREE(nstr); + return GRN_SUCCESS; + } else { + return GRN_INVALID_ARGUMENT; + } +} + static const char *grn_enc_string[] = { "default", "none", Modified: plugins/suggest/suggest.c (+6 -17) =================================================================== --- plugins/suggest/suggest.c 2013-01-21 17:53:14 +0900 (b47174c) +++ plugins/suggest/suggest.c 2013-01-21 17:55:05 +0900 (0d84cfc) @@ -304,25 +304,16 @@ complete(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost, grn_obj *col, if ((res = grn_table_create(ctx, NULL, 0, NULL, GRN_TABLE_HASH_KEY|GRN_OBJ_WITH_SUBREC, items, NULL))) { grn_id tid = grn_table_get(ctx, items, TEXT_VALUE_LEN(query)); - grn_obj *string; + grn_str *norm; if (GRN_TEXT_LEN(query) && - (string = grn_string_open(ctx, TEXT_VALUE_LEN(query), - GRN_NORMALIZER_AUTO, 0))) { + (norm = grn_str_open(ctx, TEXT_VALUE_LEN(query), GRN_STR_NORMALIZE))) { grn_table_cursor *cur; /* RK search + prefix search */ grn_obj *index; - const char *normalized; - unsigned int normalized_length_in_bytes; - grn_string_get_normalized(ctx, string, - &normalized, - &normalized_length_in_bytes, - NULL); - /* FIXME: support index selection */ + /* FIXME: support index selection */ if (grn_column_index(ctx, col, GRN_OP_PREFIX, &index, 1, NULL)) { if ((cur = grn_table_cursor_open(ctx, grn_ctx_at(ctx, index->header.domain), - normalized, - normalized_length_in_bytes, - NULL, 0, 0, -1, + norm->norm, norm->norm_blen, NULL, 0, 0, -1, GRN_CURSOR_PREFIX|GRN_CURSOR_RK))) { grn_id id; while ((id = grn_table_cursor_next(ctx, cur))) { @@ -351,9 +342,7 @@ complete(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost, grn_obj *col, if (((prefix_search_mode == GRN_SUGGEST_SEARCH_YES) || (prefix_search_mode == GRN_SUGGEST_SEARCH_AUTO && !grn_table_size(ctx, res))) && - (cur = grn_table_cursor_open(ctx, items, - normalized, - normalized_length_in_bytes, + (cur = grn_table_cursor_open(ctx, items, norm->norm, norm->norm_blen, NULL, 0, 0, -1, GRN_CURSOR_PREFIX))) { grn_id id; while ((id = grn_table_cursor_next(ctx, cur))) { @@ -362,7 +351,7 @@ complete(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost, grn_obj *col, } grn_table_cursor_close(ctx, cur); } - grn_obj_close(ctx, string); + grn_str_close(ctx, norm); } output(ctx, items, res, tid, sortby, output_columns, offset, limit); grn_obj_close(ctx, res); Modified: test/unit/util/test-string.c (+14 -34) =================================================================== --- test/unit/util/test-string.c 2013-01-21 17:53:14 +0900 (05fcb67) +++ test/unit/util/test-string.c 2013-01-21 17:55:05 +0900 (b1b617b) @@ -190,36 +190,26 @@ test_normalize(gconstpointer data) { const gchar *utf8_expected, *encoded_expected; const gchar *utf8_input, *encoded_input; - grn_obj *string; + grn_str *string; const gchar *normalized_text; - guint normalized_text_length; - guint normalized_text_n_characters; + guint normalized_text_len; int flags; grn_encoding encoding; encoding = gcut_data_get_int(data, "encoding"); GRN_CTX_SET_ENCODING(&context, encoding); - flags = GRN_STRING_WITH_CHECKS | GRN_STRING_WITH_TYPES; + flags = GRN_STR_NORMALIZE | GRN_STR_WITH_CHECKS | GRN_STR_WITH_CTYPES; utf8_input = gcut_data_get_string(data, "input"); encoded_input = convert_encoding(utf8_input, encoding); - string = grn_string_open(&context, - encoded_input, - strlen(encoded_input), - GRN_NORMALIZER_AUTO, - flags); - grn_string_get_normalized(&context, string, - &normalized_text, - &normalized_text_length, - &normalized_text_n_characters); - normalized_text = cut_take_strndup(normalized_text, normalized_text_length); - grn_obj_unlink(&context, string); + string = grn_str_open(&context, encoded_input, strlen(encoded_input), flags); + normalized_text = cut_take_strndup(string->norm, string->norm_blen); + normalized_text_len = string->norm_blen; + grn_test_assert(grn_str_close(&context, string)); utf8_expected = gcut_data_get_string(data, "expected"); encoded_expected = convert_encoding(utf8_expected, encoding); cut_assert_equal_string(encoded_expected, normalized_text); - cut_assert_equal_uint(strlen(encoded_expected), normalized_text_length); - cut_assert_equal_uint(g_utf8_strlen(utf8_expected, -1), - normalized_text_n_characters); + cut_assert_equal_int(strlen(encoded_expected), normalized_text_len); } void @@ -253,13 +243,11 @@ data_normalize_broken(void) void test_normalize_broken(gconstpointer data) { - grn_obj *string; + grn_str *string; const gchar *input, *encoded_input; - const gchar *normalized_text; grn_encoding input_encoding, context_encoding; gint input_length; - guint normalized_text_length, normalized_text_n_characters; - int flags = GRN_STRING_WITH_CHECKS | GRN_STRING_WITH_TYPES; + int flags = GRN_STR_NORMALIZE | GRN_STR_WITH_CHECKS | GRN_STR_WITH_CTYPES; context_encoding = gcut_data_get_int(data, "context-encoding"); GRN_CTX_SET_ENCODING(&context, context_encoding); @@ -271,18 +259,10 @@ test_normalize_broken(gconstpointer data) if (input_length < 0) { input_length = strlen(encoded_input); } - string = grn_string_open(&context, encoded_input, input_length, - GRN_NORMALIZER_AUTO, flags); - grn_string_get_normalized(&context, string, - &normalized_text, - &normalized_text_length, - &normalized_text_n_characters); - normalized_text = cut_take_strndup(normalized_text, normalized_text_length); - grn_obj_unlink(&context, string); - - cut_assert_equal_string("", normalized_text); - cut_assert_equal_int(0, normalized_text_length); - cut_assert_equal_int(0, normalized_text_n_characters); + string = grn_str_open(&context, encoded_input, input_length, flags); + cut_assert_equal_string("", string->norm); + cut_assert_equal_int(0, string->norm_blen); + grn_test_assert(grn_str_close(&context, string)); } void -------------- next part -------------- HTML����������������������������...Télécharger