[Groonga-commit] groonga/groonga [master] tokenizer ngram: use grn_tokenizer_query

Back to archive index

Kouhei Sutou null+****@clear*****
Fri Dec 21 17:17:12 JST 2012


Kouhei Sutou	2012-12-21 17:17:12 +0900 (Fri, 21 Dec 2012)

  New Revision: e69f8551f713a240118c2318ab6f5127f55bc06e
  https://github.com/groonga/groonga/commit/e69f8551f713a240118c2318ab6f5127f55bc06e

  Log:
    tokenizer ngram: use grn_tokenizer_query

  Modified files:
    lib/token.c

  Modified: lib/token.c (+42 -39)
===================================================================
--- lib/token.c    2012-12-21 17:15:43 +0900 (32079fb)
+++ lib/token.c    2012-12-21 17:17:12 +0900 (57483e2)
@@ -206,7 +206,6 @@ delimit_null_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_d
 typedef struct {
   grn_tokenizer_token token;
   grn_tokenizer_query *query;
-  grn_obj *nstr;
   uint8_t uni_alpha;
   uint8_t uni_digit;
   uint8_t uni_symbol;
@@ -215,7 +214,6 @@ typedef struct {
   uint8_t overlap;
   int32_t pos;
   uint32_t skip;
-  grn_encoding encoding;
   const unsigned char *next;
   const unsigned char *end;
   const uint_least8_t *ctypes;
@@ -224,25 +222,34 @@ typedef struct {
 } grn_ngram_tokenizer;
 
 static grn_obj *
-ngram_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data, uint8_t ngram_unit,
+ngram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data, uint8_t ngram_unit,
            uint8_t uni_alpha, uint8_t uni_digit, uint8_t uni_symbol, uint8_t ignore_blank)
 {
-  grn_obj *str;
-  grn_obj *normalizer = NULL;
-  int nflags =
+  unsigned int normalize_flags =
     GRN_STRING_REMOVE_BLANK |
     GRN_STRING_WITH_TYPES |
     GRN_STRING_REMOVE_TOKENIZED_DELIMITER;
+  grn_tokenizer_query *query;
   const char *normalized;
   unsigned int normalized_length_in_bytes;
   grn_ngram_tokenizer *token;
-  grn_obj_flags table_flags;
-  if (!(str = grn_ctx_pop(ctx))) {
-    ERR(GRN_INVALID_ARGUMENT, "missing argument");
+
+  query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags);
+  if (!query) {
+    return NULL;
+  }
+
+  if (!(token = GRN_MALLOC(sizeof(grn_ngram_tokenizer)))) {
+    grn_tokenizer_query_close(ctx, query);
+    ERR(GRN_NO_MEMORY_AVAILABLE,
+        "[tokenizer][ngram] "
+        "memory allocation to grn_ngram_tokenizer failed");
     return NULL;
   }
-  if (!(token = GRN_MALLOC(sizeof(grn_ngram_tokenizer)))) { return NULL; }
   user_data->ptr = token;
+
+  token->query = query;
+
   token->uni_alpha = uni_alpha;
   token->uni_digit = uni_digit;
   token->uni_symbol = uni_symbol;
@@ -251,64 +258,56 @@ ngram_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data, uint8_t ngram
   token->overlap = 0;
   token->pos = 0;
   token->skip = 0;
-  grn_table_get_info(ctx, table, &table_flags, &token->encoding, NULL,
-                     &normalizer);
-  if (!(token->nstr = grn_string_open_(ctx,
-                                       GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str),
-                                       normalizer, nflags, token->encoding))) {
-    GRN_FREE(token);
-    ERR(GRN_TOKENIZER_ERROR, "grn_string_open failed at grn_token_open");
-    return NULL;
-  }
-  grn_string_get_normalized(ctx, token->nstr,
+
+  grn_string_get_normalized(ctx, token->query->normalized_query,
                             &normalized, &normalized_length_in_bytes,
                             &(token->len));
   token->next = (const unsigned char *)normalized;
   token->end = token->next + normalized_length_in_bytes;
-  token->ctypes = grn_string_get_types(ctx, token->nstr);
+  token->ctypes = grn_string_get_types(ctx, token->query->normalized_query);
   grn_tokenizer_token_init(ctx, &(token->token));
   return NULL;
 }
 
 static grn_obj *
 unigram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{ return ngram_init(ctx, args[0], user_data, 1, 1, 1, 1, 0); }
+{ return ngram_init(ctx, nargs, args, user_data, 1, 1, 1, 1, 0); }
 
 static grn_obj *
 bigram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{ return ngram_init(ctx, args[0], user_data, 2, 1, 1, 1, 0); }
+{ return ngram_init(ctx, nargs, args, user_data, 2, 1, 1, 1, 0); }
 
 static grn_obj *
 trigram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{ return ngram_init(ctx, args[0], user_data, 3, 1, 1, 1, 0); }
+{ return ngram_init(ctx, nargs, args, user_data, 3, 1, 1, 1, 0); }
 
 static grn_obj *
 bigrams_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{ return ngram_init(ctx, args[0], user_data, 2, 1, 1, 0, 0); }
+{ return ngram_init(ctx, nargs, args, user_data, 2, 1, 1, 0, 0); }
 
 static grn_obj *
 bigramsa_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{ return ngram_init(ctx, args[0], user_data, 2, 0, 1, 0, 0); }
+{ return ngram_init(ctx, nargs, args, user_data, 2, 0, 1, 0, 0); }
 
 static grn_obj *
 bigramsad_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{ return ngram_init(ctx, args[0], user_data, 2, 0, 0, 0, 0); }
+{ return ngram_init(ctx, nargs, args, user_data, 2, 0, 0, 0, 0); }
 
 static grn_obj *
 bigrami_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{ return ngram_init(ctx, args[0], user_data, 2, 1, 1, 1, 1); }
+{ return ngram_init(ctx, nargs, args, user_data, 2, 1, 1, 1, 1); }
 
 static grn_obj *
 bigramis_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{ return ngram_init(ctx, args[0], user_data, 2, 1, 1, 0, 1); }
+{ return ngram_init(ctx, nargs, args, user_data, 2, 1, 1, 0, 1); }
 
 static grn_obj *
 bigramisa_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{ return ngram_init(ctx, args[0], user_data, 2, 0, 1, 0, 1); }
+{ return ngram_init(ctx, nargs, args, user_data, 2, 0, 1, 0, 1); }
 
 static grn_obj *
 bigramisad_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{ return ngram_init(ctx, args[0], user_data, 2, 0, 0, 0, 1); }
+{ return ngram_init(ctx, nargs, args, user_data, 2, 0, 0, 0, 1); }
 
 static grn_obj *
 ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
@@ -319,7 +318,8 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   int32_t len = 0, pos = token->pos + token->skip, status = 0;
   const uint_least8_t *cp = token->ctypes ? token->ctypes + pos : NULL;
   if (cp && token->uni_alpha && GRN_STR_CTYPE(*cp) == grn_str_alpha) {
-    while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) {
+    while ((cl = grn_charlen_(ctx, (char *)r, (char *)e,
+                              token->query->encoding))) {
       len++;
       r += cl;
       if (/* !token->ignore_blank && */ GRN_STR_ISBLANK(*cp)) { break; }
@@ -328,7 +328,8 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
     token->next = r;
     token->overlap = 0;
   } else if (cp && token->uni_digit && GRN_STR_CTYPE(*cp) == grn_str_digit) {
-    while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) {
+    while ((cl = grn_charlen_(ctx, (char *)r, (char *)e,
+                              token->query->encoding))) {
       len++;
       r += cl;
       if (/* !token->ignore_blank && */ GRN_STR_ISBLANK(*cp)) { break; }
@@ -337,7 +338,8 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
     token->next = r;
     token->overlap = 0;
   } else if (cp && token->uni_symbol && GRN_STR_CTYPE(*cp) == grn_str_symbol) {
-    while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) {
+    while ((cl = grn_charlen_(ctx, (char *)r, (char *)e,
+                              token->query->encoding))) {
       len++;
       r += cl;
       if (!token->ignore_blank && GRN_STR_ISBLANK(*cp)) { break; }
@@ -354,21 +356,22 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
         token->status = GRN_TOKEN_NOT_FOUND;
         return NULL;
       }
-      len = grn_str_len(key, token->encoding, NULL);
+      len = grn_str_len(key, token->query->encoding, NULL);
     }
-    r = p + grn_charlen_(ctx, p, e, token->encoding);
+    r = p + grn_charlen_(ctx, p, e, token->query->encoding);
     if (tid && (len > 1 || r == p)) {
       if (r != p && pos + len - 1 <= token->tail) { continue; }
       p += strlen(key);
       if (!*p && token->mode == GRN_TOKEN_GET) { token->status = GRN_TOKEN_DONE; }
     }
 #endif /* PRE_DEFINED_UNSPLIT_WORDS */
-    if ((cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) {
+    if ((cl = grn_charlen_(ctx, (char *)r, (char *)e, token->query->encoding))) {
       len++;
       r += cl;
       token->next = r;
       while (len < token->ngram_unit &&
-             (cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) {
+             (cl = grn_charlen_(ctx, (char *)r, (char *)e,
+                                token->query->encoding))) {
         if (cp) {
           if (!token->ignore_blank && GRN_STR_ISBLANK(*cp)) { break; }
           cp++;
@@ -403,7 +406,7 @@ ngram_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
 {
   grn_ngram_tokenizer *token = user_data->ptr;
   grn_tokenizer_token_fin(ctx, &(token->token));
-  grn_obj_close(ctx, token->nstr);
+  grn_tokenizer_query_close(ctx, token->query);
   GRN_FREE(token);
   return NULL;
 }
-------------- next part --------------
HTML����������������������������...
Télécharger 



More information about the Groonga-commit mailing list
Back to archive index