groonga/groonga at 45f8edd [master] table_tokenize: add index_column option (Groonga-commit) - Groonga - fulltext search engine.

naoa	2016-04-03 07:34:17 +0900 (Sun, 03 Apr 2016)

  New Revision: 45f8edda981e7bce82dce7ccf88f87733b6c7d2e
  https://github.com/groonga/groonga/commit/45f8edda981e7bce82dce7ccf88f87733b6c7d2e

  Merged 3687075: Merge pull request #518 from naoa/add-output-df-to-table-tokenize

  Message:
    table_tokenize: add index_column option

  Added files:
    test/command/suite/table_tokenize/index_column.expected
    test/command/suite/table_tokenize/index_column.test
  Modified files:
    lib/proc/proc_tokenize.c

  Modified: lib/proc/proc_tokenize.c (+57 -9)
===================================================================

--- lib/proc/proc_tokenize.c    2016-04-03 02:04:55 +0900 (0fc98c1)
+++ lib/proc/proc_tokenize.c    2016-04-03 07:34:17 +0900 (2ec1935)
@@ -69,11 +69,19 @@ typedef struct {
 } tokenize_token;
 
 static void
-output_tokens(grn_ctx *ctx, grn_obj *tokens, grn_obj *lexicon)
+output_tokens(grn_ctx *ctx, grn_obj *tokens, grn_obj *lexicon, grn_obj *index_column)
 {
-  int i, n_tokens;
+  int i, n_tokens, n_elements;
+  grn_obj estimate_size;
 
   n_tokens = GRN_BULK_VSIZE(tokens) / sizeof(tokenize_token);
+  if (index_column) {
+    n_elements = 4;
+    GRN_UINT32_INIT(&estimate_size, 0);
+  } else {
+    n_elements = 3;
+  }
+
   grn_ctx_output_array_open(ctx, "TOKENS", n_tokens);
   for (i = 0; i < n_tokens; i++) {
     tokenize_token *token;
@@ -82,7 +90,7 @@ output_tokens(grn_ctx *ctx, grn_obj *tokens, grn_obj *lexicon)
 
     token = ((tokenize_token *)(GRN_BULK_HEAD(tokens))) + i;
 
-    grn_ctx_output_map_open(ctx, "TOKEN", 3);
+    grn_ctx_output_map_open(ctx, "TOKEN", n_elements);
 
     grn_ctx_output_cstr(ctx, "value");
     value_size = grn_table_get_key(ctx, lexicon, token->id,
@@ -95,8 +103,20 @@ output_tokens(grn_ctx *ctx, grn_obj *tokens, grn_obj *lexicon)
     grn_ctx_output_cstr(ctx, "force_prefix");
     grn_ctx_output_bool(ctx, token->force_prefix);
 
+    if (index_column) {
+      GRN_BULK_REWIND(&estimate_size);
+      grn_obj_get_value(ctx, index_column, token->id, &estimate_size);
+      grn_ctx_output_cstr(ctx, "estimate_size");
+      grn_ctx_output_int64(ctx, GRN_UINT32_VALUE(&estimate_size));
+    }
+
     grn_ctx_output_map_close(ctx);
   }
+
+  if (index_column) {
+    grn_obj_unlink(ctx, &estimate_size);
+  }
+
   grn_ctx_output_array_close(ctx);
 }
 
@@ -216,11 +236,13 @@ command_table_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *u
   grn_obj *string;
   grn_obj *flag_names;
   grn_obj *mode_name;
+  grn_obj *index_column_name;
 
   table_name = grn_plugin_proc_get_var(ctx, user_data, "table", -1);
   string = grn_plugin_proc_get_var(ctx, user_data, "string", -1);
   flag_names = grn_plugin_proc_get_var(ctx, user_data, "flags", -1);
   mode_name = grn_plugin_proc_get_var(ctx, user_data, "mode", -1);
+  index_column_name = grn_plugin_proc_get_var(ctx, user_data, "index_column", -1);
 
   if (GRN_TEXT_LEN(table_name) == 0) {
     GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[table_tokenize] table name is missing");
@@ -235,6 +257,7 @@ command_table_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *u
   {
     unsigned int flags;
     grn_obj *lexicon;
+    grn_obj *index_column = NULL;
 
     flags = parse_tokenize_flags(ctx, flag_names);
     if (ctx->rc != GRN_SUCCESS) {
@@ -251,15 +274,35 @@ command_table_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *u
     (GRN_TEXT_LEN(mode_name) == strlen(name) &&\
      memcmp(GRN_TEXT_VALUE(mode_name), name, strlen(name)) == 0)
 
+    if (GRN_TEXT_LEN(index_column_name) > 0) {
+      index_column = grn_obj_column(ctx, lexicon,
+                                    GRN_TEXT_VALUE(index_column_name),
+                                    GRN_TEXT_LEN(index_column_name));
+      if (!index_column) {
+        GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
+                         "[table_tokenize] nonexistent index column: <%.*s>",
+                         (int)GRN_TEXT_LEN(index_column_name),
+                         GRN_TEXT_VALUE(index_column_name));
+        goto exit;
+      }
+      if (index_column->header.type != GRN_COLUMN_INDEX) {
+        GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
+                         "[table_tokenize] index column must be COLUMN_INDEX: <%.*s>",
+                         (int)GRN_TEXT_LEN(index_column_name),
+                         GRN_TEXT_VALUE(index_column_name));
+        goto exit;
+      }
+    }
+
     {
       grn_obj tokens;
       GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL);
     if (GRN_TEXT_LEN(mode_name) == 0 || MODE_NAME_EQUAL("GET")) {
       tokenize(ctx, lexicon, string, GRN_TOKEN_GET, flags, &tokens);
-      output_tokens(ctx, &tokens, lexicon);
+      output_tokens(ctx, &tokens, lexicon, index_column);
     } else if (MODE_NAME_EQUAL("ADD")) {
       tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens);
-      output_tokens(ctx, &tokens, lexicon);
+      output_tokens(ctx, &tokens, lexicon, index_column);
     } else {
       GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
                        "[table_tokenize] invalid mode: <%.*s>",
@@ -269,7 +312,11 @@ command_table_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *u
     }
 #undef MODE_NAME_EQUAL
 
+exit:
     grn_obj_unlink(ctx, lexicon);
+    if (index_column) {
+      grn_obj_unlink(ctx, index_column);
+    }
   }
 
   return NULL;
@@ -278,16 +325,17 @@ command_table_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *u
 void
 grn_proc_init_table_tokenize(grn_ctx *ctx)
 {
-  grn_expr_var vars[4];
+  grn_expr_var vars[5];
 
   grn_plugin_expr_var_init(ctx, &(vars[0]), "table", -1);
   grn_plugin_expr_var_init(ctx, &(vars[1]), "string", -1);
   grn_plugin_expr_var_init(ctx, &(vars[2]), "flags", -1);
   grn_plugin_expr_var_init(ctx, &(vars[3]), "mode", -1);
+  grn_plugin_expr_var_init(ctx, &(vars[4]), "index_column", -1);
   grn_plugin_command_create(ctx,
                             "table_tokenize", -1,
                             command_table_tokenize,
-                            4,
+                            5,
                             vars);
 }
 
@@ -343,12 +391,12 @@ command_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_da
       GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL);
       if (GRN_TEXT_LEN(mode_name) == 0 || MODE_NAME_EQUAL("ADD")) {
         tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens);
-        output_tokens(ctx, &tokens, lexicon);
+        output_tokens(ctx, &tokens, lexicon, NULL);
       } else if (MODE_NAME_EQUAL("GET")) {
         tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens);
         GRN_BULK_REWIND(&tokens);
         tokenize(ctx, lexicon, string, GRN_TOKEN_GET, flags, &tokens);
-        output_tokens(ctx, &tokens, lexicon);
+        output_tokens(ctx, &tokens, lexicon, NULL);
       } else {
         GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
                          "[tokenize] invalid mode: <%.*s>",

  Added: test/command/suite/table_tokenize/index_column.expected (+55 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/table_tokenize/index_column.expected    2016-04-03 07:34:17 +0900 (61f4663)
@@ -0,0 +1,55 @@
+table_create Entries TABLE_NO_KEY
+[[0,0.0,0.0],true]
+column_create Entries body COLUMN_SCALAR ShortText
+[[0,0.0,0.0],true]
+table_create Terms TABLE_PAT_KEY ShortText   --default_tokenizer TokenDelimit   --normalizer NormalizerAuto
+[[0,0.0,0.0],true]
+load --table Entries
+[
+{"body": "Groonga is a full text search engine"},
+{"body": "Mroonga is a MySQL storage engine based on Groonga"},
+{"body": "Rroonga is a ruby bindings of Groonga"}
+]
+[[0,0.0,0.0],3]
+column_create Terms index COLUMN_INDEX Entries body
+[[0,0.0,0.0],true]
+table_tokenize Terms "a ruby bindings of Groonga" --mode GET --index_column index
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "a",
+      "position": 0,
+      "force_prefix": false,
+      "estimate_size": 5
+    },
+    {
+      "value": "ruby",
+      "position": 1,
+      "force_prefix": false,
+      "estimate_size": 1
+    },
+    {
+      "value": "bindings",
+      "position": 2,
+      "force_prefix": false,
+      "estimate_size": 1
+    },
+    {
+      "value": "of",
+      "position": 3,
+      "force_prefix": false,
+      "estimate_size": 1
+    },
+    {
+      "value": "groonga",
+      "position": 4,
+      "force_prefix": false,
+      "estimate_size": 5
+    }
+  ]
+]

  Added: test/command/suite/table_tokenize/index_column.test (+17 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/table_tokenize/index_column.test    2016-04-03 07:34:17 +0900 (b931843)
@@ -0,0 +1,17 @@
+table_create Entries TABLE_NO_KEY
+column_create Entries body COLUMN_SCALAR ShortText
+
+table_create Terms TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenDelimit \
+  --normalizer NormalizerAuto
+
+load --table Entries
+[
+{"body": "Groonga is a full text search engine"},
+{"body": "Mroonga is a MySQL storage engine based on Groonga"},
+{"body": "Rroonga is a ruby bindings of Groonga"}
+]
+
+column_create Terms index COLUMN_INDEX Entries body
+
+table_tokenize Terms "a ruby bindings of Groonga" --mode GET --index_column index
-------------- next part --------------
HTML����������������������������...
Télécharger 


Groonga - fulltext search engine.

[Groonga-commit] groonga/groonga at 45f8edd [master] table_tokenize: add index_column option