groonga/groonga at 81dd354 [master] TokenTable: add a new tokenizer that tokenizes existing keys (Groonga-commit) - Groonga - fulltext search engine.

Kouhei Sutou	2019-02-07 14:48:48 +0900 (Thu, 07 Feb 2019)

  Revision: 81dd354377cd870f74736988ad1f6d5181810bf8
  https://github.com/groonga/groonga/commit/81dd354377cd870f74736988ad1f6d5181810bf8

  Message:
    TokenTable: add a new tokenizer that tokenizes existing keys

  Added files:
    test/command/suite/select/query/match/with_index/token_table/same.expected
    test/command/suite/select/query/match/with_index/token_table/same.test
    test/command/suite/tokenizers/table/match.expected
    test/command/suite/tokenizers/table/match.test
    test/command/suite/tokenizers/table/no_table.expected
    test/command/suite/tokenizers/table/no_table.test
    test/command/suite/tokenizers/table/nonexistent_table.expected
    test/command/suite/tokenizers/table/nonexistent_table.test
  Modified files:
    lib/tokenizers.c
    test/command/suite/schema/plugins.expected
    test/command/suite/schema/tables/columns/compress/lz4.expected
    test/command/suite/schema/tables/columns/compress/zlib.expected
    test/command/suite/schema/tables/columns/compress/zstd.expected
    test/command/suite/schema/tables/columns/type/index_medium.expected
    test/command/suite/schema/tables/columns/type/index_small.expected
    test/command/suite/schema/tables/columns/type/scalar.expected
    test/command/suite/schema/tables/columns/type/vector.expected
    test/command/suite/schema/tables/normalizer.expected
    test/command/suite/schema/tables/normalizer_with_options.expected
    test/command/suite/schema/tables/token_filters.expected
    test/command/suite/schema/tables/token_filters_with_options.expected
    test/command/suite/schema/tables/tokenizer.expected
    test/command/suite/schema/tables/tokenizer_with_options.expected
    test/command/suite/schema/tables/type/array.expected
    test/command/suite/schema/tables/type/hash_table.expected
    test/command/suite/schema/tables/value_type/reference.expected
    test/command/suite/schema/tables/value_type/type.expected
    test/command/suite/tokenizer_list/default.expected

  Modified: lib/tokenizers.c (+250 -0)
===================================================================

--- lib/tokenizers.c    2019-02-07 10:54:48 +0900 (d10118356)
+++ lib/tokenizers.c    2019-02-07 14:48:48 +0900 (41f8ee843)
@@ -1764,6 +1764,249 @@ pattern_fin(grn_ctx *ctx, void *user_data)
   GRN_FREE(tokenizer);
 }
 
+/* table tokenizer */
+
+typedef struct {
+  grn_obj *table;
+} grn_table_options;
+
+typedef struct {
+  grn_tokenizer_token token;
+  grn_tokenizer_query *query;
+  grn_table_options *options;
+  grn_bool have_tokenized_delimiter;
+  grn_encoding encoding;
+  const unsigned char *start;
+  const unsigned char *current;
+  const unsigned char *next;
+  const unsigned char *end;
+  grn_pat_scan_hit hits[1024];
+  int n_hits;
+  int current_hit;
+} grn_table_tokenizer;
+
+static void
+table_options_init(grn_table_options *options)
+{
+  options->table = NULL;
+}
+
+static void *
+table_open_options(grn_ctx *ctx,
+                   grn_obj *tokenizer,
+                   grn_obj *raw_options,
+                   void *user_data)
+{
+  grn_table_options *options;
+
+  options = GRN_MALLOC(sizeof(grn_table_options));
+  if (!options) {
+    ERR(GRN_NO_MEMORY_AVAILABLE,
+        "[tokenizer][table] "
+        "failed to allocate memory for options");
+    return NULL;
+  }
+
+  table_options_init(options);
+  GRN_OPTION_VALUES_EACH_BEGIN(ctx, raw_options, i, name, name_length) {
+    grn_raw_string name_raw;
+    name_raw.value = name;
+    name_raw.length = name_length;
+
+    if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "table")) {
+      const char *name;
+      unsigned int name_length;
+      grn_id domain;
+
+      name_length = grn_vector_get_element(ctx,
+                                           raw_options,
+                                           i,
+                                           &name,
+                                           NULL,
+                                           &domain);
+      if (grn_type_id_is_text_family(ctx, domain) && name_length > 0) {
+        options->table = grn_ctx_get(ctx, name, name_length);
+        if (!options->table) {
+          ERR(GRN_INVALID_ARGUMENT,
+              "[tokenizer][table] nonexistent table: <%.*s>",
+              (int)name_length, name);
+          break;
+        }
+        if (options->table->header.type != GRN_TABLE_PAT_KEY) {
+          grn_obj inspected;
+          GRN_TEXT_INIT(&inspected, 0);
+          grn_inspect(ctx, &inspected, options->table);
+          ERR(GRN_INVALID_ARGUMENT,
+              "[tokenizer][table] table must be a patricia trie table: "
+              "<%.*s>: <%.*s>",
+              (int)name_length,
+              name,
+              (int)GRN_TEXT_LEN(&inspected),
+              GRN_TEXT_VALUE(&inspected));
+          GRN_OBJ_FIN(ctx, &inspected);
+          break;
+        }
+      }
+    }
+  } GRN_OPTION_VALUES_EACH_END();
+
+  if (ctx->rc == GRN_SUCCESS && !options->table) {
+    ERR(GRN_INVALID_ARGUMENT,
+        "[tokenizer][table] table isn't specified");
+  }
+
+  return options;
+}
+
+static void
+table_close_options(grn_ctx *ctx, void *data)
+{
+  grn_table_options *options = data;
+  GRN_FREE(options);
+}
+
+static void *
+table_init(grn_ctx *ctx, grn_tokenizer_query *query)
+{
+  grn_obj *lexicon = grn_tokenizer_query_get_lexicon(ctx, query);
+  grn_table_options *options;
+  grn_table_tokenizer *tokenizer;
+
+  options = grn_table_cache_default_tokenizer_options(ctx,
+                                                      lexicon,
+                                                      table_open_options,
+                                                      table_close_options,
+                                                      NULL);
+  if (ctx->rc != GRN_SUCCESS) {
+    return NULL;
+  }
+
+  if (!(tokenizer = GRN_MALLOC(sizeof(grn_table_tokenizer)))) {
+    ERR(GRN_NO_MEMORY_AVAILABLE,
+        "[tokenizer][table] "
+        "memory allocation to grn_table_tokenizer failed");
+    return NULL;
+  }
+
+  tokenizer->query = query;
+  tokenizer->options = options;
+
+  {
+    const char *raw_string;
+    size_t raw_string_length;
+    grn_encoding encoding;
+
+    raw_string = grn_tokenizer_query_get_raw_string(ctx,
+                                                    tokenizer->query,
+                                                    &raw_string_length);
+    encoding = grn_tokenizer_query_get_encoding(ctx, tokenizer->query);
+    tokenizer->have_tokenized_delimiter =
+      grn_tokenizer_have_tokenized_delimiter(ctx,
+                                             raw_string,
+                                             raw_string_length,
+                                             encoding);
+    tokenizer->encoding = encoding;
+  }
+  {
+    grn_obj *string;
+    const char *normalized;
+    unsigned int normalized_length_in_bytes;
+
+    string = grn_tokenizer_query_get_normalized_string(ctx, tokenizer->query);
+    grn_string_get_normalized(ctx,
+                              string,
+                              &normalized, &normalized_length_in_bytes,
+                              NULL);
+    tokenizer->start = (const unsigned char *)normalized;
+    tokenizer->next = tokenizer->start;
+    tokenizer->current = tokenizer->start;
+    tokenizer->end = tokenizer->start + normalized_length_in_bytes;
+  }
+
+  tokenizer->n_hits = 0;
+  tokenizer->current_hit = -1;
+
+  return tokenizer;
+}
+
+static void
+table_scan(grn_ctx *ctx,
+           grn_table_tokenizer *tokenizer)
+{
+  const char *rest;
+  tokenizer->n_hits = grn_pat_scan(ctx,
+                                   (grn_pat *)(tokenizer->options->table),
+                                   tokenizer->next,
+                                   tokenizer->end - tokenizer->next,
+                                   tokenizer->hits,
+                                   sizeof(tokenizer->hits) /
+                                   sizeof(*(tokenizer->hits)),
+                                   &rest);
+  tokenizer->current = tokenizer->next;
+  tokenizer->next = rest;
+  tokenizer->current_hit = 0;
+}
+
+static void
+table_next(grn_ctx *ctx,
+           grn_tokenizer_query *query,
+           grn_token *token,
+           void *user_data)
+{
+  grn_table_tokenizer *tokenizer = user_data;
+
+  if (tokenizer->have_tokenized_delimiter) {
+    unsigned int rest_length;
+    rest_length = tokenizer->end - tokenizer->next;
+    tokenizer->next =
+      (unsigned char *)grn_tokenizer_next_by_tokenized_delimiter(
+        ctx,
+        token,
+        (const char *)tokenizer->next,
+        rest_length,
+        tokenizer->encoding);
+  } else {
+    if (tokenizer->current_hit == -1) {
+      table_scan(ctx, tokenizer);
+    }
+    if (tokenizer->current_hit < tokenizer->n_hits) {
+      grn_pat_scan_hit *hit = &(tokenizer->hits[tokenizer->current_hit]);
+      grn_token_set_data(ctx,
+                         token,
+                         tokenizer->current + hit->offset,
+                         hit->length);
+      tokenizer->current_hit++;
+      if (tokenizer->current_hit == tokenizer->n_hits) {
+        grn_token_status status = GRN_TOKEN_CONTINUE;
+        tokenizer->current_hit = -1;
+        if (tokenizer->next != tokenizer->end) {
+          table_scan(ctx, tokenizer);
+        }
+        if (tokenizer->next == tokenizer->end) {
+          status = GRN_TOKEN_LAST;
+        }
+        grn_token_set_status(ctx, token, status);
+      } else {
+        grn_token_set_status(ctx, token, GRN_TOKEN_CONTINUE);
+      }
+    } else {
+      grn_token_set_data(ctx, token, NULL, 0);
+      grn_token_set_status(ctx, token, GRN_TOKEN_LAST);
+    }
+  }
+}
+
+static void
+table_fin(grn_ctx *ctx, void *user_data)
+{
+  grn_table_tokenizer *tokenizer = user_data;
+
+  if (!tokenizer) {
+    return;
+  }
+  GRN_FREE(tokenizer);
+}
+
 /* external */
 
 grn_rc
@@ -1958,6 +2201,13 @@ grn_db_init_builtin_tokenizers(grn_ctx *ctx)
     grn_tokenizer_set_next_func(ctx, tokenizer, pattern_next);
     grn_tokenizer_set_fin_func(ctx, tokenizer, pattern_fin);
   }
+  {
+    grn_obj *tokenizer;
+    tokenizer = grn_tokenizer_create(ctx, "TokenTable", -1);
+    grn_tokenizer_set_init_func(ctx, tokenizer, table_init);
+    grn_tokenizer_set_next_func(ctx, tokenizer, table_next);
+    grn_tokenizer_set_fin_func(ctx, tokenizer, table_fin);
+  }
 
   return GRN_SUCCESS;
 }

  Modified: test/command/suite/schema/plugins.expected (+7 -3)
===================================================================
--- test/command/suite/schema/plugins.expected    2019-02-07 10:54:48 +0900 (c492ea356)
+++ test/command/suite/schema/plugins.expected    2019-02-07 14:48:48 +0900 (caad98793)
@@ -191,6 +191,10 @@ schema
         "id": 77,
         "name": "TokenRegexp"
       },
+      "TokenTable": {
+        "id": 80,
+        "name": "TokenTable"
+      },
       "TokenTrigram": {
         "id": 68,
         "name": "TokenTrigram"
@@ -202,15 +206,15 @@ schema
     },
     "normalizers": {
       "NormalizerAuto": {
-        "id": 80,
+        "id": 81,
         "name": "NormalizerAuto"
       },
       "NormalizerNFKC100": {
-        "id": 82,
+        "id": 83,
         "name": "NormalizerNFKC100"
       },
       "NormalizerNFKC51": {
-        "id": 81,
+        "id": 82,
         "name": "NormalizerNFKC51"
       }
     },

  Modified: test/command/suite/schema/tables/columns/compress/lz4.expected (+7 -3)
===================================================================
--- test/command/suite/schema/tables/columns/compress/lz4.expected    2019-02-07 10:54:48 +0900 (486b12a9a)
+++ test/command/suite/schema/tables/columns/compress/lz4.expected    2019-02-07 14:48:48 +0900 (e24b459cf)
@@ -190,6 +190,10 @@ schema
         "id": 77,
         "name": "TokenRegexp"
       },
+      "TokenTable": {
+        "id": 80,
+        "name": "TokenTable"
+      },
       "TokenTrigram": {
         "id": 68,
         "name": "TokenTrigram"
@@ -201,15 +205,15 @@ schema
     },
     "normalizers": {
       "NormalizerAuto": {
-        "id": 80,
+        "id": 81,
         "name": "NormalizerAuto"
       },
       "NormalizerNFKC100": {
-        "id": 82,
+        "id": 83,
         "name": "NormalizerNFKC100"
       },
       "NormalizerNFKC51": {
-        "id": 81,
+        "id": 82,
         "name": "NormalizerNFKC51"
       }
     },

  Modified: test/command/suite/schema/tables/columns/compress/zlib.expected (+7 -3)
===================================================================
--- test/command/suite/schema/tables/columns/compress/zlib.expected    2019-02-07 10:54:48 +0900 (d2e422f81)
+++ test/command/suite/schema/tables/columns/compress/zlib.expected    2019-02-07 14:48:48 +0900 (7d5d1bae9)
@@ -190,6 +190,10 @@ schema
         "id": 77,
         "name": "TokenRegexp"
       },
+      "TokenTable": {
+        "id": 80,
+        "name": "TokenTable"
+      },
       "TokenTrigram": {
         "id": 68,
         "name": "TokenTrigram"
@@ -201,15 +205,15 @@ schema
     },
     "normalizers": {
       "NormalizerAuto": {
-        "id": 80,
+        "id": 81,
         "name": "NormalizerAuto"
       },
       "NormalizerNFKC100": {
-        "id": 82,
+        "id": 83,
         "name": "NormalizerNFKC100"
       },
       "NormalizerNFKC51": {
-        "id": 81,
+        "id": 82,
         "name": "NormalizerNFKC51"
       }
     },

  Modified: test/command/suite/schema/tables/columns/compress/zstd.expected (+7 -3)
===================================================================
--- test/command/suite/schema/tables/columns/compress/zstd.expected    2019-02-07 10:54:48 +0900 (60134b304)
+++ test/command/suite/schema/tables/columns/compress/zstd.expected    2019-02-07 14:48:48 +0900 (4ceef10ba)
@@ -190,6 +190,10 @@ schema
         "id": 77,
         "name": "TokenRegexp"
       },
+      "TokenTable": {
+        "id": 80,
+        "name": "TokenTable"
+      },
       "TokenTrigram": {
         "id": 68,
         "name": "TokenTrigram"
@@ -201,15 +205,15 @@ schema
     },
     "normalizers": {
       "NormalizerAuto": {
-        "id": 80,
+        "id": 81,
         "name": "NormalizerAuto"
       },
       "NormalizerNFKC100": {
-        "id": 82,
+        "id": 83,
         "name": "NormalizerNFKC100"
       },
       "NormalizerNFKC51": {
-        "id": 81,
+        "id": 82,
         "name": "NormalizerNFKC51"
       }
     },

  Modified: test/command/suite/schema/tables/columns/type/index_medium.expected (+8 -4)
===================================================================
--- test/command/suite/schema/tables/columns/type/index_medium.expected    2019-02-07 10:54:48 +0900 (ffaa17a37)
+++ test/command/suite/schema/tables/columns/type/index_medium.expected    2019-02-07 14:48:48 +0900 (3bc08528a)
@@ -196,6 +196,10 @@ schema
         "id": 77,
         "name": "TokenRegexp"
       },
+      "TokenTable": {
+        "id": 80,
+        "name": "TokenTable"
+      },
       "TokenTrigram": {
         "id": 68,
         "name": "TokenTrigram"
@@ -207,15 +211,15 @@ schema
     },
     "normalizers": {
       "NormalizerAuto": {
-        "id": 80,
+        "id": 81,
         "name": "NormalizerAuto"
       },
       "NormalizerNFKC100": {
-        "id": 82,
+        "id": 83,
         "name": "NormalizerNFKC100"
       },
       "NormalizerNFKC51": {
-        "id": 81,
+        "id": 82,
         "name": "NormalizerNFKC51"
       }
     },
@@ -354,7 +358,7 @@ schema
           "options": null
         },
         "normalizer": {
-          "id": 80,
+          "id": 81,
           "name": "NormalizerAuto",
           "options": null
         },

  Modified: test/command/suite/schema/tables/columns/type/index_small.expected (+8 -4)
===================================================================
--- test/command/suite/schema/tables/columns/type/index_small.expected    2019-02-07 10:54:48 +0900 (8864b4092)
+++ test/command/suite/schema/tables/columns/type/index_small.expected    2019-02-07 14:48:48 +0900 (56432ad5a)
@@ -196,6 +196,10 @@ schema
         "id": 77,
         "name": "TokenRegexp"
       },
+      "TokenTable": {
+        "id": 80,
+        "name": "TokenTable"
+      },
       "TokenTrigram": {
         "id": 68,
         "name": "TokenTrigram"
@@ -207,15 +211,15 @@ schema
     },
     "normalizers": {
       "NormalizerAuto": {
-        "id": 80,
+        "id": 81,
         "name": "NormalizerAuto"
       },
       "NormalizerNFKC100": {
-        "id": 82,
+        "id": 83,
         "name": "NormalizerNFKC100"
       },
       "NormalizerNFKC51": {
-        "id": 81,
+        "id": 82,
         "name": "NormalizerNFKC51"
       }
     },
@@ -354,7 +358,7 @@ schema
           "options": null
         },
         "normalizer": {
-          "id": 80,
+          "id": 81,
           "name": "NormalizerAuto",
           "options": null
         },

  Modified: test/command/suite/schema/tables/columns/type/scalar.expected (+7 -3)
===================================================================
--- test/command/suite/schema/tables/columns/type/scalar.expected    2019-02-07 10:54:48 +0900 (2332d720a)
+++ test/command/suite/schema/tables/columns/type/scalar.expected    2019-02-07 14:48:48 +0900 (13e3a1b88)
@@ -190,6 +190,10 @@ schema
         "id": 77,
         "name": "TokenRegexp"
       },
+      "TokenTable": {
+        "id": 80,
+        "name": "TokenTable"
+      },
       "TokenTrigram": {
         "id": 68,
         "name": "TokenTrigram"
@@ -201,15 +205,15 @@ schema
     },
     "normalizers": {
       "NormalizerAuto": {
-        "id": 80,
+        "id": 81,
         "name": "NormalizerAuto"
       },
       "NormalizerNFKC100": {
-        "id": 82,
+        "id": 83,
         "name": "NormalizerNFKC100"
       },
       "NormalizerNFKC51": {
-        "id": 81,
+        "id": 82,
         "name": "NormalizerNFKC51"
       }
     },

  Modified: test/command/suite/schema/tables/columns/type/vector.expected (+8 -4)
===================================================================
--- test/command/suite/schema/tables/columns/type/vector.expected    2019-02-07 10:54:48 +0900 (998a29ac0)
+++ test/command/suite/schema/tables/columns/type/vector.expected    2019-02-07 14:48:48 +0900 (e6d59ad6e)
@@ -192,6 +192,10 @@ schema
         "id": 77,
         "name": "TokenRegexp"
       },
+      "TokenTable": {
+        "id": 80,
+        "name": "TokenTable"
+      },
       "TokenTrigram": {
         "id": 68,
         "name": "TokenTrigram"
@@ -203,15 +207,15 @@ schema
     },
     "normalizers": {
       "NormalizerAuto": {
-        "id": 80,
+        "id": 81,
         "name": "NormalizerAuto"
       },
       "NormalizerNFKC100": {
-        "id": 82,
+        "id": 83,
         "name": "NormalizerNFKC100"
       },
       "NormalizerNFKC51": {
-        "id": 81,
+        "id": 82,
         "name": "NormalizerNFKC51"
       }
     },
@@ -296,7 +300,7 @@ schema
         "value_type": null,
         "tokenizer": null,
         "normalizer": {
-          "id": 80,
+          "id": 81,
           "name": "NormalizerAuto",
           "options": null
         },

  Modified: test/command/suite/schema/tables/normalizer.expected (+8 -4)
===================================================================
--- test/command/suite/schema/tables/normalizer.expected    2019-02-07 10:54:48 +0900 (2e9ff0399)
+++ test/command/suite/schema/tables/normalizer.expected    2019-02-07 14:48:48 +0900 (1a0dc0f6a)
@@ -188,6 +188,10 @@ schema
         "id": 77,
         "name": "TokenRegexp"
       },
+      "TokenTable": {
+        "id": 80,
+        "name": "TokenTable"
+      },
       "TokenTrigram": {
         "id": 68,
         "name": "TokenTrigram"
@@ -199,15 +203,15 @@ schema
     },
     "normalizers": {
       "NormalizerAuto": {
-        "id": 80,
+        "id": 81,
         "name": "NormalizerAuto"
       },
       "NormalizerNFKC100": {
-        "id": 82,
+        "id": 83,
         "name": "NormalizerNFKC100"
       },
       "NormalizerNFKC51": {
-        "id": 81,
+        "id": 82,
         "name": "NormalizerNFKC51"
       }
     },
@@ -230,7 +234,7 @@ schema
         "value_type": null,
         "tokenizer": null,
         "normalizer": {
-          "id": 80,
+          "id": 81,
           "name": "NormalizerAuto",
           "options": null
         },

  Modified: test/command/suite/schema/tables/normalizer_with_options.expected (+8 -4)
===================================================================
--- test/command/suite/schema/tables/normalizer_with_options.expected    2019-02-07 10:54:48 +0900 (9245470bf)
+++ test/command/suite/schema/tables/normalizer_with_options.expected    2019-02-07 14:48:48 +0900 (a2f7426f7)
@@ -188,6 +188,10 @@ schema
         "id": 77,
         "name": "TokenRegexp"
       },
+      "TokenTable": {
+        "id": 80,
+        "name": "TokenTable"
+      },
       "TokenTrigram": {
         "id": 68,
         "name": "TokenTrigram"
@@ -199,15 +203,15 @@ schema
     },
     "normalizers": {
       "NormalizerAuto": {
-        "id": 80,
+        "id": 81,
         "name": "NormalizerAuto"
       },
       "NormalizerNFKC100": {
-        "id": 82,
+        "id": 83,
         "name": "NormalizerNFKC100"
       },
       "NormalizerNFKC51": {
-        "id": 81,
+        "id": 82,
         "name": "NormalizerNFKC51"
       }
     },
@@ -230,7 +234,7 @@ schema
         "value_type": null,
         "tokenizer": null,
         "normalizer": {
-          "id": 82,
+          "id": 83,
           "name": "NormalizerNFKC100",
           "options": [
             "unify_kana",

  Modified: test/command/suite/schema/tables/token_filters.expected (+7 -3)
===================================================================
--- test/command/suite/schema/tables/token_filters.expected    2019-02-07 10:54:48 +0900 (6ff6b581a)
+++ test/command/suite/schema/tables/token_filters.expected    2019-02-07 14:48:48 +0900 (73115fd5a)
@@ -193,6 +193,10 @@ schema
         "id": 77,
         "name": "TokenRegexp"
       },
+      "TokenTable": {
+        "id": 80,
+        "name": "TokenTable"
+      },
       "TokenTrigram": {
         "id": 68,
         "name": "TokenTrigram"
@@ -204,15 +208,15 @@ schema
     },
     "normalizers": {
       "NormalizerAuto": {
-        "id": 80,
+        "id": 81,
         "name": "NormalizerAuto"
       },
       "NormalizerNFKC100": {
-        "id": 82,
+        "id": 83,
         "name": "NormalizerNFKC100"
       },
       "NormalizerNFKC51": {
-        "id": 81,
+        "id": 82,
         "name": "NormalizerNFKC51"
       }
     },

  Modified: test/command/suite/schema/tables/token_filters_with_options.expected (+7 -3)
===================================================================
--- test/command/suite/schema/tables/token_filters_with_options.expected    2019-02-07 10:54:48 +0900 (73242af8c)
+++ test/command/suite/schema/tables/token_filters_with_options.expected    2019-02-07 14:48:48 +0900 (35af69113)
@@ -193,6 +193,10 @@ schema
         "id": 77,
         "name": "TokenRegexp"
       },
+      "TokenTable": {
+        "id": 80,
+        "name": "TokenTable"
+      },
       "TokenTrigram": {
         "id": 68,
         "name": "TokenTrigram"
@@ -204,15 +208,15 @@ schema
     },
     "normalizers": {
       "NormalizerAuto": {
-        "id": 80,
+        "id": 81,
         "name": "NormalizerAuto"
       },
       "NormalizerNFKC100": {
-        "id": 82,
+        "id": 83,
         "name": "NormalizerNFKC100"
       },
       "NormalizerNFKC51": {
-        "id": 81,
+        "id": 82,
         "name": "NormalizerNFKC51"
       }
     },

  Modified: test/command/suite/schema/tables/tokenizer.expected (+7 -3)
===================================================================
--- test/command/suite/schema/tables/tokenizer.expected    2019-02-07 10:54:48 +0900 (efc624dcf)
+++ test/command/suite/schema/tables/tokenizer.expected    2019-02-07 14:48:48 +0900 (3b151e44c)
@@ -188,6 +188,10 @@ schema
         "id": 77,
         "name": "TokenRegexp"
       },
+      "TokenTable": {
+        "id": 80,
+        "name": "TokenTable"
+      },
       "TokenTrigram": {
         "id": 68,
         "name": "TokenTrigram"
@@ -199,15 +203,15 @@ schema
     },
     "normalizers": {
       "NormalizerAuto": {
-        "id": 80,
+        "id": 81,
         "name": "NormalizerAuto"
       },
       "NormalizerNFKC100": {
-        "id": 82,
+        "id": 83,
         "name": "NormalizerNFKC100"
       },
       "NormalizerNFKC51": {
-        "id": 81,
+        "id": 82,
         "name": "NormalizerNFKC51"
       }
     },

  Modified: test/command/suite/schema/tables/tokenizer_with_options.expected (+7 -3)
===================================================================
--- test/command/suite/schema/tables/tokenizer_with_options.expected    2019-02-07 10:54:48 +0900 (f494cae1d)
+++ test/command/suite/schema/tables/tokenizer_with_options.expected    2019-02-07 14:48:48 +0900 (607f4c54b)
@@ -188,6 +188,10 @@ schema
         "id": 77,
         "name": "TokenRegexp"
       },
+      "TokenTable": {
+        "id": 80,
+        "name": "TokenTable"
+      },
       "TokenTrigram": {
         "id": 68,
         "name": "TokenTrigram"
@@ -199,15 +203,15 @@ schema
     },
     "normalizers": {
       "NormalizerAuto": {
-        "id": 80,
+        "id": 81,
         "name": "NormalizerAuto"
       },
       "NormalizerNFKC100": {
-        "id": 82,
+        "id": 83,
         "name": "NormalizerNFKC100"
       },
       "NormalizerNFKC51": {
-        "id": 81,
+        "id": 82,
         "name": "NormalizerNFKC51"
       }
     },

  Modified: test/command/suite/schema/tables/type/array.expected (+7 -3)
===================================================================
--- test/command/suite/schema/tables/type/array.expected    2019-02-07 10:54:48 +0900 (b985517a5)
+++ test/command/suite/schema/tables/type/array.expected    2019-02-07 14:48:48 +0900 (d42dd8947)
@@ -188,6 +188,10 @@ schema
         "id": 77,
         "name": "TokenRegexp"
       },
+      "TokenTable": {
+        "id": 80,
+        "name": "TokenTable"
+      },
       "TokenTrigram": {
         "id": 68,
         "name": "TokenTrigram"
@@ -199,15 +203,15 @@ schema
     },
     "normalizers": {
       "NormalizerAuto": {
-        "id": 80,
+        "id": 81,
         "name": "NormalizerAuto"
       },
       "NormalizerNFKC100": {
-        "id": 82,
+        "id": 83,
         "name": "NormalizerNFKC100"
       },
       "NormalizerNFKC51": {
-        "id": 81,
+        "id": 82,
         "name": "NormalizerNFKC51"
       }
     },

  Modified: test/command/suite/schema/tables/type/hash_table.expected (+7 -3)
===================================================================
--- test/command/suite/schema/tables/type/hash_table.expected    2019-02-07 10:54:48 +0900 (d88b06e5b)
+++ test/command/suite/schema/tables/type/hash_table.expected    2019-02-07 14:48:48 +0900 (6e36c1bc2)
@@ -188,6 +188,10 @@ schema
         "id": 77,
         "name": "TokenRegexp"
       },
+      "TokenTable": {
+        "id": 80,
+        "name": "TokenTable"
+      },
       "TokenTrigram": {
         "id": 68,
         "name": "TokenTrigram"
@@ -199,15 +203,15 @@ schema
     },
     "normalizers": {
       "NormalizerAuto": {
-        "id": 80,
+        "id": 81,
         "name": "NormalizerAuto"
       },
       "NormalizerNFKC100": {
-        "id": 82,
+        "id": 83,
         "name": "NormalizerNFKC100"
       },
       "NormalizerNFKC51": {
-        "id": 81,
+        "id": 82,
         "name": "NormalizerNFKC51"
       }
     },

  Modified: test/command/suite/schema/tables/value_type/reference.expected (+7 -3)
===================================================================
--- test/command/suite/schema/tables/value_type/reference.expected    2019-02-07 10:54:48 +0900 (4a06ff8e4)
+++ test/command/suite/schema/tables/value_type/reference.expected    2019-02-07 14:48:48 +0900 (62893d048)
@@ -190,6 +190,10 @@ schema
         "id": 77,
         "name": "TokenRegexp"
       },
+      "TokenTable": {
+        "id": 80,
+        "name": "TokenTable"
+      },
       "TokenTrigram": {
         "id": 68,
         "name": "TokenTrigram"
@@ -201,15 +205,15 @@ schema
     },
     "normalizers": {
       "NormalizerAuto": {
-        "id": 80,
+        "id": 81,
         "name": "NormalizerAuto"
       },
       "NormalizerNFKC100": {
-        "id": 82,
+        "id": 83,
         "name": "NormalizerNFKC100"
       },
       "NormalizerNFKC51": {
-        "id": 81,
+        "id": 82,
         "name": "NormalizerNFKC51"
       }
     },

  Modified: test/command/suite/schema/tables/value_type/type.expected (+7 -3)
===================================================================
--- test/command/suite/schema/tables/value_type/type.expected    2019-02-07 10:54:48 +0900 (cb7c1aba7)
+++ test/command/suite/schema/tables/value_type/type.expected    2019-02-07 14:48:48 +0900 (eb4d4833f)
@@ -188,6 +188,10 @@ schema
         "id": 77,
         "name": "TokenRegexp"
       },
+      "TokenTable": {
+        "id": 80,
+        "name": "TokenTable"
+      },
       "TokenTrigram": {
         "id": 68,
         "name": "TokenTrigram"
@@ -199,15 +203,15 @@ schema
     },
     "normalizers": {
       "NormalizerAuto": {
-        "id": 80,
+        "id": 81,
         "name": "NormalizerAuto"
       },
       "NormalizerNFKC100": {
-        "id": 82,
+        "id": 83,
         "name": "NormalizerNFKC100"
       },
       "NormalizerNFKC51": {
-        "id": 81,
+        "id": 82,
         "name": "NormalizerNFKC51"
       }
     },

  Added: test/command/suite/select/query/match/with_index/token_table/same.expected (+20 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/query/match/with_index/token_table/same.expected    2019-02-07 14:48:48 +0900 (b46d2b629)
@@ -0,0 +1,20 @@
+table_create Menus TABLE_NO_KEY
+[[0,0.0,0.0],true]
+column_create Menus name COLUMN_SCALAR Text
+[[0,0.0,0.0],true]
+table_create Keywords TABLE_PAT_KEY ShortText   --normalize NormalizerNFKC100   --default_tokenizer 'TokenTable("table", "Keywords")'
+[[0,0.0,0.0],true]
+column_create Keywords index COLUMN_INDEX Menus name
+[[0,0.0,0.0],true]
+load --table Keywords
+[
+{"_key": "焼肉"}
+]
+[[0,0.0,0.0],1]
+load --table Menus
+[
+{"name": "焼肉定食"}
+]
+[[0,0.0,0.0],1]
+select Menus --match_columns name --query "焼肉弁当"
+[[0,0.0,0.0],[[[1],[["_id","UInt32"],["name","Text"]],[1,"焼肉定食"]]]]

  Added: test/command/suite/select/query/match/with_index/token_table/same.test (+19 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/query/match/with_index/token_table/same.test    2019-02-07 14:48:48 +0900 (ee722b31b)
@@ -0,0 +1,19 @@
+table_create Menus TABLE_NO_KEY
+column_create Menus name COLUMN_SCALAR Text
+
+table_create Keywords TABLE_PAT_KEY ShortText \
+  --normalize NormalizerNFKC100 \
+  --default_tokenizer 'TokenTable("table", "Keywords")'
+column_create Keywords index COLUMN_INDEX Menus name
+
+load --table Keywords
+[
+{"_key": "焼肉"}
+]
+
+load --table Menus
+[
+{"name": "焼肉定食"}
+]
+
+select Menus --match_columns name --query "焼肉弁当"

  Modified: test/command/suite/tokenizer_list/default.expected (+3 -0)
===================================================================
--- test/command/suite/tokenizer_list/default.expected    2019-02-07 10:54:48 +0900 (40424396d)
+++ test/command/suite/tokenizer_list/default.expected    2019-02-07 14:48:48 +0900 (dc3adbb93)
@@ -53,6 +53,9 @@ tokenizer_list
     },
     {
       "name": "TokenPattern"
+    },
+    {
+      "name": "TokenTable"
     }
   ]
 ]

  Added: test/command/suite/tokenizers/table/match.expected (+37 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/table/match.expected    2019-02-07 14:48:48 +0900 (a48159e4f)
@@ -0,0 +1,37 @@
+table_create Keywords TABLE_PAT_KEY ShortText --normalizer NormalizerNFKC100
+[[0,0.0,0.0],true]
+load --table Keywords
+[
+{"_key": "100円"},
+{"_key": "りんご"},
+{"_key": "29円"}
+]
+[[0,0.0,0.0],3]
+tokenize   'TokenTable("table", "Keywords")'   "私は100円のりんごを２9円で買いました。"
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "100円",
+      "position": 0,
+      "force_prefix": false,
+      "force_prefix_search": false
+    },
+    {
+      "value": "りんご",
+      "position": 1,
+      "force_prefix": false,
+      "force_prefix_search": false
+    },
+    {
+      "value": "２9円",
+      "position": 2,
+      "force_prefix": false,
+      "force_prefix_search": false
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/table/match.test (+12 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/table/match.test    2019-02-07 14:48:48 +0900 (ef0ff48bd)
@@ -0,0 +1,12 @@
+table_create Keywords TABLE_PAT_KEY ShortText --normalizer NormalizerNFKC100
+
+load --table Keywords
+[
+{"_key": "100円"},
+{"_key": "りんご"},
+{"_key": "29円"}
+]
+
+tokenize \
+  'TokenTable("table", "Keywords")' \
+  "私は100円のりんごを２9円で買いました。"

  Added: test/command/suite/tokenizers/table/no_table.expected (+3 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/table/no_table.expected    2019-02-07 14:48:48 +0900 (8c15a3a0b)
@@ -0,0 +1,3 @@
+tokenize   'TokenTable'   "This is a pen."
+[[[-22,0.0,0.0],"[tokenizer][table] table isn't specified"],[]]
+#|e| [tokenizer][table] table isn't specified

  Added: test/command/suite/tokenizers/table/no_table.test (+3 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/table/no_table.test    2019-02-07 14:48:48 +0900 (47a8cf70c)
@@ -0,0 +1,3 @@
+tokenize \
+  'TokenTable' \
+  "This is a pen."

  Added: test/command/suite/tokenizers/table/nonexistent_table.expected (+3 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/table/nonexistent_table.expected    2019-02-07 14:48:48 +0900 (b5250a295)
@@ -0,0 +1,3 @@
+tokenize   'TokenTable("table", "Nonexistent")'   "This is a pen."
+[[[-22,0.0,0.0],"[tokenizer][table] nonexistent table: <Nonexistent>"],[]]
+#|e| [tokenizer][table] nonexistent table: <Nonexistent>

  Added: test/command/suite/tokenizers/table/nonexistent_table.test (+3 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/table/nonexistent_table.test    2019-02-07 14:48:48 +0900 (012761bca)
@@ -0,0 +1,3 @@
+tokenize \
+  'TokenTable("table", "Nonexistent")' \
+  "This is a pen."
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190207/33091554/attachment-0001.html>


Groonga - fulltext search engine.

[Groonga-commit] groonga/groonga at 81dd354 [master] TokenTable: add a new tokenizer that tokenizes existing keys