[Groonga-commit] groonga/groonga at d2db34d [master] Extract tokenize related commands

Back to archive index

naoa null+****@clear*****
Sun Apr 3 02:04:55 JST 2016


naoa	2016-04-03 02:04:55 +0900 (Sun, 03 Apr 2016)

  New Revision: d2db34de899a41aae7a1eac95eb74c2b7eae2106
  https://github.com/groonga/groonga/commit/d2db34de899a41aae7a1eac95eb74c2b7eae2106

  Merged 3687075: Merge pull request #518 from naoa/add-output-df-to-table-tokenize

  Message:
    Extract tokenize related commands

  Added files:
    lib/proc/proc_tokenize.c
  Modified files:
    lib/grn_proc.h
    lib/proc.c
    lib/proc/sources.am

  Modified: lib/grn_proc.h (+2 -0)
===================================================================
--- lib/grn_proc.h    2016-04-03 00:53:15 +0900 (f6abb09)
+++ lib/grn_proc.h    2016-04-03 02:04:55 +0900 (3e9af03)
@@ -63,6 +63,8 @@ void grn_proc_init_table_create(grn_ctx *ctx);
 void grn_proc_init_table_list(grn_ctx *ctx);
 void grn_proc_init_table_remove(grn_ctx *ctx);
 void grn_proc_init_table_rename(grn_ctx *ctx);
+void grn_proc_init_table_tokenize(grn_ctx *ctx);
+void grn_proc_init_tokenize(grn_ctx *ctx);
 
 grn_bool grn_proc_option_value_bool(grn_ctx *ctx,
                                     grn_obj *option,

  Modified: lib/proc.c (+2 -352)
===================================================================
--- lib/proc.c    2016-04-03 00:53:15 +0900 (1cccc3e)
+++ lib/proc.c    2016-04-03 02:04:55 +0900 (2b16518)
@@ -24,7 +24,6 @@
 #include "grn_output.h"
 #include "grn_pat.h"
 #include "grn_geo.h"
-#include "grn_token_cursor.h"
 #include "grn_expr.h"
 
 #include <string.h>
@@ -1660,20 +1659,6 @@ is_normalizer(grn_ctx *ctx, grn_obj *object)
   return GRN_TRUE;
 }
 
-static grn_bool
-is_tokenizer(grn_ctx *ctx, grn_obj *object)
-{
-  if (object->header.type != GRN_PROC) {
-    return GRN_FALSE;
-  }
-
-  if (grn_proc_get_type(ctx, object) != GRN_PROC_TOKENIZER) {
-    return GRN_FALSE;
-  }
-
-  return GRN_TRUE;
-}
-
 static const char *
 char_type_name(grn_char_type type)
 {
@@ -1817,330 +1802,6 @@ proc_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data
   return NULL;
 }
 
-static unsigned int
-parse_tokenize_flags(grn_ctx *ctx, grn_obj *flag_names)
-{
-  unsigned int flags = 0;
-  const char *names, *names_end;
-  int length;
-
-  names = GRN_TEXT_VALUE(flag_names);
-  length = GRN_TEXT_LEN(flag_names);
-  names_end = names + length;
-  while (names < names_end) {
-    if (*names == '|' || *names == ' ') {
-      names += 1;
-      continue;
-    }
-
-#define CHECK_FLAG(name)\
-    if (((names_end - names) >= (sizeof(#name) - 1)) &&\
-        (!memcmp(names, #name, sizeof(#name) - 1))) {\
-      flags |= GRN_TOKEN_CURSOR_ ## name;\
-      names += sizeof(#name) - 1;\
-      continue;\
-    }
-
-    CHECK_FLAG(ENABLE_TOKENIZED_DELIMITER);
-
-#define GRN_TOKEN_CURSOR_NONE 0
-    CHECK_FLAG(NONE);
-#undef GRN_TOKEN_CURSOR_NONE
-
-    ERR(GRN_INVALID_ARGUMENT, "[tokenize] invalid flag: <%.*s>",
-        (int)(names_end - names), names);
-    return 0;
-#undef CHECK_FLAG
-  }
-
-  return flags;
-}
-
-typedef struct {
-  grn_id id;
-  int32_t position;
-  grn_bool force_prefix;
-} tokenize_token;
-
-static void
-output_tokens(grn_ctx *ctx, grn_obj *tokens, grn_obj *lexicon)
-{
-  int i, n_tokens;
-
-  n_tokens = GRN_BULK_VSIZE(tokens) / sizeof(tokenize_token);
-  GRN_OUTPUT_ARRAY_OPEN("TOKENS", n_tokens);
-  for (i = 0; i < n_tokens; i++) {
-    tokenize_token *token;
-    char value[GRN_TABLE_MAX_KEY_SIZE];
-    unsigned int value_size;
-
-    token = ((tokenize_token *)(GRN_BULK_HEAD(tokens))) + i;
-
-    GRN_OUTPUT_MAP_OPEN("TOKEN", 3);
-
-    GRN_OUTPUT_CSTR("value");
-    value_size = grn_table_get_key(ctx, lexicon, token->id,
-                                   value, GRN_TABLE_MAX_KEY_SIZE);
-    GRN_OUTPUT_STR(value, value_size);
-
-    GRN_OUTPUT_CSTR("position");
-    GRN_OUTPUT_INT32(token->position);
-
-    GRN_OUTPUT_CSTR("force_prefix");
-    GRN_OUTPUT_BOOL(token->force_prefix);
-
-    GRN_OUTPUT_MAP_CLOSE();
-  }
-  GRN_OUTPUT_ARRAY_CLOSE();
-}
-
-static grn_obj *
-create_lexicon_for_tokenize(grn_ctx *ctx,
-                            grn_obj *tokenizer_name,
-                            grn_obj *normalizer_name,
-                            grn_obj *token_filter_names)
-{
-  grn_obj *lexicon;
-  grn_obj *tokenizer;
-  grn_obj *normalizer = NULL;
-
-  tokenizer = grn_ctx_get(ctx,
-                          GRN_TEXT_VALUE(tokenizer_name),
-                          GRN_TEXT_LEN(tokenizer_name));
-  if (!tokenizer) {
-    ERR(GRN_INVALID_ARGUMENT,
-        "[tokenize] nonexistent tokenizer: <%.*s>",
-        (int)GRN_TEXT_LEN(tokenizer_name),
-        GRN_TEXT_VALUE(tokenizer_name));
-    return NULL;
-  }
-
-  if (!is_tokenizer(ctx, tokenizer)) {
-    grn_obj inspected;
-    GRN_TEXT_INIT(&inspected, 0);
-    grn_inspect(ctx, &inspected, tokenizer);
-    ERR(GRN_INVALID_ARGUMENT,
-        "[tokenize] not tokenizer: %.*s",
-        (int)GRN_TEXT_LEN(&inspected),
-        GRN_TEXT_VALUE(&inspected));
-    GRN_OBJ_FIN(ctx, &inspected);
-    grn_obj_unlink(ctx, tokenizer);
-    return NULL;
-  }
-
-  if (GRN_TEXT_LEN(normalizer_name) > 0) {
-    normalizer = grn_ctx_get(ctx,
-                             GRN_TEXT_VALUE(normalizer_name),
-                             GRN_TEXT_LEN(normalizer_name));
-    if (!normalizer) {
-      grn_obj_unlink(ctx, tokenizer);
-      ERR(GRN_INVALID_ARGUMENT,
-          "[tokenize] nonexistent normalizer: <%.*s>",
-          (int)GRN_TEXT_LEN(normalizer_name),
-          GRN_TEXT_VALUE(normalizer_name));
-      return NULL;
-    }
-
-    if (!is_normalizer(ctx, normalizer)) {
-      grn_obj inspected;
-      grn_obj_unlink(ctx, tokenizer);
-      GRN_TEXT_INIT(&inspected, 0);
-      grn_inspect(ctx, &inspected, normalizer);
-      ERR(GRN_INVALID_ARGUMENT,
-          "[tokenize] not normalizer: %.*s",
-          (int)GRN_TEXT_LEN(&inspected),
-          GRN_TEXT_VALUE(&inspected));
-      GRN_OBJ_FIN(ctx, &inspected);
-      grn_obj_unlink(ctx, normalizer);
-      return NULL;
-    }
-  }
-
-  lexicon = grn_table_create(ctx, NULL, 0,
-                             NULL,
-                             GRN_OBJ_TABLE_HASH_KEY,
-                             grn_ctx_at(ctx, GRN_DB_SHORT_TEXT),
-                             NULL);
-  grn_obj_set_info(ctx, lexicon,
-                   GRN_INFO_DEFAULT_TOKENIZER, tokenizer);
-  grn_obj_unlink(ctx, tokenizer);
-  if (normalizer) {
-    grn_obj_set_info(ctx, lexicon,
-                     GRN_INFO_NORMALIZER, normalizer);
-    grn_obj_unlink(ctx, normalizer);
-  }
-  grn_proc_table_set_token_filters(ctx, lexicon, token_filter_names);
-
-  return lexicon;
-}
-
-static void
-tokenize(grn_ctx *ctx, grn_obj *lexicon, grn_obj *string, grn_tokenize_mode mode,
-         unsigned int flags, grn_obj *tokens)
-{
-  grn_token_cursor *token_cursor;
-
-  token_cursor =
-    grn_token_cursor_open(ctx, lexicon,
-                          GRN_TEXT_VALUE(string), GRN_TEXT_LEN(string),
-                          mode, flags);
-  if (!token_cursor) {
-    return;
-  }
-
-  while (token_cursor->status == GRN_TOKEN_CURSOR_DOING) {
-    grn_id token_id = grn_token_cursor_next(ctx, token_cursor);
-    tokenize_token *current_token;
-    if (token_id == GRN_ID_NIL) {
-      continue;
-    }
-    grn_bulk_space(ctx, tokens, sizeof(tokenize_token));
-    current_token = ((tokenize_token *)(GRN_BULK_CURR(tokens))) - 1;
-    current_token->id = token_id;
-    current_token->position = token_cursor->pos;
-    current_token->force_prefix = token_cursor->force_prefix;
-  }
-  grn_token_cursor_close(ctx, token_cursor);
-}
-
-static grn_obj *
-proc_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{
-  grn_obj *tokenizer_name;
-  grn_obj *string;
-  grn_obj *normalizer_name;
-  grn_obj *flag_names;
-  grn_obj *mode_name;
-  grn_obj *token_filter_names;
-
-  tokenizer_name = VAR(0);
-  string = VAR(1);
-  normalizer_name = VAR(2);
-  flag_names = VAR(3);
-  mode_name = VAR(4);
-  token_filter_names = VAR(5);
-
-  if (GRN_TEXT_LEN(tokenizer_name) == 0) {
-    ERR(GRN_INVALID_ARGUMENT, "[tokenize] tokenizer name is missing");
-    return NULL;
-  }
-
-  if (GRN_TEXT_LEN(string) == 0) {
-    ERR(GRN_INVALID_ARGUMENT, "[tokenize] string is missing");
-    return NULL;
-  }
-
-  {
-    unsigned int flags;
-    grn_obj *lexicon;
-
-    flags = parse_tokenize_flags(ctx, flag_names);
-    if (ctx->rc != GRN_SUCCESS) {
-      return NULL;
-    }
-
-    lexicon = create_lexicon_for_tokenize(ctx,
-                                          tokenizer_name,
-                                          normalizer_name,
-                                          token_filter_names);
-    if (!lexicon) {
-      return NULL;
-    }
-
-#define MODE_NAME_EQUAL(name)\
-    (GRN_TEXT_LEN(mode_name) == strlen(name) &&\
-     memcmp(GRN_TEXT_VALUE(mode_name), name, strlen(name)) == 0)
-
-    {
-      grn_obj tokens;
-      GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL);
-      if (GRN_TEXT_LEN(mode_name) == 0 || MODE_NAME_EQUAL("ADD")) {
-        tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens);
-        output_tokens(ctx, &tokens, lexicon);
-      } else if (MODE_NAME_EQUAL("GET")) {
-        tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens);
-        GRN_BULK_REWIND(&tokens);
-        tokenize(ctx, lexicon, string, GRN_TOKEN_GET, flags, &tokens);
-        output_tokens(ctx, &tokens, lexicon);
-      } else {
-        ERR(GRN_INVALID_ARGUMENT, "[tokenize] invalid mode: <%.*s>",
-            (int)GRN_TEXT_LEN(mode_name), GRN_TEXT_VALUE(mode_name));
-      }
-      GRN_OBJ_FIN(ctx, &tokens);
-    }
-#undef MODE_NAME_EQUAL
-
-    grn_obj_unlink(ctx, lexicon);
-  }
-
-  return NULL;
-}
-
-static grn_obj *
-proc_table_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{
-  grn_obj *table_name;
-  grn_obj *string;
-  grn_obj *flag_names;
-  grn_obj *mode_name;
-
-  table_name = VAR(0);
-  string = VAR(1);
-  flag_names = VAR(2);
-  mode_name = VAR(3);
-
-  if (GRN_TEXT_LEN(table_name) == 0) {
-    ERR(GRN_INVALID_ARGUMENT, "[table_tokenize] table name is missing");
-    return NULL;
-  }
-
-  if (GRN_TEXT_LEN(string) == 0) {
-    ERR(GRN_INVALID_ARGUMENT, "[table_tokenize] string is missing");
-    return NULL;
-  }
-
-  {
-    unsigned int flags;
-    grn_obj *lexicon;
-
-    flags = parse_tokenize_flags(ctx, flag_names);
-    if (ctx->rc != GRN_SUCCESS) {
-      return NULL;
-    }
-
-    lexicon = grn_ctx_get(ctx, GRN_TEXT_VALUE(table_name), GRN_TEXT_LEN(table_name));
-
-    if (!lexicon) {
-      return NULL;
-    }
-
-#define MODE_NAME_EQUAL(name)\
-    (GRN_TEXT_LEN(mode_name) == strlen(name) &&\
-     memcmp(GRN_TEXT_VALUE(mode_name), name, strlen(name)) == 0)
-
-    {
-      grn_obj tokens;
-      GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL);
-    if (GRN_TEXT_LEN(mode_name) == 0 || MODE_NAME_EQUAL("GET")) {
-      tokenize(ctx, lexicon, string, GRN_TOKEN_GET, flags, &tokens);
-      output_tokens(ctx, &tokens, lexicon);
-    } else if (MODE_NAME_EQUAL("ADD")) {
-      tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens);
-      output_tokens(ctx, &tokens, lexicon);
-    } else {
-      ERR(GRN_INVALID_ARGUMENT, "[table_tokenize] invalid mode: <%.*s>",
-          (int)GRN_TEXT_LEN(mode_name), GRN_TEXT_VALUE(mode_name));
-    }
-      GRN_OBJ_FIN(ctx, &tokens);
-    }
-#undef MODE_NAME_EQUAL
-
-    grn_obj_unlink(ctx, lexicon);
-  }
-
-  return NULL;
-}
-
 static void
 list_proc(grn_ctx *ctx, grn_proc_type target_proc_type,
           const char *name, const char *plural_name)
@@ -4470,19 +4131,8 @@ grn_db_init_builtin_query(grn_ctx *ctx)
   DEF_VAR(vars[2], "flags");
   DEF_COMMAND("normalize", proc_normalize, 3, vars);
 
-  DEF_VAR(vars[0], "tokenizer");
-  DEF_VAR(vars[1], "string");
-  DEF_VAR(vars[2], "normalizer");
-  DEF_VAR(vars[3], "flags");
-  DEF_VAR(vars[4], "mode");
-  DEF_VAR(vars[5], "token_filters");
-  DEF_COMMAND("tokenize", proc_tokenize, 6, vars);
-
-  DEF_VAR(vars[0], "table");
-  DEF_VAR(vars[1], "string");
-  DEF_VAR(vars[2], "flags");
-  DEF_VAR(vars[3], "mode");
-  DEF_COMMAND("table_tokenize", proc_table_tokenize, 4, vars);
+  grn_proc_init_tokenize(ctx);
+  grn_proc_init_table_tokenize(ctx);
 
   DEF_COMMAND("tokenizer_list", proc_tokenizer_list, 0, vars);
 

  Added: lib/proc/proc_tokenize.c (+383 -0) 100644
===================================================================
--- /dev/null
+++ lib/proc/proc_tokenize.c    2016-04-03 02:04:55 +0900 (0fc98c1)
@@ -0,0 +1,383 @@
+/* -*- c-basic-offset: 2 -*- */
+/*
+  Copyright(C) 2009-2016 Brazil
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License version 2.1 as published by the Free Software Foundation.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#include "../grn_proc.h"
+#include "../grn_ctx.h"
+#include "../grn_token_cursor.h"
+
+#include <groonga/plugin.h>
+
+static unsigned int
+parse_tokenize_flags(grn_ctx *ctx, grn_obj *flag_names)
+{
+  unsigned int flags = 0;
+  const char *names, *names_end;
+  int length;
+
+  names = GRN_TEXT_VALUE(flag_names);
+  length = GRN_TEXT_LEN(flag_names);
+  names_end = names + length;
+  while (names < names_end) {
+    if (*names == '|' || *names == ' ') {
+      names += 1;
+      continue;
+    }
+
+#define CHECK_FLAG(name)\
+    if (((names_end - names) >= (sizeof(#name) - 1)) &&\
+        (!memcmp(names, #name, sizeof(#name) - 1))) {\
+      flags |= GRN_TOKEN_CURSOR_ ## name;\
+      names += sizeof(#name) - 1;\
+      continue;\
+    }
+
+    CHECK_FLAG(ENABLE_TOKENIZED_DELIMITER);
+
+#define GRN_TOKEN_CURSOR_NONE 0
+    CHECK_FLAG(NONE);
+#undef GRN_TOKEN_CURSOR_NONE
+
+    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
+                     "[tokenize] invalid flag: <%.*s>",
+                     (int)(names_end - names), names);
+    return 0;
+#undef CHECK_FLAG
+  }
+
+  return flags;
+}
+
+typedef struct {
+  grn_id id;
+  int32_t position;
+  grn_bool force_prefix;
+} tokenize_token;
+
+static void
+output_tokens(grn_ctx *ctx, grn_obj *tokens, grn_obj *lexicon)
+{
+  int i, n_tokens;
+
+  n_tokens = GRN_BULK_VSIZE(tokens) / sizeof(tokenize_token);
+  grn_ctx_output_array_open(ctx, "TOKENS", n_tokens);
+  for (i = 0; i < n_tokens; i++) {
+    tokenize_token *token;
+    char value[GRN_TABLE_MAX_KEY_SIZE];
+    unsigned int value_size;
+
+    token = ((tokenize_token *)(GRN_BULK_HEAD(tokens))) + i;
+
+    grn_ctx_output_map_open(ctx, "TOKEN", 3);
+
+    grn_ctx_output_cstr(ctx, "value");
+    value_size = grn_table_get_key(ctx, lexicon, token->id,
+                                   value, GRN_TABLE_MAX_KEY_SIZE);
+    grn_ctx_output_str(ctx, value, value_size);
+
+    grn_ctx_output_cstr(ctx, "position");
+    grn_ctx_output_int32(ctx, token->position);
+
+    grn_ctx_output_cstr(ctx, "force_prefix");
+    grn_ctx_output_bool(ctx, token->force_prefix);
+
+    grn_ctx_output_map_close(ctx);
+  }
+  grn_ctx_output_array_close(ctx);
+}
+
+static grn_obj *
+create_lexicon_for_tokenize(grn_ctx *ctx,
+                            grn_obj *tokenizer_name,
+                            grn_obj *normalizer_name,
+                            grn_obj *token_filter_names)
+{
+  grn_obj *lexicon;
+  grn_obj *tokenizer;
+  grn_obj *normalizer = NULL;
+
+  tokenizer = grn_ctx_get(ctx,
+                          GRN_TEXT_VALUE(tokenizer_name),
+                          GRN_TEXT_LEN(tokenizer_name));
+  if (!tokenizer) {
+    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
+                     "[tokenize] nonexistent tokenizer: <%.*s>",
+                     (int)GRN_TEXT_LEN(tokenizer_name),
+                     GRN_TEXT_VALUE(tokenizer_name));
+    return NULL;
+  }
+
+  if (!grn_obj_is_tokenizer_proc(ctx, tokenizer)) {
+    grn_obj inspected;
+    GRN_TEXT_INIT(&inspected, 0);
+    grn_inspect(ctx, &inspected, tokenizer);
+    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
+                     "[tokenize] not tokenizer: %.*s",
+                     (int)GRN_TEXT_LEN(&inspected),
+                     GRN_TEXT_VALUE(&inspected));
+    GRN_OBJ_FIN(ctx, &inspected);
+    grn_obj_unlink(ctx, tokenizer);
+    return NULL;
+  }
+
+  if (GRN_TEXT_LEN(normalizer_name) > 0) {
+    normalizer = grn_ctx_get(ctx,
+                             GRN_TEXT_VALUE(normalizer_name),
+                             GRN_TEXT_LEN(normalizer_name));
+    if (!normalizer) {
+      grn_obj_unlink(ctx, tokenizer);
+      GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
+                       "[tokenize] nonexistent normalizer: <%.*s>",
+                       (int)GRN_TEXT_LEN(normalizer_name),
+                       GRN_TEXT_VALUE(normalizer_name));
+      return NULL;
+    }
+
+    if (!grn_obj_is_normalizer_proc(ctx, normalizer)) {
+      grn_obj inspected;
+      grn_obj_unlink(ctx, tokenizer);
+      GRN_TEXT_INIT(&inspected, 0);
+      grn_inspect(ctx, &inspected, normalizer);
+      GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
+                       "[tokenize] not normalizer: %.*s",
+                       (int)GRN_TEXT_LEN(&inspected),
+                       GRN_TEXT_VALUE(&inspected));
+      GRN_OBJ_FIN(ctx, &inspected);
+      grn_obj_unlink(ctx, normalizer);
+      return NULL;
+    }
+  }
+
+  lexicon = grn_table_create(ctx, NULL, 0,
+                             NULL,
+                             GRN_OBJ_TABLE_HASH_KEY,
+                             grn_ctx_at(ctx, GRN_DB_SHORT_TEXT),
+                             NULL);
+  grn_obj_set_info(ctx, lexicon,
+                   GRN_INFO_DEFAULT_TOKENIZER, tokenizer);
+  grn_obj_unlink(ctx, tokenizer);
+  if (normalizer) {
+    grn_obj_set_info(ctx, lexicon,
+                     GRN_INFO_NORMALIZER, normalizer);
+    grn_obj_unlink(ctx, normalizer);
+  }
+  grn_proc_table_set_token_filters(ctx, lexicon, token_filter_names);
+
+  return lexicon;
+}
+
+static void
+tokenize(grn_ctx *ctx, grn_obj *lexicon, grn_obj *string, grn_tokenize_mode mode,
+         unsigned int flags, grn_obj *tokens)
+{
+  grn_token_cursor *token_cursor;
+
+  token_cursor =
+    grn_token_cursor_open(ctx, lexicon,
+                          GRN_TEXT_VALUE(string), GRN_TEXT_LEN(string),
+                          mode, flags);
+  if (!token_cursor) {
+    return;
+  }
+
+  while (token_cursor->status == GRN_TOKEN_CURSOR_DOING) {
+    grn_id token_id = grn_token_cursor_next(ctx, token_cursor);
+    tokenize_token *current_token;
+    if (token_id == GRN_ID_NIL) {
+      continue;
+    }
+    grn_bulk_space(ctx, tokens, sizeof(tokenize_token));
+    current_token = ((tokenize_token *)(GRN_BULK_CURR(tokens))) - 1;
+    current_token->id = token_id;
+    current_token->position = token_cursor->pos;
+    current_token->force_prefix = token_cursor->force_prefix;
+  }
+  grn_token_cursor_close(ctx, token_cursor);
+}
+
+static grn_obj *
+command_table_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
+{
+  grn_obj *table_name;
+  grn_obj *string;
+  grn_obj *flag_names;
+  grn_obj *mode_name;
+
+  table_name = grn_plugin_proc_get_var(ctx, user_data, "table", -1);
+  string = grn_plugin_proc_get_var(ctx, user_data, "string", -1);
+  flag_names = grn_plugin_proc_get_var(ctx, user_data, "flags", -1);
+  mode_name = grn_plugin_proc_get_var(ctx, user_data, "mode", -1);
+
+  if (GRN_TEXT_LEN(table_name) == 0) {
+    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[table_tokenize] table name is missing");
+    return NULL;
+  }
+
+  if (GRN_TEXT_LEN(string) == 0) {
+    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[table_tokenize] string is missing");
+    return NULL;
+  }
+
+  {
+    unsigned int flags;
+    grn_obj *lexicon;
+
+    flags = parse_tokenize_flags(ctx, flag_names);
+    if (ctx->rc != GRN_SUCCESS) {
+      return NULL;
+    }
+
+    lexicon = grn_ctx_get(ctx, GRN_TEXT_VALUE(table_name), GRN_TEXT_LEN(table_name));
+
+    if (!lexicon) {
+      return NULL;
+    }
+
+#define MODE_NAME_EQUAL(name)\
+    (GRN_TEXT_LEN(mode_name) == strlen(name) &&\
+     memcmp(GRN_TEXT_VALUE(mode_name), name, strlen(name)) == 0)
+
+    {
+      grn_obj tokens;
+      GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL);
+    if (GRN_TEXT_LEN(mode_name) == 0 || MODE_NAME_EQUAL("GET")) {
+      tokenize(ctx, lexicon, string, GRN_TOKEN_GET, flags, &tokens);
+      output_tokens(ctx, &tokens, lexicon);
+    } else if (MODE_NAME_EQUAL("ADD")) {
+      tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens);
+      output_tokens(ctx, &tokens, lexicon);
+    } else {
+      GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
+                       "[table_tokenize] invalid mode: <%.*s>",
+                       (int)GRN_TEXT_LEN(mode_name), GRN_TEXT_VALUE(mode_name));
+    }
+      GRN_OBJ_FIN(ctx, &tokens);
+    }
+#undef MODE_NAME_EQUAL
+
+    grn_obj_unlink(ctx, lexicon);
+  }
+
+  return NULL;
+}
+
+void
+grn_proc_init_table_tokenize(grn_ctx *ctx)
+{
+  grn_expr_var vars[4];
+
+  grn_plugin_expr_var_init(ctx, &(vars[0]), "table", -1);
+  grn_plugin_expr_var_init(ctx, &(vars[1]), "string", -1);
+  grn_plugin_expr_var_init(ctx, &(vars[2]), "flags", -1);
+  grn_plugin_expr_var_init(ctx, &(vars[3]), "mode", -1);
+  grn_plugin_command_create(ctx,
+                            "table_tokenize", -1,
+                            command_table_tokenize,
+                            4,
+                            vars);
+}
+
+static grn_obj *
+command_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
+{
+  grn_obj *tokenizer_name;
+  grn_obj *string;
+  grn_obj *normalizer_name;
+  grn_obj *flag_names;
+  grn_obj *mode_name;
+  grn_obj *token_filter_names;
+
+  tokenizer_name = grn_plugin_proc_get_var(ctx, user_data, "tokenizer", -1);
+  string = grn_plugin_proc_get_var(ctx, user_data, "string", -1);
+  normalizer_name = grn_plugin_proc_get_var(ctx, user_data, "normalizer", -1);
+  flag_names = grn_plugin_proc_get_var(ctx, user_data, "flags", -1);
+  mode_name = grn_plugin_proc_get_var(ctx, user_data, "mode", -1);
+  token_filter_names = grn_plugin_proc_get_var(ctx, user_data, "token_filters", -1);
+
+  if (GRN_TEXT_LEN(tokenizer_name) == 0) {
+    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[tokenize] tokenizer name is missing");
+    return NULL;
+  }
+
+  if (GRN_TEXT_LEN(string) == 0) {
+    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[tokenize] string is missing");
+    return NULL;
+  }
+
+  {
+    unsigned int flags;
+    grn_obj *lexicon;
+
+    flags = parse_tokenize_flags(ctx, flag_names);
+    if (ctx->rc != GRN_SUCCESS) {
+      return NULL;
+    }
+
+    lexicon = create_lexicon_for_tokenize(ctx,
+                                          tokenizer_name,
+                                          normalizer_name,
+                                          token_filter_names);
+    if (!lexicon) {
+      return NULL;
+    }
+#define MODE_NAME_EQUAL(name)\
+    (GRN_TEXT_LEN(mode_name) == strlen(name) &&\
+     memcmp(GRN_TEXT_VALUE(mode_name), name, strlen(name)) == 0)
+
+    {
+      grn_obj tokens;
+      GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL);
+      if (GRN_TEXT_LEN(mode_name) == 0 || MODE_NAME_EQUAL("ADD")) {
+        tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens);
+        output_tokens(ctx, &tokens, lexicon);
+      } else if (MODE_NAME_EQUAL("GET")) {
+        tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens);
+        GRN_BULK_REWIND(&tokens);
+        tokenize(ctx, lexicon, string, GRN_TOKEN_GET, flags, &tokens);
+        output_tokens(ctx, &tokens, lexicon);
+      } else {
+        GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
+                         "[tokenize] invalid mode: <%.*s>",
+                         (int)GRN_TEXT_LEN(mode_name), GRN_TEXT_VALUE(mode_name));
+      }
+      GRN_OBJ_FIN(ctx, &tokens);
+    }
+#undef MODE_NAME_EQUAL
+
+    grn_obj_unlink(ctx, lexicon);
+  }
+
+  return NULL;
+}
+
+void
+grn_proc_init_tokenize(grn_ctx *ctx)
+{
+  grn_expr_var vars[6];
+
+  grn_plugin_expr_var_init(ctx, &(vars[0]), "tokenizer", -1);
+  grn_plugin_expr_var_init(ctx, &(vars[1]), "string", -1);
+  grn_plugin_expr_var_init(ctx, &(vars[2]), "normalizer", -1);
+  grn_plugin_expr_var_init(ctx, &(vars[3]), "flags", -1);
+  grn_plugin_expr_var_init(ctx, &(vars[4]), "mode", -1);
+  grn_plugin_expr_var_init(ctx, &(vars[5]), "token_filters", -1);
+  grn_plugin_command_create(ctx,
+                            "tokenize", -1,
+                            command_tokenize,
+                            6,
+                            vars);
+}

  Modified: lib/proc/sources.am (+2 -1)
===================================================================
--- lib/proc/sources.am    2016-04-03 00:53:15 +0900 (0c0d174)
+++ lib/proc/sources.am    2016-04-03 02:04:55 +0900 (f0f88b1)
@@ -9,4 +9,5 @@ libgrnproc_la_SOURCES =				\
 	proc_select.c				\
 	proc_snippet.c				\
 	proc_highlight.c			\
-	proc_table.c
+	proc_table.c				\
+	proc_tokenize.c
-------------- next part --------------
HTML����������������������������...
Télécharger 



More information about the Groonga-commit mailing list
Back to archive index