naoa
null+****@clear*****
Sun Apr 3 02:04:55 JST 2016
naoa 2016-04-03 02:04:55 +0900 (Sun, 03 Apr 2016) New Revision: d2db34de899a41aae7a1eac95eb74c2b7eae2106 https://github.com/groonga/groonga/commit/d2db34de899a41aae7a1eac95eb74c2b7eae2106 Merged 3687075: Merge pull request #518 from naoa/add-output-df-to-table-tokenize Message: Extract tokenize related commands Added files: lib/proc/proc_tokenize.c Modified files: lib/grn_proc.h lib/proc.c lib/proc/sources.am Modified: lib/grn_proc.h (+2 -0) =================================================================== --- lib/grn_proc.h 2016-04-03 00:53:15 +0900 (f6abb09) +++ lib/grn_proc.h 2016-04-03 02:04:55 +0900 (3e9af03) @@ -63,6 +63,8 @@ void grn_proc_init_table_create(grn_ctx *ctx); void grn_proc_init_table_list(grn_ctx *ctx); void grn_proc_init_table_remove(grn_ctx *ctx); void grn_proc_init_table_rename(grn_ctx *ctx); +void grn_proc_init_table_tokenize(grn_ctx *ctx); +void grn_proc_init_tokenize(grn_ctx *ctx); grn_bool grn_proc_option_value_bool(grn_ctx *ctx, grn_obj *option, Modified: lib/proc.c (+2 -352) =================================================================== --- lib/proc.c 2016-04-03 00:53:15 +0900 (1cccc3e) +++ lib/proc.c 2016-04-03 02:04:55 +0900 (2b16518) @@ -24,7 +24,6 @@ #include "grn_output.h" #include "grn_pat.h" #include "grn_geo.h" -#include "grn_token_cursor.h" #include "grn_expr.h" #include <string.h> @@ -1660,20 +1659,6 @@ is_normalizer(grn_ctx *ctx, grn_obj *object) return GRN_TRUE; } -static grn_bool -is_tokenizer(grn_ctx *ctx, grn_obj *object) -{ - if (object->header.type != GRN_PROC) { - return GRN_FALSE; - } - - if (grn_proc_get_type(ctx, object) != GRN_PROC_TOKENIZER) { - return GRN_FALSE; - } - - return GRN_TRUE; -} - static const char * char_type_name(grn_char_type type) { @@ -1817,330 +1802,6 @@ proc_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data return NULL; } -static unsigned int -parse_tokenize_flags(grn_ctx *ctx, grn_obj *flag_names) -{ - unsigned int flags = 0; - const char *names, *names_end; - int length; - - names = GRN_TEXT_VALUE(flag_names); - length = GRN_TEXT_LEN(flag_names); - names_end = names + length; - while (names < names_end) { - if (*names == '|' || *names == ' ') { - names += 1; - continue; - } - -#define CHECK_FLAG(name)\ - if (((names_end - names) >= (sizeof(#name) - 1)) &&\ - (!memcmp(names, #name, sizeof(#name) - 1))) {\ - flags |= GRN_TOKEN_CURSOR_ ## name;\ - names += sizeof(#name) - 1;\ - continue;\ - } - - CHECK_FLAG(ENABLE_TOKENIZED_DELIMITER); - -#define GRN_TOKEN_CURSOR_NONE 0 - CHECK_FLAG(NONE); -#undef GRN_TOKEN_CURSOR_NONE - - ERR(GRN_INVALID_ARGUMENT, "[tokenize] invalid flag: <%.*s>", - (int)(names_end - names), names); - return 0; -#undef CHECK_FLAG - } - - return flags; -} - -typedef struct { - grn_id id; - int32_t position; - grn_bool force_prefix; -} tokenize_token; - -static void -output_tokens(grn_ctx *ctx, grn_obj *tokens, grn_obj *lexicon) -{ - int i, n_tokens; - - n_tokens = GRN_BULK_VSIZE(tokens) / sizeof(tokenize_token); - GRN_OUTPUT_ARRAY_OPEN("TOKENS", n_tokens); - for (i = 0; i < n_tokens; i++) { - tokenize_token *token; - char value[GRN_TABLE_MAX_KEY_SIZE]; - unsigned int value_size; - - token = ((tokenize_token *)(GRN_BULK_HEAD(tokens))) + i; - - GRN_OUTPUT_MAP_OPEN("TOKEN", 3); - - GRN_OUTPUT_CSTR("value"); - value_size = grn_table_get_key(ctx, lexicon, token->id, - value, GRN_TABLE_MAX_KEY_SIZE); - GRN_OUTPUT_STR(value, value_size); - - GRN_OUTPUT_CSTR("position"); - GRN_OUTPUT_INT32(token->position); - - GRN_OUTPUT_CSTR("force_prefix"); - GRN_OUTPUT_BOOL(token->force_prefix); - - GRN_OUTPUT_MAP_CLOSE(); - } - GRN_OUTPUT_ARRAY_CLOSE(); -} - -static grn_obj * -create_lexicon_for_tokenize(grn_ctx *ctx, - grn_obj *tokenizer_name, - grn_obj *normalizer_name, - grn_obj *token_filter_names) -{ - grn_obj *lexicon; - grn_obj *tokenizer; - grn_obj *normalizer = NULL; - - tokenizer = grn_ctx_get(ctx, - GRN_TEXT_VALUE(tokenizer_name), - GRN_TEXT_LEN(tokenizer_name)); - if (!tokenizer) { - ERR(GRN_INVALID_ARGUMENT, - "[tokenize] nonexistent tokenizer: <%.*s>", - (int)GRN_TEXT_LEN(tokenizer_name), - GRN_TEXT_VALUE(tokenizer_name)); - return NULL; - } - - if (!is_tokenizer(ctx, tokenizer)) { - grn_obj inspected; - GRN_TEXT_INIT(&inspected, 0); - grn_inspect(ctx, &inspected, tokenizer); - ERR(GRN_INVALID_ARGUMENT, - "[tokenize] not tokenizer: %.*s", - (int)GRN_TEXT_LEN(&inspected), - GRN_TEXT_VALUE(&inspected)); - GRN_OBJ_FIN(ctx, &inspected); - grn_obj_unlink(ctx, tokenizer); - return NULL; - } - - if (GRN_TEXT_LEN(normalizer_name) > 0) { - normalizer = grn_ctx_get(ctx, - GRN_TEXT_VALUE(normalizer_name), - GRN_TEXT_LEN(normalizer_name)); - if (!normalizer) { - grn_obj_unlink(ctx, tokenizer); - ERR(GRN_INVALID_ARGUMENT, - "[tokenize] nonexistent normalizer: <%.*s>", - (int)GRN_TEXT_LEN(normalizer_name), - GRN_TEXT_VALUE(normalizer_name)); - return NULL; - } - - if (!is_normalizer(ctx, normalizer)) { - grn_obj inspected; - grn_obj_unlink(ctx, tokenizer); - GRN_TEXT_INIT(&inspected, 0); - grn_inspect(ctx, &inspected, normalizer); - ERR(GRN_INVALID_ARGUMENT, - "[tokenize] not normalizer: %.*s", - (int)GRN_TEXT_LEN(&inspected), - GRN_TEXT_VALUE(&inspected)); - GRN_OBJ_FIN(ctx, &inspected); - grn_obj_unlink(ctx, normalizer); - return NULL; - } - } - - lexicon = grn_table_create(ctx, NULL, 0, - NULL, - GRN_OBJ_TABLE_HASH_KEY, - grn_ctx_at(ctx, GRN_DB_SHORT_TEXT), - NULL); - grn_obj_set_info(ctx, lexicon, - GRN_INFO_DEFAULT_TOKENIZER, tokenizer); - grn_obj_unlink(ctx, tokenizer); - if (normalizer) { - grn_obj_set_info(ctx, lexicon, - GRN_INFO_NORMALIZER, normalizer); - grn_obj_unlink(ctx, normalizer); - } - grn_proc_table_set_token_filters(ctx, lexicon, token_filter_names); - - return lexicon; -} - -static void -tokenize(grn_ctx *ctx, grn_obj *lexicon, grn_obj *string, grn_tokenize_mode mode, - unsigned int flags, grn_obj *tokens) -{ - grn_token_cursor *token_cursor; - - token_cursor = - grn_token_cursor_open(ctx, lexicon, - GRN_TEXT_VALUE(string), GRN_TEXT_LEN(string), - mode, flags); - if (!token_cursor) { - return; - } - - while (token_cursor->status == GRN_TOKEN_CURSOR_DOING) { - grn_id token_id = grn_token_cursor_next(ctx, token_cursor); - tokenize_token *current_token; - if (token_id == GRN_ID_NIL) { - continue; - } - grn_bulk_space(ctx, tokens, sizeof(tokenize_token)); - current_token = ((tokenize_token *)(GRN_BULK_CURR(tokens))) - 1; - current_token->id = token_id; - current_token->position = token_cursor->pos; - current_token->force_prefix = token_cursor->force_prefix; - } - grn_token_cursor_close(ctx, token_cursor); -} - -static grn_obj * -proc_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) -{ - grn_obj *tokenizer_name; - grn_obj *string; - grn_obj *normalizer_name; - grn_obj *flag_names; - grn_obj *mode_name; - grn_obj *token_filter_names; - - tokenizer_name = VAR(0); - string = VAR(1); - normalizer_name = VAR(2); - flag_names = VAR(3); - mode_name = VAR(4); - token_filter_names = VAR(5); - - if (GRN_TEXT_LEN(tokenizer_name) == 0) { - ERR(GRN_INVALID_ARGUMENT, "[tokenize] tokenizer name is missing"); - return NULL; - } - - if (GRN_TEXT_LEN(string) == 0) { - ERR(GRN_INVALID_ARGUMENT, "[tokenize] string is missing"); - return NULL; - } - - { - unsigned int flags; - grn_obj *lexicon; - - flags = parse_tokenize_flags(ctx, flag_names); - if (ctx->rc != GRN_SUCCESS) { - return NULL; - } - - lexicon = create_lexicon_for_tokenize(ctx, - tokenizer_name, - normalizer_name, - token_filter_names); - if (!lexicon) { - return NULL; - } - -#define MODE_NAME_EQUAL(name)\ - (GRN_TEXT_LEN(mode_name) == strlen(name) &&\ - memcmp(GRN_TEXT_VALUE(mode_name), name, strlen(name)) == 0) - - { - grn_obj tokens; - GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL); - if (GRN_TEXT_LEN(mode_name) == 0 || MODE_NAME_EQUAL("ADD")) { - tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens); - output_tokens(ctx, &tokens, lexicon); - } else if (MODE_NAME_EQUAL("GET")) { - tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens); - GRN_BULK_REWIND(&tokens); - tokenize(ctx, lexicon, string, GRN_TOKEN_GET, flags, &tokens); - output_tokens(ctx, &tokens, lexicon); - } else { - ERR(GRN_INVALID_ARGUMENT, "[tokenize] invalid mode: <%.*s>", - (int)GRN_TEXT_LEN(mode_name), GRN_TEXT_VALUE(mode_name)); - } - GRN_OBJ_FIN(ctx, &tokens); - } -#undef MODE_NAME_EQUAL - - grn_obj_unlink(ctx, lexicon); - } - - return NULL; -} - -static grn_obj * -proc_table_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) -{ - grn_obj *table_name; - grn_obj *string; - grn_obj *flag_names; - grn_obj *mode_name; - - table_name = VAR(0); - string = VAR(1); - flag_names = VAR(2); - mode_name = VAR(3); - - if (GRN_TEXT_LEN(table_name) == 0) { - ERR(GRN_INVALID_ARGUMENT, "[table_tokenize] table name is missing"); - return NULL; - } - - if (GRN_TEXT_LEN(string) == 0) { - ERR(GRN_INVALID_ARGUMENT, "[table_tokenize] string is missing"); - return NULL; - } - - { - unsigned int flags; - grn_obj *lexicon; - - flags = parse_tokenize_flags(ctx, flag_names); - if (ctx->rc != GRN_SUCCESS) { - return NULL; - } - - lexicon = grn_ctx_get(ctx, GRN_TEXT_VALUE(table_name), GRN_TEXT_LEN(table_name)); - - if (!lexicon) { - return NULL; - } - -#define MODE_NAME_EQUAL(name)\ - (GRN_TEXT_LEN(mode_name) == strlen(name) &&\ - memcmp(GRN_TEXT_VALUE(mode_name), name, strlen(name)) == 0) - - { - grn_obj tokens; - GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL); - if (GRN_TEXT_LEN(mode_name) == 0 || MODE_NAME_EQUAL("GET")) { - tokenize(ctx, lexicon, string, GRN_TOKEN_GET, flags, &tokens); - output_tokens(ctx, &tokens, lexicon); - } else if (MODE_NAME_EQUAL("ADD")) { - tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens); - output_tokens(ctx, &tokens, lexicon); - } else { - ERR(GRN_INVALID_ARGUMENT, "[table_tokenize] invalid mode: <%.*s>", - (int)GRN_TEXT_LEN(mode_name), GRN_TEXT_VALUE(mode_name)); - } - GRN_OBJ_FIN(ctx, &tokens); - } -#undef MODE_NAME_EQUAL - - grn_obj_unlink(ctx, lexicon); - } - - return NULL; -} - static void list_proc(grn_ctx *ctx, grn_proc_type target_proc_type, const char *name, const char *plural_name) @@ -4470,19 +4131,8 @@ grn_db_init_builtin_query(grn_ctx *ctx) DEF_VAR(vars[2], "flags"); DEF_COMMAND("normalize", proc_normalize, 3, vars); - DEF_VAR(vars[0], "tokenizer"); - DEF_VAR(vars[1], "string"); - DEF_VAR(vars[2], "normalizer"); - DEF_VAR(vars[3], "flags"); - DEF_VAR(vars[4], "mode"); - DEF_VAR(vars[5], "token_filters"); - DEF_COMMAND("tokenize", proc_tokenize, 6, vars); - - DEF_VAR(vars[0], "table"); - DEF_VAR(vars[1], "string"); - DEF_VAR(vars[2], "flags"); - DEF_VAR(vars[3], "mode"); - DEF_COMMAND("table_tokenize", proc_table_tokenize, 4, vars); + grn_proc_init_tokenize(ctx); + grn_proc_init_table_tokenize(ctx); DEF_COMMAND("tokenizer_list", proc_tokenizer_list, 0, vars); Added: lib/proc/proc_tokenize.c (+383 -0) 100644 =================================================================== --- /dev/null +++ lib/proc/proc_tokenize.c 2016-04-03 02:04:55 +0900 (0fc98c1) @@ -0,0 +1,383 @@ +/* -*- c-basic-offset: 2 -*- */ +/* + Copyright(C) 2009-2016 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include "../grn_proc.h" +#include "../grn_ctx.h" +#include "../grn_token_cursor.h" + +#include <groonga/plugin.h> + +static unsigned int +parse_tokenize_flags(grn_ctx *ctx, grn_obj *flag_names) +{ + unsigned int flags = 0; + const char *names, *names_end; + int length; + + names = GRN_TEXT_VALUE(flag_names); + length = GRN_TEXT_LEN(flag_names); + names_end = names + length; + while (names < names_end) { + if (*names == '|' || *names == ' ') { + names += 1; + continue; + } + +#define CHECK_FLAG(name)\ + if (((names_end - names) >= (sizeof(#name) - 1)) &&\ + (!memcmp(names, #name, sizeof(#name) - 1))) {\ + flags |= GRN_TOKEN_CURSOR_ ## name;\ + names += sizeof(#name) - 1;\ + continue;\ + } + + CHECK_FLAG(ENABLE_TOKENIZED_DELIMITER); + +#define GRN_TOKEN_CURSOR_NONE 0 + CHECK_FLAG(NONE); +#undef GRN_TOKEN_CURSOR_NONE + + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "[tokenize] invalid flag: <%.*s>", + (int)(names_end - names), names); + return 0; +#undef CHECK_FLAG + } + + return flags; +} + +typedef struct { + grn_id id; + int32_t position; + grn_bool force_prefix; +} tokenize_token; + +static void +output_tokens(grn_ctx *ctx, grn_obj *tokens, grn_obj *lexicon) +{ + int i, n_tokens; + + n_tokens = GRN_BULK_VSIZE(tokens) / sizeof(tokenize_token); + grn_ctx_output_array_open(ctx, "TOKENS", n_tokens); + for (i = 0; i < n_tokens; i++) { + tokenize_token *token; + char value[GRN_TABLE_MAX_KEY_SIZE]; + unsigned int value_size; + + token = ((tokenize_token *)(GRN_BULK_HEAD(tokens))) + i; + + grn_ctx_output_map_open(ctx, "TOKEN", 3); + + grn_ctx_output_cstr(ctx, "value"); + value_size = grn_table_get_key(ctx, lexicon, token->id, + value, GRN_TABLE_MAX_KEY_SIZE); + grn_ctx_output_str(ctx, value, value_size); + + grn_ctx_output_cstr(ctx, "position"); + grn_ctx_output_int32(ctx, token->position); + + grn_ctx_output_cstr(ctx, "force_prefix"); + grn_ctx_output_bool(ctx, token->force_prefix); + + grn_ctx_output_map_close(ctx); + } + grn_ctx_output_array_close(ctx); +} + +static grn_obj * +create_lexicon_for_tokenize(grn_ctx *ctx, + grn_obj *tokenizer_name, + grn_obj *normalizer_name, + grn_obj *token_filter_names) +{ + grn_obj *lexicon; + grn_obj *tokenizer; + grn_obj *normalizer = NULL; + + tokenizer = grn_ctx_get(ctx, + GRN_TEXT_VALUE(tokenizer_name), + GRN_TEXT_LEN(tokenizer_name)); + if (!tokenizer) { + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "[tokenize] nonexistent tokenizer: <%.*s>", + (int)GRN_TEXT_LEN(tokenizer_name), + GRN_TEXT_VALUE(tokenizer_name)); + return NULL; + } + + if (!grn_obj_is_tokenizer_proc(ctx, tokenizer)) { + grn_obj inspected; + GRN_TEXT_INIT(&inspected, 0); + grn_inspect(ctx, &inspected, tokenizer); + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "[tokenize] not tokenizer: %.*s", + (int)GRN_TEXT_LEN(&inspected), + GRN_TEXT_VALUE(&inspected)); + GRN_OBJ_FIN(ctx, &inspected); + grn_obj_unlink(ctx, tokenizer); + return NULL; + } + + if (GRN_TEXT_LEN(normalizer_name) > 0) { + normalizer = grn_ctx_get(ctx, + GRN_TEXT_VALUE(normalizer_name), + GRN_TEXT_LEN(normalizer_name)); + if (!normalizer) { + grn_obj_unlink(ctx, tokenizer); + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "[tokenize] nonexistent normalizer: <%.*s>", + (int)GRN_TEXT_LEN(normalizer_name), + GRN_TEXT_VALUE(normalizer_name)); + return NULL; + } + + if (!grn_obj_is_normalizer_proc(ctx, normalizer)) { + grn_obj inspected; + grn_obj_unlink(ctx, tokenizer); + GRN_TEXT_INIT(&inspected, 0); + grn_inspect(ctx, &inspected, normalizer); + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "[tokenize] not normalizer: %.*s", + (int)GRN_TEXT_LEN(&inspected), + GRN_TEXT_VALUE(&inspected)); + GRN_OBJ_FIN(ctx, &inspected); + grn_obj_unlink(ctx, normalizer); + return NULL; + } + } + + lexicon = grn_table_create(ctx, NULL, 0, + NULL, + GRN_OBJ_TABLE_HASH_KEY, + grn_ctx_at(ctx, GRN_DB_SHORT_TEXT), + NULL); + grn_obj_set_info(ctx, lexicon, + GRN_INFO_DEFAULT_TOKENIZER, tokenizer); + grn_obj_unlink(ctx, tokenizer); + if (normalizer) { + grn_obj_set_info(ctx, lexicon, + GRN_INFO_NORMALIZER, normalizer); + grn_obj_unlink(ctx, normalizer); + } + grn_proc_table_set_token_filters(ctx, lexicon, token_filter_names); + + return lexicon; +} + +static void +tokenize(grn_ctx *ctx, grn_obj *lexicon, grn_obj *string, grn_tokenize_mode mode, + unsigned int flags, grn_obj *tokens) +{ + grn_token_cursor *token_cursor; + + token_cursor = + grn_token_cursor_open(ctx, lexicon, + GRN_TEXT_VALUE(string), GRN_TEXT_LEN(string), + mode, flags); + if (!token_cursor) { + return; + } + + while (token_cursor->status == GRN_TOKEN_CURSOR_DOING) { + grn_id token_id = grn_token_cursor_next(ctx, token_cursor); + tokenize_token *current_token; + if (token_id == GRN_ID_NIL) { + continue; + } + grn_bulk_space(ctx, tokens, sizeof(tokenize_token)); + current_token = ((tokenize_token *)(GRN_BULK_CURR(tokens))) - 1; + current_token->id = token_id; + current_token->position = token_cursor->pos; + current_token->force_prefix = token_cursor->force_prefix; + } + grn_token_cursor_close(ctx, token_cursor); +} + +static grn_obj * +command_table_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +{ + grn_obj *table_name; + grn_obj *string; + grn_obj *flag_names; + grn_obj *mode_name; + + table_name = grn_plugin_proc_get_var(ctx, user_data, "table", -1); + string = grn_plugin_proc_get_var(ctx, user_data, "string", -1); + flag_names = grn_plugin_proc_get_var(ctx, user_data, "flags", -1); + mode_name = grn_plugin_proc_get_var(ctx, user_data, "mode", -1); + + if (GRN_TEXT_LEN(table_name) == 0) { + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[table_tokenize] table name is missing"); + return NULL; + } + + if (GRN_TEXT_LEN(string) == 0) { + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[table_tokenize] string is missing"); + return NULL; + } + + { + unsigned int flags; + grn_obj *lexicon; + + flags = parse_tokenize_flags(ctx, flag_names); + if (ctx->rc != GRN_SUCCESS) { + return NULL; + } + + lexicon = grn_ctx_get(ctx, GRN_TEXT_VALUE(table_name), GRN_TEXT_LEN(table_name)); + + if (!lexicon) { + return NULL; + } + +#define MODE_NAME_EQUAL(name)\ + (GRN_TEXT_LEN(mode_name) == strlen(name) &&\ + memcmp(GRN_TEXT_VALUE(mode_name), name, strlen(name)) == 0) + + { + grn_obj tokens; + GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL); + if (GRN_TEXT_LEN(mode_name) == 0 || MODE_NAME_EQUAL("GET")) { + tokenize(ctx, lexicon, string, GRN_TOKEN_GET, flags, &tokens); + output_tokens(ctx, &tokens, lexicon); + } else if (MODE_NAME_EQUAL("ADD")) { + tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens); + output_tokens(ctx, &tokens, lexicon); + } else { + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "[table_tokenize] invalid mode: <%.*s>", + (int)GRN_TEXT_LEN(mode_name), GRN_TEXT_VALUE(mode_name)); + } + GRN_OBJ_FIN(ctx, &tokens); + } +#undef MODE_NAME_EQUAL + + grn_obj_unlink(ctx, lexicon); + } + + return NULL; +} + +void +grn_proc_init_table_tokenize(grn_ctx *ctx) +{ + grn_expr_var vars[4]; + + grn_plugin_expr_var_init(ctx, &(vars[0]), "table", -1); + grn_plugin_expr_var_init(ctx, &(vars[1]), "string", -1); + grn_plugin_expr_var_init(ctx, &(vars[2]), "flags", -1); + grn_plugin_expr_var_init(ctx, &(vars[3]), "mode", -1); + grn_plugin_command_create(ctx, + "table_tokenize", -1, + command_table_tokenize, + 4, + vars); +} + +static grn_obj * +command_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +{ + grn_obj *tokenizer_name; + grn_obj *string; + grn_obj *normalizer_name; + grn_obj *flag_names; + grn_obj *mode_name; + grn_obj *token_filter_names; + + tokenizer_name = grn_plugin_proc_get_var(ctx, user_data, "tokenizer", -1); + string = grn_plugin_proc_get_var(ctx, user_data, "string", -1); + normalizer_name = grn_plugin_proc_get_var(ctx, user_data, "normalizer", -1); + flag_names = grn_plugin_proc_get_var(ctx, user_data, "flags", -1); + mode_name = grn_plugin_proc_get_var(ctx, user_data, "mode", -1); + token_filter_names = grn_plugin_proc_get_var(ctx, user_data, "token_filters", -1); + + if (GRN_TEXT_LEN(tokenizer_name) == 0) { + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[tokenize] tokenizer name is missing"); + return NULL; + } + + if (GRN_TEXT_LEN(string) == 0) { + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[tokenize] string is missing"); + return NULL; + } + + { + unsigned int flags; + grn_obj *lexicon; + + flags = parse_tokenize_flags(ctx, flag_names); + if (ctx->rc != GRN_SUCCESS) { + return NULL; + } + + lexicon = create_lexicon_for_tokenize(ctx, + tokenizer_name, + normalizer_name, + token_filter_names); + if (!lexicon) { + return NULL; + } +#define MODE_NAME_EQUAL(name)\ + (GRN_TEXT_LEN(mode_name) == strlen(name) &&\ + memcmp(GRN_TEXT_VALUE(mode_name), name, strlen(name)) == 0) + + { + grn_obj tokens; + GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL); + if (GRN_TEXT_LEN(mode_name) == 0 || MODE_NAME_EQUAL("ADD")) { + tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens); + output_tokens(ctx, &tokens, lexicon); + } else if (MODE_NAME_EQUAL("GET")) { + tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens); + GRN_BULK_REWIND(&tokens); + tokenize(ctx, lexicon, string, GRN_TOKEN_GET, flags, &tokens); + output_tokens(ctx, &tokens, lexicon); + } else { + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "[tokenize] invalid mode: <%.*s>", + (int)GRN_TEXT_LEN(mode_name), GRN_TEXT_VALUE(mode_name)); + } + GRN_OBJ_FIN(ctx, &tokens); + } +#undef MODE_NAME_EQUAL + + grn_obj_unlink(ctx, lexicon); + } + + return NULL; +} + +void +grn_proc_init_tokenize(grn_ctx *ctx) +{ + grn_expr_var vars[6]; + + grn_plugin_expr_var_init(ctx, &(vars[0]), "tokenizer", -1); + grn_plugin_expr_var_init(ctx, &(vars[1]), "string", -1); + grn_plugin_expr_var_init(ctx, &(vars[2]), "normalizer", -1); + grn_plugin_expr_var_init(ctx, &(vars[3]), "flags", -1); + grn_plugin_expr_var_init(ctx, &(vars[4]), "mode", -1); + grn_plugin_expr_var_init(ctx, &(vars[5]), "token_filters", -1); + grn_plugin_command_create(ctx, + "tokenize", -1, + command_tokenize, + 6, + vars); +} Modified: lib/proc/sources.am (+2 -1) =================================================================== --- lib/proc/sources.am 2016-04-03 00:53:15 +0900 (0c0d174) +++ lib/proc/sources.am 2016-04-03 02:04:55 +0900 (f0f88b1) @@ -9,4 +9,5 @@ libgrnproc_la_SOURCES = \ proc_select.c \ proc_snippet.c \ proc_highlight.c \ - proc_table.c + proc_table.c \ + proc_tokenize.c -------------- next part -------------- HTML����������������������������...Télécharger