Kouhei Sutou
null+****@clear*****
Fri Oct 10 19:31:43 JST 2014
Kouhei Sutou 2014-10-10 19:31:43 +0900 (Fri, 10 Oct 2014) New Revision: 7b1f7a2f620e3ab698c6e0f09e78a8e6048d00ed https://github.com/groonga/groonga/commit/7b1f7a2f620e3ab698c6e0f09e78a8e6048d00ed Merged 86db854: Merge pull request #214 from groonga/improve-token-filter-api Message: token-filter: use token filter specific callbacks instead of generic callbacks Generic callbacks are difficult to understand how to use API because its signature doesn't show what arguments are passed. Specific callbacks are easy to understand how to use API because its signature shows what arguments are passed. Modified files: include/groonga/token_filter.h include/groonga/tokenizer.h lib/db.h lib/token.c lib/token.h lib/token_filter.c lib/tokenizer.c plugins/token_filters/stop_word.c Modified: include/groonga/token_filter.h (+29 -10) =================================================================== --- include/groonga/token_filter.h 2014-10-05 12:36:13 +0900 (347715a) +++ include/groonga/token_filter.h 2014-10-10 19:31:43 +0900 (2ae553b) @@ -26,24 +26,43 @@ extern "C" { #endif /* __cplusplus */ +typedef void *grn_token_filter_init_func(grn_ctx *ctx, + grn_obj *table, + grn_token_mode mode); + +typedef void grn_token_filter_filter_func(grn_ctx *ctx, + grn_token *current_token, + grn_token *next_token, + void *user_data); + +typedef void grn_token_filter_fin_func(grn_ctx *ctx, + void *user_data); + + /* grn_token_filter_register() registers a plugin to the database which is associated with `ctx'. `plugin_name_ptr' and `plugin_name_length' specify the plugin name. Alphabetic letters ('A'-'Z' and 'a'-'z'), digits ('0'-'9') and - an underscore ('_') are capable characters. `init', `next' and `fin' specify - the plugin functions. `init' is called for initializing a token_filter for a - document or query. `next' is called for extracting tokens one by one. `fin' - is called for finalizing a token_filter. grn_token_filter_register() returns - GRN_SUCCESS on success, an error code on failure. See "groonga.h" for more - details of grn_proc_func and grn_user_data, that is used as an argument of - grn_proc_func. + an underscore ('_') are capable characters. + + `init', `filter' and `fin' specify the plugin functions. + + `init' is called for initializing a token_filter for a document or + query. + + `filter' is called for filtering tokens one by one. + + `fin' is called for finalizing a token_filter. + + grn_token_filter_register() returns GRN_SUCCESS on success, an error + code on failure. */ GRN_PLUGIN_EXPORT grn_rc grn_token_filter_register(grn_ctx *ctx, const char *plugin_name_ptr, int plugin_name_length, - grn_proc_func *init, - grn_proc_func *next, - grn_proc_func *fin); + grn_token_filter_init_func *init, + grn_token_filter_filter_func *filter, + grn_token_filter_fin_func *fin); #ifdef __cplusplus } /* extern "C" */ Modified: include/groonga/tokenizer.h (+15 -0) =================================================================== --- include/groonga/tokenizer.h 2014-10-05 12:36:13 +0900 (b57a513) +++ include/groonga/tokenizer.h 2014-10-10 19:31:43 +0900 (9792e01) @@ -193,6 +193,21 @@ typedef unsigned int grn_tokenizer_status; #define GRN_TOKENIZER_CONTINUE GRN_TOKENIZER_TOKEN_CONTINUE #define GRN_TOKENIZER_LAST GRN_TOKENIZER_TOKEN_LAST +typedef struct _grn_token grn_token; + +GRN_PLUGIN_EXPORT grn_obj *grn_token_get_data(grn_ctx *ctx, + grn_token *token); +GRN_PLUGIN_EXPORT grn_rc grn_token_set_data(grn_ctx *ctx, + grn_token *token, + const char *str_ptr, + int str_length); +GRN_PLUGIN_EXPORT grn_tokenizer_status grn_token_get_status(grn_ctx *ctx, + grn_token *token); +GRN_PLUGIN_EXPORT grn_rc grn_token_set_status(grn_ctx *ctx, + grn_token *token, + grn_tokenizer_status status); + + /* grn_tokenizer_token_push() pushes the next token into `token'. Note that grn_tokenizer_token_push() does not make a copy of the given string. This Modified: lib/db.h (+12 -0) =================================================================== --- lib/db.h 2014-10-05 12:36:13 +0900 (e9d2278) +++ lib/db.h 2014-10-10 19:31:43 +0900 (f2f9556) @@ -29,6 +29,8 @@ #include "store.h" #endif /* GRN_STORE_H */ +#include <groonga/token_filter.h> + #ifdef __cplusplus extern "C" { #endif @@ -185,6 +187,16 @@ struct _grn_proc { grn_selector_func *selector; + union { + struct { + grn_token_filter_init_func *init; + grn_token_filter_filter_func *filter; + grn_token_filter_fin_func *fin; + } token_filter; + } callbacks; + + void *user_data; + grn_id module; // uint32_t nargs; // uint32_t nresults; Modified: lib/token.c (+40 -61) =================================================================== --- lib/token.c 2014-10-05 12:36:13 +0900 (f0a0ff7) +++ lib/token.c 2014-10-10 19:31:43 +0900 (00b3de2) @@ -501,7 +501,6 @@ grn_token_cursor_open_initialize_token_filters(grn_ctx *ctx, { grn_obj *token_filters = token_cursor->token_filters; unsigned int i, n_token_filters; - grn_obj mode; if (token_filters) { n_token_filters = GRN_BULK_VSIZE(token_filters) / sizeof(grn_obj *); @@ -509,41 +508,15 @@ grn_token_cursor_open_initialize_token_filters(grn_ctx *ctx, n_token_filters = 0; } - if (n_token_filters == 0) { - token_cursor->token_filter_ctxs = NULL; - return; - } - - token_cursor->token_filter_ctxs = - GRN_MALLOC(sizeof(grn_proc_ctx) * n_token_filters); - if (!token_cursor->token_filter_ctxs) { - ERR(GRN_NO_MEMORY_AVAILABLE, - "[token-cursor][open] failed to allocate token filter contexts"); - return; - } - - GRN_UINT32_INIT(&mode, 0); - GRN_UINT32_SET(ctx, &mode, token_cursor->mode); for (i = 0; i < n_token_filters; i++) { - grn_obj *token_filter = GRN_PTR_VALUE_AT(token_filters, i); - grn_proc_ctx *token_filter_ctx = &token_cursor->token_filter_ctxs[i]; - int n_args = 0; - grn_obj *args[2]; - - token_filter_ctx->caller = NULL; - token_filter_ctx->user_data.ptr = NULL; - token_filter_ctx->proc = (grn_proc *)token_filter; - token_filter_ctx->hooks = NULL; - token_filter_ctx->currh = NULL; - token_filter_ctx->phase = PROC_INIT; - - args[n_args++] = token_cursor->table; - args[n_args++] = &mode; - ((grn_proc *)token_filter)->funcs[PROC_INIT](ctx, - n_args, args, - &token_filter_ctx->user_data); + grn_obj *token_filter_object = GRN_PTR_VALUE_AT(token_filters, i); + grn_proc *token_filter = (grn_proc *)token_filter_object; + + token_filter->user_data = + token_filter->callbacks.token_filter.init(ctx, + token_cursor->table, + token_cursor->mode); } - GRN_OBJ_FIN(ctx, &mode); } grn_token_cursor * @@ -625,45 +598,58 @@ grn_token_cursor_open(grn_ctx *ctx, grn_obj *table, static int grn_token_cursor_next_apply_token_filters(grn_ctx *ctx, grn_token_cursor *token_cursor, - grn_obj *current_token, + grn_obj *current_token_data, grn_obj *status) { grn_obj *token_filters = token_cursor->token_filters; unsigned int i, n_token_filters; + grn_token current_token; + grn_token next_token; if (token_filters) { n_token_filters = GRN_BULK_VSIZE(token_filters) / sizeof(grn_obj *); } else { n_token_filters = 0; } + + GRN_TEXT_INIT(&(current_token.data), GRN_OBJ_DO_SHALLOW_COPY); + GRN_TEXT_SET(ctx, &(current_token.data), + GRN_TEXT_VALUE(current_token_data), + GRN_TEXT_LEN(current_token_data)); + current_token.status = GRN_INT32_VALUE(status); + GRN_TEXT_INIT(&(next_token.data), GRN_OBJ_DO_SHALLOW_COPY); + GRN_TEXT_SET(ctx, &(next_token.data), + GRN_TEXT_VALUE(&(current_token.data)), + GRN_TEXT_LEN(&(current_token.data))); + next_token.status = current_token.status; + for (i = 0; i < n_token_filters; i++) { - grn_obj *token_filter = GRN_PTR_VALUE_AT(token_filters, i); - grn_proc_ctx *token_filter_ctx = &token_cursor->token_filter_ctxs[i]; - int n_args = 0; - grn_obj *args[2]; + grn_obj *token_filter_object = GRN_PTR_VALUE_AT(token_filters, i); + grn_proc *token_filter = (grn_proc *)token_filter_object; #define SKIP_FLAGS\ (GRN_TOKENIZER_TOKEN_SKIP |\ GRN_TOKENIZER_TOKEN_SKIP_WITH_POSITION) - if (GRN_INT32_VALUE(status) & SKIP_FLAGS) { + if (current_token.status & SKIP_FLAGS) { break; } #undef SKIP_FLAGS - args[n_args++] = current_token; - args[n_args++] = status; - ((grn_proc *)token_filter)->funcs[PROC_NEXT](ctx, - n_args, - args, - &token_filter_ctx->user_data); - status = grn_ctx_pop(ctx); - current_token = grn_ctx_pop(ctx); + token_filter->callbacks.token_filter.filter(ctx, + ¤t_token, + &next_token, + token_filter->user_data); + GRN_TEXT_SET(ctx, &(current_token.data), + GRN_TEXT_VALUE(&(next_token.data)), + GRN_TEXT_LEN(&(next_token.data))); + current_token.status = next_token.status; } - token_cursor->curr = (const unsigned char *)GRN_TEXT_VALUE(current_token); - token_cursor->curr_size = GRN_TEXT_LEN(current_token); + token_cursor->curr = + (const unsigned char *)GRN_TEXT_VALUE(&(current_token.data)); + token_cursor->curr_size = GRN_TEXT_LEN(&(current_token.data)); - return GRN_INT32_VALUE(status); + return current_token.status; } grn_id @@ -811,17 +797,10 @@ grn_token_cursor_close_token_filters(grn_ctx *ctx, n_token_filters = 0; } for (i = 0; i < n_token_filters; i++) { - grn_obj *token_filter = GRN_PTR_VALUE_AT(token_filters, i); - grn_proc_ctx *token_filter_ctx = &token_cursor->token_filter_ctxs[i]; - - ((grn_proc *)token_filter)->funcs[PROC_FIN](ctx, - 1, - &token_cursor->table, - &token_filter_ctx->user_data); - } + grn_obj *token_filter_object = GRN_PTR_VALUE_AT(token_filters, i); + grn_proc *token_filter = (grn_proc *)token_filter_object; - if (token_cursor->token_filter_ctxs) { - GRN_FREE(token_cursor->token_filter_ctxs); + token_filter->callbacks.token_filter.fin(ctx, token_filter->user_data); } } Modified: lib/token.h (+5 -1) =================================================================== --- lib/token.h 2014-10-05 12:36:13 +0900 (89a2eb0) +++ lib/token.h 2014-10-10 19:31:43 +0900 (8689307) @@ -46,6 +46,11 @@ typedef enum { GRN_TOKEN_NOT_FOUND } grn_token_status; +struct _grn_token { + grn_obj data; + grn_tokenizer_status status; +}; + typedef struct { grn_obj *table; const unsigned char *orig; @@ -61,7 +66,6 @@ typedef struct { grn_obj *tokenizer; grn_proc_ctx pctx; grn_obj *token_filters; - grn_proc_ctx *token_filter_ctxs; uint32_t variant; grn_obj *nstr; } grn_token_cursor; Modified: lib/token_filter.c (+22 -19) =================================================================== --- lib/token_filter.c 2014-10-05 12:36:13 +0900 (470f9f1) +++ lib/token_filter.c 2014-10-10 19:31:43 +0900 (910bb9a) @@ -19,38 +19,41 @@ #include <string.h> #include "groonga_in.h" +#include "db.h" #include <groonga/token_filter.h> grn_rc grn_token_filter_register(grn_ctx *ctx, const char *plugin_name_ptr, int plugin_name_length, - grn_proc_func *init, - grn_proc_func *next, - grn_proc_func *fin) + grn_token_filter_init_func *init, + grn_token_filter_filter_func *filter, + grn_token_filter_fin_func *fin) { - grn_expr_var vars[] = { - { NULL, 0 }, - { NULL, 0 }, - { NULL, 0 } - }; - GRN_TEXT_INIT(&vars[0].value, 0); - GRN_TEXT_INIT(&vars[1].value, 0); - GRN_UINT32_INIT(&vars[2].value, 0); - if (plugin_name_length == -1) { plugin_name_length = strlen(plugin_name_ptr); } + { - grn_obj * const obj = grn_proc_create(ctx, - plugin_name_ptr, - plugin_name_length, - GRN_PROC_TOKENIZER, - init, next, fin, 3, vars); - if (obj == NULL) { - GRN_PLUGIN_ERROR(ctx, GRN_TOKEN_FILTER_ERROR, "grn_proc_create() failed"); + grn_obj *token_filter_object = grn_proc_create(ctx, + plugin_name_ptr, + plugin_name_length, + GRN_PROC_TOKENIZER, + NULL, NULL, NULL, 0, NULL); + if (token_filter_object == NULL) { + GRN_PLUGIN_ERROR(ctx, GRN_TOKEN_FILTER_ERROR, + "[token-filter][%.*s] failed to grn_proc_create()", + plugin_name_length, plugin_name_ptr); return ctx->rc; } + + { + grn_proc *token_filter = (grn_proc *)token_filter_object; + token_filter->callbacks.token_filter.init = init; + token_filter->callbacks.token_filter.filter = filter; + token_filter->callbacks.token_filter.fin = fin; + } } + return GRN_SUCCESS; } Modified: lib/tokenizer.c (+57 -1) =================================================================== --- lib/tokenizer.c 2014-10-05 12:36:13 +0900 (fc3495d) +++ lib/tokenizer.c 2014-10-10 19:31:43 +0900 (ac4628b) @@ -1,6 +1,6 @@ /* -*- c-basic-offset: 2 -*- */ /* - Copyright(C) 2012 Brazil + Copyright(C) 2012-2014 Brazil This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -318,3 +318,59 @@ grn_tokenizer_register(grn_ctx *ctx, const char *plugin_name_ptr, } return GRN_SUCCESS; } + +grn_obj * +grn_token_get_data(grn_ctx *ctx, grn_token *token) +{ + GRN_API_ENTER; + if (!token) { + ERR(GRN_INVALID_ARGUMENT, "token must not be NULL"); + GRN_API_RETURN(NULL); + } + GRN_API_RETURN(&(token->data)); +} + +grn_rc +grn_token_set_data(grn_ctx *ctx, + grn_token *token, + const char *str_ptr, + int str_length) +{ + GRN_API_ENTER; + if (!token) { + ERR(GRN_INVALID_ARGUMENT, "token must not be NULL"); + goto exit; + } + if (str_length == -1) { + str_length = strlen(str_ptr); + } + GRN_TEXT_SET(ctx, &(token->data), str_ptr, str_length); +exit: + GRN_API_RETURN(ctx->rc); +} + +grn_tokenizer_status +grn_token_get_status(grn_ctx *ctx, grn_token *token) +{ + GRN_API_ENTER; + if (!token) { + ERR(GRN_INVALID_ARGUMENT, "token must not be NULL"); + GRN_API_RETURN(GRN_TOKENIZER_TOKEN_CONTINUE); + } + GRN_API_RETURN(token->status); +} + +grn_rc +grn_token_set_status(grn_ctx *ctx, + grn_token *token, + grn_tokenizer_status status) +{ + GRN_API_ENTER; + if (!token) { + ERR(GRN_INVALID_ARGUMENT, "token must not be NULL"); + goto exit; + } + token->status = status; +exit: + GRN_API_RETURN(ctx->rc); +} Modified: plugins/token_filters/stop_word.c (+25 -28) =================================================================== --- plugins/token_filters/stop_word.c 2014-10-05 12:36:13 +0900 (6bb6311) +++ plugins/token_filters/stop_word.c 2014-10-10 19:31:43 +0900 (21451f0) @@ -33,8 +33,8 @@ typedef struct { grn_tokenizer_token token; } grn_stop_word_token_filter; -static grn_obj * -stop_word_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +static void * +stop_word_init(grn_ctx *ctx, grn_obj *table, grn_token_mode mode) { grn_stop_word_token_filter *token_filter; @@ -46,8 +46,8 @@ stop_word_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data return NULL; } - token_filter->table = args[0]; - token_filter->mode = GRN_UINT32_VALUE(args[1]); + token_filter->table = table; + token_filter->mode = mode; token_filter->column = grn_obj_column(ctx, token_filter->table, COLUMN_NAME, @@ -70,27 +70,29 @@ stop_word_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data return NULL; } - user_data->ptr = token_filter; - GRN_BOOL_INIT(&(token_filter->value), 0); grn_tokenizer_token_init(ctx, &(token_filter->token)); - return NULL; + return token_filter; } -static grn_obj * -stop_word_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +static void +stop_word_filter(grn_ctx *ctx, + grn_token *current_token, + grn_token *next_token, + void *user_data) { - grn_stop_word_token_filter *token_filter = user_data->ptr; - grn_obj *current_token = args[0]; - int status = GRN_INT32_VALUE(args[1]); + grn_stop_word_token_filter *token_filter = user_data; if (token_filter->mode == GRN_TOKEN_GET) { grn_id id; + grn_obj *data; + + data = grn_token_get_data(ctx, current_token); id = grn_table_get(ctx, token_filter->table, - GRN_TEXT_VALUE(current_token), - GRN_TEXT_LEN(current_token)); + GRN_TEXT_VALUE(data), + GRN_TEXT_LEN(data)); if (id != GRN_ID_NIL) { GRN_BULK_REWIND(&(token_filter->value)); grn_obj_get_value(ctx, @@ -98,32 +100,27 @@ stop_word_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data id, &(token_filter->value)); if (GRN_BOOL_VALUE(&(token_filter->value))) { + grn_tokenizer_status status; + status = grn_token_get_status(ctx, current_token); status |= GRN_TOKENIZER_TOKEN_SKIP; + grn_token_set_status(ctx, next_token, status); } } } - - grn_tokenizer_token_push(ctx, - &(token_filter->token), - GRN_TEXT_VALUE(current_token), - GRN_TEXT_LEN(current_token), - status); - - return NULL; } -static grn_obj * -stop_word_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +static void +stop_word_fin(grn_ctx *ctx, void *user_data) { - grn_stop_word_token_filter *token_filter = user_data->ptr; + grn_stop_word_token_filter *token_filter = user_data; if (!token_filter) { - return NULL; + return; } + grn_tokenizer_token_fin(ctx, &(token_filter->token)); grn_obj_unlink(ctx, token_filter->column); grn_obj_unlink(ctx, &(token_filter->value)); GRN_PLUGIN_FREE(ctx, token_filter); - return NULL; } grn_rc @@ -140,7 +137,7 @@ GRN_PLUGIN_REGISTER(grn_ctx *ctx) rc = grn_token_filter_register(ctx, "TokenFilterStopWord", -1, stop_word_init, - stop_word_next, + stop_word_filter, stop_word_fin); return rc; -------------- next part -------------- HTML����������������������������...Télécharger