Kouhei Sutou 2018-11-13 11:05:43 +0900 (Tue, 13 Nov 2018) Revision: 2033d06c6aabb61e7136ff279477835f3827c00e https://github.com/groonga/groonga/commit/2033d06c6aabb61e7136ff279477835f3827c00e Message: Add grn_tokenizer_next_by_tokenized_delimiter() It's for new tokenizer next API, grn_tokenizer_next_func. Modified files: include/groonga/tokenizer.h lib/tokenizer.c lib/tokenizers.c plugins/tokenizers/mecab.c Modified: include/groonga/tokenizer.h (+20 -1) =================================================================== --- include/groonga/tokenizer.h 2018-11-13 10:53:33 +0900 (61468ef59) +++ include/groonga/tokenizer.h 2018-11-13 11:05:43 +0900 (693d8686a) @@ -254,6 +254,10 @@ GRN_PLUGIN_EXPORT void grn_tokenizer_token_push(grn_ctx *ctx, grn_tokenizer_toke the next token into `token'. It returns the string after the next token. The returned string may be `NULL' when all tokens are extracted. + + @deprecated since 8.0.9. It's for old tokenizer next API. Use + grn_tokenizer_next_by_tokenized_delimiter() for new tokenizer next + API (grn_tokenizer_next_func). */ GRN_PLUGIN_EXPORT const char *grn_tokenizer_tokenized_delimiter_next(grn_ctx *ctx, grn_tokenizer_token *token, @@ -262,6 +266,21 @@ GRN_PLUGIN_EXPORT const char *grn_tokenizer_tokenized_delimiter_next(grn_ctx *ct grn_encoding encoding); /* + Extract the next token by delimiting by + GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8. + + This is for grn_tokenizer_next_func. + + @since 8.0.9. + */ +GRN_PLUGIN_EXPORT const char * +grn_tokenizer_next_by_tokenized_delimiter(grn_ctx *ctx, + grn_token *token, + const char *str_ptr, + unsigned int str_length, + grn_encoding encoding); + +/* grn_tokenizer_register() registers a plugin to the database which is associated with `ctx'. `plugin_name_ptr' and `plugin_name_length' specify the plugin name. Alphabetic letters ('A'-'Z' and 'a'-'z'), digits ('0'-'9') and @@ -273,7 +292,7 @@ GRN_PLUGIN_EXPORT const char *grn_tokenizer_tokenized_delimiter_next(grn_ctx *ct details of grn_proc_func and grn_user_data, that is used as an argument of grn_proc_func. - Deprecated since 8.0.2. Use grn_tokenizer_create() and + @deprecated since 8.0.2. Use grn_tokenizer_create() and grn_tokenizer_set_*_func(). */ GRN_PLUGIN_EXPORT grn_rc Modified: lib/tokenizer.c (+35 -0) =================================================================== --- lib/tokenizer.c 2018-11-13 10:53:33 +0900 (3cefaa4a5) +++ lib/tokenizer.c 2018-11-13 11:05:43 +0900 (e4198b9e3) @@ -510,6 +510,41 @@ grn_tokenizer_tokenized_delimiter_next(grn_ctx *ctx, return next_start; } +const char * +grn_tokenizer_next_by_tokenized_delimiter(grn_ctx *ctx, + grn_token *token, + const char *str_ptr, + unsigned int str_length, + grn_encoding encoding) +{ + size_t char_length = 0; + const char *start = str_ptr; + const char *current; + const char *end = str_ptr + str_length; + const char *next_start = NULL; + + for (current = start; current < end; current += char_length) { + char_length = grn_charlen_(ctx, current, end, encoding); + if (char_length == 0) { + break; + } + if (grn_tokenizer_is_tokenized_delimiter(ctx, current, char_length, + encoding)) { + next_start = str_ptr + (current - start + char_length); + break; + } + } + + grn_token_set_data(ctx, token, start, current - start); + if (current == end) { + grn_token_set_status(ctx, token, GRN_TOKEN_LAST); + } else { + grn_token_set_status(ctx, token, GRN_TOKEN_CONTINUE); + } + + return next_start; +} + grn_rc grn_tokenizer_register(grn_ctx *ctx, const char *plugin_name_ptr, unsigned int plugin_name_length, Modified: lib/tokenizers.c (+2 -14) =================================================================== --- lib/tokenizers.c 2018-11-13 10:53:33 +0900 (2f599c3e9) +++ lib/tokenizers.c 2018-11-13 11:05:43 +0900 (3b3ad3307) @@ -110,7 +110,6 @@ typedef struct { } grn_delimit_options_default; typedef struct { - grn_tokenizer_token token; grn_tokenizer_query *query; grn_delimit_options *options; grn_bool have_tokenized_delimiter; @@ -207,7 +206,6 @@ delimit_init_raw(grn_ctx *ctx, return NULL; } - grn_tokenizer_token_init(ctx, &(tokenizer->token)); tokenizer->query = query; tokenizer->options = options; @@ -276,23 +274,14 @@ delimit_next(grn_ctx *ctx, if (tokenizer->have_tokenized_delimiter) { unsigned int rest_length; - grn_obj *status; - grn_obj *data; rest_length = tokenizer->end - tokenizer->next; tokenizer->next = - (unsigned char *)grn_tokenizer_tokenized_delimiter_next( + (unsigned char *)grn_tokenizer_next_by_tokenized_delimiter( ctx, - &(tokenizer->token), + token, (const char *)tokenizer->next, rest_length, tokenizer->encoding); - status = grn_ctx_pop(ctx); - data = grn_ctx_pop(ctx); - grn_token_set_data(ctx, - token, - GRN_TEXT_VALUE(data), - GRN_TEXT_LEN(data)); - grn_token_set_status(ctx, token, GRN_UINT32_VALUE(status)); } else { size_t cl; const unsigned char *p = tokenizer->next, *r; @@ -359,7 +348,6 @@ delimit_fin(grn_ctx *ctx, void *user_data) if (!tokenizer) { return; } - grn_tokenizer_token_fin(ctx, &(tokenizer->token)); GRN_FREE(tokenizer); } Modified: plugins/tokenizers/mecab.c (+5 -16) =================================================================== --- plugins/tokenizers/mecab.c 2018-11-13 10:53:33 +0900 (91a080fbf) +++ plugins/tokenizers/mecab.c 2018-11-13 11:05:43 +0900 (9d41ebf15) @@ -1159,23 +1159,12 @@ mecab_next(grn_ctx *ctx, if (grn_tokenizer_query_have_tokenized_delimiter(ctx, tokenizer->query)) { grn_encoding encoding = tokenizer->query->encoding; - grn_tokenizer_token tokenizer_token; - grn_tokenizer_token_init(ctx, &tokenizer_token); - /* TODO: Need grn_token version. */ tokenizer->next = - grn_tokenizer_tokenized_delimiter_next(ctx, - &tokenizer_token, - tokenizer->next, - tokenizer->end - tokenizer->next, - encoding); - grn_token_set_data(ctx, - token, - GRN_TEXT_VALUE(&(tokenizer_token.str)), - GRN_TEXT_LEN(&(tokenizer_token.str))); - grn_token_set_status(ctx, - token, - GRN_UINT32_VALUE(&(tokenizer_token.status))); - grn_tokenizer_token_fin(ctx, &tokenizer_token); + grn_tokenizer_next_by_tokenized_delimiter(ctx, + token, + tokenizer->next, + tokenizer->end - tokenizer->next, + encoding); } else if (mecab_tokenizer_options_need_default_output(ctx, tokenizer->options)) { mecab_next_default_format(ctx, tokenizer, token); } else { -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20181113/f33b1d65/attachment-0001.html>