Kouhei Sutou 2018-11-26 10:15:29 +0900 (Mon, 26 Nov 2018) Revision: addd89465dc508a52f6f8a2e171124d0cf21351b https://github.com/groonga/groonga/commit/addd89465dc508a52f6f8a2e171124d0cf21351b Message: TokenDelimit: add pattern option Added files: test/command/suite/tokenizers/delimit/options/pattern/no_match.expected test/command/suite/tokenizers/delimit/options/pattern/no_match.test test/command/suite/tokenizers/delimit/options/pattern/sentences.expected test/command/suite/tokenizers/delimit/options/pattern/sentences.test Modified files: lib/tokenizers.c Modified: lib/tokenizers.c (+70 -2) =================================================================== --- lib/tokenizers.c 2018-11-22 18:42:17 +0900 (3b3ad3307) +++ lib/tokenizers.c 2018-11-26 10:15:29 +0900 (cb4478d44) @@ -22,6 +22,7 @@ #include "grn_token_cursor.h" #include "grn_string.h" #include "grn_plugin.h" +#include "grn_onigmo.h" grn_obj *grn_tokenizer_uvector = NULL; @@ -102,6 +103,9 @@ uvector_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) typedef struct { grn_obj delimiters; +#ifdef GRN_SUPPORT_REGEXP + OnigRegex regex; +#endif /* GRN_SUPPORT_REGEXP */ } grn_delimit_options; typedef struct { @@ -114,6 +118,7 @@ typedef struct { grn_delimit_options *options; grn_bool have_tokenized_delimiter; grn_encoding encoding; + const unsigned char *start; const unsigned char *next; const unsigned char *end; } grn_delimit_tokenizer; @@ -122,6 +127,9 @@ static void delimit_options_init(grn_delimit_options *options) { GRN_TEXT_INIT(&(options->delimiters), GRN_OBJ_VECTOR); +#ifdef GRN_SUPPORT_REGEXP + options->regex = NULL; +#endif /* GRN_SUPPORT_REGEXP */ } static void * @@ -169,6 +177,30 @@ delimit_open_options(grn_ctx *ctx, 0, GRN_DB_TEXT); } + } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "pattern")) { +#ifdef GRN_SUPPORT_REGEXP + const char *pattern; + unsigned int pattern_length; + grn_id domain; + + pattern_length = grn_vector_get_element(ctx, + raw_options, + i, + &pattern, + NULL, + &domain); + if (grn_type_id_is_text_family(ctx, domain) && pattern_length > 0) { + if (options->regex) { + onig_free(options->regex); + } + options->regex = grn_onigmo_new(ctx, + pattern, + pattern_length, + GRN_ONIGMO_OPTION_DEFAULT, + GRN_ONIGMO_SYNTAX_DEFAULT, + "[tokenizer][delimit]"); + } +#endif /* GRN_SUPPORT_REGEXP */ } } GRN_OPTION_VALUES_EACH_END(); @@ -188,7 +220,13 @@ static void delimit_close_options(grn_ctx *ctx, void *data) { grn_delimit_options *options = data; + GRN_OBJ_FIN(ctx, &(options->delimiters)); +#ifdef GRN_SUPPORT_REGEXP + if (options->regex) { + onig_free(options->regex); + } +#endif /* GRN_SUPPORT_REGEXP */ GRN_FREE(options); } @@ -235,8 +273,9 @@ delimit_init_raw(grn_ctx *ctx, string, &normalized, &normalized_length_in_bytes, NULL); - tokenizer->next = (const unsigned char *)normalized; - tokenizer->end = tokenizer->next + normalized_length_in_bytes; + tokenizer->start = (const unsigned char *)normalized; + tokenizer->next = tokenizer->start; + tokenizer->end = tokenizer->start + normalized_length_in_bytes; } return tokenizer; @@ -282,6 +321,35 @@ delimit_next(grn_ctx *ctx, (const char *)tokenizer->next, rest_length, tokenizer->encoding); +#ifdef GRN_SUPPORT_REGEXP + } else if (tokenizer->options->regex) { + OnigPosition position; + OnigRegion region; + + onig_region_init(®ion); + position = onig_search(tokenizer->options->regex, + tokenizer->start, + tokenizer->end, + tokenizer->next, + tokenizer->end, + ®ion, + ONIG_OPTION_NONE); + if (position == ONIG_MISMATCH) { + grn_token_set_data(ctx, + token, + NULL, + 0); + grn_token_set_status(ctx, token, GRN_TOKEN_LAST); + } else { + grn_token_set_data(ctx, + token, + tokenizer->start + region.beg[0], + region.end[0] - region.beg[0]); + grn_token_set_status(ctx, token, GRN_TOKEN_CONTINUE); + tokenizer->next = tokenizer->start + region.end[0]; + onig_region_free(®ion, 0); + } +#endif /* GRN_SUPPORT_REGEXP */ } else { size_t cl; const unsigned char *p = tokenizer->next, *r; Added: test/command/suite/tokenizers/delimit/options/pattern/no_match.expected (+2 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/delimit/options/pattern/no_match.expected 2018-11-26 10:15:29 +0900 (70e6dbf63) @@ -0,0 +1,2 @@ +tokenize 'TokenDelimit("pattern", "nonexistent")' "Hello" +[[0,0.0,0.0],[]] Added: test/command/suite/tokenizers/delimit/options/pattern/no_match.test (+3 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/delimit/options/pattern/no_match.test 2018-11-26 10:15:29 +0900 (ffe5281d7) @@ -0,0 +1,3 @@ +tokenize \ + 'TokenDelimit("pattern", "nonexistent")' \ + "Hello" Added: test/command/suite/tokenizers/delimit/options/pattern/sentences.expected (+40 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/delimit/options/pattern/sentences.expected 2018-11-26 10:15:29 +0900 (08ac7c822) @@ -0,0 +1,40 @@ +tokenize 'TokenDelimit("pattern", "[^\\\\s].*?[.。]")' "りんごです。ペンです。This is an apple. Mr. X." +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "りんごです。", + "position": 0, + "force_prefix": false, + "force_prefix_search": false + }, + { + "value": "ペンです。", + "position": 1, + "force_prefix": false, + "force_prefix_search": false + }, + { + "value": "This is an apple.", + "position": 2, + "force_prefix": false, + "force_prefix_search": false + }, + { + "value": "Mr.", + "position": 3, + "force_prefix": false, + "force_prefix_search": false + }, + { + "value": "X.", + "position": 4, + "force_prefix": false, + "force_prefix_search": false + } + ] +] Added: test/command/suite/tokenizers/delimit/options/pattern/sentences.test (+3 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/delimit/options/pattern/sentences.test 2018-11-26 10:15:29 +0900 (8396e3284) @@ -0,0 +1,3 @@ +tokenize \ + 'TokenDelimit("pattern", "[^\\\\s].*?[.。]")' \ + "りんごです。ペンです。This is an apple. Mr. X." -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20181126/78fb3ae7/attachment-0001.html>