Kouhei Sutou
null+****@clear*****
Mon Sep 10 17:22:12 JST 2018
Kouhei Sutou 2018-09-10 17:22:12 +0900 (Mon, 10 Sep 2018) Revision: 818b0f9c2250b18ff20a29092b4ddc40d8586f19 https://github.com/groonga/groonga/commit/818b0f9c2250b18ff20a29092b4ddc40d8586f19 Message: TokenMecab: add include_form option It adds base form, inflected form and inflected type to tokens. Added files: test/command/suite/tokenizers/mecab/options/include_form.expected test/command/suite/tokenizers/mecab/options/include_form.test Modified files: plugins/tokenizers/mecab.c Modified: plugins/tokenizers/mecab.c (+22 -0) =================================================================== --- plugins/tokenizers/mecab.c 2018-09-10 16:51:44 +0900 (0b51370e8) +++ plugins/tokenizers/mecab.c 2018-09-10 17:22:12 +0900 (42e0c5177) @@ -51,6 +51,7 @@ typedef struct { int32_t chunk_size_threshold; grn_bool include_class; grn_bool include_reading; + grn_bool include_form; } grn_mecab_tokenizer_options; typedef struct { @@ -144,6 +145,7 @@ mecab_tokenizer_options_init(grn_mecab_tokenizer_options *options) options->chunk_size_threshold = grn_mecab_chunk_size_threshold; options->include_class = GRN_FALSE; options->include_reading = GRN_FALSE; + options->include_form = GRN_FALSE; } static grn_bool @@ -161,6 +163,10 @@ mecab_tokenizer_options_need_default_output(grn_mecab_tokenizer_options *options return GRN_TRUE; } + if (options->include_form) { + return GRN_TRUE; + } + return GRN_FALSE; } @@ -212,6 +218,12 @@ mecab_tokenizer_options_open(grn_ctx *ctx, raw_options, i, options->include_reading); + } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "include_form")) { + options->include_form = + grn_vector_get_element_bool(ctx, + raw_options, + i, + options->include_form); } } GRN_OPTION_VALUES_EACH_END(); @@ -825,6 +837,16 @@ mecab_next_default_format(grn_ctx *ctx, data.ignore_asterisk_value = GRN_FALSE; mecab_next_default_format_add_feature(ctx, &data, "reading", 7); } + if (tokenizer->options->include_form) { + add_feature_data data; + data.token = token; + data.features = &features; + data.ignore_empty_value = GRN_TRUE; + data.ignore_asterisk_value = GRN_TRUE; + mecab_next_default_format_add_feature(ctx, &data, "inflected_type", 4); + mecab_next_default_format_add_feature(ctx, &data, "inflected_form", 5); + mecab_next_default_format_add_feature(ctx, &data, "base_form", 6); + } GRN_OBJ_FIN(ctx, &features); } Added: test/command/suite/tokenizers/mecab/options/include_form.expected (+40 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/mecab/options/include_form.expected 2018-09-10 17:22:12 +0900 (36432399d) @@ -0,0 +1,40 @@ +tokenize 'TokenMecab("include_form", true)' '行きました' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "行き", + "position": 0, + "force_prefix": false, + "metadata": { + "inflected_type": "五段・カ行促音便", + "inflected_form": "連用形", + "base_form": "行く" + } + }, + { + "value": "まし", + "position": 1, + "force_prefix": false, + "metadata": { + "inflected_type": "特殊・マス", + "inflected_form": "連用形", + "base_form": "ます" + } + }, + { + "value": "た", + "position": 2, + "force_prefix": false, + "metadata": { + "inflected_type": "特殊・タ", + "inflected_form": "基本形", + "base_form": "た" + } + } + ] +] Added: test/command/suite/tokenizers/mecab/options/include_form.test (+5 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/mecab/options/include_form.test 2018-09-10 17:22:12 +0900 (c954e9889) @@ -0,0 +1,5 @@ +#@on-error omit +tokenize \ + 'TokenMecab("include_form", true)' \ + '行きました' +#@on-error default -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180910/deb56804/attachment-0001.htm