[Groonga-commit] groonga/groonga at 818b0f9 [master] TokenMecab: add include_form option

Back to archive index

Kouhei Sutou null+****@clear*****
Mon Sep 10 17:22:12 JST 2018


Kouhei Sutou	2018-09-10 17:22:12 +0900 (Mon, 10 Sep 2018)

  Revision: 818b0f9c2250b18ff20a29092b4ddc40d8586f19
  https://github.com/groonga/groonga/commit/818b0f9c2250b18ff20a29092b4ddc40d8586f19

  Message:
    TokenMecab: add include_form option
    
    It adds base form, inflected form and inflected type to tokens.

  Added files:
    test/command/suite/tokenizers/mecab/options/include_form.expected
    test/command/suite/tokenizers/mecab/options/include_form.test
  Modified files:
    plugins/tokenizers/mecab.c

  Modified: plugins/tokenizers/mecab.c (+22 -0)
===================================================================
--- plugins/tokenizers/mecab.c    2018-09-10 16:51:44 +0900 (0b51370e8)
+++ plugins/tokenizers/mecab.c    2018-09-10 17:22:12 +0900 (42e0c5177)
@@ -51,6 +51,7 @@ typedef struct {
   int32_t chunk_size_threshold;
   grn_bool include_class;
   grn_bool include_reading;
+  grn_bool include_form;
 } grn_mecab_tokenizer_options;
 
 typedef struct {
@@ -144,6 +145,7 @@ mecab_tokenizer_options_init(grn_mecab_tokenizer_options *options)
   options->chunk_size_threshold = grn_mecab_chunk_size_threshold;
   options->include_class = GRN_FALSE;
   options->include_reading = GRN_FALSE;
+  options->include_form = GRN_FALSE;
 }
 
 static grn_bool
@@ -161,6 +163,10 @@ mecab_tokenizer_options_need_default_output(grn_mecab_tokenizer_options *options
     return GRN_TRUE;
   }
 
+  if (options->include_form) {
+    return GRN_TRUE;
+  }
+
   return GRN_FALSE;
 }
 
@@ -212,6 +218,12 @@ mecab_tokenizer_options_open(grn_ctx *ctx,
                                     raw_options,
                                     i,
                                     options->include_reading);
+    } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "include_form")) {
+      options->include_form =
+        grn_vector_get_element_bool(ctx,
+                                    raw_options,
+                                    i,
+                                    options->include_form);
     }
   } GRN_OPTION_VALUES_EACH_END();
 
@@ -825,6 +837,16 @@ mecab_next_default_format(grn_ctx *ctx,
     data.ignore_asterisk_value = GRN_FALSE;
     mecab_next_default_format_add_feature(ctx, &data, "reading", 7);
   }
+  if (tokenizer->options->include_form) {
+    add_feature_data data;
+    data.token = token;
+    data.features = &features;
+    data.ignore_empty_value = GRN_TRUE;
+    data.ignore_asterisk_value = GRN_TRUE;
+    mecab_next_default_format_add_feature(ctx, &data, "inflected_type", 4);
+    mecab_next_default_format_add_feature(ctx, &data, "inflected_form", 5);
+    mecab_next_default_format_add_feature(ctx, &data, "base_form", 6);
+  }
   GRN_OBJ_FIN(ctx, &features);
 }
 

  Added: test/command/suite/tokenizers/mecab/options/include_form.expected (+40 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/options/include_form.expected    2018-09-10 17:22:12 +0900 (36432399d)
@@ -0,0 +1,40 @@
+tokenize   'TokenMecab("include_form", true)'   '行きました'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "行き",
+      "position": 0,
+      "force_prefix": false,
+      "metadata": {
+        "inflected_type": "五段・カ行促音便",
+        "inflected_form": "連用形",
+        "base_form": "行く"
+      }
+    },
+    {
+      "value": "まし",
+      "position": 1,
+      "force_prefix": false,
+      "metadata": {
+        "inflected_type": "特殊・マス",
+        "inflected_form": "連用形",
+        "base_form": "ます"
+      }
+    },
+    {
+      "value": "た",
+      "position": 2,
+      "force_prefix": false,
+      "metadata": {
+        "inflected_type": "特殊・タ",
+        "inflected_form": "基本形",
+        "base_form": "た"
+      }
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/mecab/options/include_form.test (+5 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/options/include_form.test    2018-09-10 17:22:12 +0900 (c954e9889)
@@ -0,0 +1,5 @@
+#@on-error omit
+tokenize \
+  'TokenMecab("include_form", true)' \
+  '行きました'
+#@on-error default
-------------- next part --------------
HTML����������������������������...
URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180910/deb56804/attachment-0001.htm 



More information about the Groonga-commit mailing list
Back to archive index