[Groonga-commit] groonga/groonga at 8b87c84 [master] TokenPattern: add support for multiple "pattern"s

Back to archive index
Kouhei Sutou null+****@clear*****
Fri Feb 1 14:46:38 JST 2019


Kouhei Sutou	2019-02-01 14:46:38 +0900 (Fri, 01 Feb 2019)

  Revision: 8b87c840b6a8f1c8e1faa5f50c8314628ad0085b
  https://github.com/groonga/groonga/commit/8b87c840b6a8f1c8e1faa5f50c8314628ad0085b

  Message:
    TokenPattern: add support for multiple "pattern"s

  Added files:
    test/command/suite/tokenizers/pattern/multiple.expected
    test/command/suite/tokenizers/pattern/multiple.test
  Modified files:
    lib/tokenizers.c

  Modified: lib/tokenizers.c (+17 -8)
===================================================================
--- lib/tokenizers.c    2019-02-01 14:31:23 +0900 (920141519)
+++ lib/tokenizers.c    2019-02-01 14:46:38 +0900 (d10118356)
@@ -1576,6 +1576,7 @@ pattern_open_options(grn_ctx *ctx,
                      void *user_data)
 {
   grn_pattern_options *options;
+  grn_obj all_patterns;
 
   options = GRN_MALLOC(sizeof(grn_pattern_options));
   if (!options) {
@@ -1586,6 +1587,7 @@ pattern_open_options(grn_ctx *ctx,
   }
 
   pattern_options_init(options);
+  GRN_TEXT_INIT(&all_patterns, 0);
   GRN_OPTION_VALUES_EACH_BEGIN(ctx, raw_options, i, name, name_length) {
     grn_raw_string name_raw;
     name_raw.value = name;
@@ -1604,20 +1606,27 @@ pattern_open_options(grn_ctx *ctx,
                                               NULL,
                                               &domain);
       if (grn_type_id_is_text_family(ctx, domain) && pattern_length > 0) {
-        if (options->regex) {
-          onig_free(options->regex);
+        if (GRN_TEXT_LEN(&all_patterns) > 0) {
+          GRN_TEXT_PUTS(ctx, &all_patterns, "|");
         }
-        options->regex = grn_onigmo_new(ctx,
-                                        pattern,
-                                        pattern_length,
-                                        GRN_ONIGMO_OPTION_DEFAULT,
-                                        GRN_ONIGMO_SYNTAX_DEFAULT,
-                                        "[tokenizer][delimit]");
+        GRN_TEXT_PUTS(ctx, &all_patterns, "(?:");
+        GRN_TEXT_PUT(ctx, &all_patterns, pattern, pattern_length);
+        GRN_TEXT_PUTS(ctx, &all_patterns, ")");
       }
 #endif /* GRN_SUPPORT_REGEXP */
     }
   } GRN_OPTION_VALUES_EACH_END();
 
+  if (GRN_TEXT_LEN(&all_patterns) > 0) {
+    options->regex = grn_onigmo_new(ctx,
+                                    GRN_TEXT_VALUE(&all_patterns),
+                                    GRN_TEXT_LEN(&all_patterns),
+                                    GRN_ONIGMO_OPTION_DEFAULT,
+                                    GRN_ONIGMO_SYNTAX_DEFAULT,
+                                    "[tokenizer][pattern]");
+  }
+  GRN_OBJ_FIN(ctx, &all_patterns);
+
   return options;
 }
 

  Added: test/command/suite/tokenizers/pattern/multiple.expected (+40 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/pattern/multiple.expected    2019-02-01 14:46:38 +0900 (a36f8d4ac)
@@ -0,0 +1,40 @@
+tokenize   'TokenPattern("pattern", "\\\\d+円",                 "pattern", "りんご|みかん")'   "私は100円のりんごと50円のみかんを129円で買いました。"
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "100円",
+      "position": 0,
+      "force_prefix": false,
+      "force_prefix_search": false
+    },
+    {
+      "value": "りんご",
+      "position": 1,
+      "force_prefix": false,
+      "force_prefix_search": false
+    },
+    {
+      "value": "50円",
+      "position": 2,
+      "force_prefix": false,
+      "force_prefix_search": false
+    },
+    {
+      "value": "みかん",
+      "position": 3,
+      "force_prefix": false,
+      "force_prefix_search": false
+    },
+    {
+      "value": "129円",
+      "position": 4,
+      "force_prefix": false,
+      "force_prefix_search": false
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/pattern/multiple.test (+4 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/pattern/multiple.test    2019-02-01 14:46:38 +0900 (b2469fb15)
@@ -0,0 +1,4 @@
+tokenize \
+  'TokenPattern("pattern", "\\\\d+円", \
+                "pattern", "りんご|みかん")' \
+  "私は100円のりんごと50円のみかんを129円で買いました。"
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190201/aca18407/attachment-0001.html>


More information about the Groonga-commit mailing list
Back to archive index