[Groonga-commit] groonga/groonga at 11457ea [master] NormalizerNFKC100: Add unify_katakana_bu_sound option

Back to archive index

Kouhei Sutou null+****@clear*****
Mon May 14 17:21:47 JST 2018


Kouhei Sutou	2018-05-14 17:21:47 +0900 (Mon, 14 May 2018)

  New Revision: 11457ea74e4fba1521f826bffd97db04ec0018d9
  https://github.com/groonga/groonga/commit/11457ea74e4fba1521f826bffd97db04ec0018d9

  Message:
    NormalizerNFKC100: Add unify_katakana_bu_sound option

  Added files:
    test/command/suite/normalizers/nfkc100/unify_katakana_bu_sound.expected
    test/command/suite/normalizers/nfkc100/unify_katakana_bu_sound.test
  Modified files:
    lib/normalizer.c

  Modified: lib/normalizer.c (+58 -0)
===================================================================
--- lib/normalizer.c    2018-05-14 16:42:51 +0900 (625c3608a)
+++ lib/normalizer.c    2018-05-14 17:21:47 +0900 (c9555446c)
@@ -628,6 +628,7 @@ typedef struct {
   grn_bool unify_hyphen_and_prolonged_sound_mark;
   grn_bool unify_middle_dot;
   grn_bool unify_katakana_v_sounds;
+  grn_bool unify_katakana_bu_sound;
 } grn_utf8_normalize_options;
 
 static void
@@ -647,6 +648,7 @@ utf8_normalize_options_init(grn_utf8_normalize_options *options,
   options->unify_hyphen_and_prolonged_sound_mark = GRN_FALSE;
   options->unify_middle_dot = GRN_FALSE;
   options->unify_katakana_v_sounds = GRN_FALSE;
+  options->unify_katakana_bu_sound = GRN_FALSE;
 }
 
 grn_inline static const unsigned char *
@@ -1034,6 +1036,47 @@ utf8_normalize_unify_katakana_v_sounds(const unsigned char *utf8_char,
   return GRN_FALSE;
 }
 
+grn_inline static grn_bool
+utf8_normalize_unify_katakana_bu_sound(const unsigned char *utf8_char,
+                                       size_t length,
+                                       unsigned char *previous_normalized,
+                                       unsigned char *normalized)
+{
+  if (!previous_normalized) {
+    return GRN_FALSE;
+  }
+
+  {
+    size_t previous_length = normalized - previous_normalized;
+
+    /* U+30F4 KATAKANA LETTER VU */
+    if (previous_length == 3 &&
+        previous_normalized[0] == 0xe3 &&
+        previous_normalized[1] == 0x83 &&
+        previous_normalized[2] == 0xb4) {
+      /* U+30D6 KATAKANA LETTER BU */
+      previous_normalized[2] = 0x96;
+      if (length == 3 &&
+          utf8_char[0] == 0xe3 &&
+          utf8_char[1] == 0x82 &&
+          /* U+30A1 KATAKANA LETTER SMALL A */
+          /* U+30A3 KATAKANA LETTER SMALL I */
+          /* U+30A5 KATAKANA LETTER SMALL U */
+          /* U+30A7 KATAKANA LETTER SMALL E */
+          /* U+30A9 KATAKANA LETTER SMALL O */
+          (utf8_char[2] == 0xa1 ||
+           utf8_char[2] == 0xa3 ||
+           utf8_char[2] == 0xa5 ||
+           utf8_char[2] == 0xa7 ||
+           utf8_char[2] == 0xa9)) {
+        return GRN_TRUE;
+      }
+    }
+  }
+
+  return GRN_FALSE;
+}
+
 grn_inline static grn_obj *
 utf8_normalize(grn_ctx *ctx,
                grn_string *nstr,
@@ -1253,6 +1296,12 @@ utf8_normalize(grn_ctx *ctx,
             }
           }
 
+          if (options->unify_katakana_bu_sound) {
+            if (utf8_normalize_unify_katakana_bu_sound(p, lp, d_, d)) {
+              lp = 0;
+            }
+          }
+
           grn_memcpy(d, p, lp);
           p = p_original;
         }
@@ -1281,6 +1330,9 @@ utf8_normalize(grn_ctx *ctx,
   if (options->unify_katakana_v_sounds) {
     utf8_normalize_unify_katakana_v_sounds(NULL, 0, d_, d);
   }
+  if (options->unify_katakana_bu_sound) {
+    utf8_normalize_unify_katakana_bu_sound(NULL, 0, d_, d);
+  }
   *d = '\0';
   nstr->n_characters = length;
   nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
@@ -1777,6 +1829,12 @@ nfkc100_open_options(grn_ctx *ctx,
                                     raw_options,
                                     i,
                                     options->unify_katakana_v_sounds);
+    } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "unify_katakana_bu_sound")) {
+      options->unify_katakana_bu_sound =
+        grn_vector_get_element_bool(ctx,
+                                    raw_options,
+                                    i,
+                                    options->unify_katakana_bu_sound);
     }
   } GRN_OPTION_VALUES_EACH_END();
 

  Added: test/command/suite/normalizers/nfkc100/unify_katakana_bu_sound.expected (+22 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/normalizers/nfkc100/unify_katakana_bu_sound.expected    2018-05-14 17:21:47 +0900 (ac3a7f38a)
@@ -0,0 +1,22 @@
+normalize   'NormalizerNFKC100("unify_katakana_bu_sound", true)'   "ヴァヴィヴヴェヴォヴ"   WITH_TYPES
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  {
+    "normalized": "ブブブブブブ",
+    "types": [
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana"
+    ],
+    "checks": [
+
+    ]
+  }
+]

  Added: test/command/suite/normalizers/nfkc100/unify_katakana_bu_sound.test (+4 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/normalizers/nfkc100/unify_katakana_bu_sound.test    2018-05-14 17:21:47 +0900 (dcab4ee28)
@@ -0,0 +1,4 @@
+normalize \
+  'NormalizerNFKC100("unify_katakana_bu_sound", true)' \
+  "ヴァヴィヴヴェヴォヴ" \
+  WITH_TYPES
-------------- next part --------------
HTML����������������������������...
URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180514/48f58f28/attachment-0001.htm 



More information about the Groonga-commit mailing list
Back to archive index