[Groonga-commit] groonga/groonga at 2033d06 [master] Add grn_tokenizer_next_by_tokenized_delimiter()

Back to archive index
Kouhei Sutou null+****@clear*****
Tue Nov 13 11:05:43 JST 2018


Kouhei Sutou	2018-11-13 11:05:43 +0900 (Tue, 13 Nov 2018)

  Revision: 2033d06c6aabb61e7136ff279477835f3827c00e
  https://github.com/groonga/groonga/commit/2033d06c6aabb61e7136ff279477835f3827c00e

  Message:
    Add grn_tokenizer_next_by_tokenized_delimiter()
    
    It's for new tokenizer next API, grn_tokenizer_next_func.

  Modified files:
    include/groonga/tokenizer.h
    lib/tokenizer.c
    lib/tokenizers.c
    plugins/tokenizers/mecab.c

  Modified: include/groonga/tokenizer.h (+20 -1)
===================================================================
--- include/groonga/tokenizer.h    2018-11-13 10:53:33 +0900 (61468ef59)
+++ include/groonga/tokenizer.h    2018-11-13 11:05:43 +0900 (693d8686a)
@@ -254,6 +254,10 @@ GRN_PLUGIN_EXPORT void grn_tokenizer_token_push(grn_ctx *ctx, grn_tokenizer_toke
   the next token into `token'. It returns the string after the next
   token. The returned string may be `NULL' when all tokens are
   extracted.
+
+  @deprecated since 8.0.9. It's for old tokenizer next API. Use
+  grn_tokenizer_next_by_tokenized_delimiter() for new tokenizer next
+  API (grn_tokenizer_next_func).
  */
 GRN_PLUGIN_EXPORT const char *grn_tokenizer_tokenized_delimiter_next(grn_ctx *ctx,
                                                                      grn_tokenizer_token *token,
@@ -262,6 +266,21 @@ GRN_PLUGIN_EXPORT const char *grn_tokenizer_tokenized_delimiter_next(grn_ctx *ct
                                                                      grn_encoding encoding);
 
 /*
+  Extract the next token by delimiting by
+  GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8.
+
+  This is for grn_tokenizer_next_func.
+
+  @since 8.0.9.
+ */
+GRN_PLUGIN_EXPORT const char *
+grn_tokenizer_next_by_tokenized_delimiter(grn_ctx *ctx,
+                                          grn_token *token,
+                                          const char *str_ptr,
+                                          unsigned int str_length,
+                                          grn_encoding encoding);
+
+/*
   grn_tokenizer_register() registers a plugin to the database which is
   associated with `ctx'. `plugin_name_ptr' and `plugin_name_length' specify the
   plugin name. Alphabetic letters ('A'-'Z' and 'a'-'z'), digits ('0'-'9') and
@@ -273,7 +292,7 @@ GRN_PLUGIN_EXPORT const char *grn_tokenizer_tokenized_delimiter_next(grn_ctx *ct
   details of grn_proc_func and grn_user_data, that is used as an argument of
   grn_proc_func.
 
-  Deprecated since 8.0.2. Use grn_tokenizer_create() and
+  @deprecated since 8.0.2. Use grn_tokenizer_create() and
   grn_tokenizer_set_*_func().
  */
 GRN_PLUGIN_EXPORT grn_rc

  Modified: lib/tokenizer.c (+35 -0)
===================================================================
--- lib/tokenizer.c    2018-11-13 10:53:33 +0900 (3cefaa4a5)
+++ lib/tokenizer.c    2018-11-13 11:05:43 +0900 (e4198b9e3)
@@ -510,6 +510,41 @@ grn_tokenizer_tokenized_delimiter_next(grn_ctx *ctx,
   return next_start;
 }
 
+const char *
+grn_tokenizer_next_by_tokenized_delimiter(grn_ctx *ctx,
+                                          grn_token *token,
+                                          const char *str_ptr,
+                                          unsigned int str_length,
+                                          grn_encoding encoding)
+{
+  size_t char_length = 0;
+  const char *start = str_ptr;
+  const char *current;
+  const char *end = str_ptr + str_length;
+  const char *next_start = NULL;
+
+  for (current = start; current < end; current += char_length) {
+    char_length = grn_charlen_(ctx, current, end, encoding);
+    if (char_length == 0) {
+      break;
+    }
+    if (grn_tokenizer_is_tokenized_delimiter(ctx, current, char_length,
+                                             encoding)) {
+      next_start = str_ptr + (current - start + char_length);
+      break;
+    }
+  }
+
+  grn_token_set_data(ctx, token, start, current - start);
+  if (current == end) {
+    grn_token_set_status(ctx, token, GRN_TOKEN_LAST);
+  } else {
+    grn_token_set_status(ctx, token, GRN_TOKEN_CONTINUE);
+  }
+
+  return next_start;
+}
+
 grn_rc
 grn_tokenizer_register(grn_ctx *ctx, const char *plugin_name_ptr,
                        unsigned int plugin_name_length,

  Modified: lib/tokenizers.c (+2 -14)
===================================================================
--- lib/tokenizers.c    2018-11-13 10:53:33 +0900 (2f599c3e9)
+++ lib/tokenizers.c    2018-11-13 11:05:43 +0900 (3b3ad3307)
@@ -110,7 +110,6 @@ typedef struct {
 } grn_delimit_options_default;
 
 typedef struct {
-  grn_tokenizer_token token;
   grn_tokenizer_query *query;
   grn_delimit_options *options;
   grn_bool have_tokenized_delimiter;
@@ -207,7 +206,6 @@ delimit_init_raw(grn_ctx *ctx,
     return NULL;
   }
 
-  grn_tokenizer_token_init(ctx, &(tokenizer->token));
   tokenizer->query = query;
   tokenizer->options = options;
 
@@ -276,23 +274,14 @@ delimit_next(grn_ctx *ctx,
 
   if (tokenizer->have_tokenized_delimiter) {
     unsigned int rest_length;
-    grn_obj *status;
-    grn_obj *data;
     rest_length = tokenizer->end - tokenizer->next;
     tokenizer->next =
-      (unsigned char *)grn_tokenizer_tokenized_delimiter_next(
+      (unsigned char *)grn_tokenizer_next_by_tokenized_delimiter(
         ctx,
-        &(tokenizer->token),
+        token,
         (const char *)tokenizer->next,
         rest_length,
         tokenizer->encoding);
-    status = grn_ctx_pop(ctx);
-    data = grn_ctx_pop(ctx);
-    grn_token_set_data(ctx,
-                       token,
-                       GRN_TEXT_VALUE(data),
-                       GRN_TEXT_LEN(data));
-    grn_token_set_status(ctx, token, GRN_UINT32_VALUE(status));
   } else {
     size_t cl;
     const unsigned char *p = tokenizer->next, *r;
@@ -359,7 +348,6 @@ delimit_fin(grn_ctx *ctx, void *user_data)
   if (!tokenizer) {
     return;
   }
-  grn_tokenizer_token_fin(ctx, &(tokenizer->token));
   GRN_FREE(tokenizer);
 }
 

  Modified: plugins/tokenizers/mecab.c (+5 -16)
===================================================================
--- plugins/tokenizers/mecab.c    2018-11-13 10:53:33 +0900 (91a080fbf)
+++ plugins/tokenizers/mecab.c    2018-11-13 11:05:43 +0900 (9d41ebf15)
@@ -1159,23 +1159,12 @@ mecab_next(grn_ctx *ctx,
 
   if (grn_tokenizer_query_have_tokenized_delimiter(ctx, tokenizer->query)) {
     grn_encoding encoding = tokenizer->query->encoding;
-    grn_tokenizer_token tokenizer_token;
-    grn_tokenizer_token_init(ctx, &tokenizer_token);
-    /* TODO: Need grn_token version. */
     tokenizer->next =
-      grn_tokenizer_tokenized_delimiter_next(ctx,
-                                             &tokenizer_token,
-                                             tokenizer->next,
-                                             tokenizer->end - tokenizer->next,
-                                             encoding);
-    grn_token_set_data(ctx,
-                       token,
-                       GRN_TEXT_VALUE(&(tokenizer_token.str)),
-                       GRN_TEXT_LEN(&(tokenizer_token.str)));
-    grn_token_set_status(ctx,
-                         token,
-                         GRN_UINT32_VALUE(&(tokenizer_token.status)));
-    grn_tokenizer_token_fin(ctx, &tokenizer_token);
+      grn_tokenizer_next_by_tokenized_delimiter(ctx,
+                                                token,
+                                                tokenizer->next,
+                                                tokenizer->end - tokenizer->next,
+                                                encoding);
   } else if (mecab_tokenizer_options_need_default_output(ctx, tokenizer->options)) {
     mecab_next_default_format(ctx, tokenizer, token);
   } else {
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20181113/f33b1d65/attachment-0001.html>


More information about the Groonga-commit mailing list
Back to archive index