[Groonga-commit] groonga/groonga at 8a84411 [master] ii: use the previous buffer allocation algorithm again

Back to archive index

Kouhei Sutou null+****@clear*****
Tue Feb 28 13:33:47 JST 2017


Kouhei Sutou	2017-02-28 13:33:47 +0900 (Tue, 28 Feb 2017)

  New Revision: 8a8441144be5c4284c78db2c4561ccd6c5a84449
  https://github.com/groonga/groonga/commit/8a8441144be5c4284c78db2c4561ccd6c5a84449

  Message:
    ii: use the previous buffer allocation algorithm again
    
    The algorithm is used until 7dd19103de5df2f8c0af7ac47b9149d421a0aa5d .
    
    The algorithm is for natural language. It's suitable for natural
    language but it's not good performance for some non natural language
    text data. So we introduce the new buffer allocation algorithm.
    
    But the new buffer allocation algorithm increases index size for natural
    language. So we use the previous buffer allocation algorithm again just
    for natural language text. We assume that target text is natural
    language if lexicon has a tokenizer.

  Modified files:
    lib/ii.c

  Modified: lib/ii.c (+59 -32)
===================================================================
--- lib/ii.c    2017-02-28 13:07:37 +0900 (eca9b0c)
+++ lib/ii.c    2017-02-28 13:33:47 +0900 (2495b22)
@@ -4045,53 +4045,80 @@ buffer_new_lexicon_pat(grn_ctx *ctx,
   key_size = grn_table_get_key(ctx, ii->lexicon, id, key,
                                GRN_TABLE_MAX_KEY_SIZE);
   if (ii->lexicon->header.flags & GRN_OBJ_KEY_VAR_SIZE) {
-    int target_key_size = key_size;
-    int reduced_key_size = 0;
-
-    while (*lseg == NOT_ASSIGNED && target_key_size > 0) {
-      grn_id tid;
+    grn_obj *tokenizer = NULL;
 
+    grn_table_get_info(ctx, ii->lexicon, NULL, NULL, &tokenizer, NULL, NULL);
+    if (tokenizer) {
+      /* For natural language */
       cursor = grn_pat_cursor_open(ctx,
                                    (grn_pat *)(ii->lexicon),
-                                   key, target_key_size,
-                                   NULL, 0, 0, -1,
-                                   GRN_CURSOR_PREFIX);
-      if (!cursor) {
-        break;
-      }
-
-      if (reduced_key_size == 0) {
+                                   key,
+                                   key_size,
+                                   NULL,
+                                   0,
+                                   0,
+                                   -1,
+                                   GRN_CURSOR_ASCENDING|GRN_CURSOR_GT);
+      if (cursor) {
+        grn_id tid;
         while (ctx->rc == GRN_SUCCESS &&
                *lseg == NOT_ASSIGNED &&
                (tid = grn_pat_cursor_next(ctx, cursor))) {
           buffer_new_find_segment(ctx, ii, size, tid, h, b, lseg, pseg);
         }
-      } else {
-        while (ctx->rc == GRN_SUCCESS &&
-               *lseg == NOT_ASSIGNED &&
-               (tid = grn_pat_cursor_next(ctx, cursor))) {
-          void *current_key;
-          int current_key_size;
+        grn_pat_cursor_close(ctx, cursor);
+      }
+    } else {
+      /* For text data */
+      int target_key_size = key_size;
+      int reduced_key_size = 0;
 
-          current_key_size = grn_pat_cursor_get_key(ctx, cursor, &current_key);
-          if (memcmp(((char *)current_key) + target_key_size,
-                     key + target_key_size,
-                     reduced_key_size) == 0) {
-            continue;
+      while (*lseg == NOT_ASSIGNED && target_key_size > 0) {
+        grn_id tid;
+
+        cursor = grn_pat_cursor_open(ctx,
+                                     (grn_pat *)(ii->lexicon),
+                                     key, target_key_size,
+                                     NULL, 0, 0, -1,
+                                     GRN_CURSOR_PREFIX);
+        if (!cursor) {
+          break;
+        }
+
+        if (reduced_key_size == 0) {
+          while (ctx->rc == GRN_SUCCESS &&
+                 *lseg == NOT_ASSIGNED &&
+                 (tid = grn_pat_cursor_next(ctx, cursor))) {
+            buffer_new_find_segment(ctx, ii, size, tid, h, b, lseg, pseg);
+          }
+        } else {
+          while (ctx->rc == GRN_SUCCESS &&
+                 *lseg == NOT_ASSIGNED &&
+                 (tid = grn_pat_cursor_next(ctx, cursor))) {
+            void *current_key;
+            int current_key_size;
+
+            current_key_size = grn_pat_cursor_get_key(ctx, cursor, &current_key);
+            if (memcmp(((char *)current_key) + target_key_size,
+                       key + target_key_size,
+                       reduced_key_size) == 0) {
+              continue;
+            }
+            buffer_new_find_segment(ctx, ii, size, tid, h, b, lseg, pseg);
           }
-          buffer_new_find_segment(ctx, ii, size, tid, h, b, lseg, pseg);
         }
-      }
-      grn_pat_cursor_close(ctx, cursor);
+        grn_pat_cursor_close(ctx, cursor);
 
-      if (reduced_key_size == 0) {
-        reduced_key_size = 1;
-      } else {
-        reduced_key_size *= 2;
+        if (reduced_key_size == 0) {
+          reduced_key_size = 1;
+        } else {
+          reduced_key_size *= 2;
+        }
+        target_key_size -= reduced_key_size;
       }
-      target_key_size -= reduced_key_size;
     }
   } else {
+    /* For other data */
     cursor = grn_pat_cursor_open(ctx,
                                  (grn_pat *)(ii->lexicon),
                                  NULL, 0, key, key_size, 0, -1,
-------------- next part --------------
HTML����������������������������...
Télécharger 



More information about the Groonga-commit mailing list
Back to archive index