groonga/groonga at 09df0e6 [master] index_column_diff: improve performance (Groonga-commit) - Groonga - fulltext search engine.

Kouhei Sutou	2019-05-15 18:40:41 +0900 (Wed, 15 May 2019)

  Revision: 09df0e6c947900b9f97fb76ea92b5cbfc69bfadc
  https://github.com/groonga/groonga/commit/09df0e6c947900b9f97fb76ea92b5cbfc69bfadc

  Message:
    index_column_diff: improve performance

  Modified files:
    lib/index_column.c

  Modified: lib/index_column.c (+308 -294)
===================================================================

--- lib/index_column.c    2019-05-13 18:02:06 +0900 (b3f9ba9ae)
+++ lib/index_column.c    2019-05-15 18:40:41 +0900 (ad432dbfa)
@@ -26,7 +26,6 @@
 
 static uint64_t grn_index_sparsity = 10;
 static grn_bool grn_index_chunk_split_enable = GRN_TRUE;
-static size_t grn_index_column_diff_cache_size_max = 256;
 
 void
 grn_index_column_init_from_env(void)
@@ -57,21 +56,6 @@ grn_index_column_init_from_env(void)
       grn_index_chunk_split_enable = GRN_TRUE;
     }
   }
-
-  {
-    char grn_index_column_diff_cache_size_max_env[GRN_ENV_BUFFER_SIZE];
-    grn_getenv("GRN_INDEX_COLUMN_DIFF_CACHE_SIZE_MAX",
-               grn_index_column_diff_cache_size_max_env,
-               GRN_ENV_BUFFER_SIZE);
-    if (grn_index_column_diff_cache_size_max_env[0]) {
-      uint64_t size;
-      errno = 0;
-      size = strtoull(grn_index_column_diff_cache_size_max_env, NULL, 0);
-      if (errno == 0 && size > 0) {
-        grn_index_column_diff_cache_size_max = size;
-      }
-    }
-  }
 }
 
 grn_inline static void
@@ -260,16 +244,16 @@ grn_index_column_rebuild(grn_ctx *ctx, grn_obj *index_column)
   GRN_API_RETURN(ctx->rc);
 }
 
-static const char *postings_column_name = "postings";
 static const char *remains_column_name = "remains";
 static const char *missings_column_name = "missings";
 
 typedef struct {
   grn_id token_id;
-  grn_id diff_id;
-  grn_obj postings;
-  size_t offset;
-} grn_index_column_diff_cache;
+  grn_ii_cursor *cursor;
+  bool need_cursor_next;
+  grn_obj look_ahead;
+  size_t look_ahead_offset;
+} grn_index_column_diff_posting_list;
 
 typedef struct {
   grn_obj *lexicon;
@@ -285,7 +269,7 @@ typedef struct {
   grn_obj *source_table;
   grn_obj source_columns;
   grn_obj *diff;
-  grn_obj *postings;
+  grn_hash *posting_lists;
   grn_obj *remains;
   grn_obj *missings;
   struct {
@@ -307,8 +291,6 @@ typedef struct {
     grn_timeval start_time;
     grn_timeval previous_time;
   } progress;
-  grn_index_column_diff_cache *cache;
-  size_t cache_size;
 } grn_index_column_diff_data;
 
 static void
@@ -318,45 +300,6 @@ grn_index_column_diff_data_init(grn_ctx *ctx,
   GRN_PTR_INIT(&(data->source_columns), GRN_OBJ_VECTOR, GRN_ID_NIL);
   GRN_VOID_INIT(&(data->buffers.value));
   GRN_UINT32_INIT(&(data->buffers.postings), GRN_OBJ_VECTOR);
-
-  data->cache = NULL;
-  data->cache_size = 0;
-}
-
-static void
-grn_index_column_diff_data_init_cache(grn_ctx *ctx,
-                                      grn_index_column_diff_data *data)
-{
-  size_t cache_size;
-  const size_t n_tokens = grn_table_size(ctx, data->lexicon);
-  if (n_tokens == 0) {
-    cache_size = 1;
-  } else if (n_tokens < grn_index_column_diff_cache_size_max) {
-    cache_size = n_tokens;
-  } else {
-    cache_size = n_tokens * 0.01;
-    if (cache_size >= grn_index_column_diff_cache_size_max) {
-      cache_size = grn_index_column_diff_cache_size_max;
-    }
-  }
-
-  data->cache = GRN_MALLOCN(grn_index_column_diff_cache, cache_size);
-  if (!data->cache) {
-    char message[GRN_CTX_MSGSIZE];
-    grn_strcpy(message, GRN_CTX_MSGSIZE, ctx->errbuf);
-    ERR(GRN_NO_MEMORY_AVAILABLE,
-        "[index-column][diff] failed to allocate cache: <%" GRN_FMT_SIZE ">: %s",
-        cache_size,
-        message);
-    return;
-  }
-  for (size_t i = 0; i < cache_size; i++) {
-    data->cache[i].token_id = GRN_ID_NIL;
-    data->cache[i].diff_id = GRN_ID_NIL;
-    GRN_UINT32_INIT(&(data->cache[i].postings), GRN_OBJ_VECTOR);
-    data->cache[i].offset = 0;
-  }
-  data->cache_size = cache_size;
 }
 
 static void
@@ -376,11 +319,6 @@ grn_index_column_diff_data_fin(grn_ctx *ctx,
 
   GRN_OBJ_FIN(ctx, &(data->buffers.value));
   GRN_OBJ_FIN(ctx, &(data->buffers.postings));
-
-  for (size_t i = 0; i < data->cache_size; i++) {
-    GRN_OBJ_FIN(ctx, &(data->cache[i].postings));
-  }
-  GRN_FREE(data->cache);
 }
 
 static void
@@ -498,171 +436,72 @@ grn_index_column_diff_progress(grn_ctx *ctx,
   }
 }
 
-static size_t
-grn_index_column_diff_cache_compute_key(grn_ctx *ctx,
-                                        grn_index_column_diff_data *data,
-                                        grn_id token_id)
-{
-  return token_id % data->cache_size;
-}
-
 static void
-grn_index_column_diff_cache_flush(grn_ctx *ctx,
-                                  grn_index_column_diff_data *data,
-                                  size_t cache_key)
+grn_index_column_diff_append_postings(grn_ctx *ctx,
+                                      grn_index_column_diff_data *data,
+                                      grn_id token_id,
+                                      grn_obj *column,
+                                      grn_obj *postings)
 {
-  const grn_id cached_token_id = data->cache[cache_key].token_id;
-  if (cached_token_id == GRN_ID_NIL) {
+  if (GRN_BULK_VSIZE(postings) == 0) {
     return;
   }
 
-  grn_obj *new_postings = &(data->buffers.postings);
-  GRN_BULK_REWIND(new_postings);
-  grn_obj *postings = &(data->cache[cache_key].postings);
-  const size_t n_posting_elements = data->n_posting_elements;
-  const size_t offset = data->cache[cache_key].offset;
-  const uint32_t *postings_raw =
-    ((uint32_t *)GRN_BULK_HEAD(postings)) +
-    (n_posting_elements * offset);
-  const size_t postings_raw_size =
-    sizeof(uint32_t) *
-    (GRN_UINT32_VECTOR_SIZE(postings) - (n_posting_elements * offset));
-  grn_bulk_write(ctx,
-                 new_postings,
-                 (const char *)postings_raw,
-                 postings_raw_size);
+  const grn_id diff_id = grn_table_add(ctx,
+                                       data->diff,
+                                       &token_id, sizeof(grn_id),
+                                       NULL);
+  if (diff_id == GRN_ID_NIL) {
+    return;
+  }
   grn_obj_set_value(ctx,
-                    data->postings,
-                    data->cache[cache_key].diff_id,
-                    new_postings,
-                    GRN_OBJ_SET);
-
-  data->cache[cache_key].token_id = GRN_ID_NIL;
-  data->cache[cache_key].diff_id = GRN_ID_NIL;
-  data->cache[cache_key].offset = 0;
-  GRN_BULK_REWIND(&(data->cache[cache_key].postings));
+                    column,
+                    diff_id,
+                    postings,
+                    GRN_OBJ_APPEND);
 }
 
 static void
-grn_index_column_diff_cache_flush_all(grn_ctx *ctx,
-                                      grn_index_column_diff_data *data)
+grn_index_column_diff_append_posting(grn_ctx *ctx,
+                                     grn_index_column_diff_data *data,
+                                     grn_id token_id,
+                                     grn_obj *column,
+                                     grn_posting *posting)
 {
-  for (size_t i = 0; i < data->cache_size; i++) {
-    grn_index_column_diff_cache_flush(ctx, data, i);
+  grn_obj *postings = &(data->buffers.postings);
+  GRN_BULK_REWIND(postings);
+  GRN_UINT32_PUT(ctx, postings, posting->rid);
+  if (data->index.with_section) {
+    GRN_UINT32_PUT(ctx, postings, posting->sid);
   }
-}
-
-static grn_obj *
-grn_index_column_diff_cache_allocate(grn_ctx *ctx,
-                                     grn_index_column_diff_data *data)
-{
-  const grn_id token_id = data->current.token_id;
-  const size_t cache_key =
-    grn_index_column_diff_cache_compute_key(ctx, data, token_id);
-  grn_index_column_diff_cache_flush(ctx, data, cache_key);
-  data->cache[cache_key].token_id = token_id;
-  data->cache[cache_key].diff_id = data->current.diff_id;
-  return &(data->cache[cache_key].postings);
-}
-
-static const uint32_t *
-grn_index_column_diff_cache_get(grn_ctx *ctx,
-                                grn_index_column_diff_data *data,
-                                size_t *n_postings)
-{
-  const grn_id token_id = data->current.token_id;
-  const size_t cache_key
-    = grn_index_column_diff_cache_compute_key(ctx, data, token_id);
-  const grn_id cached_token_id = data->cache[cache_key].token_id;
-  grn_obj *postings;
-  if (cached_token_id == token_id) {
-    postings = &(data->cache[cache_key].postings);
-  } else {
-    postings = grn_index_column_diff_cache_allocate(ctx, data);
-    const grn_id diff_id = data->current.diff_id;
-    grn_obj_get_value(ctx, data->postings, diff_id, postings);
+  if (data->index.with_position) {
+    GRN_UINT32_PUT(ctx, postings, posting->pos);
   }
-  const size_t n_posting_elements = data->n_posting_elements;
-  const size_t offset = data->cache[cache_key].offset;
-  *n_postings =
-    (GRN_UINT32_VECTOR_SIZE(postings) / n_posting_elements) -
-    offset;
-  return ((uint32_t *)GRN_BULK_HEAD(postings)) + (n_posting_elements * offset);
+  grn_index_column_diff_append_postings(ctx, data, token_id, column, postings);
 }
 
 static void
-grn_index_column_diff_cache_remove_posting(grn_ctx *ctx,
-                                           grn_index_column_diff_data *data,
-                                           size_t nth_posting)
+grn_index_column_diff_append_missing(grn_ctx *ctx,
+                                     grn_index_column_diff_data *data,
+                                     grn_posting *posting)
 {
-  const grn_id token_id = data->current.token_id;
-  const size_t cache_key =
-    grn_index_column_diff_cache_compute_key(ctx, data, token_id);
-  const size_t n_posting_elements = data->n_posting_elements;
-  size_t n_postings = 0;
-  const uint32_t *postings =
-    grn_index_column_diff_cache_get(ctx, data, &n_postings);
-
-  if (nth_posting > 0) {
-    grn_obj *remains = &(data->buffers.postings);
-    GRN_BULK_REWIND(remains);
-    const size_t posting_size = sizeof(uint32_t) * n_posting_elements;
-    grn_bulk_write(ctx,
-                   remains,
-                   (const char *)postings,
-                   posting_size * nth_posting);
-    const grn_id diff_id = data->current.diff_id;
-    grn_obj_set_value(ctx, data->remains, diff_id, remains, GRN_OBJ_APPEND);
-  }
-  data->cache[cache_key].offset += nth_posting + 1;
+  grn_index_column_diff_append_posting(ctx,
+                                       data,
+                                       data->current.token_id,
+                                       data->missings,
+                                       posting);
 }
 
-static const uint32_t *
-grn_index_column_diff_get_postings(grn_ctx *ctx,
-                                   grn_index_column_diff_data *data,
-                                   size_t *n_postings)
+static void
+grn_index_column_diff_append_remain(grn_ctx *ctx,
+                                    grn_index_column_diff_data *data,
+                                    grn_posting *posting)
 {
-  if (!data->current.is_new_diff) {
-    return grn_index_column_diff_cache_get(ctx, data, n_postings);
-  }
-
-  grn_obj *postings = grn_index_column_diff_cache_allocate(ctx, data);
-  const grn_id token_id = data->current.token_id;
-  const unsigned int ii_cursor_flags = 0;
-  grn_ii_cursor *ii_cursor = grn_ii_cursor_open(ctx,
-                                                data->ii,
-                                                token_id,
-                                                GRN_ID_NIL,
-                                                GRN_ID_MAX,
-                                                data->index.n_elements,
-                                                ii_cursor_flags);
-  if (ii_cursor) {
-    const grn_bool with_section = data->index.with_section;
-    const grn_bool with_position = data->index.with_position;
-    if (with_position) {
-      while (grn_ii_cursor_next(ctx, ii_cursor)) {
-        grn_posting *posting;
-        while ((posting = grn_ii_cursor_next_pos(ctx, ii_cursor))) {
-          GRN_UINT32_PUT(ctx, postings, posting->rid);
-          if (with_section) {
-            GRN_UINT32_PUT(ctx, postings, posting->sid);
-          }
-          GRN_UINT32_PUT(ctx, postings, posting->pos);
-        }
-      }
-    } else {
-      grn_posting *posting;
-      while ((posting = grn_ii_cursor_next(ctx, ii_cursor))) {
-        GRN_UINT32_PUT(ctx, postings, posting->rid);
-        if (with_section) {
-          GRN_UINT32_PUT(ctx, postings, posting->sid);
-        }
-      }
-    }
-    grn_ii_cursor_close(ctx, ii_cursor);
-  }
-  *n_postings = GRN_UINT32_VECTOR_SIZE(postings) / data->n_posting_elements;
-  return (const uint32_t *)(GRN_BULK_HEAD(postings));
+  grn_index_column_diff_append_posting(ctx,
+                                       data,
+                                       data->current.token_id,
+                                       data->remains,
+                                       posting);
 }
 
 static int
@@ -705,64 +544,236 @@ grn_index_column_diff_compare_posting(grn_ctx *ctx,
   return 0;
 }
 
-static int64_t
-grn_index_column_diff_find_posting(grn_ctx *ctx,
-                                   grn_index_column_diff_data *data)
+static void
+grn_index_column_diff_posting_list_init(grn_ctx *ctx,
+                                        grn_index_column_diff_data *data,
+                                        grn_index_column_diff_posting_list *posting_list)
 {
-  size_t n_postings = 0;
-  const uint32_t *postings =
-    grn_index_column_diff_get_postings(ctx, data, &n_postings);
-  const size_t n_posting_elements = data->n_posting_elements;
-  for (size_t i = 0; i < n_postings; i++) {
-    const uint32_t *posting = postings + (n_posting_elements * i);
-    const int compared =
-      grn_index_column_diff_compare_posting(ctx, data, posting);
-    if (compared == 0) {
-      return i;
-    } else if (compared > 0) {
-      return -1;
-    }
+  posting_list->token_id = data->current.token_id;
+  const unsigned int ii_cursor_flags = 0;
+  posting_list->cursor = grn_ii_cursor_open(ctx,
+                                            data->ii,
+                                            posting_list->token_id,
+                                            GRN_ID_NIL,
+                                            GRN_ID_MAX,
+                                            data->index.n_elements,
+                                            ii_cursor_flags);
+  if (!posting_list->cursor) {
+    posting_list->token_id = GRN_ID_NIL;
+    return;
   }
-  return -1;
+  posting_list->need_cursor_next = true;
+  GRN_UINT32_INIT(&(posting_list->look_ahead), GRN_OBJ_VECTOR);
+  posting_list->look_ahead_offset = 0;
 }
 
 static void
-grn_index_column_diff_process_token_id(grn_ctx *ctx,
-                                       grn_index_column_diff_data *data)
+grn_index_column_diff_posting_list_fin(grn_ctx *ctx,
+                                       grn_index_column_diff_data *data,
+                                       grn_index_column_diff_posting_list *posting_list)
 {
-  {
-    int added = 0;
-    data->current.diff_id =
-      grn_table_add(ctx,
-                    data->diff,
-                    &(data->current.token_id), sizeof(grn_id),
-                    &added);
-    data->current.is_new_diff = (added != 0);
+  if (posting_list->token_id == GRN_ID_NIL) {
+    return;
   }
 
-  const int64_t nth_posting =
-    grn_index_column_diff_find_posting(ctx, data);
-  if (nth_posting >= 0) {
-    grn_index_column_diff_cache_remove_posting(ctx,
-                                               data,
-                                               (size_t)nth_posting);
+  const grn_id token_id = posting_list->token_id;
+  posting_list->token_id = GRN_ID_NIL;
+
+  grn_obj *look_ahead = &(posting_list->look_ahead);
+  grn_obj *remains;
+  if (posting_list->look_ahead_offset == 0) {
+    remains = look_ahead;
   } else {
+    const size_t offset = posting_list->look_ahead_offset * sizeof(uint32_t);
     grn_obj *postings = &(data->buffers.postings);
     GRN_BULK_REWIND(postings);
-    const grn_posting *posting = &(data->current.posting);
-    GRN_UINT32_PUT(ctx, postings, posting->rid);
-    if (data->index.with_section) {
-      GRN_UINT32_PUT(ctx, postings, posting->sid);
+    grn_bulk_write(ctx,
+                   postings,
+                   GRN_BULK_HEAD(look_ahead) + offset,
+                   GRN_BULK_VSIZE(look_ahead) - offset);
+    remains = postings;
+  }
+
+  grn_ii_cursor *cursor = posting_list->cursor;
+  if (cursor) {
+    const grn_bool with_section = data->index.with_section;
+    const grn_bool with_position = data->index.with_position;
+    if (with_position) {
+      while (!posting_list->need_cursor_next ||
+             grn_ii_cursor_next(ctx, cursor)) {
+        grn_posting *posting;
+        while ((posting = grn_ii_cursor_next_pos(ctx, cursor))) {
+          posting_list->need_cursor_next = false;
+          GRN_UINT32_PUT(ctx, remains, posting->rid);
+          if (with_section) {
+            GRN_UINT32_PUT(ctx, remains, posting->sid);
+          }
+          GRN_UINT32_PUT(ctx, remains, posting->pos);
+        }
+        posting_list->need_cursor_next = true;
+      }
+    } else {
+      grn_posting *posting;
+      while ((posting = grn_ii_cursor_next(ctx, cursor))) {
+        GRN_UINT32_PUT(ctx, remains, posting->rid);
+        if (with_section) {
+          GRN_UINT32_PUT(ctx, remains, posting->sid);
+        }
+      }
+    }
+    grn_ii_cursor_close(ctx, cursor);
+    posting_list->cursor = NULL;
+  }
+
+  grn_index_column_diff_append_postings(ctx,
+                                        data,
+                                        token_id,
+                                        data->remains,
+                                        remains);
+
+  GRN_OBJ_FIN(ctx, look_ahead);
+}
+
+static void
+grn_index_column_diff_process_token_id(grn_ctx *ctx,
+                                       grn_index_column_diff_data *data)
+{
+  void *value = NULL;
+  int added = 0;
+  const grn_id posting_list_id =
+    grn_hash_add(ctx,
+                 data->posting_lists,
+                 &(data->current.token_id), sizeof(grn_id),
+                 &value,
+                 &added);
+  if (posting_list_id == GRN_ID_NIL) {
+    grn_rc rc = ctx->rc;
+    if (rc == GRN_SUCCESS) {
+      rc = GRN_UNKNOWN_ERROR;
+    }
+    char message[GRN_CTX_MSGSIZE];
+    grn_strcpy(message, GRN_CTX_MSGSIZE, ctx->errbuf);
+    ERR(rc,
+        "[index-column][diff] failed to add a record to keep a posting list: "
+        "%u: %s",
+        data->current.token_id,
+        message);
+    return;
+  }
+
+  grn_index_column_diff_posting_list *posting_list = value;
+  if (added) {
+    grn_index_column_diff_posting_list_init(ctx, data, posting_list);
+  }
+
+  if (posting_list->token_id == GRN_ID_NIL) {
+    grn_index_column_diff_append_missing(ctx, data, &(data->current.posting));
+    return;
+  }
+
+  const grn_bool with_section = data->index.with_section;
+  const grn_bool with_position = data->index.with_position;
+
+  grn_obj *look_ahead = &(posting_list->look_ahead);
+  const size_t look_ahead_offset = posting_list->look_ahead_offset;
+
+  const size_t n_posting_elements = data->n_posting_elements;
+  const size_t n_postings =
+    GRN_UINT32_VECTOR_SIZE(look_ahead) /
+    n_posting_elements;
+  for (size_t i = look_ahead_offset; i < n_postings; i++) {
+    const size_t offset = i * data->n_posting_elements;
+    const uint32_t *raw_posting =
+      ((const uint32_t *)GRN_BULK_HEAD(look_ahead)) + offset;
+    const int compared =
+      grn_index_column_diff_compare_posting(ctx, data, raw_posting);
+    if (compared == 0) {
+      posting_list->look_ahead_offset += n_posting_elements;
+      if (posting_list->look_ahead_offset ==
+          GRN_UINT32_VECTOR_SIZE(look_ahead)) {
+        GRN_BULK_REWIND(look_ahead);
+        posting_list->look_ahead_offset = 0;
+      }
+      return;
+    } else if (compared > 0) {
+      grn_index_column_diff_append_missing(ctx, data, &(data->current.posting));
+      return;
+    } else {
+      grn_posting posting;
+      size_t j = 0;
+      posting.rid = raw_posting[j];
+      if (data->index.with_section) {
+        j++;
+        posting.sid = raw_posting[j];
+      }
+      if (data->index.with_position) {
+        j++;
+        posting.pos = raw_posting[j];
+      }
+      grn_index_column_diff_append_remain(ctx, data, &posting);
+    }
+  }
+
+  grn_ii_cursor *cursor = posting_list->cursor;
+  if (with_position) {
+    while (!posting_list->need_cursor_next ||
+           grn_ii_cursor_next(ctx, cursor)) {
+      grn_posting *posting;
+      while ((posting = grn_ii_cursor_next_pos(ctx, cursor))) {
+        posting_list->need_cursor_next = false;
+
+        uint32_t raw_posting[3];
+        size_t i = 0;
+        raw_posting[i] = posting->rid;
+        if (with_section) {
+          i++;
+          raw_posting[i] = posting->sid;
+        }
+        i++;
+        raw_posting[i] = posting->pos;
+        const int compared =
+          grn_index_column_diff_compare_posting(ctx, data, raw_posting);
+        if (compared == 0) {
+          return;
+        } else if (compared > 0) {
+          for (size_t j = 0; j <= i; j++) {
+            GRN_UINT32_PUT(ctx, look_ahead, raw_posting[j]);
+          }
+          grn_index_column_diff_append_missing(ctx, data, &(data->current.posting));
+          return;
+        } else {
+          grn_index_column_diff_append_remain(ctx, data, posting);
+        }
+      }
+      posting_list->need_cursor_next = true;
     }
-    if (data->index.with_position) {
-      GRN_UINT32_PUT(ctx, postings, posting->pos);
+  } else {
+    grn_posting *posting;
+    while ((posting = grn_ii_cursor_next(ctx, cursor))) {
+      uint32_t raw_posting[2];
+      size_t i = 0;
+      raw_posting[i] = posting->rid;
+      if (with_section) {
+        i++;
+        raw_posting[i] = posting->sid;
+      }
+      const int compared =
+        grn_index_column_diff_compare_posting(ctx, data, raw_posting);
+      if (compared == 0) {
+        return;
+      } else if (compared > 0) {
+        for (size_t j = 0; j <= i; j++) {
+          GRN_UINT32_PUT(ctx, look_ahead, raw_posting[j]);
+        }
+        grn_index_column_diff_append_missing(ctx, data, &(data->current.posting));
+        return;
+      } else {
+        grn_index_column_diff_append_remain(ctx, data, posting);
+      }
     }
-    grn_obj_set_value(ctx,
-                      data->missings,
-                      data->current.diff_id,
-                      postings,
-                      GRN_OBJ_APPEND);
   }
+  grn_index_column_diff_append_missing(ctx, data, &(data->current.posting));
+  grn_index_column_diff_posting_list_fin(ctx, data, posting_list);
 }
 
 static void
@@ -801,6 +812,9 @@ grn_index_column_diff_process_token(grn_ctx *ctx,
     }
 
     grn_index_column_diff_process_token_id(ctx, data);
+    if (ctx->rc != GRN_SUCCESS) {
+      break;
+    }
   }
   grn_token_cursor_close(ctx, token_cursor);
 }
@@ -812,7 +826,6 @@ grn_index_column_diff_compute(grn_ctx *ctx,
   grn_obj *source_columns = &(data->source_columns);
   const size_t n_source_columns = GRN_PTR_VECTOR_SIZE(source_columns);
   grn_obj *value = &(data->buffers.value);
-  grn_obj *postings = &(data->buffers.postings);
 
   grn_index_column_diff_init_progress(ctx, data);
 
@@ -831,6 +844,9 @@ grn_index_column_diff_compute(grn_ctx *ctx,
 
       GRN_BULK_REWIND(value);
       grn_obj_get_value(ctx, source, id, value);
+      if (ctx->rc != GRN_SUCCESS) {
+        break;
+      }
 
       switch (value->header.type) {
       case GRN_VECTOR :
@@ -845,6 +861,9 @@ grn_index_column_diff_compute(grn_ctx *ctx,
                                                 data,
                                                 element,
                                                 element_size);
+            if (ctx->rc != GRN_SUCCESS) {
+              break;
+            }
           }
         }
         break;
@@ -857,6 +876,9 @@ grn_index_column_diff_compute(grn_ctx *ctx,
             data->current.token_id = element;
             data->current.posting.pos = 0;
             grn_index_column_diff_process_token_id(ctx, data);
+            if (ctx->rc != GRN_SUCCESS) {
+              break;
+            }
           }
         } else {
           const size_t n_elements = grn_uvector_size(ctx, value);
@@ -870,6 +892,9 @@ grn_index_column_diff_compute(grn_ctx *ctx,
             }
             data->current.posting.pos = 0;
             grn_index_column_diff_process_token_id(ctx, data);
+            if (ctx->rc != GRN_SUCCESS) {
+              break;
+            }
           }
         }
         break;
@@ -890,34 +915,23 @@ grn_index_column_diff_compute(grn_ctx *ctx,
       default :
         break;
       }
-    }
-  } GRN_TABLE_EACH_END(ctx, cursor);
-
-  grn_index_column_diff_cache_flush_all(ctx, data);
 
-  {
-    GRN_TABLE_EACH_BEGIN(ctx, data->diff, cursor, id) {
-      GRN_BULK_REWIND(postings);
-      grn_obj_get_value(ctx, data->postings, id, postings);
-      if (GRN_UINT32_VECTOR_SIZE(postings) > 0) {
-        grn_obj_set_value(ctx, data->remains, id, postings, GRN_OBJ_APPEND);
-        continue;
-      }
-      GRN_BULK_REWIND(postings);
-      grn_obj_get_value(ctx, data->remains, id, postings);
-      if (GRN_UINT32_VECTOR_SIZE(postings) > 0) {
-        continue;
-      }
-      GRN_BULK_REWIND(postings);
-      grn_obj_get_value(ctx, data->missings, id, postings);
-      if (GRN_UINT32_VECTOR_SIZE(postings) > 0) {
-        continue;
+      if (ctx->rc != GRN_SUCCESS) {
+        break;
       }
-      grn_table_cursor_delete(ctx, cursor);
-    } GRN_TABLE_EACH_END(ctx, cursor);
-  }
+    }
+    if (ctx->rc != GRN_SUCCESS) {
+      break;
+    }
+  } GRN_TABLE_EACH_END(ctx, cursor);
 
-  grn_obj_close(ctx, data->postings);
+  GRN_HASH_EACH_BEGIN(ctx, data->posting_lists, cursor, id) {
+    void *value;
+    grn_hash_cursor_get_value(ctx, cursor, &value);
+    grn_index_column_diff_posting_list *posting_list = value;
+    grn_index_column_diff_posting_list_fin(ctx, data, posting_list);
+  } GRN_HASH_EACH_END(ctx, cursor);
+  grn_hash_close(ctx, data->posting_lists);
 }
 
 grn_rc
@@ -990,7 +1004,6 @@ grn_index_column_diff(grn_ctx *ctx,
   }
 
   data.lexicon = grn_ctx_at(ctx, index_column->header.domain);
-  grn_index_column_diff_data_init_cache(ctx, &data);
   if (ctx->rc != GRN_SUCCESS) {
     goto exit;
   }
@@ -1011,23 +1024,24 @@ grn_index_column_diff(grn_ctx *ctx,
         message);
     goto exit;
   }
-  data.postings = grn_column_create(ctx,
-                                    data.diff,
-                                    postings_column_name,
-                                    strlen(postings_column_name),
-                                    NULL,
-                                    GRN_OBJ_COLUMN_VECTOR,
-                                    grn_ctx_at(ctx, GRN_DB_UINT32));
-  if (!data.postings) {
+  data.posting_lists =
+    grn_hash_create(ctx,
+                    NULL,
+                    sizeof(grn_id),
+                    sizeof(grn_index_column_diff_posting_list),
+                    GRN_OBJ_TABLE_HASH_KEY);
+  if (!data.posting_lists) {
     char message[GRN_CTX_MSGSIZE];
     grn_strcpy(message, GRN_CTX_MSGSIZE, ctx->errbuf);
     ERR(GRN_INVALID_ARGUMENT,
-        "[index-column][diff] failed to create postings column: <%.*s>: %s",
+        "[index-column][diff] failed to create hash table for posting lists: "
+        "<%.*s>: %s",
         data.index.name_size,
         data.index.name,
         message);
     goto exit;
   }
+  DB_OBJ(data.posting_lists)->header.domain = grn_obj_id(ctx, data.diff);
   data.remains = grn_column_create(ctx,
                                    data.diff,
                                    remains_column_name,
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190515/2058ceda/attachment-0001.html>


Groonga - fulltext search engine.

[Groonga-commit] groonga/groonga at 09df0e6 [master] index_column_diff: improve performance