[Groonga-commit] groonga/groonga at d40820d [master] Optimize sequential scan with simple regexp expression

Back to archive index

Kouhei Sutou null+****@clear*****
Sun Dec 27 20:09:24 JST 2015


Kouhei Sutou	2015-12-27 20:09:24 +0900 (Sun, 27 Dec 2015)

  New Revision: d40820ddb6a760eb3373f119d27ebad5ff08b748
  https://github.com/groonga/groonga/commit/d40820ddb6a760eb3373f119d27ebad5ff08b748

  Message:
    Optimize sequential scan with simple regexp expression
    
    Simple regexp expression is "GET_VALUE_FROM_COLUMN @~ CONSTANT_PATTERN".

  Modified files:
    lib/expr.c

  Modified: lib/expr.c (+286 -16)
===================================================================
--- lib/expr.c    2015-12-26 23:45:10 +0900 (0e00466)
+++ lib/expr.c    2015-12-27 20:09:24 +0900 (71aea9d)
@@ -29,6 +29,15 @@
 #include "grn_mrb.h"
 #include "mrb/mrb_expr.h"
 
+#ifdef GRN_WITH_ONIGMO
+# define GRN_SUPPORT_REGEXP
+#endif
+
+#ifdef GRN_SUPPORT_REGEXP
+# include "grn_normalizer.h"
+# include <oniguruma.h>
+#endif
+
 grn_obj *
 grn_expr_alloc(grn_ctx *ctx, grn_obj *expr, grn_id domain, grn_obj_flags flags)
 {
@@ -4740,6 +4749,259 @@ exec_result_to_score(grn_ctx *ctx, grn_obj *result, grn_obj *score_buffer)
   }
 }
 
+typedef union {
+  struct {
+    grn_obj *expr;
+    grn_obj *variable;
+  } common;
+  struct {
+    grn_obj *expr;
+    grn_obj *variable;
+    grn_obj score_buffer;
+  } general;
+  struct {
+    grn_obj *expr;
+    grn_obj *variable;
+#ifdef GRN_SUPPORT_REGEXP
+    OnigRegex regex;
+    grn_obj value_buffer;
+    grn_obj *normalizer;
+#endif /* GRN_SUPPORT_REGEXP */
+  } simple_regexp;
+} grn_table_select_sequential_data;
+
+typedef void (*grn_table_select_sequential_init_func)(grn_ctx *ctx,
+                                                      grn_table_select_sequential_data *data);
+typedef int32_t (*grn_table_select_sequential_exec_func)(grn_ctx *ctx,
+                                                         grn_id id,
+                                                         grn_table_select_sequential_data *data);
+typedef void (*grn_table_select_sequential_fin_func)(grn_ctx *ctx,
+                                                     grn_table_select_sequential_data *data);
+
+static void
+grn_table_select_sequential_init_general(grn_ctx *ctx,
+                                         grn_table_select_sequential_data *data)
+{
+  GRN_INT32_INIT(&(data->general.score_buffer), 0);
+}
+
+static int32_t
+grn_table_select_sequential_exec_general(grn_ctx *ctx,
+                                         grn_id id,
+                                         grn_table_select_sequential_data *data)
+{
+  grn_obj *result;
+  GRN_RECORD_SET(ctx, data->general.variable, id);
+  result = grn_expr_exec(ctx, data->general.expr, 0);
+  if (ctx->rc) {
+    return -1;
+  }
+  return exec_result_to_score(ctx, result, &(data->general.score_buffer));
+}
+
+static void
+grn_table_select_sequential_fin_general(grn_ctx *ctx,
+                                        grn_table_select_sequential_data *data)
+{
+  GRN_OBJ_FIN(ctx, &(data->general.score_buffer));
+}
+
+#ifdef GRN_SUPPORT_REGEXP
+static grn_bool
+grn_table_select_sequential_is_simple_regexp(grn_ctx *ctx, grn_obj *expr)
+{
+  grn_expr *e = (grn_expr *)expr;
+  grn_expr_code *target;
+  grn_expr_code *pattern;
+  grn_expr_code *operator;
+
+  if (e->codes_curr != 3) {
+    return GRN_FALSE;
+  }
+
+  target = &(e->codes[0]);
+  pattern = &(e->codes[1]);
+  operator = &(e->codes[2]);
+
+  if (operator->op != GRN_OP_REGEXP) {
+    return GRN_FALSE;
+  }
+  if (operator->nargs != 2) {
+    return GRN_FALSE;
+  }
+
+  if (target->op != GRN_OP_GET_VALUE) {
+    return GRN_FALSE;
+  }
+  if (target->nargs != 1) {
+    return GRN_FALSE;
+  }
+  if (!target->value) {
+    return GRN_FALSE;
+  }
+  if (target->value->header.type != GRN_COLUMN_VAR_SIZE) {
+    return GRN_FALSE;
+  }
+  if ((target->value->header.flags & GRN_OBJ_COLUMN_TYPE_MASK) !=
+      GRN_OBJ_COLUMN_SCALAR) {
+    return GRN_FALSE;
+  }
+  switch (grn_obj_get_range(ctx, target->value)) {
+  case GRN_DB_SHORT_TEXT :
+  case GRN_DB_TEXT :
+  case GRN_DB_LONG_TEXT :
+    break;
+  default :
+    return GRN_FALSE;
+  }
+
+  if (pattern->op != GRN_OP_PUSH) {
+    return GRN_FALSE;
+  }
+  if (pattern->nargs != 1) {
+    return GRN_FALSE;
+  }
+  if (!pattern->value) {
+    return GRN_FALSE;
+  }
+  if (pattern->value->header.type != GRN_BULK) {
+    return GRN_FALSE;
+  }
+  switch (pattern->value->header.domain) {
+  case GRN_DB_SHORT_TEXT :
+  case GRN_DB_TEXT :
+  case GRN_DB_LONG_TEXT :
+    break;
+  default :
+    return GRN_FALSE;
+  }
+
+  return GRN_TRUE;
+}
+
+static void
+grn_table_select_sequential_init_simple_regexp(grn_ctx *ctx,
+                                               grn_table_select_sequential_data *data)
+{
+  grn_expr *e = (grn_expr *)(data->simple_regexp.expr);
+  OnigEncoding onig_encoding;
+  int onig_result;
+  OnigErrorInfo onig_error_info;
+  grn_obj *pattern;
+
+  if (ctx->encoding == GRN_ENC_NONE) {
+    data->simple_regexp.regex = NULL;
+    return;
+  }
+
+  switch (ctx->encoding) {
+  case GRN_ENC_EUC_JP :
+    onig_encoding = ONIG_ENCODING_EUC_JP;
+    break;
+  case GRN_ENC_UTF8 :
+    onig_encoding = ONIG_ENCODING_UTF8;
+    break;
+  case GRN_ENC_SJIS :
+    onig_encoding = ONIG_ENCODING_CP932;
+    break;
+  case GRN_ENC_LATIN1 :
+    onig_encoding = ONIG_ENCODING_ISO_8859_1;
+    break;
+  case GRN_ENC_KOI8R :
+    onig_encoding = ONIG_ENCODING_KOI8_R;
+    break;
+  default :
+    data->simple_regexp.regex = NULL;
+    return;
+  }
+
+  pattern = e->codes[1].value;
+  onig_result = onig_new(&(data->simple_regexp.regex),
+                         GRN_TEXT_VALUE(pattern),
+                         GRN_TEXT_VALUE(pattern) + GRN_TEXT_LEN(pattern),
+                         ONIG_OPTION_ASCII_RANGE |
+                         ONIG_OPTION_MULTILINE,
+                         onig_encoding,
+                         ONIG_SYNTAX_RUBY,
+                         &onig_error_info);
+  if (onig_result != ONIG_NORMAL) {
+    char message[ONIG_MAX_ERROR_MESSAGE_LEN];
+    onig_error_code_to_str(message, onig_result, onig_error_info);
+    ERR(GRN_INVALID_ARGUMENT,
+        "[table][select][sequential][regexp] "
+        "failed to create regular expression object: <%.*s>: %s",
+        (int)GRN_TEXT_LEN(pattern), GRN_TEXT_VALUE(pattern),
+        message);
+    return;
+  }
+
+  GRN_VOID_INIT(&(data->simple_regexp.value_buffer));
+
+  data->simple_regexp.normalizer =
+    grn_ctx_get(ctx, GRN_NORMALIZER_AUTO_NAME, -1);
+}
+
+static int32_t
+grn_table_select_sequential_exec_simple_regexp(grn_ctx *ctx,
+                                               grn_id id,
+                                               grn_table_select_sequential_data *data)
+{
+  grn_expr *e = (grn_expr *)(data->simple_regexp.expr);
+  grn_obj *value_buffer = &(data->simple_regexp.value_buffer);
+
+  if (ctx->rc) {
+    return -1;
+  }
+
+  grn_obj_reinit_for(ctx, value_buffer, e->codes[0].value);
+  grn_obj_get_value(ctx, e->codes[0].value, id, value_buffer);
+  {
+    grn_obj *norm_target;
+    const char *norm_target_raw;
+    unsigned int norm_target_raw_length_in_bytes;
+
+    norm_target = grn_string_open(ctx,
+                                  GRN_TEXT_VALUE(value_buffer),
+                                  GRN_TEXT_LEN(value_buffer),
+                                  data->simple_regexp.normalizer,
+                                  0);
+    grn_string_get_normalized(ctx, norm_target,
+                              &norm_target_raw,
+                              &norm_target_raw_length_in_bytes,
+                              NULL);
+
+    {
+      OnigPosition position;
+      position = onig_search(data->simple_regexp.regex,
+                             norm_target_raw,
+                             norm_target_raw + norm_target_raw_length_in_bytes,
+                             norm_target_raw,
+                             norm_target_raw + norm_target_raw_length_in_bytes,
+                             NULL,
+                             ONIG_OPTION_NONE);
+      grn_obj_close(ctx, norm_target);
+      if (position == ONIG_MISMATCH) {
+        return -1;
+      } else {
+        return 1;
+      }
+    }
+  }
+}
+
+static void
+grn_table_select_sequential_fin_simple_regexp(grn_ctx *ctx,
+                                        grn_table_select_sequential_data *data)
+{
+  if (!data->simple_regexp.regex) {
+    return;
+  }
+
+  onig_free(data->simple_regexp.regex);
+  GRN_OBJ_FIN(ctx, &(data->simple_regexp.value_buffer));
+}
+#endif /* GRN_SUPPORT_REGEXP */
+
 static void
 grn_table_select_sequential(grn_ctx *ctx, grn_obj *table, grn_obj *expr,
                             grn_obj *v, grn_obj *res, grn_operator op)
@@ -4749,20 +5011,33 @@ grn_table_select_sequential(grn_ctx *ctx, grn_obj *table, grn_obj *expr,
   grn_table_cursor *tc;
   grn_hash_cursor *hc;
   grn_hash *s = (grn_hash *)res;
-  grn_obj *r;
-  grn_obj score_buffer;
-  GRN_RECORD_INIT(v, 0, grn_obj_id(ctx, table));
-  GRN_INT32_INIT(&score_buffer, 0);
+  grn_table_select_sequential_data data;
+  grn_table_select_sequential_init_func init;
+  grn_table_select_sequential_exec_func exec;
+  grn_table_select_sequential_fin_func fin;
+
+  data.common.expr = expr;
+  data.common.variable = v;
+  init = grn_table_select_sequential_init_general;
+  exec = grn_table_select_sequential_exec_general;
+  fin = grn_table_select_sequential_fin_general;
+#ifdef GRN_SUPPORT_REGEXP
+  if (grn_table_select_sequential_is_simple_regexp(ctx, expr)) {
+    init = grn_table_select_sequential_init_simple_regexp;
+    exec = grn_table_select_sequential_exec_simple_regexp;
+    fin = grn_table_select_sequential_fin_simple_regexp;
+  }
+#endif /* GRN_SUPPORT_REGEXP */
+
+  init(ctx, &data);
   switch (op) {
   case GRN_OP_OR :
     if ((tc = grn_table_cursor_open(ctx, table, NULL, 0, NULL, 0, 0, -1, 0))) {
       while ((id = grn_table_cursor_next(ctx, tc))) {
-        GRN_RECORD_SET(ctx, v, id);
-        r = grn_expr_exec(ctx, expr, 0);
+        score = exec(ctx, id, &data);
         if (ctx->rc) {
           break;
         }
-        score = exec_result_to_score(ctx, r, &score_buffer);
         if (score > 0) {
           grn_rset_recinfo *ri;
           if (grn_hash_add(ctx, s, &id, s->key_size, (void **)&ri, NULL)) {
@@ -4777,12 +5052,10 @@ grn_table_select_sequential(grn_ctx *ctx, grn_obj *table, grn_obj *expr,
     if ((hc = grn_hash_cursor_open(ctx, s, NULL, 0, NULL, 0, 0, -1, 0))) {
       while (grn_hash_cursor_next(ctx, hc)) {
         grn_hash_cursor_get_key(ctx, hc, (void **) &idp);
-        GRN_RECORD_SET(ctx, v, *idp);
-        r = grn_expr_exec(ctx, expr, 0);
+        score = exec(ctx, *idp, &data);
         if (ctx->rc) {
           break;
         }
-        score = exec_result_to_score(ctx, r, &score_buffer);
         if (score > 0) {
           grn_rset_recinfo *ri;
           grn_hash_cursor_get_value(ctx, hc, (void **) &ri);
@@ -4799,11 +5072,10 @@ grn_table_select_sequential(grn_ctx *ctx, grn_obj *table, grn_obj *expr,
       while (grn_hash_cursor_next(ctx, hc)) {
         grn_hash_cursor_get_key(ctx, hc, (void **) &idp);
         GRN_RECORD_SET(ctx, v, *idp);
-        r = grn_expr_exec(ctx, expr, 0);
+        score = exec(ctx, *idp, &data);
         if (ctx->rc) {
           break;
         }
-        score = exec_result_to_score(ctx, r, &score_buffer);
         if (score > 0) {
           grn_hash_cursor_delete(ctx, hc, NULL);
         }
@@ -4815,12 +5087,10 @@ grn_table_select_sequential(grn_ctx *ctx, grn_obj *table, grn_obj *expr,
     if ((hc = grn_hash_cursor_open(ctx, s, NULL, 0, NULL, 0, 0, -1, 0))) {
       while (grn_hash_cursor_next(ctx, hc)) {
         grn_hash_cursor_get_key(ctx, hc, (void **) &idp);
-        GRN_RECORD_SET(ctx, v, *idp);
-        r = grn_expr_exec(ctx, expr, 0);
+        score = exec(ctx, *idp, &data);
         if (ctx->rc) {
           break;
         }
-        score = exec_result_to_score(ctx, r, &score_buffer);
         if (score > 0) {
           grn_rset_recinfo *ri;
           grn_hash_cursor_get_value(ctx, hc, (void **) &ri);
@@ -4833,7 +5103,7 @@ grn_table_select_sequential(grn_ctx *ctx, grn_obj *table, grn_obj *expr,
   default :
     break;
   }
-  GRN_OBJ_FIN(ctx, &score_buffer);
+  fin(ctx, &data);
 }
 
 static inline void
-------------- next part --------------
HTML����������������������������...
Télécharger 



More information about the Groonga-commit mailing list
Back to archive index