Kouhei Sutou
null+****@clear*****
Sun Dec 27 20:09:24 JST 2015
Kouhei Sutou 2015-12-27 20:09:24 +0900 (Sun, 27 Dec 2015) New Revision: d40820ddb6a760eb3373f119d27ebad5ff08b748 https://github.com/groonga/groonga/commit/d40820ddb6a760eb3373f119d27ebad5ff08b748 Message: Optimize sequential scan with simple regexp expression Simple regexp expression is "GET_VALUE_FROM_COLUMN @~ CONSTANT_PATTERN". Modified files: lib/expr.c Modified: lib/expr.c (+286 -16) =================================================================== --- lib/expr.c 2015-12-26 23:45:10 +0900 (0e00466) +++ lib/expr.c 2015-12-27 20:09:24 +0900 (71aea9d) @@ -29,6 +29,15 @@ #include "grn_mrb.h" #include "mrb/mrb_expr.h" +#ifdef GRN_WITH_ONIGMO +# define GRN_SUPPORT_REGEXP +#endif + +#ifdef GRN_SUPPORT_REGEXP +# include "grn_normalizer.h" +# include <oniguruma.h> +#endif + grn_obj * grn_expr_alloc(grn_ctx *ctx, grn_obj *expr, grn_id domain, grn_obj_flags flags) { @@ -4740,6 +4749,259 @@ exec_result_to_score(grn_ctx *ctx, grn_obj *result, grn_obj *score_buffer) } } +typedef union { + struct { + grn_obj *expr; + grn_obj *variable; + } common; + struct { + grn_obj *expr; + grn_obj *variable; + grn_obj score_buffer; + } general; + struct { + grn_obj *expr; + grn_obj *variable; +#ifdef GRN_SUPPORT_REGEXP + OnigRegex regex; + grn_obj value_buffer; + grn_obj *normalizer; +#endif /* GRN_SUPPORT_REGEXP */ + } simple_regexp; +} grn_table_select_sequential_data; + +typedef void (*grn_table_select_sequential_init_func)(grn_ctx *ctx, + grn_table_select_sequential_data *data); +typedef int32_t (*grn_table_select_sequential_exec_func)(grn_ctx *ctx, + grn_id id, + grn_table_select_sequential_data *data); +typedef void (*grn_table_select_sequential_fin_func)(grn_ctx *ctx, + grn_table_select_sequential_data *data); + +static void +grn_table_select_sequential_init_general(grn_ctx *ctx, + grn_table_select_sequential_data *data) +{ + GRN_INT32_INIT(&(data->general.score_buffer), 0); +} + +static int32_t +grn_table_select_sequential_exec_general(grn_ctx *ctx, + grn_id id, + grn_table_select_sequential_data *data) +{ + grn_obj *result; + GRN_RECORD_SET(ctx, data->general.variable, id); + result = grn_expr_exec(ctx, data->general.expr, 0); + if (ctx->rc) { + return -1; + } + return exec_result_to_score(ctx, result, &(data->general.score_buffer)); +} + +static void +grn_table_select_sequential_fin_general(grn_ctx *ctx, + grn_table_select_sequential_data *data) +{ + GRN_OBJ_FIN(ctx, &(data->general.score_buffer)); +} + +#ifdef GRN_SUPPORT_REGEXP +static grn_bool +grn_table_select_sequential_is_simple_regexp(grn_ctx *ctx, grn_obj *expr) +{ + grn_expr *e = (grn_expr *)expr; + grn_expr_code *target; + grn_expr_code *pattern; + grn_expr_code *operator; + + if (e->codes_curr != 3) { + return GRN_FALSE; + } + + target = &(e->codes[0]); + pattern = &(e->codes[1]); + operator = &(e->codes[2]); + + if (operator->op != GRN_OP_REGEXP) { + return GRN_FALSE; + } + if (operator->nargs != 2) { + return GRN_FALSE; + } + + if (target->op != GRN_OP_GET_VALUE) { + return GRN_FALSE; + } + if (target->nargs != 1) { + return GRN_FALSE; + } + if (!target->value) { + return GRN_FALSE; + } + if (target->value->header.type != GRN_COLUMN_VAR_SIZE) { + return GRN_FALSE; + } + if ((target->value->header.flags & GRN_OBJ_COLUMN_TYPE_MASK) != + GRN_OBJ_COLUMN_SCALAR) { + return GRN_FALSE; + } + switch (grn_obj_get_range(ctx, target->value)) { + case GRN_DB_SHORT_TEXT : + case GRN_DB_TEXT : + case GRN_DB_LONG_TEXT : + break; + default : + return GRN_FALSE; + } + + if (pattern->op != GRN_OP_PUSH) { + return GRN_FALSE; + } + if (pattern->nargs != 1) { + return GRN_FALSE; + } + if (!pattern->value) { + return GRN_FALSE; + } + if (pattern->value->header.type != GRN_BULK) { + return GRN_FALSE; + } + switch (pattern->value->header.domain) { + case GRN_DB_SHORT_TEXT : + case GRN_DB_TEXT : + case GRN_DB_LONG_TEXT : + break; + default : + return GRN_FALSE; + } + + return GRN_TRUE; +} + +static void +grn_table_select_sequential_init_simple_regexp(grn_ctx *ctx, + grn_table_select_sequential_data *data) +{ + grn_expr *e = (grn_expr *)(data->simple_regexp.expr); + OnigEncoding onig_encoding; + int onig_result; + OnigErrorInfo onig_error_info; + grn_obj *pattern; + + if (ctx->encoding == GRN_ENC_NONE) { + data->simple_regexp.regex = NULL; + return; + } + + switch (ctx->encoding) { + case GRN_ENC_EUC_JP : + onig_encoding = ONIG_ENCODING_EUC_JP; + break; + case GRN_ENC_UTF8 : + onig_encoding = ONIG_ENCODING_UTF8; + break; + case GRN_ENC_SJIS : + onig_encoding = ONIG_ENCODING_CP932; + break; + case GRN_ENC_LATIN1 : + onig_encoding = ONIG_ENCODING_ISO_8859_1; + break; + case GRN_ENC_KOI8R : + onig_encoding = ONIG_ENCODING_KOI8_R; + break; + default : + data->simple_regexp.regex = NULL; + return; + } + + pattern = e->codes[1].value; + onig_result = onig_new(&(data->simple_regexp.regex), + GRN_TEXT_VALUE(pattern), + GRN_TEXT_VALUE(pattern) + GRN_TEXT_LEN(pattern), + ONIG_OPTION_ASCII_RANGE | + ONIG_OPTION_MULTILINE, + onig_encoding, + ONIG_SYNTAX_RUBY, + &onig_error_info); + if (onig_result != ONIG_NORMAL) { + char message[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str(message, onig_result, onig_error_info); + ERR(GRN_INVALID_ARGUMENT, + "[table][select][sequential][regexp] " + "failed to create regular expression object: <%.*s>: %s", + (int)GRN_TEXT_LEN(pattern), GRN_TEXT_VALUE(pattern), + message); + return; + } + + GRN_VOID_INIT(&(data->simple_regexp.value_buffer)); + + data->simple_regexp.normalizer = + grn_ctx_get(ctx, GRN_NORMALIZER_AUTO_NAME, -1); +} + +static int32_t +grn_table_select_sequential_exec_simple_regexp(grn_ctx *ctx, + grn_id id, + grn_table_select_sequential_data *data) +{ + grn_expr *e = (grn_expr *)(data->simple_regexp.expr); + grn_obj *value_buffer = &(data->simple_regexp.value_buffer); + + if (ctx->rc) { + return -1; + } + + grn_obj_reinit_for(ctx, value_buffer, e->codes[0].value); + grn_obj_get_value(ctx, e->codes[0].value, id, value_buffer); + { + grn_obj *norm_target; + const char *norm_target_raw; + unsigned int norm_target_raw_length_in_bytes; + + norm_target = grn_string_open(ctx, + GRN_TEXT_VALUE(value_buffer), + GRN_TEXT_LEN(value_buffer), + data->simple_regexp.normalizer, + 0); + grn_string_get_normalized(ctx, norm_target, + &norm_target_raw, + &norm_target_raw_length_in_bytes, + NULL); + + { + OnigPosition position; + position = onig_search(data->simple_regexp.regex, + norm_target_raw, + norm_target_raw + norm_target_raw_length_in_bytes, + norm_target_raw, + norm_target_raw + norm_target_raw_length_in_bytes, + NULL, + ONIG_OPTION_NONE); + grn_obj_close(ctx, norm_target); + if (position == ONIG_MISMATCH) { + return -1; + } else { + return 1; + } + } + } +} + +static void +grn_table_select_sequential_fin_simple_regexp(grn_ctx *ctx, + grn_table_select_sequential_data *data) +{ + if (!data->simple_regexp.regex) { + return; + } + + onig_free(data->simple_regexp.regex); + GRN_OBJ_FIN(ctx, &(data->simple_regexp.value_buffer)); +} +#endif /* GRN_SUPPORT_REGEXP */ + static void grn_table_select_sequential(grn_ctx *ctx, grn_obj *table, grn_obj *expr, grn_obj *v, grn_obj *res, grn_operator op) @@ -4749,20 +5011,33 @@ grn_table_select_sequential(grn_ctx *ctx, grn_obj *table, grn_obj *expr, grn_table_cursor *tc; grn_hash_cursor *hc; grn_hash *s = (grn_hash *)res; - grn_obj *r; - grn_obj score_buffer; - GRN_RECORD_INIT(v, 0, grn_obj_id(ctx, table)); - GRN_INT32_INIT(&score_buffer, 0); + grn_table_select_sequential_data data; + grn_table_select_sequential_init_func init; + grn_table_select_sequential_exec_func exec; + grn_table_select_sequential_fin_func fin; + + data.common.expr = expr; + data.common.variable = v; + init = grn_table_select_sequential_init_general; + exec = grn_table_select_sequential_exec_general; + fin = grn_table_select_sequential_fin_general; +#ifdef GRN_SUPPORT_REGEXP + if (grn_table_select_sequential_is_simple_regexp(ctx, expr)) { + init = grn_table_select_sequential_init_simple_regexp; + exec = grn_table_select_sequential_exec_simple_regexp; + fin = grn_table_select_sequential_fin_simple_regexp; + } +#endif /* GRN_SUPPORT_REGEXP */ + + init(ctx, &data); switch (op) { case GRN_OP_OR : if ((tc = grn_table_cursor_open(ctx, table, NULL, 0, NULL, 0, 0, -1, 0))) { while ((id = grn_table_cursor_next(ctx, tc))) { - GRN_RECORD_SET(ctx, v, id); - r = grn_expr_exec(ctx, expr, 0); + score = exec(ctx, id, &data); if (ctx->rc) { break; } - score = exec_result_to_score(ctx, r, &score_buffer); if (score > 0) { grn_rset_recinfo *ri; if (grn_hash_add(ctx, s, &id, s->key_size, (void **)&ri, NULL)) { @@ -4777,12 +5052,10 @@ grn_table_select_sequential(grn_ctx *ctx, grn_obj *table, grn_obj *expr, if ((hc = grn_hash_cursor_open(ctx, s, NULL, 0, NULL, 0, 0, -1, 0))) { while (grn_hash_cursor_next(ctx, hc)) { grn_hash_cursor_get_key(ctx, hc, (void **) &idp); - GRN_RECORD_SET(ctx, v, *idp); - r = grn_expr_exec(ctx, expr, 0); + score = exec(ctx, *idp, &data); if (ctx->rc) { break; } - score = exec_result_to_score(ctx, r, &score_buffer); if (score > 0) { grn_rset_recinfo *ri; grn_hash_cursor_get_value(ctx, hc, (void **) &ri); @@ -4799,11 +5072,10 @@ grn_table_select_sequential(grn_ctx *ctx, grn_obj *table, grn_obj *expr, while (grn_hash_cursor_next(ctx, hc)) { grn_hash_cursor_get_key(ctx, hc, (void **) &idp); GRN_RECORD_SET(ctx, v, *idp); - r = grn_expr_exec(ctx, expr, 0); + score = exec(ctx, *idp, &data); if (ctx->rc) { break; } - score = exec_result_to_score(ctx, r, &score_buffer); if (score > 0) { grn_hash_cursor_delete(ctx, hc, NULL); } @@ -4815,12 +5087,10 @@ grn_table_select_sequential(grn_ctx *ctx, grn_obj *table, grn_obj *expr, if ((hc = grn_hash_cursor_open(ctx, s, NULL, 0, NULL, 0, 0, -1, 0))) { while (grn_hash_cursor_next(ctx, hc)) { grn_hash_cursor_get_key(ctx, hc, (void **) &idp); - GRN_RECORD_SET(ctx, v, *idp); - r = grn_expr_exec(ctx, expr, 0); + score = exec(ctx, *idp, &data); if (ctx->rc) { break; } - score = exec_result_to_score(ctx, r, &score_buffer); if (score > 0) { grn_rset_recinfo *ri; grn_hash_cursor_get_value(ctx, hc, (void **) &ri); @@ -4833,7 +5103,7 @@ grn_table_select_sequential(grn_ctx *ctx, grn_obj *table, grn_obj *expr, default : break; } - GRN_OBJ_FIN(ctx, &score_buffer); + fin(ctx, &data); } static inline void -------------- next part -------------- HTML����������������������������...Télécharger