/* * Stack-less Just-In-Time compiler * * Copyright 2009-2010 Zoltan Herczeg (hzmester@freemail.hu). All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are * permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this list of * conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, this list * of conditions and the following disclaimer in the documentation and/or other materials * provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "sljitLir.h" #include "regexJIT.h" #ifdef REGEX_MATCH_VERBOSE #include #endif /* Extra, hidden flags: {id!} where id > 0 found in the code. */ #define REGEX_ID_CHECK 0x100 /* When REGEX_NEWLINE && REGEX_MATCH_BEGIN defined, the pattern turn to a normal search, which starts with [\r\n] character range. */ #define REGEX_FAKE_MATCH_BEGIN 0x200 /* When REGEX_NEWLINE && REGEX_MATCH_END defined, the pattern turn to a normal search, which ends with [\r\n] character range. */ #define REGEX_FAKE_MATCH_END 0x400 /* Check match completition after every (FINISH_TEST + 1) steps. */ #define FINISH_TEST 0x7 /* --------------------------------------------------------------------- */ /* Structures for JIT-ed pattern matching */ /* --------------------------------------------------------------------- */ struct regex_machine { /* flags. */ int flags; /* Number of state descriptors for one term. */ sljit_sw no_states; /* Total size. */ sljit_sw size; union { void *init_match; sljit_sw (SLJIT_CALL *call_init)(void *next, void* match); } u; #if (defined SLJIT_INDIRECT_CALL && SLJIT_INDIRECT_CALL) struct sljit_function_context context; #endif void *continue_match; /* Variable sized array to contain the handler addresses. */ sljit_uw entry_addrs[1]; }; struct regex_match { /* Current and next state array. */ sljit_sw *current; sljit_sw *next; /* Starting. */ sljit_sw head; /* String character index (ever increasing). */ sljit_sw index; /* Best match found so far (members in priority order). */ sljit_sw best_begin; sljit_sw best_end; sljit_sw best_id; /* Bool flags (encoded as word). */ sljit_sw fast_quit; sljit_sw fast_forward; /* Machine. */ struct regex_machine *machine; union { void *continue_match; void (SLJIT_CALL *call_continue)(struct regex_match *match, const regex_char_t *input_string, int length); } u; /* Variable sized array to contain the state arrays. */ sljit_sw states[1]; }; /* State vector ITEM[0] - pointer to the address inside the machine code ITEM[1] - next pointer ITEM[2] - string started from (optional) ITEM[3] - max ID (optional) */ /* Register allocation. */ /* Current state array (loaded & stored: regex_match->current). */ #define R_CURR_STATE SLJIT_S0 /* Next state array (loaded & stored: regex_match->next). */ #define R_NEXT_STATE SLJIT_S1 /* Head (loaded & stored: regex_match->head). */ #define R_NEXT_HEAD SLJIT_S2 /* String fragment pointer. */ #define R_STRING SLJIT_S3 /* String fragment length. */ #define R_LENGTH SLJIT_S4 /* 'struct regex_match*' */ #define R_REGEX_MATCH SLJIT_R0 /* Current character. */ #define R_CURR_CHAR SLJIT_R1 /* Temporary register. */ #define R_TEMP SLJIT_R2 /* Caches the regex_match->best_begin. */ #define R_BEST_BEGIN SLJIT_R3 /* Current character index. */ #define R_CURR_INDEX SLJIT_R4 /* --------------------------------------------------------------------- */ /* Stack management */ /* --------------------------------------------------------------------- */ /* Try to allocate 2^n blocks. */ #define STACK_FRAGMENT_SIZE (((64 * sizeof(struct stack_item)) - (sizeof(struct stack_fragment_data))) / (sizeof(struct stack_item))) struct stack_item { int type; int value; }; struct stack_fragment_data { struct stack_fragment *next; struct stack_fragment *prev; }; struct stack_fragment { struct stack_fragment_data data; struct stack_item items[STACK_FRAGMENT_SIZE]; }; struct stack { struct stack_fragment *first; struct stack_fragment *last; int index; int count; }; #if (defined SLJIT_DEBUG && SLJIT_DEBUG) static void stack_check(struct stack *stack) { struct stack_fragment *curr; int found; if (!stack) return; SLJIT_ASSERT(stack->index >= 0 && stack->index < STACK_FRAGMENT_SIZE); if (stack->first == NULL) { SLJIT_ASSERT(stack->first == NULL && stack->last == NULL); SLJIT_ASSERT(stack->index == STACK_FRAGMENT_SIZE - 1 && stack->count == 0); return; } found = 0; if (stack->last == NULL) { SLJIT_ASSERT(stack->index == STACK_FRAGMENT_SIZE - 1 && stack->count == 0); found = 1; } else SLJIT_ASSERT(stack->index >= 0 && stack->count >= 0); SLJIT_ASSERT(stack->first->data.prev == NULL); curr = stack->first; while (curr) { if (curr == stack->last) found = 1; if (curr->data.next) SLJIT_ASSERT(curr->data.next->data.prev == curr); curr = curr->data.next; } SLJIT_ASSERT(found); } #endif static void stack_init(struct stack *stack) { stack->first = NULL; stack->last = NULL; stack->index = STACK_FRAGMENT_SIZE - 1; stack->count = 0; } static void stack_destroy(struct stack *stack) { struct stack_fragment *curr = stack->first; struct stack_fragment *prev; #if (defined SLJIT_DEBUG && SLJIT_DEBUG) stack_check(stack); #endif while (curr) { prev = curr; curr = curr->data.next; SLJIT_FREE(prev, NULL); } } static SLJIT_INLINE struct stack_item* stack_top(struct stack *stack) { SLJIT_ASSERT(stack->last); return stack->last->items + stack->index; } static int stack_push(struct stack *stack, int type, int value) { if (stack->last) { stack->index++; if (stack->index >= STACK_FRAGMENT_SIZE) { stack->index = 0; if (!stack->last->data.next) { stack->last->data.next = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment), NULL); if (!stack->last->data.next) return 1; stack->last->data.next->data.next = NULL; stack->last->data.next->data.prev = stack->last; } stack->last = stack->last->data.next; } } else if (!stack->first) { stack->last = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment), NULL); if (!stack->last) return 1; stack->last->data.prev = NULL; stack->last->data.next = NULL; stack->first = stack->last; stack->index = 0; } else { stack->last = stack->first; stack->index = 0; } stack->last->items[stack->index].type = type; stack->last->items[stack->index].value = value; stack->count++; #if (defined SLJIT_DEBUG && SLJIT_DEBUG) stack_check(stack); #endif return 0; } static struct stack_item* stack_pop(struct stack *stack) { struct stack_item *ret = stack_top(stack); if (stack->index > 0) stack->index--; else { stack->last = stack->last->data.prev; stack->index = STACK_FRAGMENT_SIZE - 1; } stack->count--; #if (defined SLJIT_DEBUG && SLJIT_DEBUG) stack_check(stack); #endif return ret; } static SLJIT_INLINE void stack_clone(struct stack *src, struct stack *dst) { *dst = *src; } static int stack_push_copy(struct stack *stack, int items, int length) { struct stack_fragment *frag1; int ind1; struct stack_fragment *frag2; int ind2; int counter; SLJIT_ASSERT(stack->count >= length && items <= length && items > 0); /* Allocate the necessary elements. */ counter = items; frag1 = stack->last; ind1 = stack->index; while (counter > 0) { if (stack->index + counter >= STACK_FRAGMENT_SIZE) { counter -= STACK_FRAGMENT_SIZE - stack->index - 1 + 1; stack->index = 0; if (!stack->last->data.next) { stack->last->data.next = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment), NULL); if (!stack->last->data.next) return 1; stack->last->data.next->data.next = NULL; stack->last->data.next->data.prev = stack->last; } stack->last = stack->last->data.next; } else { stack->index += counter; counter = 0; } } frag2 = stack->last; ind2 = stack->index; while (length > 0) { frag2->items[ind2--] = frag1->items[ind1--]; if (ind1 < 0) { ind1 = STACK_FRAGMENT_SIZE - 1; frag1 = frag1->data.prev; } if (ind2 < 0) { ind2 = STACK_FRAGMENT_SIZE - 1; frag2 = frag2->data.prev; } length--; } #if (defined SLJIT_DEBUG && SLJIT_DEBUG) stack_check(stack); #endif stack->count += items; return 0; } /* --------------------------------------------------------------------- */ /* Parser */ /* --------------------------------------------------------------------- */ enum { /* Common. */ type_begin, type_end, type_char, type_newline, type_id, type_rng_start, type_rng_end, type_rng_char, type_rng_left, type_rng_right, /* generator only. */ type_branch, type_jump, /* Parser only. */ type_open_br, type_close_br, type_select, type_asterisk, type_plus_sign, type_qestion_mark }; struct compiler_common { /* Temporary stacks. */ struct stack stack; struct stack depth; /* REGEX_ flags. */ int flags; /* Encoded size of the dfa representation. */ sljit_sw dfa_size; /* Number of terms. */ sljit_sw terms_size; /* Number of state descriptors for one term (same as machine->no_states). */ sljit_sw no_states; /* Number of type_rng_(char|left)-s in the longest character range. */ sljit_sw longest_range_size; /* DFA linear representation (size: dfa_size). */ struct stack_item *dfa_transitions; /* Term id and search state pairs (size: dfa_size). */ struct stack_item *search_states; /* sljit compiler */ struct sljit_compiler *compiler; /* Machine data, which must be kept for later use. */ struct regex_machine *machine; /* Temporary space for jumps (size: longest_range_size). */ struct sljit_jump **range_jump_list; }; static const regex_char_t* decode_number(const regex_char_t *regex_string, int length, int *result) { int value = 0; SLJIT_ASSERT(length > 0); if (*regex_string < '0' || *regex_string > '9') { *result = -1; return regex_string; } while (length > 0 && *regex_string >= '0' && *regex_string <= '9') { value = value * 10 + (*regex_string - '0'); length--; regex_string++; } *result = value; return regex_string; } static int iterate(struct stack *stack, int min, int max) { struct stack it; struct stack_item *item; int count = -1; int len = 0; int depth = 0; stack_clone(stack, &it); /* Calculate size. */ while (count < 0) { item = stack_pop(&it); switch (item->type) { case type_id: case type_rng_end: case type_rng_char: case type_rng_left: case type_rng_right: case type_plus_sign: case type_qestion_mark: len++; break; case type_asterisk: len += 2; break; case type_close_br: depth++; break; case type_open_br: SLJIT_ASSERT(depth > 0); depth--; if (depth == 0) count = it.count; break; case type_select: SLJIT_ASSERT(depth > 0); len += 2; break; default: SLJIT_ASSERT(item->type != type_begin && item->type != type_end); if (depth == 0) count = it.count; len++; break; } } if (min == 0 && max == 0) { /* {0,0} case, not {0,} case: delete subtree. */ stack_clone(&it, stack); /* and put an empty bracket expression instead of it. */ if (stack_push(stack, type_open_br, 0)) return REGEX_MEMORY_ERROR; if (stack_push(stack, type_close_br, 0)) return REGEX_MEMORY_ERROR; return len; } count = stack->count - count; /* Put an open bracket before the sequence. */ if (stack_push_copy(stack, 1, count)) return -1; #if (defined SLJIT_DEBUG && SLJIT_DEBUG) SLJIT_ASSERT(stack_push(&it, type_open_br, 0) == 0); #else stack_push(&it, type_open_br, 0); #endif /* Copy the data. */ if (max > 0) { len = len * (max - 1); max -= min; /* Insert ? operators. */ len += max; if (min > 0) { min--; while (min > 0) { if (stack_push_copy(stack, count, count)) return -1; min--; } if (max > 0) { if (stack_push_copy(stack, count, count)) return -1; if (stack_push(stack, type_qestion_mark, 0)) return REGEX_MEMORY_ERROR; count++; max--; } } else { SLJIT_ASSERT(max > 0); max--; count++; if (stack_push(stack, type_qestion_mark, 0)) return REGEX_MEMORY_ERROR; } while (max > 0) { if (stack_push_copy(stack, count, count)) return -1; max--; } } else { SLJIT_ASSERT(min > 0); min--; /* Insert + operator. */ len = len * min + 1; while (min > 0) { if (stack_push_copy(stack, count, count)) return -1; min--; } if (stack_push(stack, type_plus_sign, 0)) return REGEX_MEMORY_ERROR; } /* Close the opened bracket. */ if (stack_push(stack, type_close_br, 0)) return REGEX_MEMORY_ERROR; return len; } static int parse_iterator(const regex_char_t *regex_string, int length, struct stack *stack, sljit_sw *dfa_size, int begin) { /* We only know that *regex_string == { . */ int val1, val2; const regex_char_t *base_from = regex_string; const regex_char_t *from; length--; regex_string++; /* Decode left value. */ val2 = -1; if (length == 0) return -2; if (*regex_string == ',') { val1 = 0; length--; regex_string++; } else { from = regex_string; regex_string = decode_number(regex_string, length, &val1); if (val1 < 0) return -2; length -= regex_string - from; if (length == 0) return -2; if (*regex_string == '}') { val2 = val1; if (val1 == 0) val1 = -1; } else if (length >= 2 && *regex_string == '!' && regex_string[1] == '}') { /* Non posix extension. */ if (stack_push(stack, type_id, val1)) return -1; (*dfa_size)++; return (regex_string - base_from) + 1; } else { if (*regex_string != ',') return -2; length--; regex_string++; } } if (begin) return -2; /* Decode right value. */ if (val2 == -1) { if (length == 0) return -2; if (*regex_string == '}') val2 = 0; else { from = regex_string; regex_string = decode_number(regex_string, length, &val2); length -= regex_string - from; if (val2 < 0 || length == 0 || *regex_string != '}' || val2 < val1) return -2; if (val2 == 0) { SLJIT_ASSERT(val1 == 0); val1 = -1; } } } /* Fast cases. */ if (val1 > 1 || val2 > 1) { val1 = iterate(stack, val1, val2); if (val1 < 0) return -1; *dfa_size += val1; } else if (val1 == 0 && val2 == 0) { if (stack_push(stack, type_asterisk, 0)) return -1; *dfa_size += 2; } else if (val1 == 1 && val2 == 0) { if (stack_push(stack, type_plus_sign, 0)) return -1; (*dfa_size)++; } else if (val1 == 0 && val2 == 1) { if (stack_push(stack, type_qestion_mark, 0)) return -1; (*dfa_size)++; } else if (val1 == -1) { val1 = iterate(stack, 0, 0); if (val1 < 0) return -1; *dfa_size -= val1; SLJIT_ASSERT(*dfa_size >= 2); } else { /* Ignore. */ SLJIT_ASSERT(val1 == 1 && val2 == 1); } return regex_string - base_from; } static int parse_char_range(const regex_char_t *regex_string, int length, struct compiler_common *compiler_common) { struct stack* stack = &compiler_common->stack; const regex_char_t *base_from = regex_string; regex_char_t left_char, right_char, tmp_char; length--; regex_string++; if (length == 0) return -2; if (*regex_string != '^') { if (stack_push(stack, type_rng_start, 0)) return -1; } else { length--; regex_string++; if (length == 0) return -2; if (stack_push(stack, type_rng_start, 1)) return -1; } /* For both the type_rng_start & type_rng_end. */ compiler_common->dfa_size += 2; /* Range must be at least 1 character. */ if (*regex_string == ']') { length--; regex_string++; if (stack_push(stack, type_rng_char, ']')) return -1; compiler_common->dfa_size++; } while (1) { if (length == 0) return -2; if (*regex_string == ']') break; if (*regex_string != '\\') left_char = *regex_string; else { regex_string++; length--; if (length == 0) return -2; left_char = *regex_string; } regex_string++; length--; /* Is a range here? */ if (length >= 3 && *regex_string == '-' && *(regex_string + 1) != ']') { regex_string++; length--; if (*regex_string != '\\') right_char = *regex_string; else { regex_string++; length--; if (length == 0) return -2; right_char = *regex_string; } regex_string++; length--; if (left_char > right_char) { /* Swap if necessary. */ tmp_char = left_char; left_char = right_char; right_char = tmp_char; } if (stack_push(stack, type_rng_left, left_char)) return -1; if (stack_push(stack, type_rng_right, right_char)) return -1; compiler_common->dfa_size += 2; } else { if (stack_push(stack, type_rng_char, left_char)) return -1; compiler_common->dfa_size++; } } if (stack_push(stack, type_rng_end, 0)) return -1; return regex_string - base_from; } static int parse(const regex_char_t *regex_string, int length, struct compiler_common *compiler_common) { /* Depth of bracketed expressions. */ int depth = 0; /* Have we already found a term? '1' if not yet. */ int begin = 1; /* Cache stack pointer. */ struct stack* stack = &compiler_common->stack; int tmp; /* Type_begin and type_end. */ compiler_common->dfa_size = 2; stack_init(stack); if (stack_push(stack, type_begin, 0)) return REGEX_MEMORY_ERROR; if (length > 0 && *regex_string == '^') { compiler_common->flags |= REGEX_MATCH_BEGIN; length--; regex_string++; } if ((compiler_common->flags & (REGEX_MATCH_BEGIN | REGEX_NEWLINE)) == (REGEX_MATCH_BEGIN | REGEX_NEWLINE)) { /* Replace REGEX_MATCH_BEGIN flag to REGEX_FAKE_MATCH_BEGIN */ compiler_common->flags &= ~REGEX_MATCH_BEGIN; compiler_common->flags |= REGEX_FAKE_MATCH_BEGIN; /* and append a new-line search. */ if (stack_push(stack, type_newline, 0)) return REGEX_MEMORY_ERROR; compiler_common->dfa_size++; /* Begin intentionally kept as 1. */ } while (length > 0) { switch (*regex_string) { case '\\' : length--; regex_string++; if (length == 0) return REGEX_INVALID_REGEX; if (stack_push(stack, type_char, *regex_string)) return REGEX_MEMORY_ERROR; begin = 0; compiler_common->dfa_size++; break; case '.' : if (stack_push(stack, type_rng_start, 1)) return REGEX_MEMORY_ERROR; if (compiler_common->flags & REGEX_NEWLINE) { if (stack_push(stack, type_rng_char, '\n')) return REGEX_MEMORY_ERROR; if (stack_push(stack, type_rng_char, '\r')) return REGEX_MEMORY_ERROR; compiler_common->dfa_size += 2; } if (stack_push(stack, type_rng_end, 1)) return REGEX_MEMORY_ERROR; begin = 0; compiler_common->dfa_size += 2; break; case '(' : depth++; if (stack_push(stack, type_open_br, 0)) return REGEX_MEMORY_ERROR; begin = 1; break; case ')' : if (depth == 0) return REGEX_INVALID_REGEX; depth--; if (stack_push(stack, type_close_br, 0)) return REGEX_MEMORY_ERROR; begin = 0; break; case '|' : if (stack_push(stack, type_select, 0)) return REGEX_MEMORY_ERROR; begin = 1; compiler_common->dfa_size += 2; break; case '*' : if (begin) return REGEX_INVALID_REGEX; if (stack_push(stack, type_asterisk, 0)) return REGEX_MEMORY_ERROR; compiler_common->dfa_size += 2; break; case '?' : case '+' : if (begin) return REGEX_INVALID_REGEX; if (stack_push(stack, (*regex_string == '+') ? type_plus_sign : type_qestion_mark, 0)) return REGEX_MEMORY_ERROR; compiler_common->dfa_size++; break; case '{' : tmp = parse_iterator(regex_string, length, stack, &compiler_common->dfa_size, begin); if (tmp >= 0) { length -= tmp; regex_string += tmp; } else if (tmp == -1) return REGEX_MEMORY_ERROR; else { /* Not a valid range expression. */ SLJIT_ASSERT(tmp == -2); if (stack_push(stack, type_char, '{')) return REGEX_MEMORY_ERROR; compiler_common->dfa_size++; } break; case '[' : tmp = parse_char_range(regex_string, length, compiler_common); if (tmp >= 0) { length -= tmp; regex_string += tmp; } else if (tmp == -1) return REGEX_MEMORY_ERROR; else { SLJIT_ASSERT(tmp == -2); return REGEX_INVALID_REGEX; } begin = 0; break; default: if (length == 1 && *regex_string == '$') { compiler_common->flags |= REGEX_MATCH_END; break; } if (stack_push(stack, type_char, *regex_string)) return REGEX_MEMORY_ERROR; begin = 0; compiler_common->dfa_size++; break; } length--; regex_string++; } if (depth != 0) return REGEX_INVALID_REGEX; if ((compiler_common->flags & (REGEX_MATCH_END | REGEX_NEWLINE)) == (REGEX_MATCH_END | REGEX_NEWLINE)) { /* Replace REGEX_MATCH_END flag to REGEX_FAKE_MATCH_END */ compiler_common->flags &= ~REGEX_MATCH_END; compiler_common->flags |= REGEX_FAKE_MATCH_END; /* and append a new-line search. */ if (stack_push(stack, type_newline, 1)) return REGEX_MEMORY_ERROR; compiler_common->dfa_size++; /* Begin intentionally kept as 1. */ } if (stack_push(stack, type_end, 0)) return REGEX_MEMORY_ERROR; return REGEX_NO_ERROR; } /* --------------------------------------------------------------------- */ /* Generating machine state transitions */ /* --------------------------------------------------------------------- */ #define PUT_TRANSITION(typ, val) \ do { \ --transitions_ptr; \ transitions_ptr->type = typ; \ transitions_ptr->value = val; \ } while (0) static struct stack_item* handle_iteratives(struct stack_item *transitions_ptr, struct stack_item *transitions, struct stack *depth) { struct stack_item *item; while (1) { item = stack_top(depth); switch (item->type) { case type_asterisk: SLJIT_ASSERT(transitions[item->value].type == type_branch); transitions[item->value].value = transitions_ptr - transitions; PUT_TRANSITION(type_branch, item->value + 1); break; case type_plus_sign: SLJIT_ASSERT(transitions[item->value].type == type_branch); transitions[item->value].value = transitions_ptr - transitions; break; case type_qestion_mark: PUT_TRANSITION(type_branch, item->value); break; default: return transitions_ptr; } stack_pop(depth); } } static int generate_transitions(struct compiler_common *compiler_common) { struct stack *stack = &compiler_common->stack; struct stack *depth = &compiler_common->depth; struct stack_item *transitions_ptr; struct stack_item *item; stack_init(depth); compiler_common->dfa_transitions = SLJIT_MALLOC(sizeof(struct stack_item) * compiler_common->dfa_size, NULL); if (!compiler_common->dfa_transitions) return REGEX_MEMORY_ERROR; /* Go through the items of the stack and generate the necessary branches and jumps (edges of DFA). */ transitions_ptr = compiler_common->dfa_transitions + compiler_common->dfa_size; while (stack->count > 0) { item = stack_pop(stack); switch (item->type) { case type_begin: case type_open_br: item = stack_pop(depth); if (item->type == type_select) PUT_TRANSITION(type_branch, item->value + 1); else SLJIT_ASSERT(item->type == type_close_br); if (stack->count == 0) PUT_TRANSITION(type_begin, 0); else transitions_ptr = handle_iteratives(transitions_ptr, compiler_common->dfa_transitions, depth); break; case type_end: case type_close_br: if (item->type == type_end) *--transitions_ptr = *item; if (stack_push(depth, type_close_br, transitions_ptr - compiler_common->dfa_transitions)) return REGEX_MEMORY_ERROR; break; case type_select: item = stack_top(depth); if (item->type == type_select) { SLJIT_ASSERT(compiler_common->dfa_transitions[item->value].type == type_jump); PUT_TRANSITION(type_branch, item->value + 1); PUT_TRANSITION(type_jump, item->value); item->value = transitions_ptr - compiler_common->dfa_transitions; } else { SLJIT_ASSERT(item->type == type_close_br); item->type = type_select; PUT_TRANSITION(type_jump, item->value); item->value = transitions_ptr - compiler_common->dfa_transitions; } break; case type_asterisk: case type_plus_sign: case type_qestion_mark: if (item->type != type_qestion_mark) PUT_TRANSITION(type_branch, 0); if (stack_push(depth, item->type, transitions_ptr - compiler_common->dfa_transitions)) return REGEX_MEMORY_ERROR; break; case type_char: case type_newline: case type_rng_start: /* Requires handle_iteratives. */ *--transitions_ptr = *item; transitions_ptr = handle_iteratives(transitions_ptr, compiler_common->dfa_transitions, depth); break; default: *--transitions_ptr = *item; break; } } SLJIT_ASSERT(compiler_common->dfa_transitions == transitions_ptr); SLJIT_ASSERT(depth->count == 0); return REGEX_NO_ERROR; } #undef PUT_TRANSITION #ifdef REGEX_MATCH_VERBOSE static void verbose_transitions(struct compiler_common *compiler_common) { struct stack_item *transitions_ptr = compiler_common->dfa_transitions; struct stack_item *transitions_end = transitions_ptr + compiler_common->dfa_size; struct stack_item *search_states_ptr = compiler_common->search_states; int pos; printf("-----------------\nTransitions\n-----------------\n"); pos = 0; while (transitions_ptr < transitions_end) { printf("[%3d] ", pos++); if (search_states_ptr->type >= 0) printf("(%3d) ", search_states_ptr->type); switch (transitions_ptr->type) { case type_begin: printf("type_begin\n"); break; case type_end: printf("type_end\n"); break; case type_char: if (transitions_ptr->value >= ' ') printf("type_char '%c'\n", transitions_ptr->value); else printf("type_char 0x%x\n", transitions_ptr->value); break; case type_newline: printf("type_newline %s\n", transitions_ptr->value ? "(end)" : "(begin)"); break; case type_id: printf("type_id %d\n", transitions_ptr->value); break; case type_rng_start: printf("type_rng_start %s\n", transitions_ptr->value ? "(invert)" : "(normal)"); break; case type_rng_end: printf("type_rng_end\n"); break; case type_rng_char: if (transitions_ptr->value >= ' ') printf("type_rng_char '%c'\n", transitions_ptr->value); else printf("type_rng_char 0x%x\n", transitions_ptr->value); break; case type_rng_left: if (transitions_ptr->value >= ' ') printf("type_rng_left '%c'\n", transitions_ptr->value); else printf("type_rng_left 0x%x\n", transitions_ptr->value); break; case type_rng_right: if (transitions_ptr->value >= ' ') printf("type_rng_right '%c'\n", transitions_ptr->value); else printf("type_rng_right 0x%x\n", transitions_ptr->value); break; case type_branch: printf("type_branch -> %d\n", transitions_ptr->value); break; case type_jump: printf("type_jump -> %d\n", transitions_ptr->value); break; default: printf("UNEXPECTED TYPE\n"); break; } transitions_ptr++; search_states_ptr++; } printf("flags: "); if (!(compiler_common->flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_NEWLINE | REGEX_ID_CHECK | REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END))) printf("none "); if (compiler_common->flags & REGEX_MATCH_BEGIN) printf("REGEX_MATCH_BEGIN "); if (compiler_common->flags & REGEX_MATCH_END) printf("REGEX_MATCH_END "); if (compiler_common->flags & REGEX_NEWLINE) printf("REGEX_NEWLINE "); if (compiler_common->flags & REGEX_ID_CHECK) printf("REGEX_ID_CHECK "); if (compiler_common->flags & REGEX_FAKE_MATCH_BEGIN) printf("REGEX_FAKE_MATCH_BEGIN "); if (compiler_common->flags & REGEX_FAKE_MATCH_END) printf("REGEX_FAKE_MATCH_END "); if (compiler_common->longest_range_size > 0) printf("(longest range: %ld) ", (long)compiler_common->longest_range_size); printf("\n"); } #endif /* --------------------------------------------------------------------- */ /* Utilities */ /* --------------------------------------------------------------------- */ static int generate_search_states(struct compiler_common *compiler_common) { struct stack_item *transitions_ptr = compiler_common->dfa_transitions; struct stack_item *transitions_end = transitions_ptr + compiler_common->dfa_size; struct stack_item *search_states_ptr; struct stack_item *rng_start = NULL; compiler_common->terms_size = !(compiler_common->flags & REGEX_FAKE_MATCH_END) ? 1 : 2; compiler_common->longest_range_size = 0; compiler_common->search_states = SLJIT_MALLOC(sizeof(struct stack_item) * compiler_common->dfa_size, NULL); if (!compiler_common->search_states) return REGEX_MEMORY_ERROR; search_states_ptr = compiler_common->search_states; while (transitions_ptr < transitions_end) { switch (transitions_ptr->type) { case type_begin: case type_end: search_states_ptr->type = 0; break; case type_char: search_states_ptr->type = compiler_common->terms_size++; break; case type_newline: if (transitions_ptr->value) search_states_ptr->type = 1; else search_states_ptr->type = compiler_common->terms_size++; SLJIT_ASSERT(search_states_ptr->type == 1 || search_states_ptr->type == 2); break; case type_id: if (transitions_ptr->value > 0) compiler_common->flags |= REGEX_ID_CHECK; search_states_ptr->type = -1; break; case type_rng_start: search_states_ptr->type = compiler_common->terms_size; rng_start = search_states_ptr; break; case type_rng_end: search_states_ptr->type = compiler_common->terms_size++; /* Ok, this is a blunt over estimation :) */ if (compiler_common->longest_range_size < search_states_ptr - rng_start) compiler_common->longest_range_size = search_states_ptr - rng_start; break; default: search_states_ptr->type = -1; break; } search_states_ptr->value = -1; search_states_ptr++; transitions_ptr++; } return REGEX_NO_ERROR; } static int trace_transitions(int from, struct compiler_common *compiler_common) { int id = 0; struct stack *stack = &compiler_common->stack; struct stack *depth = &compiler_common->depth; struct stack_item *dfa_transitions = compiler_common->dfa_transitions; struct stack_item *search_states = compiler_common->search_states; SLJIT_ASSERT(search_states[from].type >= 0); from++; /* Be prepared for any paths (loops, etc). */ while (1) { if (dfa_transitions[from].type == type_id) if (id < dfa_transitions[from].value) id = dfa_transitions[from].value; if (search_states[from].value < id) { /* Forward step. */ if (search_states[from].value == -1) if (stack_push(stack, 0, from)) return REGEX_MEMORY_ERROR; search_states[from].value = id; if (dfa_transitions[from].type == type_branch) { if (stack_push(depth, id, from)) return REGEX_MEMORY_ERROR; from++; continue; } else if (dfa_transitions[from].type == type_jump) { from = dfa_transitions[from].value; continue; } else if (search_states[from].type < 0) { from++; continue; } } /* Back tracking. */ if (depth->count > 0) { id = stack_top(depth)->type; from = dfa_transitions[stack_pop(depth)->value].value; continue; } return 0; } } /* --------------------------------------------------------------------- */ /* Code generator */ /* --------------------------------------------------------------------- */ #define TERM_OFFSET_OF(index, offs) (((index) * no_states + (offs)) * sizeof(sljit_sw)) #define TERM_REL_OFFSET_OF(base, offs) ((base) + ((offs) * sizeof(sljit_sw))) #define EMIT_OP1(type, arg1, arg2, arg3, arg4) \ CHECK(sljit_emit_op1(compiler, type, arg1, arg2, arg3, arg4)) #define EMIT_OP2(type, arg1, arg2, arg3, arg4, arg5, arg6) \ CHECK(sljit_emit_op2(compiler, type, arg1, arg2, arg3, arg4, arg5, arg6)) #define EMIT_LABEL(label) \ label = sljit_emit_label(compiler); \ CHECK(!label) #define EMIT_JUMP(jump, type) \ jump = sljit_emit_jump(compiler, type); \ CHECK(!jump) #define EMIT_CMP(jump, type, arg1, arg2, arg3, arg4) \ jump = sljit_emit_cmp(compiler, type, arg1, arg2, arg3, arg4); \ CHECK(!jump) /* CHECK depends on the use case. */ #define CHECK(exp) \ if (SLJIT_UNLIKELY(exp)) \ return REGEX_MEMORY_ERROR static int compile_uncond_tran(struct compiler_common *compiler_common, int reg) { struct sljit_compiler *compiler = compiler_common->compiler; struct stack *stack = &compiler_common->stack; struct stack_item *search_states = compiler_common->search_states; int flags = compiler_common->flags; sljit_sw no_states = compiler_common->no_states; sljit_uw head = 0; sljit_sw offset, value; if (reg != R_CURR_STATE || !(compiler_common->flags & REGEX_FAKE_MATCH_BEGIN)) { CHECK(trace_transitions(0, compiler_common)); } else { CHECK(trace_transitions(1, compiler_common)); } while (stack->count > 0) { value = stack_pop(stack)->value; if (search_states[value].type >= 0) { offset = TERM_OFFSET_OF(search_states[value].type, 0); EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 1), SLJIT_IMM, head); if (offset > 0) head = offset; if (!(flags & REGEX_MATCH_BEGIN)) { EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), R_TEMP, 0); if (flags & REGEX_ID_CHECK) { EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 3), SLJIT_IMM, search_states[value].value); } } else if (flags & REGEX_ID_CHECK) { EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), SLJIT_IMM, search_states[value].value); } } search_states[value].value = -1; } if (reg == R_NEXT_STATE) { EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0); } else if (flags & REGEX_FAKE_MATCH_BEGIN) { SLJIT_ASSERT(compiler_common->dfa_transitions[1].type == type_newline && !compiler_common->dfa_transitions[1].value); offset = TERM_OFFSET_OF(search_states[1].type, 0); SLJIT_ASSERT(!(flags & REGEX_MATCH_BEGIN)); EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 1), SLJIT_IMM, head); head = offset; EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), SLJIT_IMM, 1); if (flags & REGEX_ID_CHECK) { EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 3), SLJIT_IMM, 0); } } EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, head); return REGEX_NO_ERROR; } static int compile_cond_tran(struct compiler_common *compiler_common, sljit_sw curr_index) { struct sljit_compiler *compiler = compiler_common->compiler; struct stack *stack = &compiler_common->stack; struct stack_item *search_states = compiler_common->search_states; sljit_sw offset; int flags; sljit_sw no_states; sljit_sw value; struct sljit_jump *jump1; struct sljit_jump *jump2; struct sljit_jump *jump3; struct sljit_jump *jump4; struct sljit_jump *jump5; struct sljit_label *label1; flags = compiler_common->flags; no_states = compiler_common->no_states; EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0); if (!(flags & (REGEX_ID_CHECK | REGEX_MATCH_BEGIN))) { EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2)); } while (stack->count > 0) { value = stack_pop(stack)->value; if (search_states[value].type >= 0) { #ifdef REGEX_MATCH_VERBOSE if (flags & REGEX_MATCH_VERBOSE) printf("-> (%3d:%3d) ", search_states[value].type, search_states[value].value); #endif offset = TERM_OFFSET_OF(search_states[value].type, 0); if (!(flags & REGEX_ID_CHECK)) { if (!(flags & REGEX_MATCH_BEGIN)) { /* Check whether item is inserted. */ EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1); EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0); if (offset > 0) { EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset); } EMIT_JUMP(jump2, SLJIT_JUMP); /* Check whether old index <= index. */ EMIT_LABEL(label1); sljit_set_label(jump1, label1); EMIT_CMP(jump1, SLJIT_LESS_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0); EMIT_LABEL(label1); sljit_set_label(jump2, label1); EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0); EMIT_LABEL(label1); sljit_set_label(jump1, label1); } else { /* Check whether item is inserted. */ EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1); EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0); if (offset > 0) { EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset); } EMIT_LABEL(label1); sljit_set_label(jump1, label1); } } else { if (!(flags & REGEX_MATCH_BEGIN)) { EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2)); /* Check whether item is inserted. */ EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1); EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0); if (offset > 0) { EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset); } EMIT_JUMP(jump2, SLJIT_JUMP); /* Check whether old index != index. */ EMIT_LABEL(label1); sljit_set_label(jump1, label1); EMIT_OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0); EMIT_JUMP(jump1, SLJIT_LESS); EMIT_JUMP(jump3, SLJIT_GREATER); /* Old index == index. */ EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 3)); if (search_states[value].value > 0) { EMIT_CMP(jump4, SLJIT_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value); EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value); EMIT_LABEL(label1); sljit_set_label(jump4, label1); } EMIT_OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, SLJIT_MEM1(R_NEXT_STATE), offset + 3 * sizeof(sljit_sw), R_TEMP, 0); EMIT_JUMP(jump4, SLJIT_GREATER_EQUAL); EMIT_JUMP(jump5, SLJIT_JUMP); /* Overwrite index & id. */ EMIT_LABEL(label1); sljit_set_label(jump3, label1); sljit_set_label(jump2, label1); EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0); EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 3)); if (search_states[value].value > 0) { EMIT_CMP(jump3, SLJIT_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value); EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value); EMIT_LABEL(label1); sljit_set_label(jump3, label1); } EMIT_LABEL(label1); sljit_set_label(jump5, label1); EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 3 * sizeof(sljit_sw), R_TEMP, 0); /* Exit. */ EMIT_LABEL(label1); sljit_set_label(jump1, label1); sljit_set_label(jump4, label1); } else { EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2)); if (search_states[value].value > 0) { EMIT_CMP(jump1, SLJIT_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value); EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value); EMIT_LABEL(label1); sljit_set_label(jump1, label1); } /* Check whether item is inserted. */ EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1); EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0); if (offset > 0) { EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset); } EMIT_JUMP(jump2, SLJIT_JUMP); /* Check whether old id >= id. */ EMIT_LABEL(label1); sljit_set_label(jump1, label1); EMIT_CMP(jump1, SLJIT_GREATER_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0); EMIT_LABEL(label1); sljit_set_label(jump2, label1); EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0); EMIT_LABEL(label1); sljit_set_label(jump1, label1); } } } search_states[value].value = -1; } #ifdef REGEX_MATCH_VERBOSE if (flags & REGEX_MATCH_VERBOSE) printf("\n"); #endif return REGEX_NO_ERROR; } static int compile_end_check(struct compiler_common *compiler_common, struct sljit_label *end_check_label) { struct sljit_compiler *compiler = compiler_common->compiler; struct sljit_jump *jump; struct sljit_jump *clear_states_jump; struct sljit_label *label; struct sljit_label *leave_label; struct sljit_label *begin_loop_label; /* Priority order: best_begin > best_end > best_id. In other words: if (new best_begin > old test_begin) do nothing otherwise we know that new_end > old_end, since R_CURR_INDEX ever increasing therefore we must overwrite all best_* variables (new_id also contains the highest id for this turn). */ /* Both R_CURR_CHAR and R_BEST_BEGIN used as temporary registers. */ if (!(compiler_common->flags & REGEX_MATCH_BEGIN)) { EMIT_OP1(SLJIT_MOV, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2)); EMIT_CMP(jump, !(compiler_common->flags & REGEX_MATCH_NON_GREEDY) ? SLJIT_LESS : SLJIT_LESS_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), R_CURR_CHAR, 0); sljit_set_label(jump, end_check_label); EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), R_CURR_CHAR, 0); if (!(compiler_common->flags & (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END))) { EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0); } else { if ((compiler_common->flags & (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END)) == (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END)) { EMIT_OP2(SLJIT_SUB, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0, SLJIT_IMM, 2); } else { EMIT_OP2(SLJIT_SUB, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0, SLJIT_IMM, 1); } } if (compiler_common->flags & REGEX_ID_CHECK) { EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 3)); } EMIT_CMP(clear_states_jump, SLJIT_LESS, R_CURR_CHAR, 0, R_BEST_BEGIN, 0); EMIT_LABEL(leave_label); EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, R_CURR_CHAR, 0); EMIT_JUMP(jump, SLJIT_JUMP); sljit_set_label(jump, end_check_label); /* A loop to clear all states, which are > (or >=) than R_CURR_CHAR. */ EMIT_LABEL(label); sljit_set_label(clear_states_jump, label); EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0); EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0); /* Begin of the loop. */ EMIT_LABEL(begin_loop_label); EMIT_CMP(jump, SLJIT_EQUAL, R_TEMP, 0, SLJIT_IMM, 0); sljit_set_label(jump, leave_label); EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, R_CURR_STATE, 0); EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_MEM1(R_TEMP), sizeof(sljit_sw)); EMIT_CMP(clear_states_jump, !(compiler_common->flags & REGEX_MATCH_NON_GREEDY) ? SLJIT_GREATER : SLJIT_GREATER_EQUAL, SLJIT_MEM1(R_TEMP), 2 * sizeof(sljit_sw), R_CURR_CHAR, 0); /* Case 1: keep this case. */ EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_TEMP), sizeof(sljit_sw), R_NEXT_HEAD, 0); EMIT_OP2(SLJIT_SUB, R_NEXT_HEAD, 0, R_TEMP, 0, R_CURR_STATE, 0); EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_BEST_BEGIN, 0); EMIT_JUMP(jump, SLJIT_JUMP); sljit_set_label(jump, begin_loop_label); /* Case 2: remove this case. */ EMIT_LABEL(label); sljit_set_label(clear_states_jump, label); EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_TEMP), sizeof(sljit_sw), SLJIT_IMM, -1); EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_BEST_BEGIN, 0); EMIT_JUMP(jump, SLJIT_JUMP); sljit_set_label(jump, begin_loop_label); } else { EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_IMM, 0); EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0); EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0); if (compiler_common->flags & REGEX_ID_CHECK) { EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2)); } EMIT_JUMP(jump, SLJIT_JUMP); sljit_set_label(jump, end_check_label); } return REGEX_NO_ERROR; } static int compile_leave_fast_forward(struct compiler_common *compiler_common, struct sljit_label *fast_forward_label) { struct sljit_compiler *compiler = compiler_common->compiler; struct stack *stack = &compiler_common->stack; struct stack_item *dfa_transitions = compiler_common->dfa_transitions; struct stack_item *search_states = compiler_common->search_states; int ind; struct sljit_jump *jump; int init_range = 1, prev_value = 0; while (stack->count > 0) { ind = stack_pop(stack)->value; search_states[ind].value = -1; if (search_states[ind].type >= 0) { if (dfa_transitions[ind].type == type_char) { EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value); sljit_set_label(jump, fast_forward_label); } else if (dfa_transitions[ind].type == type_rng_start) { SLJIT_ASSERT(!dfa_transitions[ind].value); ind++; while (dfa_transitions[ind].type != type_rng_end) { if (dfa_transitions[ind].type == type_rng_char) { EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value); sljit_set_label(jump, fast_forward_label); } else { SLJIT_ASSERT(dfa_transitions[ind].type == type_rng_left); if (init_range) { EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_CHAR, 0); init_range = 0; } if (dfa_transitions[ind].value != prev_value) { /* Best compatibility to all archs. */ prev_value -= dfa_transitions[ind].value; if (prev_value < 0) { EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, -prev_value); } else { EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, prev_value); } prev_value = dfa_transitions[ind].value; } EMIT_CMP(jump, SLJIT_LESS_EQUAL, R_TEMP, 0, SLJIT_IMM, dfa_transitions[ind + 1].value - dfa_transitions[ind].value); sljit_set_label(jump, fast_forward_label); ind++; } ind++; } } else { SLJIT_ASSERT(dfa_transitions[ind].type == type_newline); EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\n'); sljit_set_label(jump, fast_forward_label); EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\r'); sljit_set_label(jump, fast_forward_label); } } } return REGEX_NO_ERROR; } static int compile_newline_check(struct compiler_common *compiler_common, sljit_sw ind) { struct sljit_compiler *compiler = compiler_common->compiler; struct sljit_jump *jump1; struct sljit_jump *jump2; struct sljit_label *label; sljit_sw no_states; sljit_sw offset; /* Check whether a new-line character is found. */ EMIT_CMP(jump1, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\n'); EMIT_CMP(jump2, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\r'); no_states = compiler_common->no_states; offset = TERM_OFFSET_OF(compiler_common->search_states[ind].type, 1); EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), offset); EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), offset, SLJIT_IMM, -1); CHECK(sljit_emit_ijump(compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0)); EMIT_LABEL(label); sljit_set_label(jump1, label); sljit_set_label(jump2, label); return REGEX_NO_ERROR; } #undef CHECK #define CHECK(exp) \ if (SLJIT_UNLIKELY(exp)) \ return 0 static SLJIT_INLINE void range_set_label(struct sljit_jump **range_jump_list, struct sljit_label *label) { while (*range_jump_list) { sljit_set_label(*range_jump_list, label); range_jump_list++; } } static sljit_sw compile_range_check(struct compiler_common *compiler_common, sljit_sw ind) { struct sljit_compiler *compiler = compiler_common->compiler; struct stack_item *dfa_transitions = compiler_common->dfa_transitions; struct sljit_jump **range_jump_list = compiler_common->range_jump_list; int invert = dfa_transitions[ind].value; struct sljit_label *label; sljit_sw no_states; sljit_sw offset; int init_range = 1, prev_value = 0; ind++; while (dfa_transitions[ind].type != type_rng_end) { if (dfa_transitions[ind].type == type_rng_char) { EMIT_CMP(*range_jump_list, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value); range_jump_list++; } else { SLJIT_ASSERT(dfa_transitions[ind].type == type_rng_left); if (init_range) { EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_CHAR, 0); init_range = 0; } if (dfa_transitions[ind].value != prev_value) { /* Best compatibility to all archs. */ prev_value -= dfa_transitions[ind].value; if (prev_value < 0) { EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, -prev_value); } else { EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, prev_value); } prev_value = dfa_transitions[ind].value; } EMIT_CMP(*range_jump_list, SLJIT_LESS_EQUAL, R_TEMP, 0, SLJIT_IMM, dfa_transitions[ind + 1].value - dfa_transitions[ind].value); range_jump_list++; ind++; } ind++; } *range_jump_list = NULL; if (!invert) { no_states = compiler_common->no_states; offset = TERM_OFFSET_OF(compiler_common->search_states[ind].type, 1); EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), offset); EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), offset, SLJIT_IMM, -1); CHECK(sljit_emit_ijump(compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0)); EMIT_LABEL(label); range_set_label(compiler_common->range_jump_list, label); /* Clears the jump list. */ *compiler_common->range_jump_list = NULL; } return ind; } #undef TERM_OFFSET_OF #undef EMIT_OP1 #undef EMIT_OP2 #undef EMIT_LABEL #undef EMIT_JUMP #undef EMIT_CMP #undef CHECK /* --------------------------------------------------------------------- */ /* Main compiler */ /* --------------------------------------------------------------------- */ #define TERM_OFFSET_OF(ind, offs) (((ind) * compiler_common.no_states + (offs)) * sizeof(sljit_sw)) #define EMIT_OP1(type, arg1, arg2, arg3, arg4) \ CHECK(sljit_emit_op1(compiler_common.compiler, type, arg1, arg2, arg3, arg4)) #define EMIT_OP2(type, arg1, arg2, arg3, arg4, arg5, arg6) \ CHECK(sljit_emit_op2(compiler_common.compiler, type, arg1, arg2, arg3, arg4, arg5, arg6)) #define EMIT_LABEL(label) \ label = sljit_emit_label(compiler_common.compiler); \ CHECK(!label) #define EMIT_JUMP(jump, type) \ jump = sljit_emit_jump(compiler_common.compiler, type); \ CHECK(!jump) #define EMIT_CMP(jump, type, arg1, arg2, arg3, arg4) \ jump = sljit_emit_cmp(compiler_common.compiler, type, arg1, arg2, arg3, arg4); \ CHECK(!jump) /* A do {} while(0) expression helps to avoid goto statements. */ #define BEGIN_GUARD \ do { #define END_GUARD \ } while(0); #define CHECK(exp) \ if (SLJIT_UNLIKELY(exp)) \ break; struct regex_machine* regex_compile(const regex_char_t *regex_string, int length, int re_flags, int *error) { struct compiler_common compiler_common; sljit_sw ind; int error_code, done, suggest_fast_forward; /* ID of an empty match (-1 if not reachable). */ int empty_match_id; struct sljit_jump *jump; struct sljit_jump *best_match_found_jump; struct sljit_jump *fast_forward_jump = NULL; struct sljit_jump *length_is_zero_jump; struct sljit_jump *end_check_jump = NULL; struct sljit_jump *best_match_check_jump = NULL; struct sljit_jump *non_greedy_end_jump = NULL; struct sljit_label *label; struct sljit_label *end_check_label = NULL; struct sljit_label *start_label; struct sljit_label *fast_forward_label; struct sljit_label *fast_forward_return_label; if (error) *error = REGEX_NO_ERROR; #ifdef REGEX_MATCH_VERBOSE compiler_common.flags = re_flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_MATCH_NON_GREEDY | REGEX_NEWLINE | REGEX_MATCH_VERBOSE); #else compiler_common.flags = re_flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_MATCH_NON_GREEDY | REGEX_NEWLINE); #endif /* Step 1: parsing (Left->Right). Syntax check and AST generator. */ error_code = parse(regex_string, length, &compiler_common); if (error_code) { stack_destroy(&compiler_common.stack); if (error) *error = error_code; return NULL; } /* Step 2: generating branches (Right->Left). */ error_code = generate_transitions(&compiler_common); stack_destroy(&compiler_common.stack); stack_destroy(&compiler_common.depth); if (error_code) { if (compiler_common.dfa_transitions) SLJIT_FREE(compiler_common.dfa_transitions, NULL); if (error) *error = error_code; return NULL; } /* Step 3: Generate necessary data for depth-first search (Left->Right). */ error_code = generate_search_states(&compiler_common); if (error_code) { SLJIT_FREE(compiler_common.dfa_transitions, NULL); if (error) *error = error_code; return NULL; } #ifdef REGEX_MATCH_VERBOSE if (compiler_common.flags & REGEX_MATCH_VERBOSE) verbose_transitions(&compiler_common); #endif /* Step 4: Left->Right generate code. */ stack_init(&compiler_common.stack); stack_init(&compiler_common.depth); done = 0; compiler_common.machine = NULL; compiler_common.compiler = NULL; compiler_common.range_jump_list = NULL; BEGIN_GUARD compiler_common.machine = (struct regex_machine*)SLJIT_MALLOC(sizeof(struct regex_machine) + (compiler_common.terms_size - 1) * sizeof(sljit_uw), NULL); CHECK(!compiler_common.machine); compiler_common.compiler = sljit_create_compiler(NULL); CHECK(!compiler_common.compiler); if (compiler_common.longest_range_size > 0) { compiler_common.range_jump_list = (struct sljit_jump**)SLJIT_MALLOC(sizeof(struct sljit_jump*) * compiler_common.longest_range_size, NULL); CHECK(!compiler_common.range_jump_list); } if ((compiler_common.flags & REGEX_ID_CHECK) && !(compiler_common.flags & REGEX_MATCH_BEGIN)) compiler_common.no_states = 4; else if (!(compiler_common.flags & REGEX_ID_CHECK) && (compiler_common.flags & REGEX_MATCH_BEGIN)) compiler_common.no_states = 2; else compiler_common.no_states = 3; compiler_common.machine->flags = compiler_common.flags; compiler_common.machine->no_states = compiler_common.no_states; compiler_common.machine->size = compiler_common.machine->no_states * compiler_common.terms_size; /* Study the regular expression. */ empty_match_id = -1; suggest_fast_forward = 1; if (!(compiler_common.flags & REGEX_FAKE_MATCH_BEGIN)) { CHECK(trace_transitions(0, &compiler_common)); while (compiler_common.stack.count > 0) { ind = stack_pop(&compiler_common.stack)->value; if (compiler_common.search_states[ind].type == 0) { SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_end); suggest_fast_forward = 0; empty_match_id = compiler_common.search_states[ind].value; } else if (compiler_common.search_states[ind].type > 0) { SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type != type_end); if (compiler_common.dfa_transitions[ind].type == type_rng_start && compiler_common.dfa_transitions[ind].value) suggest_fast_forward = 0; } compiler_common.search_states[ind].value = -1; } } else { SLJIT_ASSERT(compiler_common.dfa_transitions[1].type == type_newline); CHECK(trace_transitions(1, &compiler_common)); while (compiler_common.stack.count > 0) { ind = stack_pop(&compiler_common.stack)->value; if (compiler_common.search_states[ind].type == 0) { SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_end); suggest_fast_forward = 0; empty_match_id = compiler_common.search_states[ind].value; } compiler_common.search_states[ind].value = -1; } } /* Step 4.1: Generate entry. */ CHECK(sljit_emit_enter(compiler_common.compiler, 0, 3, 5, 5, 0, 0, 0)); /* Copy arguments to their place. */ EMIT_OP1(SLJIT_MOV, R_REGEX_MATCH, 0, SLJIT_S0, 0); EMIT_OP1(SLJIT_MOV, R_STRING, 0, SLJIT_S1, 0); EMIT_OP2(SLJIT_ADD, R_LENGTH, 0, SLJIT_S2, 0, SLJIT_IMM, 1); /* Init global registers. */ EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current)); EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next)); EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head)); EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin)); EMIT_OP1(SLJIT_MOV, R_CURR_INDEX, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index)); /* Check whether the best match has already found in a previous frame. */ EMIT_CMP(jump, SLJIT_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 0); EMIT_JUMP(best_match_found_jump, SLJIT_JUMP); #ifdef REGEX_MATCH_VERBOSE if (compiler_common.flags & REGEX_MATCH_VERBOSE) printf("\n-----------------\nTrace\n-----------------\n"); #endif /* Step 4.2: Generate code for state 0. */ EMIT_LABEL(label); compiler_common.machine->entry_addrs[0] = (sljit_uw)label; /* Swapping current and next. */ EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_STATE, 0); EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, R_NEXT_STATE, 0); EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, R_TEMP, 0); /* Checking whether the best case needs to be updated. */ if (!(compiler_common.flags & REGEX_MATCH_END)) { EMIT_CMP(end_check_jump, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 1), SLJIT_IMM, -1); EMIT_LABEL(end_check_label); } EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), TERM_REL_OFFSET_OF(0, 1), SLJIT_IMM, -1); EMIT_OP2(SLJIT_ADD, R_CURR_INDEX, 0, R_CURR_INDEX, 0, SLJIT_IMM, 1); /* Checking whether best case has already found. */ if (!(compiler_common.flags & REGEX_MATCH_END) || (compiler_common.flags & REGEX_MATCH_BEGIN)) { if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) { /* We can bail out if no more active states remain and R_BEST_BEGIN != -1. */ EMIT_CMP(best_match_check_jump, SLJIT_NOT_EQUAL, R_BEST_BEGIN, 0, SLJIT_IMM, -1); } else { /* We can bail out if no more active states remain (regardless of R_BEST_BEGIN). */ EMIT_CMP(best_match_check_jump, SLJIT_EQUAL, R_NEXT_HEAD, 0, SLJIT_IMM, 0); } } EMIT_LABEL(start_label); sljit_set_label(jump, start_label); if (!(compiler_common.flags & REGEX_MATCH_BEGIN) && suggest_fast_forward) { EMIT_CMP(fast_forward_jump, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0); } /* Loading the next character. */ EMIT_OP2(SLJIT_SUB | SLJIT_SET_E, R_LENGTH, 0, R_LENGTH, 0, SLJIT_IMM, 1); EMIT_JUMP(length_is_zero_jump, SLJIT_EQUAL); EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_STRING, 0); #ifdef REGEX_USE_8BIT_CHARS EMIT_OP1(SLJIT_MOV_U8, R_CURR_CHAR, 0, SLJIT_MEM1(R_TEMP), 0); EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, 1); #else EMIT_OP1(SLJIT_MOV_UH, R_CURR_CHAR, 0, SLJIT_MEM1(R_TEMP), 0); EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, 2); #endif EMIT_OP1(SLJIT_MOV, R_STRING, 0, R_TEMP, 0); #ifdef REGEX_MATCH_VERBOSE if (compiler_common.flags & REGEX_MATCH_VERBOSE) { printf("(%3d): ", 0); CHECK(trace_transitions(0, &compiler_common)); while (compiler_common.stack.count > 0) { ind = stack_pop(&compiler_common.stack)->value; if (compiler_common.search_states[ind].type >= 0) printf("-> (%3d:%3d) ", compiler_common.search_states[ind].type, compiler_common.search_states[ind].value); compiler_common.search_states[ind].value = -1; } printf("\n"); } #endif EMIT_LABEL(fast_forward_return_label); if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) { EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 1); if (!(compiler_common.flags & REGEX_MATCH_END)) { EMIT_CMP(jump, SLJIT_NOT_EQUAL, R_BEST_BEGIN, 0, SLJIT_IMM, -1); } EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_INDEX, 0); CHECK(compile_uncond_tran(&compiler_common, R_NEXT_STATE)); /* And branching to the first state. */ CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0)); if (!(compiler_common.flags & REGEX_MATCH_END)) { EMIT_LABEL(label); sljit_set_label(jump, label); } } /* This is the case where we only have to reset the R_NEXT_HEAD. */ EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0); EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0); CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0)); /* Fast-forward loop. */ if (fast_forward_jump) { /* Quit from fast-forward loop. */ EMIT_LABEL(fast_forward_label); EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_NEXT_HEAD, 0, SLJIT_IMM, 1); EMIT_OP1(SLJIT_MOV, R_LENGTH, 0, R_NEXT_STATE, 0); EMIT_OP1(SLJIT_MOV, R_STRING, 0, R_CURR_STATE, 0); EMIT_OP1(SLJIT_MOV, R_CURR_INDEX, 0, R_NEXT_HEAD, 0); EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next)); EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current)); EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head)); /* Update the start field of the locations. */ CHECK(trace_transitions(0, &compiler_common)); while (compiler_common.stack.count > 0) { ind = stack_pop(&compiler_common.stack)->value; if (compiler_common.search_states[ind].type >= 0) { EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 2), R_TEMP, 0); } compiler_common.search_states[ind].value = -1; } EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0); EMIT_JUMP(jump, SLJIT_JUMP); sljit_set_label(jump, fast_forward_return_label); /* Start fast-forward. */ EMIT_LABEL(label); sljit_set_label(fast_forward_jump, label); /* Moving everything to registers. */ EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next), R_NEXT_STATE, 0); EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current), R_CURR_STATE, 0); EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head), R_NEXT_HEAD, 0); EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, R_LENGTH, 0); EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, R_STRING, 0); EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, R_CURR_INDEX, 0); /* Fast forward mainloop. */ EMIT_LABEL(label); EMIT_OP2(SLJIT_SUB | SLJIT_SET_E, R_NEXT_STATE, 0, R_NEXT_STATE, 0, SLJIT_IMM, 1); EMIT_JUMP(fast_forward_jump, SLJIT_EQUAL); #ifdef REGEX_USE_8BIT_CHARS EMIT_OP1(SLJIT_MOV_U8, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), 0); EMIT_OP2(SLJIT_ADD, R_CURR_STATE, 0, R_CURR_STATE, 0, SLJIT_IMM, 1); #else EMIT_OP1(SLJIT_MOV_UH, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), 0); EMIT_OP2(SLJIT_ADD, R_CURR_STATE, 0, R_CURR_STATE, 0, SLJIT_IMM, 2); #endif CHECK(trace_transitions(0, &compiler_common)); CHECK(compile_leave_fast_forward(&compiler_common, fast_forward_label)); EMIT_OP2(SLJIT_ADD, R_NEXT_HEAD, 0, R_NEXT_HEAD, 0, SLJIT_IMM, 1); EMIT_JUMP(jump, SLJIT_JUMP); sljit_set_label(jump, label); /* String is finished. */ EMIT_LABEL(label); sljit_set_label(fast_forward_jump, label); EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index), R_NEXT_HEAD, 0); EMIT_JUMP(fast_forward_jump, SLJIT_JUMP); } /* End check. */ if (end_check_jump) { EMIT_LABEL(label); sljit_set_label(end_check_jump, label); if (!(compiler_common.flags & REGEX_MATCH_NON_GREEDY) || !(compiler_common.flags & REGEX_MATCH_BEGIN)) { CHECK(compile_end_check(&compiler_common, end_check_label)); } else { /* Since we leave, we do not need to update the R_BEST_BEGIN. */ EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0); EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0); if (compiler_common.flags & REGEX_ID_CHECK) { EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2)); } EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 1); EMIT_JUMP(non_greedy_end_jump, SLJIT_JUMP); } } /* Finish check. */ if (best_match_check_jump) { EMIT_LABEL(label); sljit_set_label(best_match_check_jump, label); if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) { EMIT_CMP(jump, SLJIT_NOT_EQUAL, R_NEXT_HEAD, 0, SLJIT_IMM, 0); sljit_set_label(jump, start_label); } EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 1); } /* Leaving matching and storing the necessary values. */ EMIT_LABEL(label); sljit_set_label(length_is_zero_jump, label); if (non_greedy_end_jump) sljit_set_label(non_greedy_end_jump, label); EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index), R_CURR_INDEX, 0); EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head), R_NEXT_HEAD, 0); EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next), R_NEXT_STATE, 0); EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current), R_CURR_STATE, 0); /* Exit from JIT. */ EMIT_LABEL(label); sljit_set_label(best_match_found_jump, label); if (fast_forward_jump) sljit_set_label(fast_forward_jump, label); CHECK(sljit_emit_return(compiler_common.compiler, SLJIT_UNUSED, 0, 0)); for (ind = 1; ind < compiler_common.dfa_size - 1; ind++) { if (compiler_common.search_states[ind].type >= 0) { SLJIT_ASSERT(compiler_common.search_states[ind].type < compiler_common.terms_size); EMIT_LABEL(label); compiler_common.machine->entry_addrs[compiler_common.search_states[ind].type] = (sljit_uw)label; if (compiler_common.dfa_transitions[ind].type == type_char) { EMIT_CMP(jump, SLJIT_NOT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, compiler_common.dfa_transitions[ind].value); } else if (compiler_common.dfa_transitions[ind].type == type_rng_start) { ind = compile_range_check(&compiler_common, ind); CHECK(!ind); } else { SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_newline); CHECK(compile_newline_check(&compiler_common, ind)); } CHECK(trace_transitions(ind, &compiler_common)); #ifdef REGEX_MATCH_VERBOSE if (compiler_common.flags & REGEX_MATCH_VERBOSE) printf("(%3d): ", compiler_common.search_states[ind].type); #endif CHECK(compile_cond_tran(&compiler_common, compiler_common.search_states[ind].type)); if (compiler_common.dfa_transitions[ind].type == type_char) { EMIT_LABEL(label); sljit_set_label(jump, label); } else if (compiler_common.dfa_transitions[ind].type == type_rng_end) { EMIT_LABEL(label); range_set_label(compiler_common.range_jump_list, label); } else { SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_newline); } /* Branch to the next item in the list. */ EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 1)); EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 1), SLJIT_IMM, -1); CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0)); } } if (ind == compiler_common.dfa_size - 1) { /* Generate an init stub function. */ EMIT_LABEL(label); CHECK(sljit_emit_enter(compiler_common.compiler, 0, 2, 3, 3, 0, 0, 0)); if (empty_match_id == -1) { EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, -1); EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_IMM, 0); } else { EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0); EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_IMM, empty_match_id); } EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, index), SLJIT_IMM, !(compiler_common.flags & REGEX_FAKE_MATCH_BEGIN) ? 1 : 2); if (!(compiler_common.flags & REGEX_MATCH_NON_GREEDY) || empty_match_id == -1) { /* The else is a really rare event, so we still generate an empty function instead of a runtime pointer check. */ SLJIT_ASSERT(R_CURR_STATE == SLJIT_S0); if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) { /* R_CURR_INDEX (put to R_TEMP) is zero. */ EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, 0); } CHECK(compile_uncond_tran(&compiler_common, R_CURR_STATE)); } else { EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0); } CHECK(sljit_emit_return(compiler_common.compiler, SLJIT_MOV, R_NEXT_HEAD, 0)); compiler_common.machine->continue_match = sljit_generate_code(compiler_common.compiler); #ifndef SLJIT_INDIRECT_CALL compiler_common.machine->u.init_match = (void*)(sljit_sw)sljit_get_label_addr(label); #else sljit_set_function_context(&compiler_common.machine->u.init_match, &compiler_common.machine->context, sljit_get_label_addr(label), regex_compile); #endif #ifdef REGEX_MATCH_VERBOSE if (compiler_common.flags & REGEX_MATCH_VERBOSE) printf("Continue match: %p Init match: %p\n\n", compiler_common.machine->continue_match, compiler_common.machine->u.init_match); #endif if (compiler_common.machine->continue_match) { for (ind = 0; ind < compiler_common.terms_size; ++ind) compiler_common.machine->entry_addrs[ind] = sljit_get_label_addr((struct sljit_label*)compiler_common.machine->entry_addrs[ind]); done = 1; } } END_GUARD stack_destroy(&compiler_common.stack); stack_destroy(&compiler_common.depth); SLJIT_FREE(compiler_common.dfa_transitions, NULL); SLJIT_FREE(compiler_common.search_states, NULL); if (compiler_common.range_jump_list) SLJIT_FREE(compiler_common.range_jump_list, NULL); if (compiler_common.compiler) sljit_free_compiler(compiler_common.compiler); if (done) return compiler_common.machine; if (compiler_common.machine) { SLJIT_FREE(compiler_common.machine, NULL); } if (error) *error = REGEX_MEMORY_ERROR; return NULL; } #undef TERM_OFFSET_OF #undef EMIT_OP1 #undef EMIT_OP2 #undef EMIT_LABEL #undef EMIT_JUMP #undef EMIT_CMP #undef BEGIN_GUARD #undef END_GUARD #undef CHECK void regex_free_machine(struct regex_machine *machine) { sljit_free_code(machine->continue_match); SLJIT_FREE(machine, NULL); } const char* regex_get_platform_name(void) { return sljit_get_platform_name(); } /* --------------------------------------------------------------------- */ /* Mathching utilities */ /* --------------------------------------------------------------------- */ struct regex_match* regex_begin_match(struct regex_machine *machine) { sljit_sw *ptr1; sljit_sw *ptr2; sljit_sw *end; sljit_sw *entry_addrs; struct regex_match *match = (struct regex_match*)SLJIT_MALLOC(sizeof(struct regex_match) + (machine->size * 2 - 1) * sizeof(sljit_sw), NULL); if (!match) return NULL; ptr1 = match->states; ptr2 = match->states + machine->size; end = ptr2; entry_addrs = (sljit_sw*)machine->entry_addrs; match->current = ptr1; match->next = ptr2; match->head = 0; match->machine = machine; /* Init machine states. */ switch (machine->no_states) { case 2: while (ptr1 < end) { *ptr1++ = *entry_addrs; *ptr2++ = *entry_addrs++; *ptr1++ = -1; *ptr2++ = -1; } break; case 3: while (ptr1 < end) { *ptr1++ = *entry_addrs; *ptr2++ = *entry_addrs++; *ptr1++ = -1; *ptr2++ = -1; *ptr1++ = 0; *ptr2++ = 0; } break; case 4: while (ptr1 < end) { *ptr1++ = *entry_addrs; *ptr2++ = *entry_addrs++; *ptr1++ = -1; *ptr2++ = -1; *ptr1++ = 0; *ptr2++ = 0; *ptr1++ = 0; *ptr2++ = 0; } break; default: SLJIT_ASSERT_STOP(); break; } SLJIT_ASSERT(ptr1 == end); match->u.continue_match = machine->continue_match; regex_reset_match(match); return match; } void regex_reset_match(struct regex_match *match) { struct regex_machine *machine = match->machine; sljit_sw current, ind; sljit_sw *current_ptr; match->best_end = 0; match->fast_quit = 0; match->fast_forward = 0; if (match->head != 0) { /* Clear the current state. */ current = match->head; current_ptr = match->current; do { ind = (current / sizeof(sljit_sw)) + 1; current = current_ptr[ind]; current_ptr[ind] = -1; } while (current != 0); } match->head = machine->u.call_init(match->current, match); } void regex_free_match(struct regex_match *match) { SLJIT_FREE(match, NULL); } void regex_continue_match(struct regex_match *match, const regex_char_t *input_string, int length) { match->u.call_continue(match, input_string, length); } int regex_get_result(struct regex_match *match, int *end, int *id) { int flags = match->machine->flags; sljit_sw no_states; *end = match->best_end; *id = match->best_id; if (!(flags & (REGEX_MATCH_END | REGEX_FAKE_MATCH_END))) return match->best_begin; if (flags & REGEX_FAKE_MATCH_END) { SLJIT_ASSERT(!(flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END))); if (match->best_begin != -1) return match->best_begin; no_states = match->machine->no_states; if (match->current[no_states + 1] == -1) return -1; if (flags & REGEX_ID_CHECK) *id = match->current[no_states + 3]; if (!(flags & REGEX_FAKE_MATCH_BEGIN)) *end = match->index - 1; else *end = match->index - 2; return match->current[no_states + 2]; } else { /* Check the status of the last code. */ if (!(flags & REGEX_MATCH_BEGIN)) { /* No shortcut in this case. */ if (!(flags & REGEX_ID_CHECK)) { if (match->current[1] == -1) return -1; *end = match->index - 1; return match->current[2]; } if (match->current[1] == -1) return -1; *end = match->index - 1; *id = match->current[3]; return match->current[2]; } /* Shortcut is possible in this case. */ if (!(flags & REGEX_ID_CHECK)) { if (match->current[1] == -1 || match->head == -1) return -1; *end = match->index - 1; return 0; } if (match->current[1] == -1 || match->head == -1) return -1; *end = match->index - 1; *id = match->current[2]; return 0; } } int regex_is_match_finished(struct regex_match *match) { return match->fast_quit; } #ifdef REGEX_MATCH_VERBOSE void regex_continue_match_debug(struct regex_match *match, const regex_char_t *input_string, int length) { sljit_sw *ptr; sljit_sw *end; sljit_sw count; #if (defined SLJIT_DEBUG && SLJIT_DEBUG) sljit_sw current; #endif sljit_sw no_states = match->machine->no_states; sljit_sw len = match->machine->size; while (length > 0) { match->u.call_continue(match, input_string, 1); if (match->fast_forward) { if (match->machine->flags & REGEX_MATCH_VERBOSE) printf("fast forward\n"); } /* Verbose (first). */ if (match->machine->flags & REGEX_MATCH_VERBOSE) { ptr = match->current; end = ptr + len; count = 0; printf("'%c' (%3ld->%3ld [%3ld]) ", *input_string, (long)match->best_begin, (long)match->best_end, (long)match->best_id); while (ptr < end) { printf("[%3ld:", (long)count++); switch (no_states) { case 2: if (ptr[1] != -1) printf("+] "); else printf(" ] "); break; case 3: if (ptr[1] != -1) printf("+,%3ld] ", (long)ptr[2]); else printf(" ,XXX] "); break; case 4: if (ptr[1] != -1) printf("+,%3ld,%3ld] ", (long)ptr[2], (long)ptr[3]); else printf(" ,XXX,XXX] "); break; } ptr += no_states; } printf("\n"); } #if (defined SLJIT_DEBUG && SLJIT_DEBUG) /* Sanity check (later). */ ptr = match->next; end = ptr + len; while (ptr < end) { SLJIT_ASSERT(ptr[1] == -1); ptr += no_states; } /* Check number of active elements. */ ptr = match->current + no_states; end = ptr + len - no_states; count = 0; while (ptr < end) { if (ptr[1] != -1) count++; ptr += no_states; } /* Check chain list. */ current = match->head; ptr = match->current; while (current != 0) { SLJIT_ASSERT(current >= 0 && current < len * sizeof(sljit_sw)); SLJIT_ASSERT((current % (no_states * sizeof(sljit_sw))) == 0); SLJIT_ASSERT(count > 0); current = ptr[(current / sizeof(sljit_sw)) + 1]; count--; } SLJIT_ASSERT(count == 0); #endif if (match->fast_quit) { /* the machine has stopped working. */ if (match->machine->flags & REGEX_MATCH_VERBOSE) printf("Best match has found\n"); break; } input_string++; length--; } } #endif