From 8349192503450d645bad6a2b30a72c67fd97f7c8 Mon Sep 17 00:00:00 2001 From: Thomas Vigouroux Date: Fri, 15 May 2020 12:23:26 +0200 Subject: [PATCH] treesitter: update runtime Since tree-sitter PR 615, predicates are not parsed the same. "Old" way of writing predicates is still supported. --- src/tree_sitter/language.c | 4 +- src/tree_sitter/language.h | 2 +- src/tree_sitter/parser.c | 28 +- src/tree_sitter/parser.h | 50 +- src/tree_sitter/query.c | 734 +++++++++++++++--------- src/tree_sitter/subtree.c | 12 +- test/functional/lua/treesitter_spec.lua | 6 +- 7 files changed, 524 insertions(+), 312 deletions(-) diff --git a/src/tree_sitter/language.c b/src/tree_sitter/language.c index a396b4b0b6..c00c49e3c0 100644 --- a/src/tree_sitter/language.c +++ b/src/tree_sitter/language.c @@ -33,8 +33,8 @@ void ts_language_table_entry( assert(symbol < self->token_count); uint32_t action_index = ts_language_lookup(self, state, symbol); const TSParseActionEntry *entry = &self->parse_actions[action_index]; - result->action_count = entry->count; - result->is_reusable = entry->reusable; + result->action_count = entry->entry.count; + result->is_reusable = entry->entry.reusable; result->actions = (const TSParseAction *)(entry + 1); } } diff --git a/src/tree_sitter/language.h b/src/tree_sitter/language.h index f908b4593a..341f0f85af 100644 --- a/src/tree_sitter/language.h +++ b/src/tree_sitter/language.h @@ -93,7 +93,7 @@ static inline TSStateId ts_language_next_state(const TSLanguage *self, if (count > 0) { TSParseAction action = actions[count - 1]; if (action.type == TSParseActionTypeShift) { - return action.params.extra ? state : action.params.state; + return action.params.shift.extra ? state : action.params.shift.state; } } return 0; diff --git a/src/tree_sitter/parser.c b/src/tree_sitter/parser.c index d4b227308b..19add152f1 100644 --- a/src/tree_sitter/parser.c +++ b/src/tree_sitter/parser.c @@ -951,15 +951,15 @@ static bool ts_parser__do_all_potential_reductions( switch (action.type) { case TSParseActionTypeShift: case TSParseActionTypeRecover: - if (!action.params.extra && !action.params.repetition) has_shift_action = true; + if (!action.params.shift.extra && !action.params.shift.repetition) has_shift_action = true; break; case TSParseActionTypeReduce: - if (action.params.child_count > 0) + if (action.params.reduce.child_count > 0) ts_reduce_action_set_add(&self->reduce_actions, (ReduceAction){ - .symbol = action.params.symbol, - .count = action.params.child_count, - .dynamic_precedence = action.params.dynamic_precedence, - .production_id = action.params.production_id, + .symbol = action.params.reduce.symbol, + .count = action.params.reduce.child_count, + .dynamic_precedence = action.params.reduce.dynamic_precedence, + .production_id = action.params.reduce.production_id, }); default: break; @@ -1250,7 +1250,7 @@ static void ts_parser__recover( // be counted in error cost calculations. unsigned n; const TSParseAction *actions = ts_language_actions(self->language, 1, ts_subtree_symbol(lookahead), &n); - if (n > 0 && actions[n - 1].type == TSParseActionTypeShift && actions[n - 1].params.extra) { + if (n > 0 && actions[n - 1].type == TSParseActionTypeShift && actions[n - 1].params.shift.extra) { MutableSubtree mutable_lookahead = ts_subtree_make_mut(&self->tree_pool, lookahead); ts_subtree_set_extra(&mutable_lookahead); lookahead = ts_subtree_from_mut(mutable_lookahead); @@ -1379,9 +1379,9 @@ static bool ts_parser__advance( switch (action.type) { case TSParseActionTypeShift: { - if (action.params.repetition) break; + if (action.params.shift.repetition) break; TSStateId next_state; - if (action.params.extra) { + if (action.params.shift.extra) { // TODO: remove when TREE_SITTER_LANGUAGE_VERSION 9 is out. if (state == ERROR_STATE) continue; @@ -1389,7 +1389,7 @@ static bool ts_parser__advance( next_state = state; LOG("shift_extra"); } else { - next_state = action.params.state; + next_state = action.params.shift.state; LOG("shift state:%u", next_state); } @@ -1398,7 +1398,7 @@ static bool ts_parser__advance( next_state = ts_language_next_state(self->language, state, ts_subtree_symbol(lookahead)); } - ts_parser__shift(self, version, next_state, lookahead, action.params.extra); + ts_parser__shift(self, version, next_state, lookahead, action.params.shift.extra); if (did_reuse) reusable_node_advance(&self->reusable_node); return true; } @@ -1406,10 +1406,10 @@ static bool ts_parser__advance( case TSParseActionTypeReduce: { bool is_fragile = table_entry.action_count > 1; bool is_extra = lookahead.ptr == NULL; - LOG("reduce sym:%s, child_count:%u", SYM_NAME(action.params.symbol), action.params.child_count); + LOG("reduce sym:%s, child_count:%u", SYM_NAME(action.params.reduce.symbol), action.params.reduce.child_count); StackVersion reduction_version = ts_parser__reduce( - self, version, action.params.symbol, action.params.child_count, - action.params.dynamic_precedence, action.params.production_id, + self, version, action.params.reduce.symbol, action.params.reduce.child_count, + action.params.reduce.dynamic_precedence, action.params.reduce.production_id, is_fragile, is_extra ); if (reduction_version != STACK_VERSION_NONE) { diff --git a/src/tree_sitter/parser.h b/src/tree_sitter/parser.h index 9df91f8c3c..11bf4fc42a 100644 --- a/src/tree_sitter/parser.h +++ b/src/tree_sitter/parser.h @@ -62,13 +62,13 @@ typedef struct { TSStateId state; bool extra : 1; bool repetition : 1; - }; + } shift; struct { TSSymbol symbol; int16_t dynamic_precedence; uint8_t child_count; uint8_t production_id; - }; + } reduce; } params; TSParseActionType type : 4; } TSParseAction; @@ -83,7 +83,7 @@ typedef union { struct { uint8_t count; bool reusable : 1; - }; + } entry; } TSParseActionEntry; struct TSLanguage { @@ -167,22 +167,28 @@ struct TSLanguage { #define ACTIONS(id) id -#define SHIFT(state_value) \ - { \ - { \ - .type = TSParseActionTypeShift, \ - .params = {.state = state_value}, \ - } \ +#define SHIFT(state_value) \ + { \ + { \ + .params = { \ + .shift = { \ + .state = state_value \ + } \ + }, \ + .type = TSParseActionTypeShift \ + } \ } #define SHIFT_REPEAT(state_value) \ { \ { \ - .type = TSParseActionTypeShift, \ .params = { \ - .state = state_value, \ - .repetition = true \ + .shift = { \ + .state = state_value, \ + .repetition = true \ + } \ }, \ + .type = TSParseActionTypeShift \ } \ } @@ -194,20 +200,26 @@ struct TSLanguage { #define SHIFT_EXTRA() \ { \ { \ - .type = TSParseActionTypeShift, \ - .params = {.extra = true} \ + .params = { \ + .shift = { \ + .extra = true \ + } \ + }, \ + .type = TSParseActionTypeShift \ } \ } #define REDUCE(symbol_val, child_count_val, ...) \ { \ { \ - .type = TSParseActionTypeReduce, \ .params = { \ - .symbol = symbol_val, \ - .child_count = child_count_val, \ - __VA_ARGS__ \ - } \ + .reduce = { \ + .symbol = symbol_val, \ + .child_count = child_count_val, \ + __VA_ARGS__ \ + }, \ + }, \ + .type = TSParseActionTypeReduce \ } \ } diff --git a/src/tree_sitter/query.c b/src/tree_sitter/query.c index 92a8006179..d5fb90075d 100644 --- a/src/tree_sitter/query.c +++ b/src/tree_sitter/query.c @@ -27,7 +27,7 @@ typedef struct { * represented as a sequence of these steps. Fields: * * - `symbol` - The grammar symbol to match. A zero value represents the - * wildcard symbol, '*'. + * wildcard symbol, '_'. * - `field` - The field name to match. A zero value means that a field name * was not specified. * - `capture_id` - An integer representing the name of the capture associated @@ -35,21 +35,21 @@ typedef struct { * captured in this pattern. * - `depth` - The depth where this node occurs in the pattern. The root node * of the pattern has depth zero. - * - `repeat_step_index` - If this step is part of a repetition, the index of - * the beginning of the repetition. A `NONE` value means this step is not - * part of a repetition. + * - `alternative_index` - The index of a different query step that serves as + * an alternative to this step. */ typedef struct { TSSymbol symbol; TSFieldId field; uint16_t capture_ids[MAX_STEP_CAPTURE_COUNT]; - uint16_t repeat_step_index; - uint16_t depth: 11; + uint16_t alternative_index; + uint8_t depth; bool contains_captures: 1; bool is_pattern_start: 1; bool is_immediate: 1; bool is_last: 1; - bool is_repeated: 1; + bool is_placeholder: 1; + bool alternative_is_immediate: 1; } QueryStep; /* @@ -88,7 +88,25 @@ typedef struct { * QueryState - The state of an in-progress match of a particular pattern * in a query. While executing, a `TSQueryCursor` must keep track of a number * of possible in-progress matches. Each of those possible matches is - * represented as one of these states. + * represented as one of these states. Fields: + * - `id` - A numeric id that is exposed to the public API. This allows the + * caller to remove a given match, preventing any more of its captures + * from being returned. + * - `start_depth` - The depth in the tree where the first step of the state's + * pattern was matched. + * - `pattern_index` - The pattern that the state is matching. + * - `consumed_capture_count` - The number of captures from this match that + * have already been returned. + * - `capture_list_id` - A numeric id that can be used to retrieve the state's + * list of captures from the `CaptureListPool`. + * - `seeking_immediate_match` - A flag that indicates that the state's next + * step must be matched by the very next sibling. This is used when + * processing repetitions. + * - `has_in_progress_alternatives` - A flag that indicates that there is are + * other states that have the same captures as this state, but are at + * different steps in their pattern. This means that in order to obey the + * 'longest-match' rule, this state should not be returned as a match until + * it is clear that there can be no longer match. */ typedef struct { uint32_t id; @@ -96,10 +114,9 @@ typedef struct { uint16_t pattern_index; uint16_t step_index; uint16_t consumed_capture_count; - uint16_t repeat_match_count; - uint16_t step_index_on_failure; uint8_t capture_list_id; - bool seeking_non_match; + bool seeking_immediate_match: 1; + bool has_in_progress_alternatives: 1; } QueryState; typedef Array(TSQueryCapture) CaptureList; @@ -417,12 +434,13 @@ static QueryStep query_step__new( .depth = depth, .field = 0, .capture_ids = {NONE, NONE, NONE, NONE}, + .alternative_index = NONE, .contains_captures = false, - .is_repeated = false, .is_last = false, .is_pattern_start = false, + .is_placeholder = false, .is_immediate = is_immediate, - .repeat_step_index = NONE, + .alternative_is_immediate = false, }; } @@ -511,13 +529,14 @@ static inline bool ts_query__pattern_map_search( static inline void ts_query__pattern_map_insert( TSQuery *self, TSSymbol symbol, - uint32_t start_step_index + uint32_t start_step_index, + uint32_t pattern_index ) { uint32_t index; ts_query__pattern_map_search(self, symbol, &index); array_insert(&self->pattern_map, index, ((PatternEntry) { .step_index = start_step_index, - .pattern_index = self->pattern_map.size, + .pattern_index = pattern_index, })); } @@ -548,12 +567,22 @@ static TSQueryError ts_query__parse_predicate( TSQuery *self, Stream *stream ) { - if (stream->next == ')') return PARENT_DONE; - if (stream->next != '(') return TSQueryErrorSyntax; - stream_advance(stream); + if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax; + const char *predicate_name = stream->input; + stream_scan_identifier(stream); + uint32_t length = stream->input - predicate_name; + uint16_t id = symbol_table_insert_name( + &self->predicate_values, + predicate_name, + length + ); + array_back(&self->predicates_by_pattern)->length++; + array_push(&self->predicate_steps, ((TSQueryPredicateStep) { + .type = TSQueryPredicateStepTypeString, + .value_id = id, + })); stream_skip_whitespace(stream); - unsigned step_count = 0; for (;;) { if (stream->next == ')') { stream_advance(stream); @@ -658,7 +687,6 @@ static TSQueryError ts_query__parse_predicate( return TSQueryErrorSyntax; } - step_count++; stream_skip_whitespace(stream); } @@ -684,93 +712,141 @@ static TSQueryError ts_query__parse_pattern( return PARENT_DONE; } - // Parse a parenthesized node expression + // An open parenthesis can be the start of three possible constructs: + // * A grouped sequence + // * A predicate + // * A named node else if (stream->next == '(') { stream_advance(stream); stream_skip_whitespace(stream); - // Parse a nested list, which represents a pattern followed by - // zero-or-more predicates. - if (stream->next == '(' && depth == 0) { - TSQueryError e = ts_query__parse_pattern(self, stream, 0, capture_count, is_immediate); - if (e) return e; - - // Parse the predicates. - stream_skip_whitespace(stream); + // If this parenthesis is followed by a node, then it represents a grouped sequence. + if (stream->next == '(' || stream->next == '"') { + bool child_is_immediate = false; for (;;) { - TSQueryError e = ts_query__parse_predicate(self, stream); - if (e == PARENT_DONE) { + if (stream->next == '.') { + child_is_immediate = true; stream_advance(stream); stream_skip_whitespace(stream); - return 0; + } + TSQueryError e = ts_query__parse_pattern( + self, + stream, + depth, + capture_count, + child_is_immediate + ); + if (e == PARENT_DONE) { + stream_advance(stream); + break; } else if (e) { return e; } + + child_is_immediate = false; } } - TSSymbol symbol; - - // Parse the wildcard symbol - if (stream->next == '*') { - symbol = depth > 0 ? NAMED_WILDCARD_SYMBOL : WILDCARD_SYMBOL; + // A pound character indicates the start of a predicate. + else if (stream->next == '#') { stream_advance(stream); + return ts_query__parse_predicate(self, stream); } - // Parse a normal node name - else if (stream_is_ident_start(stream)) { - const char *node_name = stream->input; - stream_scan_identifier(stream); - uint32_t length = stream->input - node_name; - symbol = ts_language_symbol_for_name( - self->language, - node_name, - length, - true - ); - if (!symbol) { - stream_reset(stream, node_name); - return TSQueryErrorNodeType; - } - } else { - return TSQueryErrorSyntax; - } + // Otherwise, this parenthesis is the start of a named node. + else { + TSSymbol symbol; - // Add a step for the node. - array_push(&self->steps, query_step__new(symbol, depth, is_immediate)); + // Parse the wildcard symbol + if ( + stream->next == '_' || - // Parse the child patterns - stream_skip_whitespace(stream); - bool child_is_immediate = false; - uint16_t child_start_step_index = self->steps.size; - for (;;) { - if (stream->next == '.') { - child_is_immediate = true; + // TODO - remove. + // For temporary backward compatibility, handle '*' as a wildcard. + stream->next == '*' + ) { + symbol = depth > 0 ? NAMED_WILDCARD_SYMBOL : WILDCARD_SYMBOL; stream_advance(stream); - stream_skip_whitespace(stream); } - TSQueryError e = ts_query__parse_pattern( - self, - stream, - depth + 1, - capture_count, - child_is_immediate - ); - if (e == PARENT_DONE) { - if (child_is_immediate) { - self->steps.contents[child_start_step_index].is_last = true; + // Parse a normal node name + else if (stream_is_ident_start(stream)) { + const char *node_name = stream->input; + stream_scan_identifier(stream); + uint32_t length = stream->input - node_name; + + // TODO - remove. + // For temporary backward compatibility, handle predicates without the leading '#' sign. + if (length > 0 && (node_name[length - 1] == '!' || node_name[length - 1] == '?')) { + stream_reset(stream, node_name); + return ts_query__parse_predicate(self, stream); } - stream_advance(stream); - break; - } else if (e) { - return e; + + symbol = ts_language_symbol_for_name( + self->language, + node_name, + length, + true + ); + if (!symbol) { + stream_reset(stream, node_name); + return TSQueryErrorNodeType; + } + } else { + return TSQueryErrorSyntax; } - child_is_immediate = false; + // Add a step for the node. + array_push(&self->steps, query_step__new(symbol, depth, is_immediate)); + + // Parse the child patterns + stream_skip_whitespace(stream); + bool child_is_immediate = false; + uint16_t child_start_step_index = self->steps.size; + for (;;) { + if (stream->next == '.') { + child_is_immediate = true; + stream_advance(stream); + stream_skip_whitespace(stream); + } + + TSQueryError e = ts_query__parse_pattern( + self, + stream, + depth + 1, + capture_count, + child_is_immediate + ); + if (e == PARENT_DONE) { + if (child_is_immediate) { + self->steps.contents[child_start_step_index].is_last = true; + } + stream_advance(stream); + break; + } else if (e) { + return e; + } + + child_is_immediate = false; + } } } + // Parse a wildcard pattern + else if ( + stream->next == '_' || + + // TODO remove. + // For temporary backward compatibility, handle '*' as a wildcard. + stream->next == '*' + ) { + stream_advance(stream); + stream_skip_whitespace(stream); + + // Add a step that matches any kind of node + array_push(&self->steps, query_step__new(WILDCARD_SYMBOL, depth, is_immediate)); + } + // Parse a double-quoted anonymous leaf node expression else if (stream->next == '"') { stream_advance(stream); @@ -842,15 +918,6 @@ static TSQueryError ts_query__parse_pattern( self->steps.contents[step_index].field = field_id; } - // Parse a wildcard pattern - else if (stream->next == '*') { - stream_advance(stream); - stream_skip_whitespace(stream); - - // Add a step that matches any kind of node - array_push(&self->steps, query_step__new(WILDCARD_SYMBOL, depth, is_immediate)); - } - else { return TSQueryErrorSyntax; } @@ -863,9 +930,29 @@ static TSQueryError ts_query__parse_pattern( if (stream->next == '+') { stream_advance(stream); - step->is_repeated = true; - array_back(&self->steps)->repeat_step_index = starting_step_index; stream_skip_whitespace(stream); + QueryStep repeat_step = query_step__new(WILDCARD_SYMBOL, depth, false); + repeat_step.alternative_index = starting_step_index; + repeat_step.is_placeholder = true; + repeat_step.alternative_is_immediate = true; + array_push(&self->steps, repeat_step); + } + + else if (stream->next == '?') { + stream_advance(stream); + stream_skip_whitespace(stream); + step->alternative_index = self->steps.size; + } + + else if (stream->next == '*') { + stream_advance(stream); + stream_skip_whitespace(stream); + QueryStep repeat_step = query_step__new(WILDCARD_SYMBOL, depth, false); + repeat_step.alternative_index = starting_step_index; + repeat_step.is_placeholder = true; + repeat_step.alternative_is_immediate = true; + array_push(&self->steps, repeat_step); + step->alternative_index = self->steps.size; } // Parse an '@'-prefixed capture pattern @@ -950,6 +1037,7 @@ TSQuery *ts_query_new( Stream stream = stream_new(source, source_len); stream_skip_whitespace(&stream); while (stream.input < stream.end) { + uint32_t pattern_index = self->predicates_by_pattern.size; uint32_t start_step_index = self->steps.size; uint32_t capture_count = 0; array_push(&self->start_bytes_by_pattern, stream.input - source); @@ -980,14 +1068,18 @@ TSQuery *ts_query_new( } // Maintain a map that can look up patterns for a given root symbol. - self->steps.contents[start_step_index].is_pattern_start = true; - ts_query__pattern_map_insert( - self, - self->steps.contents[start_step_index].symbol, - start_step_index - ); - if (self->steps.contents[start_step_index].symbol == WILDCARD_SYMBOL) { - self->wildcard_root_pattern_count++; + for (;;) { + QueryStep *step = &self->steps.contents[start_step_index]; + step->is_pattern_start = true; + ts_query__pattern_map_insert(self, step->symbol, start_step_index, pattern_index); + if (step->symbol == WILDCARD_SYMBOL) { + self->wildcard_root_pattern_count++; + } + if (step->alternative_index != NONE) { + start_step_index = step->alternative_index; + } else { + break; + } } } @@ -1191,23 +1283,85 @@ static bool ts_query_cursor__first_in_progress_capture( return result; } -static bool ts_query__cursor_add_state( +// Determine which node is first in a depth-first traversal +int ts_query_cursor__compare_nodes(TSNode left, TSNode right) { + if (left.id != right.id) { + uint32_t left_start = ts_node_start_byte(left); + uint32_t right_start = ts_node_start_byte(right); + if (left_start < right_start) return -1; + if (left_start > right_start) return 1; + uint32_t left_node_count = ts_node_end_byte(left); + uint32_t right_node_count = ts_node_end_byte(right); + if (left_node_count > right_node_count) return -1; + if (left_node_count < right_node_count) return 1; + } + return 0; +} + +// Determine if either state contains a superset of the other state's captures. +void ts_query_cursor__compare_captures( + TSQueryCursor *self, + QueryState *left_state, + QueryState *right_state, + bool *left_contains_right, + bool *right_contains_left +) { + CaptureList *left_captures = capture_list_pool_get( + &self->capture_list_pool, + left_state->capture_list_id + ); + CaptureList *right_captures = capture_list_pool_get( + &self->capture_list_pool, + right_state->capture_list_id + ); + *left_contains_right = true; + *right_contains_left = true; + unsigned i = 0, j = 0; + for (;;) { + if (i < left_captures->size) { + if (j < right_captures->size) { + TSQueryCapture *left = &left_captures->contents[i]; + TSQueryCapture *right = &right_captures->contents[j]; + if (left->node.id == right->node.id && left->index == right->index) { + i++; + j++; + } else { + switch (ts_query_cursor__compare_nodes(left->node, right->node)) { + case -1: + *right_contains_left = false; + i++; + break; + case 1: + *left_contains_right = false; + j++; + break; + default: + *right_contains_left = false; + *left_contains_right = false; + i++; + j++; + break; + } + } + } else { + *right_contains_left = false; + break; + } + } else { + if (j < right_captures->size) { + *left_contains_right = false; + } + break; + } + } +} + +static bool ts_query_cursor__add_state( TSQueryCursor *self, const PatternEntry *pattern ) { QueryStep *step = &self->query->steps.contents[pattern->step_index]; - // If this pattern begins with a repetition, then avoid creating - // new states after already matching the repetition one or more times. - // The query should only one match for the repetition - the one that - // started the earliest. - if (step->is_repeated) { - for (unsigned i = 0; i < self->states.size; i++) { - QueryState *state = &self->states.contents[i]; - if (state->step_index == pattern->step_index) return true; - } - } - uint32_t list_id = capture_list_pool_acquire(&self->capture_list_pool); // If there are no capture lists left in the pool, then terminate whichever @@ -1244,21 +1398,23 @@ static bool ts_query__cursor_add_state( .pattern_index = pattern->pattern_index, .start_depth = self->depth - step->depth, .consumed_capture_count = 0, - .repeat_match_count = 0, - .step_index_on_failure = NONE, - .seeking_non_match = false, + .seeking_immediate_match = false, })); return true; } +// Duplicate the given state and insert the newly-created state immediately after +// the given state in the `states` array. static QueryState *ts_query__cursor_copy_state( TSQueryCursor *self, const QueryState *state ) { uint32_t new_list_id = capture_list_pool_acquire(&self->capture_list_pool); if (new_list_id == NONE) return NULL; - array_push(&self->states, *state); - QueryState *new_state = array_back(&self->states); + uint32_t index = (state - self->states.contents) + 1; + QueryState copy = *state; + array_insert(&self->states, index, copy); + QueryState *new_state = &self->states.contents[index]; new_state->capture_list_id = new_list_id; CaptureList *old_captures = capture_list_pool_get( &self->capture_list_pool, @@ -1281,56 +1437,62 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { if (self->ascending) { LOG("leave node. type:%s\n", ts_node_type(ts_tree_cursor_current_node(&self->cursor))); - // When leaving a node, remove any unfinished states whose next step - // needed to match something within that node. - uint32_t deleted_count = 0; - for (unsigned i = 0, n = self->states.size; i < n; i++) { - QueryState *state = &self->states.contents[i]; - QueryStep *step = &self->query->steps.contents[state->step_index]; - - if ((uint32_t)state->start_depth + (uint32_t)step->depth > self->depth) { - LOG( - " failed to match. pattern:%u, step:%u\n", - state->pattern_index, - state->step_index - ); - - capture_list_pool_release( - &self->capture_list_pool, - state->capture_list_id - ); - deleted_count++; - } else if (deleted_count > 0) { - self->states.contents[i - deleted_count] = *state; - } - } - - self->states.size -= deleted_count; - + // Leave this node by stepping to its next sibling or to its parent. + bool did_move = true; if (ts_tree_cursor_goto_next_sibling(&self->cursor)) { self->ascending = false; } else if (ts_tree_cursor_goto_parent(&self->cursor)) { self->depth--; } else { + did_move = false; + } + + // After leaving a node, remove any states that cannot make further progress. + uint32_t deleted_count = 0; + for (unsigned i = 0, n = self->states.size; i < n; i++) { + QueryState *state = &self->states.contents[i]; + QueryStep *step = &self->query->steps.contents[state->step_index]; + + // If a state completed its pattern inside of this node, but was deferred from finishing + // in order to search for longer matches, mark it as finished. + if (step->depth == PATTERN_DONE_MARKER) { + if (state->start_depth > self->depth || !did_move) { + LOG(" finish pattern %u\n", state->pattern_index); + state->id = self->next_state_id++; + array_push(&self->finished_states, *state); + deleted_count++; + continue; + } + } + + // If a state needed to match something within this node, then remove that state + // as it has failed to match. + else if ((uint32_t)state->start_depth + (uint32_t)step->depth > self->depth) { + LOG( + " failed to match. pattern:%u, step:%u\n", + state->pattern_index, + state->step_index + ); + capture_list_pool_release( + &self->capture_list_pool, + state->capture_list_id + ); + deleted_count++; + continue; + } + + if (deleted_count > 0) { + self->states.contents[i - deleted_count] = *state; + } + } + self->states.size -= deleted_count; + + if (!did_move) { return self->finished_states.size > 0; } } else { - bool has_later_siblings; - bool can_have_later_siblings_with_this_field; - TSFieldId field_id = ts_tree_cursor_current_status( - &self->cursor, - &has_later_siblings, - &can_have_later_siblings_with_this_field - ); + // If this node is before the selected range, then avoid descending into it. TSNode node = ts_tree_cursor_current_node(&self->cursor); - TSSymbol symbol = ts_node_symbol(node); - bool is_named = ts_node_is_named(node); - if (symbol != ts_builtin_sym_error && self->query->symbol_map) { - symbol = self->query->symbol_map[symbol]; - } - - // If this node is before the selected range, then avoid descending - // into it. if ( ts_node_end_byte(node) <= self->start_byte || point_lte(ts_node_end_point(node), self->start_point) @@ -1347,18 +1509,26 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { point_lte(self->end_point, ts_node_start_point(node)) ) return false; + // Get the properties of the current node. + TSSymbol symbol = ts_node_symbol(node); + bool is_named = ts_node_is_named(node); + if (symbol != ts_builtin_sym_error && self->query->symbol_map) { + symbol = self->query->symbol_map[symbol]; + } + bool can_have_later_siblings; + bool can_have_later_siblings_with_this_field; + TSFieldId field_id = ts_tree_cursor_current_status( + &self->cursor, + &can_have_later_siblings, + &can_have_later_siblings_with_this_field + ); LOG( - "enter node. " - "type:%s, field:%s, row:%u state_count:%u, " - "finished_state_count:%u, has_later_siblings:%d, " - "can_have_later_siblings_with_this_field:%d\n", + "enter node. type:%s, field:%s, row:%u state_count:%u, finished_state_count:%u\n", ts_node_type(node), ts_language_field_name_for_id(self->query->language, field_id), ts_node_start_point(node).row, self->states.size, - self->finished_states.size, - has_later_siblings, - can_have_later_siblings_with_this_field + self->finished_states.size ); // Add new states for any patterns whose root node is a wildcard. @@ -1369,7 +1539,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { // If this node matches the first step of the pattern, then add a new // state at the start of this pattern. if (step->field && field_id != step->field) continue; - if (!ts_query__cursor_add_state(self, pattern)) break; + if (!ts_query_cursor__add_state(self, pattern)) break; } // Add new states for any patterns whose root node matches this node. @@ -1381,7 +1551,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { // If this node matches the first step of the pattern, then add a new // state at the start of this pattern. if (step->field && field_id != step->field) continue; - if (!ts_query__cursor_add_state(self, pattern)) break; + if (!ts_query_cursor__add_state(self, pattern)) break; // Advance to the next pattern whose root node matches this node. i++; @@ -1392,9 +1562,10 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { } // Update all of the in-progress states with current node. - for (unsigned i = 0, n = self->states.size; i < n; i++) { + for (unsigned i = 0; i < self->states.size; i++) { QueryState *state = &self->states.contents[i]; QueryStep *step = &self->query->steps.contents[state->step_index]; + state->has_in_progress_alternatives = false; // Check that the node matches all of the criteria for the next // step of the pattern. @@ -1407,11 +1578,11 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { step->symbol == symbol || step->symbol == WILDCARD_SYMBOL || (step->symbol == NAMED_WILDCARD_SYMBOL && is_named); - bool later_sibling_can_match = has_later_siblings; - if (step->is_immediate && is_named) { + bool later_sibling_can_match = can_have_later_siblings; + if ((step->is_immediate && is_named) || state->seeking_immediate_match) { later_sibling_can_match = false; } - if (step->is_last && has_later_siblings) { + if (step->is_last && can_have_later_siblings) { node_does_match = false; } if (step->field) { @@ -1424,25 +1595,8 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { } } + // Remove states immediately if it is ever clear that they cannot match. if (!node_does_match) { - // If this QueryState has processed a repeating sequence, and that repeating - // sequence has ended, move on to the *next* step of this state's pattern. - if ( - state->step_index_on_failure != NONE && - (!later_sibling_can_match || step->is_repeated) - ) { - LOG( - " finish repetition state. pattern:%u, step:%u\n", - state->pattern_index, - state->step_index - ); - state->step_index = state->step_index_on_failure; - state->step_index_on_failure = NONE; - state->repeat_match_count = 0; - i--; - continue; - } - if (!later_sibling_can_match) { LOG( " discard state. pattern:%u, step:%u\n", @@ -1455,114 +1609,158 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { ); array_erase(&self->states, i); i--; - n--; } - - state->seeking_non_match = false; continue; } - // The `seeking_non_match` flag indicates that a previous QueryState - // has already begun processing this repeating sequence, so that *this* - // QueryState should not begin matching until a separate repeating sequence - // is found. - if (state->seeking_non_match) continue; - - // Some patterns can match their root node in multiple ways, - // capturing different children. If this pattern step could match - // later children within the same parent, then this query state - // cannot simply be updated in place. It must be split into two - // states: one that matches this node, and one which skips over - // this node, to preserve the possibility of matching later - // siblings. - QueryState *next_state = state; + // Some patterns can match their root node in multiple ways, capturing different + // children. If this pattern step could match later children within the same + // parent, then this query state cannot simply be updated in place. It must be + // split into two states: one that matches this node, and one which skips over + // this node, to preserve the possibility of matching later siblings. if ( - !step->is_pattern_start && - step->contains_captures && later_sibling_can_match && - state->repeat_match_count == 0 + !step->is_pattern_start && + step->contains_captures ) { - QueryState *copy = ts_query__cursor_copy_state(self, state); - - // The QueryState that matched this node has begun matching a repeating - // sequence. The QueryState that *skipped* this node should not start - // matching later elements of the same repeating sequence. - if (step->is_repeated) { - state->seeking_non_match = true; - } - - if (copy) { + if (ts_query__cursor_copy_state(self, state)) { LOG( - " split state. pattern:%u, step:%u\n", - copy->pattern_index, - copy->step_index + " split state for capture. pattern:%u, step:%u\n", + state->pattern_index, + state->step_index ); - next_state = copy; - } else { - LOG(" cannot split state.\n"); + i++; } } - // If the current node is captured in this pattern, add it to the - // capture list. + // If the current node is captured in this pattern, add it to the capture list. + CaptureList *capture_list = capture_list_pool_get( + &self->capture_list_pool, + state->capture_list_id + ); for (unsigned j = 0; j < MAX_STEP_CAPTURE_COUNT; j++) { uint16_t capture_id = step->capture_ids[j]; if (step->capture_ids[j] == NONE) break; - CaptureList *capture_list = capture_list_pool_get( - &self->capture_list_pool, - next_state->capture_list_id - ); - array_push(capture_list, ((TSQueryCapture) { - node, - capture_id - })); + array_push(capture_list, ((TSQueryCapture) { node, capture_id })); LOG( " capture node. pattern:%u, capture_id:%u, capture_count:%u\n", - next_state->pattern_index, + state->pattern_index, capture_id, capture_list->size ); } - // If this is the end of a repetition, then jump back to the beginning - // of that repetition. - if (step->repeat_step_index != NONE) { - next_state->step_index_on_failure = next_state->step_index + 1; - next_state->step_index = step->repeat_step_index; - next_state->repeat_match_count++; - LOG( - " continue repeat. pattern:%u, match_count:%u\n", - next_state->pattern_index, - next_state->repeat_match_count - ); - } else { - next_state->step_index++; - LOG( - " advance state. pattern:%u, step:%u\n", - next_state->pattern_index, - next_state->step_index - ); + // Advance this state to the next step of its pattern. + state->step_index++; + state->seeking_immediate_match = false; + LOG( + " advance state. pattern:%u, step:%u\n", + state->pattern_index, + state->step_index + ); - QueryStep *next_step = step + 1; - - // If the pattern is now done, then remove it from the list of - // in-progress states, and add it to the list of finished states. - if (next_step->depth == PATTERN_DONE_MARKER) { - LOG(" finish pattern %u\n", next_state->pattern_index); - - next_state->id = self->next_state_id++; - array_push(&self->finished_states, *next_state); - if (next_state == state) { - array_erase(&self->states, i); - i--; - n--; - } else { - self->states.size--; + // If this state's next step has an 'alternative' step (the step is either optional, + // or is the end of a repetition), then copy the state in order to pursue both + // alternatives. The alternative step itself may have an alternative, so this is + // an interative process. + unsigned start_index = state - self->states.contents; + unsigned end_index = start_index + 1; + for (unsigned j = start_index; j < end_index; j++) { + QueryState *state = &self->states.contents[j]; + QueryStep *next_step = &self->query->steps.contents[state->step_index]; + if (next_step->alternative_index != NONE) { + QueryState *copy = ts_query__cursor_copy_state(self, state); + if (next_step->is_placeholder) { + state->step_index++; + j--; + } + if (copy) { + i++; + end_index++; + copy->step_index = next_step->alternative_index; + if (next_step->alternative_is_immediate) { + copy->seeking_immediate_match = true; + } + LOG( + " split state for branch. pattern:%u, step:%u, step:%u\n", + copy->pattern_index, + state->step_index, + copy->step_index + ); } } } } + for (unsigned i = 0; i < self->states.size; i++) { + QueryState *state = &self->states.contents[i]; + bool did_remove = false; + + // Enfore the longest-match criteria. When a query pattern contains optional or + // repeated nodes, this is necesssary to avoid multiple redundant states, where + // one state has a strict subset of another state's captures. + for (unsigned j = i + 1; j < self->states.size; j++) { + QueryState *other_state = &self->states.contents[j]; + if ( + state->pattern_index == other_state->pattern_index && + state->start_depth == other_state->start_depth + ) { + bool left_contains_right, right_contains_left; + ts_query_cursor__compare_captures( + self, + state, + other_state, + &left_contains_right, + &right_contains_left + ); + if (left_contains_right) { + if (state->step_index == other_state->step_index) { + LOG( + " drop shorter state. pattern: %u, step_index: %u\n", + state->pattern_index, + state->step_index + ); + capture_list_pool_release(&self->capture_list_pool, other_state->capture_list_id); + array_erase(&self->states, j); + j--; + continue; + } + other_state->has_in_progress_alternatives = true; + } + if (right_contains_left) { + if (state->step_index == other_state->step_index) { + LOG( + " drop shorter state. pattern: %u, step_index: %u\n", + state->pattern_index, + state->step_index + ); + capture_list_pool_release(&self->capture_list_pool, state->capture_list_id); + array_erase(&self->states, i); + did_remove = true; + break; + } + state->has_in_progress_alternatives = true; + } + } + } + + // If there the state is at the end of its pattern, remove it from the list + // of in-progress states and add it to the list of finished states. + if (!did_remove) { + QueryStep *next_step = &self->query->steps.contents[state->step_index]; + if (next_step->depth == PATTERN_DONE_MARKER) { + if (state->has_in_progress_alternatives) { + LOG(" defer finishing pattern %u\n", state->pattern_index); + } else { + LOG(" finish pattern %u\n", state->pattern_index); + state->id = self->next_state_id++; + array_push(&self->finished_states, *state); + array_erase(&self->states, state - self->states.contents); + i--; + } + } + } + } // Continue descending if possible. if (ts_tree_cursor_goto_first_child(&self->cursor)) { diff --git a/src/tree_sitter/subtree.c b/src/tree_sitter/subtree.c index b98f172339..ef92a32fe4 100644 --- a/src/tree_sitter/subtree.c +++ b/src/tree_sitter/subtree.c @@ -21,7 +21,7 @@ typedef struct { #define TS_MAX_INLINE_TREE_LENGTH UINT8_MAX #define TS_MAX_TREE_POOL_SIZE 32 -static const ExternalScannerState empty_state = {.length = 0, .short_data = {0}}; +static const ExternalScannerState empty_state = {{.short_data = {0}}, .length = 0}; // ExternalScannerState @@ -208,7 +208,7 @@ Subtree ts_subtree_new_leaf( .has_external_tokens = has_external_tokens, .is_missing = false, .is_keyword = is_keyword, - .first_leaf = {.symbol = 0, .parse_state = 0}, + {{.first_leaf = {.symbol = 0, .parse_state = 0}}} }; return (Subtree) {.ptr = data}; } @@ -464,15 +464,17 @@ MutableSubtree ts_subtree_new_node(SubtreePool *pool, TSSymbol symbol, *data = (SubtreeHeapData) { .ref_count = 1, .symbol = symbol, - .production_id = production_id, .visible = metadata.visible, .named = metadata.named, .has_changes = false, .fragile_left = fragile, .fragile_right = fragile, .is_keyword = false, - .node_count = 0, - .first_leaf = {.symbol = 0, .parse_state = 0}, + {{ + .node_count = 0, + .production_id = production_id, + .first_leaf = {.symbol = 0, .parse_state = 0}, + }} }; MutableSubtree result = {.ptr = data}; ts_subtree_set_children(result, children->contents, children->size, language); diff --git a/test/functional/lua/treesitter_spec.lua b/test/functional/lua/treesitter_spec.lua index f93185d1f6..cb0e46b9eb 100644 --- a/test/functional/lua/treesitter_spec.lua +++ b/test/functional/lua/treesitter_spec.lua @@ -240,15 +240,15 @@ static int nlua_schedule(lua_State *const lstate) ; TODO(bfredl): overlapping matches are unreliable, ; we need a proper priority mechanism ;(type_identifier) @type -((type_identifier) @Special (eq? @Special "LuaRef")) +((type_identifier) @Special (#eq? @Special "LuaRef")) (primitive_type) @type (sized_type_specifier) @type ; defaults to very magic syntax, for best compatibility -((identifier) @Identifier (match? @Identifier "^l(u)a_")) +((identifier) @Identifier (#match? @Identifier "^l(u)a_")) ; still support \M etc prefixes -((identifier) @Constant (match? @Constant "\M^\[A-Z_]\+$")) +((identifier) @Constant (#match? @Constant "\M^\[A-Z_]\+$")) ((binary_expression left: (identifier) @WarningMsg.left right: (identifier) @WarningMsg.right) (eq? @WarningMsg.left @WarningMsg.right))