From 05f856f3142698a6d34f9879897849a5d3a9f329 Mon Sep 17 00:00:00 2001 From: Edouard CHIN Date: Wed, 25 Mar 2026 13:28:57 +0100 Subject: [PATCH 01/12] [ruby/rubygems] This new suite isn't running on windows: - This new suite needs to be added to a group in order to be picked up. I saw the warning on CI ./spec/bundler/errors_spec.rb is not assigned to any Windows runner group. see spec/support/windows_tag_group.rb for details. https://github.com/ruby/rubygems/commit/ed87214b37 --- spec/bundler/support/windows_tag_group.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/spec/bundler/support/windows_tag_group.rb b/spec/bundler/support/windows_tag_group.rb index bd6acb9d55ca2a..f1a78f23e8d37e 100644 --- a/spec/bundler/support/windows_tag_group.rb +++ b/spec/bundler/support/windows_tag_group.rb @@ -33,6 +33,7 @@ module WindowsTagGroup "spec/bundler/source/git/git_proxy_spec.rb", "spec/bundler/source_list_spec.rb", "spec/bundler/plugin/installer_spec.rb", + "spec/bundler/errors_spec.rb", "spec/bundler/friendly_errors_spec.rb", "spec/resolver/platform_spec.rb", "spec/bundler/fetcher/downloader_spec.rb", From d88aca8aca2be9da86da4f42c4b5d5e2bbb5363a Mon Sep 17 00:00:00 2001 From: Earlopain <14981592+Earlopain@users.noreply.github.com> Date: Wed, 25 Mar 2026 09:35:49 +0100 Subject: [PATCH 02/12] [ruby/prism] Add `keyword_loc` to `ForwardingSuperNode` They can take a block and so the keyword location is not always the entirety of the node. You can get it by constructing a location that is just the first 5 characters but why not have it be provided by prism? https://github.com/ruby/prism/commit/878d79eb8c --- prism/config.yml | 8 ++++++++ prism/prism.c | 1 + 2 files changed, 9 insertions(+) diff --git a/prism/config.yml b/prism/config.yml index ef970eba9d356d..7c283741d3b52c 100644 --- a/prism/config.yml +++ b/prism/config.yml @@ -2627,6 +2627,14 @@ nodes: end - name: ForwardingSuperNode fields: + - name: keyword_loc + type: location + comment: | + super + ^^^^^ + + super { 123 } + ^^^^^ - name: block type: node? kind: BlockNode diff --git a/prism/prism.c b/prism/prism.c index d68c83f1e34c18..3b475657808805 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -4379,6 +4379,7 @@ pm_forwarding_super_node_create(pm_parser_t *parser, const pm_token_t *token, pm ++parser->node_id, 0, (block == NULL) ? PM_LOCATION_INIT_TOKEN(parser, token) : PM_LOCATION_INIT_TOKEN_NODE(parser, token, block), + PM_LOCATION_INIT_TOKEN(parser, token), block ); } From 3a5536aaf7aa511cda485ccbd45b2373c1403b49 Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Tue, 24 Mar 2026 11:53:12 -0400 Subject: [PATCH 03/12] [ruby/prism] Pull out parse_def from parse_expression_prefix https://github.com/ruby/prism/commit/f77dc15864 --- prism/prism.c | 735 +++++++++++++++++++++++++------------------------- 1 file changed, 368 insertions(+), 367 deletions(-) diff --git a/prism/prism.c b/prism/prism.c index 3b475657808805..743683f1ec29b6 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -17769,6 +17769,372 @@ pm_block_call_p(const pm_node_t *node) { return false; } +/** + * Parse a method definition expression (the `def` keyword). + */ +static pm_node_t * +parse_def(pm_parser_t *parser, pm_binding_power_t binding_power, uint8_t flags, uint16_t depth) { + pm_node_list_t current_block_exits = { 0 }; + pm_node_list_t *previous_block_exits = push_block_exits(parser, ¤t_block_exits); + + pm_token_t def_keyword = parser->current; + size_t opening_newline_index = token_newline_index(parser); + + pm_node_t *receiver = NULL; + pm_token_t operator = { 0 }; + pm_token_t name; + + /* This context is necessary for lexing `...` in a bare params correctly. It + * must be pushed before lexing the first param, so it is here. */ + context_push(parser, PM_CONTEXT_DEF_PARAMS); + parser_lex(parser); + + /* This will be false if the method name is not a valid identifier but could + * be followed by an operator. */ + bool valid_name = true; + + switch (parser->current.type) { + case PM_CASE_OPERATOR: + pm_parser_scope_push(parser, true); + lex_state_set(parser, PM_LEX_STATE_ENDFN); + parser_lex(parser); + + name = parser->previous; + break; + case PM_TOKEN_IDENTIFIER: { + parser_lex(parser); + + if (match2(parser, PM_TOKEN_DOT, PM_TOKEN_COLON_COLON)) { + receiver = parse_variable_call(parser); + + pm_parser_scope_push(parser, true); + lex_state_set(parser, PM_LEX_STATE_FNAME); + parser_lex(parser); + + operator = parser->previous; + name = parse_method_definition_name(parser); + } else { + pm_refute_numbered_parameter(parser, PM_TOKEN_START(parser, &parser->previous), PM_TOKEN_LENGTH(&parser->previous)); + pm_parser_scope_push(parser, true); + + name = parser->previous; + } + + break; + } + case PM_TOKEN_INSTANCE_VARIABLE: + case PM_TOKEN_CLASS_VARIABLE: + case PM_TOKEN_GLOBAL_VARIABLE: + valid_name = false; + PRISM_FALLTHROUGH + case PM_TOKEN_CONSTANT: + case PM_TOKEN_KEYWORD_NIL: + case PM_TOKEN_KEYWORD_SELF: + case PM_TOKEN_KEYWORD_TRUE: + case PM_TOKEN_KEYWORD_FALSE: + case PM_TOKEN_KEYWORD___FILE__: + case PM_TOKEN_KEYWORD___LINE__: + case PM_TOKEN_KEYWORD___ENCODING__: { + pm_parser_scope_push(parser, true); + parser_lex(parser); + + pm_token_t identifier = parser->previous; + + if (match2(parser, PM_TOKEN_DOT, PM_TOKEN_COLON_COLON)) { + lex_state_set(parser, PM_LEX_STATE_FNAME); + parser_lex(parser); + operator = parser->previous; + + switch (identifier.type) { + case PM_TOKEN_CONSTANT: + receiver = UP(pm_constant_read_node_create(parser, &identifier)); + break; + case PM_TOKEN_INSTANCE_VARIABLE: + receiver = UP(pm_instance_variable_read_node_create(parser, &identifier)); + break; + case PM_TOKEN_CLASS_VARIABLE: + receiver = UP(pm_class_variable_read_node_create(parser, &identifier)); + break; + case PM_TOKEN_GLOBAL_VARIABLE: + receiver = UP(pm_global_variable_read_node_create(parser, &identifier)); + break; + case PM_TOKEN_KEYWORD_NIL: + receiver = UP(pm_nil_node_create(parser, &identifier)); + break; + case PM_TOKEN_KEYWORD_SELF: + receiver = UP(pm_self_node_create(parser, &identifier)); + break; + case PM_TOKEN_KEYWORD_TRUE: + receiver = UP(pm_true_node_create(parser, &identifier)); + break; + case PM_TOKEN_KEYWORD_FALSE: + receiver = UP(pm_false_node_create(parser, &identifier)); + break; + case PM_TOKEN_KEYWORD___FILE__: + receiver = UP(pm_source_file_node_create(parser, &identifier)); + break; + case PM_TOKEN_KEYWORD___LINE__: + receiver = UP(pm_source_line_node_create(parser, &identifier)); + break; + case PM_TOKEN_KEYWORD___ENCODING__: + receiver = UP(pm_source_encoding_node_create(parser, &identifier)); + break; + default: + break; + } + + name = parse_method_definition_name(parser); + } else { + if (!valid_name) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, &identifier, PM_ERR_DEF_NAME, pm_token_str(identifier.type)); + } + + name = identifier; + } + break; + } + case PM_TOKEN_PARENTHESIS_LEFT: { + /* The current context is `PM_CONTEXT_DEF_PARAMS`, however the inner + * expression of this parenthesis should not be processed under this + * context. Thus, the context is popped here. */ + context_pop(parser); + parser_lex(parser); + + pm_token_t lparen = parser->previous; + pm_node_t *expression = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, (flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL, PM_ERR_DEF_RECEIVER, (uint16_t) (depth + 1)); + + accept1(parser, PM_TOKEN_NEWLINE); + expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_EXPECT_RPAREN); + pm_token_t rparen = parser->previous; + + lex_state_set(parser, PM_LEX_STATE_FNAME); + expect2(parser, PM_TOKEN_DOT, PM_TOKEN_COLON_COLON, PM_ERR_DEF_RECEIVER_TERM); + + operator = parser->previous; + receiver = UP(pm_parentheses_node_create(parser, &lparen, expression, &rparen, 0)); + + /* To push `PM_CONTEXT_DEF_PARAMS` again is for the same reason as + * described the above. */ + pm_parser_scope_push(parser, true); + context_push(parser, PM_CONTEXT_DEF_PARAMS); + name = parse_method_definition_name(parser); + break; + } + default: + pm_parser_scope_push(parser, true); + name = parse_method_definition_name(parser); + break; + } + + pm_token_t lparen = { 0 }; + pm_token_t rparen = { 0 }; + pm_parameters_node_t *params; + + bool accept_endless_def = true; + switch (parser->current.type) { + case PM_TOKEN_PARENTHESIS_LEFT: { + parser_lex(parser); + lparen = parser->previous; + + if (match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { + params = NULL; + } else { + /* https://bugs.ruby-lang.org/issues/19107 */ + bool allow_trailing_comma = parser->version >= PM_OPTIONS_VERSION_CRUBY_4_1; + params = parse_parameters( + parser, + PM_BINDING_POWER_DEFINED, + true, + allow_trailing_comma, + true, + true, + false, + PM_ERR_ARGUMENT_NO_FORWARDING_ELLIPSES, + (uint16_t) (depth + 1) + ); + } + + lex_state_set(parser, PM_LEX_STATE_BEG); + parser->command_start = true; + + context_pop(parser); + if (!accept1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_DEF_PARAMS_TERM_PAREN, pm_token_str(parser->current.type)); + parser->previous.start = parser->previous.end; + parser->previous.type = 0; + } + + rparen = parser->previous; + break; + } + case PM_CASE_PARAMETER: { + /* If we're about to lex a label, we need to add the label state to + * make sure the next newline is ignored. */ + if (parser->current.type == PM_TOKEN_LABEL) { + lex_state_set(parser, parser->lex_state | PM_LEX_STATE_LABEL); + } + + params = parse_parameters( + parser, + PM_BINDING_POWER_DEFINED, + false, + false, + true, + true, + false, + PM_ERR_ARGUMENT_NO_FORWARDING_ELLIPSES, + (uint16_t) (depth + 1) + ); + + /* Reject `def * = 1` and similar. We have to specifically check for + * them because they create ambiguity with optional arguments. */ + accept_endless_def = false; + + context_pop(parser); + break; + } + default: { + params = NULL; + context_pop(parser); + break; + } + } + + pm_node_t *statements = NULL; + pm_token_t equal = { 0 }; + pm_token_t end_keyword = { 0 }; + + if (accept1(parser, PM_TOKEN_EQUAL)) { + if (token_is_setter_name(&name)) { + pm_parser_err_token(parser, &name, PM_ERR_DEF_ENDLESS_SETTER); + } + if (!accept_endless_def) { + pm_parser_err_previous(parser, PM_ERR_DEF_ENDLESS_PARAMETERS); + } + if ( + parser->current_context->context == PM_CONTEXT_DEFAULT_PARAMS && + parser->current_context->prev->context == PM_CONTEXT_BLOCK_PARAMETERS + ) { + PM_PARSER_ERR_FORMAT(parser, PM_TOKEN_START(parser, &def_keyword), PM_TOKENS_LENGTH(&def_keyword, &parser->previous), PM_ERR_UNEXPECTED_PARAMETER_DEFAULT_VALUE, "endless method definition"); + } + equal = parser->previous; + + context_push(parser, PM_CONTEXT_DEF); + pm_do_loop_stack_push(parser, false); + statements = UP(pm_statements_node_create(parser)); + + uint8_t allow_flags; + if (parser->version >= PM_OPTIONS_VERSION_CRUBY_4_0) { + allow_flags = flags & PM_PARSE_ACCEPTS_COMMAND_CALL; + } else { + /* Allow `def foo = puts "Hello"` but not + * `private def foo = puts "Hello"` */ + allow_flags = (binding_power == PM_BINDING_POWER_ASSIGNMENT || binding_power < PM_BINDING_POWER_COMPOSITION) ? PM_PARSE_ACCEPTS_COMMAND_CALL : 0; + } + + /* Inside a def body, we push true onto the accepts_block_stack so that + * `do` is lexed as PM_TOKEN_KEYWORD_DO (which can only start a block + * for primary-level constructs, not commands). During command argument + * parsing, the stack is pushed to false, causing `do` to be lexed as + * PM_TOKEN_KEYWORD_DO_BLOCK, which is not consumed inside the endless + * def body and instead left for the outer context. */ + pm_accepts_block_stack_push(parser, true); + pm_node_t *statement = parse_expression(parser, PM_BINDING_POWER_DEFINED + 1, allow_flags | PM_PARSE_IN_ENDLESS_DEF, PM_ERR_DEF_ENDLESS, (uint16_t) (depth + 1)); + pm_accepts_block_stack_pop(parser); + + /* If an unconsumed PM_TOKEN_KEYWORD_DO follows the body, it is an error + * (e.g., `def f = 1 do end`). PM_TOKEN_KEYWORD_DO_BLOCK is + * intentionally not caught here — it should bubble up to the outer + * context (e.g., `private def f = puts "Hello" do end` where the block + * attaches to `private`). */ + if (accept1(parser, PM_TOKEN_KEYWORD_DO)) { + pm_block_node_t *block = parse_block(parser, (uint16_t) (depth + 1)); + pm_parser_err_node(parser, UP(block), PM_ERR_DEF_ENDLESS_DO_BLOCK); + } + + if (accept1(parser, PM_TOKEN_KEYWORD_RESCUE_MODIFIER)) { + context_push(parser, PM_CONTEXT_RESCUE_MODIFIER); + + pm_token_t rescue_keyword = parser->previous; + + /* In the Ruby grammar, the rescue value of an endless method + * command excludes and/or and in/=>. */ + pm_node_t *value = parse_expression(parser, PM_BINDING_POWER_MATCH + 1, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1)); + context_pop(parser); + + statement = UP(pm_rescue_modifier_node_create(parser, statement, &rescue_keyword, value)); + } + + /* A nested endless def whose body is a command call (e.g., + * `def f = def g = foo bar`) is a command assignment and cannot appear + * as a def body. */ + if (PM_NODE_TYPE_P(statement, PM_DEF_NODE) && pm_command_call_value_p(statement)) { + PM_PARSER_ERR_NODE_FORMAT(parser, statement, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_str(parser->current.type)); + } + + pm_statements_node_body_append(parser, (pm_statements_node_t *) statements, statement, false); + pm_do_loop_stack_pop(parser); + context_pop(parser); + } else { + if (lparen.start == NULL) { + lex_state_set(parser, PM_LEX_STATE_BEG); + parser->command_start = true; + expect2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON, PM_ERR_DEF_PARAMS_TERM); + } else { + accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); + } + + pm_accepts_block_stack_push(parser, true); + pm_do_loop_stack_push(parser, false); + + if (!match4(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { + pm_accepts_block_stack_push(parser, true); + statements = UP(parse_statements(parser, PM_CONTEXT_DEF, (uint16_t) (depth + 1))); + pm_accepts_block_stack_pop(parser); + } + + if (match3(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_ELSE)) { + assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE)); + statements = UP(parse_rescues_implicit_begin(parser, opening_newline_index, &def_keyword, def_keyword.start, (pm_statements_node_t *) statements, PM_RESCUES_DEF, (uint16_t) (depth + 1))); + } else { + parser_warn_indentation_mismatch(parser, opening_newline_index, &def_keyword, false, false); + } + + pm_accepts_block_stack_pop(parser); + pm_do_loop_stack_pop(parser); + + expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_DEF_TERM, &def_keyword); + end_keyword = parser->previous; + } + + pm_constant_id_list_t locals; + pm_locals_order(parser, &parser->current_scope->locals, &locals, false); + pm_parser_scope_pop(parser); + + /* If the final character is `@` as is the case when defining methods to + * override the unary operators, we should ignore the @ in the same way we + * do for symbols. */ + pm_constant_id_t name_id = pm_parser_constant_id_raw(parser, name.start, parse_operator_symbol_name(&name)); + + flush_block_exits(parser, previous_block_exits); + + return UP(pm_def_node_create( + parser, + name_id, + &name, + receiver, + params, + statements, + &locals, + &def_keyword, + NTOK2PTR(operator), + NTOK2PTR(lparen), + NTOK2PTR(rparen), + NTOK2PTR(equal), + NTOK2PTR(end_keyword) + )); +} + /** * Parse an expression that begins with the previous node that we just lexed. */ @@ -18988,373 +19354,8 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, u pop_block_exits(parser, previous_block_exits); return UP(pm_class_node_create(parser, &locals, &class_keyword, constant_path, &name, NTOK2PTR(inheritance_operator), superclass, statements, &parser->previous)); } - case PM_TOKEN_KEYWORD_DEF: { - pm_node_list_t current_block_exits = { 0 }; - pm_node_list_t *previous_block_exits = push_block_exits(parser, ¤t_block_exits); - - pm_token_t def_keyword = parser->current; - size_t opening_newline_index = token_newline_index(parser); - - pm_node_t *receiver = NULL; - pm_token_t operator = { 0 }; - pm_token_t name; - - // This context is necessary for lexing `...` in a bare params - // correctly. It must be pushed before lexing the first param, so it - // is here. - context_push(parser, PM_CONTEXT_DEF_PARAMS); - parser_lex(parser); - - // This will be false if the method name is not a valid identifier - // but could be followed by an operator. - bool valid_name = true; - - switch (parser->current.type) { - case PM_CASE_OPERATOR: - pm_parser_scope_push(parser, true); - lex_state_set(parser, PM_LEX_STATE_ENDFN); - parser_lex(parser); - - name = parser->previous; - break; - case PM_TOKEN_IDENTIFIER: { - parser_lex(parser); - - if (match2(parser, PM_TOKEN_DOT, PM_TOKEN_COLON_COLON)) { - receiver = parse_variable_call(parser); - - pm_parser_scope_push(parser, true); - lex_state_set(parser, PM_LEX_STATE_FNAME); - parser_lex(parser); - - operator = parser->previous; - name = parse_method_definition_name(parser); - } else { - pm_refute_numbered_parameter(parser, PM_TOKEN_START(parser, &parser->previous), PM_TOKEN_LENGTH(&parser->previous)); - pm_parser_scope_push(parser, true); - - name = parser->previous; - } - - break; - } - case PM_TOKEN_INSTANCE_VARIABLE: - case PM_TOKEN_CLASS_VARIABLE: - case PM_TOKEN_GLOBAL_VARIABLE: - valid_name = false; - PRISM_FALLTHROUGH - case PM_TOKEN_CONSTANT: - case PM_TOKEN_KEYWORD_NIL: - case PM_TOKEN_KEYWORD_SELF: - case PM_TOKEN_KEYWORD_TRUE: - case PM_TOKEN_KEYWORD_FALSE: - case PM_TOKEN_KEYWORD___FILE__: - case PM_TOKEN_KEYWORD___LINE__: - case PM_TOKEN_KEYWORD___ENCODING__: { - pm_parser_scope_push(parser, true); - parser_lex(parser); - - pm_token_t identifier = parser->previous; - - if (match2(parser, PM_TOKEN_DOT, PM_TOKEN_COLON_COLON)) { - lex_state_set(parser, PM_LEX_STATE_FNAME); - parser_lex(parser); - operator = parser->previous; - - switch (identifier.type) { - case PM_TOKEN_CONSTANT: - receiver = UP(pm_constant_read_node_create(parser, &identifier)); - break; - case PM_TOKEN_INSTANCE_VARIABLE: - receiver = UP(pm_instance_variable_read_node_create(parser, &identifier)); - break; - case PM_TOKEN_CLASS_VARIABLE: - receiver = UP(pm_class_variable_read_node_create(parser, &identifier)); - break; - case PM_TOKEN_GLOBAL_VARIABLE: - receiver = UP(pm_global_variable_read_node_create(parser, &identifier)); - break; - case PM_TOKEN_KEYWORD_NIL: - receiver = UP(pm_nil_node_create(parser, &identifier)); - break; - case PM_TOKEN_KEYWORD_SELF: - receiver = UP(pm_self_node_create(parser, &identifier)); - break; - case PM_TOKEN_KEYWORD_TRUE: - receiver = UP(pm_true_node_create(parser, &identifier)); - break; - case PM_TOKEN_KEYWORD_FALSE: - receiver = UP(pm_false_node_create(parser, &identifier)); - break; - case PM_TOKEN_KEYWORD___FILE__: - receiver = UP(pm_source_file_node_create(parser, &identifier)); - break; - case PM_TOKEN_KEYWORD___LINE__: - receiver = UP(pm_source_line_node_create(parser, &identifier)); - break; - case PM_TOKEN_KEYWORD___ENCODING__: - receiver = UP(pm_source_encoding_node_create(parser, &identifier)); - break; - default: - break; - } - - name = parse_method_definition_name(parser); - } else { - if (!valid_name) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, &identifier, PM_ERR_DEF_NAME, pm_token_str(identifier.type)); - } - - name = identifier; - } - break; - } - case PM_TOKEN_PARENTHESIS_LEFT: { - // The current context is `PM_CONTEXT_DEF_PARAMS`, however - // the inner expression of this parenthesis should not be - // processed under this context. Thus, the context is popped - // here. - context_pop(parser); - parser_lex(parser); - - pm_token_t lparen = parser->previous; - pm_node_t *expression = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, (flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL, PM_ERR_DEF_RECEIVER, (uint16_t) (depth + 1)); - - accept1(parser, PM_TOKEN_NEWLINE); - expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_EXPECT_RPAREN); - pm_token_t rparen = parser->previous; - - lex_state_set(parser, PM_LEX_STATE_FNAME); - expect2(parser, PM_TOKEN_DOT, PM_TOKEN_COLON_COLON, PM_ERR_DEF_RECEIVER_TERM); - - operator = parser->previous; - receiver = UP(pm_parentheses_node_create(parser, &lparen, expression, &rparen, 0)); - - // To push `PM_CONTEXT_DEF_PARAMS` again is for the same - // reason as described the above. - pm_parser_scope_push(parser, true); - context_push(parser, PM_CONTEXT_DEF_PARAMS); - name = parse_method_definition_name(parser); - break; - } - default: - pm_parser_scope_push(parser, true); - name = parse_method_definition_name(parser); - break; - } - - pm_token_t lparen = { 0 }; - pm_token_t rparen = { 0 }; - pm_parameters_node_t *params; - - bool accept_endless_def = true; - switch (parser->current.type) { - case PM_TOKEN_PARENTHESIS_LEFT: { - parser_lex(parser); - lparen = parser->previous; - - if (match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { - params = NULL; - } else { - // https://bugs.ruby-lang.org/issues/19107 - bool allow_trailing_comma = parser->version >= PM_OPTIONS_VERSION_CRUBY_4_1; - params = parse_parameters( - parser, - PM_BINDING_POWER_DEFINED, - true, - allow_trailing_comma, - true, - true, - false, - PM_ERR_ARGUMENT_NO_FORWARDING_ELLIPSES, - (uint16_t) (depth + 1) - ); - } - - lex_state_set(parser, PM_LEX_STATE_BEG); - parser->command_start = true; - - context_pop(parser); - if (!accept1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_DEF_PARAMS_TERM_PAREN, pm_token_str(parser->current.type)); - parser->previous.start = parser->previous.end; - parser->previous.type = 0; - } - - rparen = parser->previous; - break; - } - case PM_CASE_PARAMETER: { - // If we're about to lex a label, we need to add the label - // state to make sure the next newline is ignored. - if (parser->current.type == PM_TOKEN_LABEL) { - lex_state_set(parser, parser->lex_state | PM_LEX_STATE_LABEL); - } - - params = parse_parameters( - parser, - PM_BINDING_POWER_DEFINED, - false, - false, - true, - true, - false, - PM_ERR_ARGUMENT_NO_FORWARDING_ELLIPSES, - (uint16_t) (depth + 1) - ); - - // Reject `def * = 1` and similar. We have to specifically check - // for them because they create ambiguity with optional arguments. - accept_endless_def = false; - - context_pop(parser); - break; - } - default: { - params = NULL; - context_pop(parser); - break; - } - } - - pm_node_t *statements = NULL; - pm_token_t equal = { 0 }; - pm_token_t end_keyword = { 0 }; - - if (accept1(parser, PM_TOKEN_EQUAL)) { - if (token_is_setter_name(&name)) { - pm_parser_err_token(parser, &name, PM_ERR_DEF_ENDLESS_SETTER); - } - if (!accept_endless_def) { - pm_parser_err_previous(parser, PM_ERR_DEF_ENDLESS_PARAMETERS); - } - if ( - parser->current_context->context == PM_CONTEXT_DEFAULT_PARAMS && - parser->current_context->prev->context == PM_CONTEXT_BLOCK_PARAMETERS - ) { - PM_PARSER_ERR_FORMAT(parser, PM_TOKEN_START(parser, &def_keyword), PM_TOKENS_LENGTH(&def_keyword, &parser->previous), PM_ERR_UNEXPECTED_PARAMETER_DEFAULT_VALUE, "endless method definition"); - } - equal = parser->previous; - - context_push(parser, PM_CONTEXT_DEF); - pm_do_loop_stack_push(parser, false); - statements = UP(pm_statements_node_create(parser)); - - uint8_t allow_flags; - if (parser->version >= PM_OPTIONS_VERSION_CRUBY_4_0) { - allow_flags = flags & PM_PARSE_ACCEPTS_COMMAND_CALL; - } else { - // Allow `def foo = puts "Hello"` but not `private def foo = puts "Hello"` - allow_flags = (binding_power == PM_BINDING_POWER_ASSIGNMENT || binding_power < PM_BINDING_POWER_COMPOSITION) ? PM_PARSE_ACCEPTS_COMMAND_CALL : 0; - } - - // Inside a def body, we push true onto the - // accepts_block_stack so that `do` is lexed as - // PM_TOKEN_KEYWORD_DO (which can only start a block for - // primary-level constructs, not commands). During command - // argument parsing, the stack is pushed to false, causing - // `do` to be lexed as PM_TOKEN_KEYWORD_DO_BLOCK, which - // is not consumed inside the endless def body and instead - // left for the outer context. - pm_accepts_block_stack_push(parser, true); - pm_node_t *statement = parse_expression(parser, PM_BINDING_POWER_DEFINED + 1, allow_flags | PM_PARSE_IN_ENDLESS_DEF, PM_ERR_DEF_ENDLESS, (uint16_t) (depth + 1)); - pm_accepts_block_stack_pop(parser); - - // If an unconsumed PM_TOKEN_KEYWORD_DO follows the body, - // it is an error (e.g., `def f = 1 do end`). - // PM_TOKEN_KEYWORD_DO_BLOCK is intentionally not caught - // here — it should bubble up to the outer context (e.g., - // `private def f = puts "Hello" do end` where the block - // attaches to `private`). - if (accept1(parser, PM_TOKEN_KEYWORD_DO)) { - pm_block_node_t *block = parse_block(parser, (uint16_t) (depth + 1)); - pm_parser_err_node(parser, UP(block), PM_ERR_DEF_ENDLESS_DO_BLOCK); - } - - if (accept1(parser, PM_TOKEN_KEYWORD_RESCUE_MODIFIER)) { - context_push(parser, PM_CONTEXT_RESCUE_MODIFIER); - - pm_token_t rescue_keyword = parser->previous; - - // In the Ruby grammar, the rescue value of an endless - // method command excludes and/or and in/=>. - pm_node_t *value = parse_expression(parser, PM_BINDING_POWER_MATCH + 1, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1)); - context_pop(parser); - - statement = UP(pm_rescue_modifier_node_create(parser, statement, &rescue_keyword, value)); - } - - // A nested endless def whose body is a command call (e.g., - // `def f = def g = foo bar`) is a command assignment and - // cannot appear as a def body. - if (PM_NODE_TYPE_P(statement, PM_DEF_NODE) && pm_command_call_value_p(statement)) { - PM_PARSER_ERR_NODE_FORMAT(parser, statement, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_str(parser->current.type)); - } - - pm_statements_node_body_append(parser, (pm_statements_node_t *) statements, statement, false); - pm_do_loop_stack_pop(parser); - context_pop(parser); - } else { - if (lparen.start == NULL) { - lex_state_set(parser, PM_LEX_STATE_BEG); - parser->command_start = true; - expect2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON, PM_ERR_DEF_PARAMS_TERM); - } else { - accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); - } - - pm_accepts_block_stack_push(parser, true); - pm_do_loop_stack_push(parser, false); - - if (!match4(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { - pm_accepts_block_stack_push(parser, true); - statements = UP(parse_statements(parser, PM_CONTEXT_DEF, (uint16_t) (depth + 1))); - pm_accepts_block_stack_pop(parser); - } - - if (match3(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_ELSE)) { - assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE)); - statements = UP(parse_rescues_implicit_begin(parser, opening_newline_index, &def_keyword, def_keyword.start, (pm_statements_node_t *) statements, PM_RESCUES_DEF, (uint16_t) (depth + 1))); - } else { - parser_warn_indentation_mismatch(parser, opening_newline_index, &def_keyword, false, false); - } - - pm_accepts_block_stack_pop(parser); - pm_do_loop_stack_pop(parser); - - expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_DEF_TERM, &def_keyword); - end_keyword = parser->previous; - } - - pm_constant_id_list_t locals; - pm_locals_order(parser, &parser->current_scope->locals, &locals, false); - pm_parser_scope_pop(parser); - - /** - * If the final character is `@` as is the case when defining - * methods to override the unary operators, we should ignore - * the @ in the same way we do for symbols. - */ - pm_constant_id_t name_id = pm_parser_constant_id_raw(parser, name.start, parse_operator_symbol_name(&name)); - - flush_block_exits(parser, previous_block_exits); - - return UP(pm_def_node_create( - parser, - name_id, - &name, - receiver, - params, - statements, - &locals, - &def_keyword, - NTOK2PTR(operator), - NTOK2PTR(lparen), - NTOK2PTR(rparen), - NTOK2PTR(equal), - NTOK2PTR(end_keyword) - )); - } + case PM_TOKEN_KEYWORD_DEF: + return parse_def(parser, binding_power, flags, depth); case PM_TOKEN_KEYWORD_DEFINED: { parser_lex(parser); From 5fcdaff8a986ccbc81c9a84870e08be34f6abf0a Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Tue, 24 Mar 2026 11:57:30 -0400 Subject: [PATCH 04/12] [ruby/prism] Pull out parse_case from parse_expression_prefix https://github.com/ruby/prism/commit/410a623cdf --- prism/prism.c | 433 +++++++++++++++++++++++++------------------------- 1 file changed, 220 insertions(+), 213 deletions(-) diff --git a/prism/prism.c b/prism/prism.c index 743683f1ec29b6..3016d704ca4f8f 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -17769,6 +17769,224 @@ pm_block_call_p(const pm_node_t *node) { return false; } +/** + * Parse a case expression (the `case` keyword). This handles both case-when and + * case-in (pattern matching) forms. + */ +static pm_node_t * +parse_case(pm_parser_t *parser, uint8_t flags, uint16_t depth) { + size_t opening_newline_index = token_newline_index(parser); + parser_lex(parser); + + pm_token_t case_keyword = parser->previous; + pm_node_t *predicate = NULL; + + pm_node_list_t current_block_exits = { 0 }; + pm_node_list_t *previous_block_exits = push_block_exits(parser, ¤t_block_exits); + + if (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) { + while (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)); + predicate = NULL; + } else if (match3(parser, PM_TOKEN_KEYWORD_WHEN, PM_TOKEN_KEYWORD_IN, PM_TOKEN_KEYWORD_END)) { + predicate = NULL; + } else if (!token_begins_expression_p(parser->current.type)) { + predicate = NULL; + } else { + predicate = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, (flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL, PM_ERR_CASE_EXPRESSION_AFTER_CASE, (uint16_t) (depth + 1)); + while (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)); + } + + if (match1(parser, PM_TOKEN_KEYWORD_END)) { + parser_warn_indentation_mismatch(parser, opening_newline_index, &case_keyword, false, false); + parser_lex(parser); + pop_block_exits(parser, previous_block_exits); + pm_parser_err_token(parser, &case_keyword, PM_ERR_CASE_MISSING_CONDITIONS); + return UP(pm_case_node_create(parser, &case_keyword, predicate, &parser->previous)); + } + + /* At this point we can create a case node, though we don't yet know if it + * is a case-in or case-when node. */ + pm_node_t *node; + + if (match1(parser, PM_TOKEN_KEYWORD_WHEN)) { + pm_case_node_t *case_node = pm_case_node_create(parser, &case_keyword, predicate, NULL); + pm_static_literals_t literals = { 0 }; + + /* At this point we've seen a when keyword, so we know this is a + * case-when node. We will continue to parse the when nodes until we hit + * the end of the list. */ + while (match1(parser, PM_TOKEN_KEYWORD_WHEN)) { + parser_warn_indentation_mismatch(parser, opening_newline_index, &case_keyword, false, true); + parser_lex(parser); + + pm_token_t when_keyword = parser->previous; + pm_when_node_t *when_node = pm_when_node_create(parser, &when_keyword); + + do { + if (accept1(parser, PM_TOKEN_USTAR)) { + pm_token_t operator = parser->previous; + pm_node_t *expression = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_EXPECT_EXPRESSION_AFTER_STAR, (uint16_t) (depth + 1)); + + pm_splat_node_t *splat_node = pm_splat_node_create(parser, &operator, expression); + pm_when_node_conditions_append(parser->arena, when_node, UP(splat_node)); + + if (PM_NODE_TYPE_P(expression, PM_ERROR_RECOVERY_NODE)) break; + } else { + pm_node_t *condition = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_CASE_EXPRESSION_AFTER_WHEN, (uint16_t) (depth + 1)); + pm_when_node_conditions_append(parser->arena, when_node, condition); + + /* If we found a missing node, then this is a syntax error + * and we should stop looping. */ + if (PM_NODE_TYPE_P(condition, PM_ERROR_RECOVERY_NODE)) break; + + /* If this is a string node, then we need to mark it as + * frozen because when clause strings are frozen. */ + if (PM_NODE_TYPE_P(condition, PM_STRING_NODE)) { + pm_node_flag_set(condition, PM_STRING_FLAGS_FROZEN | PM_NODE_FLAG_STATIC_LITERAL); + } else if (PM_NODE_TYPE_P(condition, PM_SOURCE_FILE_NODE)) { + pm_node_flag_set(condition, PM_NODE_FLAG_STATIC_LITERAL); + } + + pm_when_clause_static_literals_add(parser, &literals, condition); + } + } while (accept1(parser, PM_TOKEN_COMMA)); + + if (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) { + if (accept1(parser, PM_TOKEN_KEYWORD_THEN)) { + pm_when_node_then_keyword_loc_set(parser, when_node, &parser->previous); + } + } else { + expect1(parser, PM_TOKEN_KEYWORD_THEN, PM_ERR_EXPECT_WHEN_DELIMITER); + pm_when_node_then_keyword_loc_set(parser, when_node, &parser->previous); + } + + if (!match3(parser, PM_TOKEN_KEYWORD_WHEN, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { + pm_statements_node_t *statements = parse_statements(parser, PM_CONTEXT_CASE_WHEN, (uint16_t) (depth + 1)); + if (statements != NULL) { + pm_when_node_statements_set(when_node, statements); + } + } + + pm_case_node_condition_append(parser->arena, case_node, UP(when_node)); + } + + /* If we didn't parse any conditions (in or when) then we need to + * indicate that we have an error. */ + if (case_node->conditions.size == 0) { + pm_parser_err_token(parser, &case_keyword, PM_ERR_CASE_MISSING_CONDITIONS); + } + + pm_static_literals_free(&literals); + node = UP(case_node); + } else { + pm_case_match_node_t *case_node = pm_case_match_node_create(parser, &case_keyword, predicate); + + /* If this is a case-match node (i.e., it is a pattern matching case + * statement) then we must have a predicate. */ + if (predicate == NULL) { + pm_parser_err_token(parser, &case_keyword, PM_ERR_CASE_MATCH_MISSING_PREDICATE); + } + + /* At this point we expect that we're parsing a case-in node. We will + * continue to parse the in nodes until we hit the end of the list. */ + while (match1(parser, PM_TOKEN_KEYWORD_IN)) { + parser_warn_indentation_mismatch(parser, opening_newline_index, &case_keyword, false, true); + + bool previous_pattern_matching_newlines = parser->pattern_matching_newlines; + parser->pattern_matching_newlines = true; + + lex_state_set(parser, PM_LEX_STATE_BEG | PM_LEX_STATE_LABEL); + parser->command_start = false; + parser_lex(parser); + + pm_token_t in_keyword = parser->previous; + + pm_constant_id_list_t captures = { 0 }; + pm_node_t *pattern = parse_pattern(parser, &captures, PM_PARSE_PATTERN_TOP | PM_PARSE_PATTERN_MULTI, PM_ERR_PATTERN_EXPRESSION_AFTER_IN, (uint16_t) (depth + 1)); + + parser->pattern_matching_newlines = previous_pattern_matching_newlines; + + /* Since we're in the top-level of the case-in node we need to + * check for guard clauses in the form of `if` or `unless` + * statements. */ + if (accept1(parser, PM_TOKEN_KEYWORD_IF_MODIFIER)) { + pm_token_t keyword = parser->previous; + pm_node_t *predicate = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, (flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL, PM_ERR_CONDITIONAL_IF_PREDICATE, (uint16_t) (depth + 1)); + pattern = UP(pm_if_node_modifier_create(parser, pattern, &keyword, predicate)); + } else if (accept1(parser, PM_TOKEN_KEYWORD_UNLESS_MODIFIER)) { + pm_token_t keyword = parser->previous; + pm_node_t *predicate = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, (flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL, PM_ERR_CONDITIONAL_UNLESS_PREDICATE, (uint16_t) (depth + 1)); + pattern = UP(pm_unless_node_modifier_create(parser, pattern, &keyword, predicate)); + } + + /* Now we need to check for the terminator of the in node's pattern. + * It can be a newline or semicolon optionally followed by a `then` + * keyword. */ + pm_token_t then_keyword = { 0 }; + if (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) { + if (accept1(parser, PM_TOKEN_KEYWORD_THEN)) { + then_keyword = parser->previous; + } + } else { + expect1(parser, PM_TOKEN_KEYWORD_THEN, PM_ERR_EXPECT_IN_DELIMITER); + then_keyword = parser->previous; + } + + /* Now we can actually parse the statements associated with the in + * node. */ + pm_statements_node_t *statements; + if (match3(parser, PM_TOKEN_KEYWORD_IN, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { + statements = NULL; + } else { + statements = parse_statements(parser, PM_CONTEXT_CASE_IN, (uint16_t) (depth + 1)); + } + + /* Now that we have the full pattern and statements, we can create + * the node and attach it to the case node. */ + pm_node_t *condition = UP(pm_in_node_create(parser, pattern, statements, &in_keyword, NTOK2PTR(then_keyword))); + pm_case_match_node_condition_append(parser->arena, case_node, condition); + } + + /* If we didn't parse any conditions (in or when) then we need to + * indicate that we have an error. */ + if (case_node->conditions.size == 0) { + pm_parser_err_token(parser, &case_keyword, PM_ERR_CASE_MISSING_CONDITIONS); + } + + node = UP(case_node); + } + + accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); + if (accept1(parser, PM_TOKEN_KEYWORD_ELSE)) { + pm_token_t else_keyword = parser->previous; + pm_else_node_t *else_node; + + if (!match1(parser, PM_TOKEN_KEYWORD_END)) { + else_node = pm_else_node_create(parser, &else_keyword, parse_statements(parser, PM_CONTEXT_ELSE, (uint16_t) (depth + 1)), &parser->current); + } else { + else_node = pm_else_node_create(parser, &else_keyword, NULL, &parser->current); + } + + if (PM_NODE_TYPE_P(node, PM_CASE_NODE)) { + pm_case_node_else_clause_set((pm_case_node_t *) node, else_node); + } else { + pm_case_match_node_else_clause_set((pm_case_match_node_t *) node, else_node); + } + } + + parser_warn_indentation_mismatch(parser, opening_newline_index, &case_keyword, false, false); + expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_CASE_TERM, &case_keyword); + + if (PM_NODE_TYPE_P(node, PM_CASE_NODE)) { + pm_case_node_end_keyword_loc_set(parser, (pm_case_node_t *) node, &parser->previous); + } else { + pm_case_match_node_end_keyword_loc_set(parser, (pm_case_match_node_t *) node, &parser->previous); + } + + pop_block_exits(parser, previous_block_exits); + return node; +} + /** * Parse a method definition expression (the `def` keyword). */ @@ -18895,219 +19113,8 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, u return UP(pm_alias_method_node_create(parser, &keyword, new_name, old_name)); } } - case PM_TOKEN_KEYWORD_CASE: { - size_t opening_newline_index = token_newline_index(parser); - parser_lex(parser); - - pm_token_t case_keyword = parser->previous; - pm_node_t *predicate = NULL; - - pm_node_list_t current_block_exits = { 0 }; - pm_node_list_t *previous_block_exits = push_block_exits(parser, ¤t_block_exits); - - if (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) { - while (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)); - predicate = NULL; - } else if (match3(parser, PM_TOKEN_KEYWORD_WHEN, PM_TOKEN_KEYWORD_IN, PM_TOKEN_KEYWORD_END)) { - predicate = NULL; - } else if (!token_begins_expression_p(parser->current.type)) { - predicate = NULL; - } else { - predicate = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, (flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL, PM_ERR_CASE_EXPRESSION_AFTER_CASE, (uint16_t) (depth + 1)); - while (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)); - } - - if (match1(parser, PM_TOKEN_KEYWORD_END)) { - parser_warn_indentation_mismatch(parser, opening_newline_index, &case_keyword, false, false); - parser_lex(parser); - pop_block_exits(parser, previous_block_exits); - pm_parser_err_token(parser, &case_keyword, PM_ERR_CASE_MISSING_CONDITIONS); - return UP(pm_case_node_create(parser, &case_keyword, predicate, &parser->previous)); - } - - // At this point we can create a case node, though we don't yet know - // if it is a case-in or case-when node. - pm_node_t *node; - - if (match1(parser, PM_TOKEN_KEYWORD_WHEN)) { - pm_case_node_t *case_node = pm_case_node_create(parser, &case_keyword, predicate, NULL); - pm_static_literals_t literals = { 0 }; - - // At this point we've seen a when keyword, so we know this is a - // case-when node. We will continue to parse the when nodes - // until we hit the end of the list. - while (match1(parser, PM_TOKEN_KEYWORD_WHEN)) { - parser_warn_indentation_mismatch(parser, opening_newline_index, &case_keyword, false, true); - parser_lex(parser); - - pm_token_t when_keyword = parser->previous; - pm_when_node_t *when_node = pm_when_node_create(parser, &when_keyword); - - do { - if (accept1(parser, PM_TOKEN_USTAR)) { - pm_token_t operator = parser->previous; - pm_node_t *expression = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_EXPECT_EXPRESSION_AFTER_STAR, (uint16_t) (depth + 1)); - - pm_splat_node_t *splat_node = pm_splat_node_create(parser, &operator, expression); - pm_when_node_conditions_append(parser->arena, when_node, UP(splat_node)); - - if (PM_NODE_TYPE_P(expression, PM_ERROR_RECOVERY_NODE)) break; - } else { - pm_node_t *condition = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_CASE_EXPRESSION_AFTER_WHEN, (uint16_t) (depth + 1)); - pm_when_node_conditions_append(parser->arena, when_node, condition); - - // If we found a missing node, then this is a syntax - // error and we should stop looping. - if (PM_NODE_TYPE_P(condition, PM_ERROR_RECOVERY_NODE)) break; - - // If this is a string node, then we need to mark it - // as frozen because when clause strings are frozen. - if (PM_NODE_TYPE_P(condition, PM_STRING_NODE)) { - pm_node_flag_set(condition, PM_STRING_FLAGS_FROZEN | PM_NODE_FLAG_STATIC_LITERAL); - } else if (PM_NODE_TYPE_P(condition, PM_SOURCE_FILE_NODE)) { - pm_node_flag_set(condition, PM_NODE_FLAG_STATIC_LITERAL); - } - - pm_when_clause_static_literals_add(parser, &literals, condition); - } - } while (accept1(parser, PM_TOKEN_COMMA)); - - if (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) { - if (accept1(parser, PM_TOKEN_KEYWORD_THEN)) { - pm_when_node_then_keyword_loc_set(parser, when_node, &parser->previous); - } - } else { - expect1(parser, PM_TOKEN_KEYWORD_THEN, PM_ERR_EXPECT_WHEN_DELIMITER); - pm_when_node_then_keyword_loc_set(parser, when_node, &parser->previous); - } - - if (!match3(parser, PM_TOKEN_KEYWORD_WHEN, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { - pm_statements_node_t *statements = parse_statements(parser, PM_CONTEXT_CASE_WHEN, (uint16_t) (depth + 1)); - if (statements != NULL) { - pm_when_node_statements_set(when_node, statements); - } - } - - pm_case_node_condition_append(parser->arena, case_node, UP(when_node)); - } - - // If we didn't parse any conditions (in or when) then we need - // to indicate that we have an error. - if (case_node->conditions.size == 0) { - pm_parser_err_token(parser, &case_keyword, PM_ERR_CASE_MISSING_CONDITIONS); - } - - pm_static_literals_free(&literals); - node = UP(case_node); - } else { - pm_case_match_node_t *case_node = pm_case_match_node_create(parser, &case_keyword, predicate); - - // If this is a case-match node (i.e., it is a pattern matching - // case statement) then we must have a predicate. - if (predicate == NULL) { - pm_parser_err_token(parser, &case_keyword, PM_ERR_CASE_MATCH_MISSING_PREDICATE); - } - - // At this point we expect that we're parsing a case-in node. We - // will continue to parse the in nodes until we hit the end of - // the list. - while (match1(parser, PM_TOKEN_KEYWORD_IN)) { - parser_warn_indentation_mismatch(parser, opening_newline_index, &case_keyword, false, true); - - bool previous_pattern_matching_newlines = parser->pattern_matching_newlines; - parser->pattern_matching_newlines = true; - - lex_state_set(parser, PM_LEX_STATE_BEG | PM_LEX_STATE_LABEL); - parser->command_start = false; - parser_lex(parser); - - pm_token_t in_keyword = parser->previous; - - pm_constant_id_list_t captures = { 0 }; - pm_node_t *pattern = parse_pattern(parser, &captures, PM_PARSE_PATTERN_TOP | PM_PARSE_PATTERN_MULTI, PM_ERR_PATTERN_EXPRESSION_AFTER_IN, (uint16_t) (depth + 1)); - - parser->pattern_matching_newlines = previous_pattern_matching_newlines; - - // Since we're in the top-level of the case-in node we need - // to check for guard clauses in the form of `if` or - // `unless` statements. - if (accept1(parser, PM_TOKEN_KEYWORD_IF_MODIFIER)) { - pm_token_t keyword = parser->previous; - pm_node_t *predicate = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, (flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL, PM_ERR_CONDITIONAL_IF_PREDICATE, (uint16_t) (depth + 1)); - pattern = UP(pm_if_node_modifier_create(parser, pattern, &keyword, predicate)); - } else if (accept1(parser, PM_TOKEN_KEYWORD_UNLESS_MODIFIER)) { - pm_token_t keyword = parser->previous; - pm_node_t *predicate = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, (flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL, PM_ERR_CONDITIONAL_UNLESS_PREDICATE, (uint16_t) (depth + 1)); - pattern = UP(pm_unless_node_modifier_create(parser, pattern, &keyword, predicate)); - } - - // Now we need to check for the terminator of the in node's - // pattern. It can be a newline or semicolon optionally - // followed by a `then` keyword. - pm_token_t then_keyword = { 0 }; - if (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) { - if (accept1(parser, PM_TOKEN_KEYWORD_THEN)) { - then_keyword = parser->previous; - } - } else { - expect1(parser, PM_TOKEN_KEYWORD_THEN, PM_ERR_EXPECT_IN_DELIMITER); - then_keyword = parser->previous; - } - - // Now we can actually parse the statements associated with - // the in node. - pm_statements_node_t *statements; - if (match3(parser, PM_TOKEN_KEYWORD_IN, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { - statements = NULL; - } else { - statements = parse_statements(parser, PM_CONTEXT_CASE_IN, (uint16_t) (depth + 1)); - } - - // Now that we have the full pattern and statements, we can - // create the node and attach it to the case node. - pm_node_t *condition = UP(pm_in_node_create(parser, pattern, statements, &in_keyword, NTOK2PTR(then_keyword))); - pm_case_match_node_condition_append(parser->arena, case_node, condition); - } - - // If we didn't parse any conditions (in or when) then we need - // to indicate that we have an error. - if (case_node->conditions.size == 0) { - pm_parser_err_token(parser, &case_keyword, PM_ERR_CASE_MISSING_CONDITIONS); - } - - node = UP(case_node); - } - - accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); - if (accept1(parser, PM_TOKEN_KEYWORD_ELSE)) { - pm_token_t else_keyword = parser->previous; - pm_else_node_t *else_node; - - if (!match1(parser, PM_TOKEN_KEYWORD_END)) { - else_node = pm_else_node_create(parser, &else_keyword, parse_statements(parser, PM_CONTEXT_ELSE, (uint16_t) (depth + 1)), &parser->current); - } else { - else_node = pm_else_node_create(parser, &else_keyword, NULL, &parser->current); - } - - if (PM_NODE_TYPE_P(node, PM_CASE_NODE)) { - pm_case_node_else_clause_set((pm_case_node_t *) node, else_node); - } else { - pm_case_match_node_else_clause_set((pm_case_match_node_t *) node, else_node); - } - } - - parser_warn_indentation_mismatch(parser, opening_newline_index, &case_keyword, false, false); - expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_CASE_TERM, &case_keyword); - - if (PM_NODE_TYPE_P(node, PM_CASE_NODE)) { - pm_case_node_end_keyword_loc_set(parser, (pm_case_node_t *) node, &parser->previous); - } else { - pm_case_match_node_end_keyword_loc_set(parser, (pm_case_match_node_t *) node, &parser->previous); - } - - pop_block_exits(parser, previous_block_exits); - return node; - } + case PM_TOKEN_KEYWORD_CASE: + return parse_case(parser, flags, depth); case PM_TOKEN_KEYWORD_BEGIN: { size_t opening_newline_index = token_newline_index(parser); parser_lex(parser); From 2c01a495147f708aef9e99b62935290c3a53f317 Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Tue, 24 Mar 2026 12:04:13 -0400 Subject: [PATCH 05/12] [ruby/prism] Pull out parse_parentheses from parse_expression_prefix https://github.com/ruby/prism/commit/3349087895 --- prism/prism.c | 409 +++++++++++++++++++++++++------------------------- 1 file changed, 207 insertions(+), 202 deletions(-) diff --git a/prism/prism.c b/prism/prism.c index 3016d704ca4f8f..ac1f7415b2517e 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -18353,6 +18353,211 @@ parse_def(pm_parser_t *parser, pm_binding_power_t binding_power, uint8_t flags, )); } +/** + * Parse a parenthesized expression, which could be a grouping, a multi-target + * assignment, or a set of statements. + */ +static pm_node_t * +parse_parentheses(pm_parser_t *parser, pm_binding_power_t binding_power, uint16_t depth) { + pm_token_t opening = parser->current; + pm_node_flags_t paren_flags = 0; + + pm_node_list_t current_block_exits = { 0 }; + pm_node_list_t *previous_block_exits = push_block_exits(parser, ¤t_block_exits); + + parser_lex(parser); + while (true) { + if (accept1(parser, PM_TOKEN_SEMICOLON)) { + paren_flags |= PM_PARENTHESES_NODE_FLAGS_MULTIPLE_STATEMENTS; + } else if (!accept1(parser, PM_TOKEN_NEWLINE)) { + break; + } + } + + /* If this is the end of the file or we match a right parenthesis, then we + * have an empty parentheses node, and we can immediately return. */ + if (match2(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_TOKEN_EOF)) { + expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_EXPECT_RPAREN); + pop_block_exits(parser, previous_block_exits); + return UP(pm_parentheses_node_create(parser, &opening, NULL, &parser->previous, paren_flags)); + } + + /* Otherwise, we're going to parse the first statement in the list of + * statements within the parentheses. */ + pm_accepts_block_stack_push(parser, true); + context_push(parser, PM_CONTEXT_PARENS); + pm_node_t *statement = parse_expression(parser, PM_BINDING_POWER_STATEMENT, PM_PARSE_ACCEPTS_COMMAND_CALL | PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_CANNOT_PARSE_EXPRESSION, (uint16_t) (depth + 1)); + context_pop(parser); + + /* Determine if this statement is followed by a terminator. In the case of a + * single statement, this is fine. But in the case of multiple statements + * it's required. */ + bool terminator_found = false; + + if (accept1(parser, PM_TOKEN_SEMICOLON)) { + terminator_found = true; + paren_flags |= PM_PARENTHESES_NODE_FLAGS_MULTIPLE_STATEMENTS; + } else if (accept1(parser, PM_TOKEN_NEWLINE)) { + terminator_found = true; + } + + if (terminator_found) { + while (true) { + if (accept1(parser, PM_TOKEN_SEMICOLON)) { + paren_flags |= PM_PARENTHESES_NODE_FLAGS_MULTIPLE_STATEMENTS; + } else if (!accept1(parser, PM_TOKEN_NEWLINE)) { + break; + } + } + } + + /* If we hit a right parenthesis, then we're done parsing the parentheses + * node, and we can check which kind of node we should return. */ + if (match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { + if (opening.type == PM_TOKEN_PARENTHESIS_LEFT_PARENTHESES) { + lex_state_set(parser, PM_LEX_STATE_ENDARG); + } + + parser_lex(parser); + pm_accepts_block_stack_pop(parser); + pop_block_exits(parser, previous_block_exits); + + if (PM_NODE_TYPE_P(statement, PM_MULTI_TARGET_NODE) || PM_NODE_TYPE_P(statement, PM_SPLAT_NODE)) { + /* If we have a single statement and are ending on a right + * parenthesis, then we need to check if this is possibly a multiple + * target node. */ + pm_multi_target_node_t *multi_target; + + if (PM_NODE_TYPE_P(statement, PM_MULTI_TARGET_NODE) && ((pm_multi_target_node_t *) statement)->lparen_loc.length == 0) { + multi_target = (pm_multi_target_node_t *) statement; + } else { + multi_target = pm_multi_target_node_create(parser); + pm_multi_target_node_targets_append(parser, multi_target, statement); + } + + multi_target->lparen_loc = TOK2LOC(parser, &opening); + multi_target->rparen_loc = TOK2LOC(parser, &parser->previous); + PM_NODE_START_SET_TOKEN(parser, multi_target, &opening); + PM_NODE_LENGTH_SET_TOKEN(parser, multi_target, &parser->previous); + + pm_node_t *result; + if (match1(parser, PM_TOKEN_COMMA) && (binding_power == PM_BINDING_POWER_STATEMENT)) { + result = parse_targets(parser, UP(multi_target), PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); + accept1(parser, PM_TOKEN_NEWLINE); + } else { + result = UP(multi_target); + } + + if (context_p(parser, PM_CONTEXT_MULTI_TARGET)) { + /* All set, this is explicitly allowed by the parent context. */ + } else if (context_p(parser, PM_CONTEXT_FOR_INDEX) && match1(parser, PM_TOKEN_KEYWORD_IN)) { + /* All set, we're inside a for loop and we're parsing multiple + * targets. */ + } else if (binding_power != PM_BINDING_POWER_STATEMENT) { + /* Multi targets are not allowed when it's not a statement + * level. */ + pm_parser_err_node(parser, result, PM_ERR_WRITE_TARGET_UNEXPECTED); + } else if (!match2(parser, PM_TOKEN_EQUAL, PM_TOKEN_PARENTHESIS_RIGHT)) { + /* Multi targets must be followed by an equal sign in order to + * be valid (or a right parenthesis if they are nested). */ + pm_parser_err_node(parser, result, PM_ERR_WRITE_TARGET_UNEXPECTED); + } + + return result; + } + + /* If we have a single statement and are ending on a right parenthesis + * and we didn't return a multiple assignment node, then we can return a + * regular parentheses node now. */ + pm_statements_node_t *statements = pm_statements_node_create(parser); + pm_statements_node_body_append(parser, statements, statement, true); + + return UP(pm_parentheses_node_create(parser, &opening, UP(statements), &parser->previous, paren_flags)); + } + + /* If we have more than one statement in the set of parentheses, then we are + * going to parse all of them as a list of statements. We'll do that here. + */ + context_push(parser, PM_CONTEXT_PARENS); + paren_flags |= PM_PARENTHESES_NODE_FLAGS_MULTIPLE_STATEMENTS; + + pm_statements_node_t *statements = pm_statements_node_create(parser); + pm_statements_node_body_append(parser, statements, statement, true); + + /* If we didn't find a terminator and we didn't find a right parenthesis, + * then this is a syntax error. */ + if (!terminator_found && !match1(parser, PM_TOKEN_EOF)) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_str(parser->current.type)); + } + + /* Parse each statement within the parentheses. */ + while (true) { + pm_node_t *node = parse_expression(parser, PM_BINDING_POWER_STATEMENT, PM_PARSE_ACCEPTS_COMMAND_CALL | PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_CANNOT_PARSE_EXPRESSION, (uint16_t) (depth + 1)); + pm_statements_node_body_append(parser, statements, node, true); + + /* If we're recovering from a syntax error, then we need to stop parsing + * the statements now. */ + if (parser->recovering) { + /* If this is the level of context where the recovery has happened, + * then we can mark the parser as done recovering. */ + if (match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) parser->recovering = false; + break; + } + + /* If we couldn't parse an expression at all, then we need to bail out + * of the loop. */ + if (PM_NODE_TYPE_P(node, PM_ERROR_RECOVERY_NODE)) break; + + /* If we successfully parsed a statement, then we are going to need a + * terminator to delimit them. */ + if (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) { + while (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)); + if (match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) break; + } else if (match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { + break; + } else if (!match1(parser, PM_TOKEN_EOF)) { + /* If we're at the end of the file, then we're going to add an error + * after this for the ) anyway. */ + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_str(parser->current.type)); + } + } + + context_pop(parser); + pm_accepts_block_stack_pop(parser); + expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_EXPECT_RPAREN); + + /* When we're parsing multi targets, we allow them to be followed by a right + * parenthesis if they are at the statement level. This is only possible if + * they are the final statement in a parentheses. We need to explicitly + * reject that here. */ + { + pm_node_t *statement = statements->body.nodes[statements->body.size - 1]; + + if (PM_NODE_TYPE_P(statement, PM_SPLAT_NODE)) { + pm_multi_target_node_t *multi_target = pm_multi_target_node_create(parser); + pm_multi_target_node_targets_append(parser, multi_target, statement); + + statement = UP(multi_target); + statements->body.nodes[statements->body.size - 1] = statement; + } + + if (PM_NODE_TYPE_P(statement, PM_MULTI_TARGET_NODE)) { + const uint8_t *offset = parser->start + PM_NODE_END(statement); + pm_token_t operator = { .type = PM_TOKEN_EQUAL, .start = offset, .end = offset }; + pm_node_t *value = UP(pm_error_recovery_node_create(parser, PM_NODE_END(statement), 0)); + + statement = UP(pm_multi_write_node_create(parser, (pm_multi_target_node_t *) statement, &operator, value)); + statements->body.nodes[statements->body.size - 1] = statement; + + pm_parser_err_node(parser, statement, PM_ERR_WRITE_TARGET_UNEXPECTED); + } + } + + pop_block_exits(parser, previous_block_exits); + pm_void_statements_check(parser, statements, true); + return UP(pm_parentheses_node_create(parser, &opening, UP(statements), &parser->previous, paren_flags)); +} + /** * Parse an expression that begins with the previous node that we just lexed. */ @@ -18473,208 +18678,8 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, u return UP(array); } case PM_TOKEN_PARENTHESIS_LEFT: - case PM_TOKEN_PARENTHESIS_LEFT_PARENTHESES: { - pm_token_t opening = parser->current; - pm_node_flags_t paren_flags = 0; - - pm_node_list_t current_block_exits = { 0 }; - pm_node_list_t *previous_block_exits = push_block_exits(parser, ¤t_block_exits); - - parser_lex(parser); - while (true) { - if (accept1(parser, PM_TOKEN_SEMICOLON)) { - paren_flags |= PM_PARENTHESES_NODE_FLAGS_MULTIPLE_STATEMENTS; - } else if (!accept1(parser, PM_TOKEN_NEWLINE)) { - break; - } - } - - // If this is the end of the file or we match a right parenthesis, then - // we have an empty parentheses node, and we can immediately return. - if (match2(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_TOKEN_EOF)) { - expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_EXPECT_RPAREN); - pop_block_exits(parser, previous_block_exits); - return UP(pm_parentheses_node_create(parser, &opening, NULL, &parser->previous, paren_flags)); - } - - // Otherwise, we're going to parse the first statement in the list - // of statements within the parentheses. - pm_accepts_block_stack_push(parser, true); - context_push(parser, PM_CONTEXT_PARENS); - pm_node_t *statement = parse_expression(parser, PM_BINDING_POWER_STATEMENT, PM_PARSE_ACCEPTS_COMMAND_CALL | PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_CANNOT_PARSE_EXPRESSION, (uint16_t) (depth + 1)); - context_pop(parser); - - // Determine if this statement is followed by a terminator. In the - // case of a single statement, this is fine. But in the case of - // multiple statements it's required. - bool terminator_found = false; - - if (accept1(parser, PM_TOKEN_SEMICOLON)) { - terminator_found = true; - paren_flags |= PM_PARENTHESES_NODE_FLAGS_MULTIPLE_STATEMENTS; - } else if (accept1(parser, PM_TOKEN_NEWLINE)) { - terminator_found = true; - } - - if (terminator_found) { - while (true) { - if (accept1(parser, PM_TOKEN_SEMICOLON)) { - paren_flags |= PM_PARENTHESES_NODE_FLAGS_MULTIPLE_STATEMENTS; - } else if (!accept1(parser, PM_TOKEN_NEWLINE)) { - break; - } - } - } - - // If we hit a right parenthesis, then we're done parsing the - // parentheses node, and we can check which kind of node we should - // return. - if (match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { - if (opening.type == PM_TOKEN_PARENTHESIS_LEFT_PARENTHESES) { - lex_state_set(parser, PM_LEX_STATE_ENDARG); - } - - parser_lex(parser); - pm_accepts_block_stack_pop(parser); - pop_block_exits(parser, previous_block_exits); - - if (PM_NODE_TYPE_P(statement, PM_MULTI_TARGET_NODE) || PM_NODE_TYPE_P(statement, PM_SPLAT_NODE)) { - // If we have a single statement and are ending on a right - // parenthesis, then we need to check if this is possibly a - // multiple target node. - pm_multi_target_node_t *multi_target; - - if (PM_NODE_TYPE_P(statement, PM_MULTI_TARGET_NODE) && ((pm_multi_target_node_t *) statement)->lparen_loc.length == 0) { - multi_target = (pm_multi_target_node_t *) statement; - } else { - multi_target = pm_multi_target_node_create(parser); - pm_multi_target_node_targets_append(parser, multi_target, statement); - } - - multi_target->lparen_loc = TOK2LOC(parser, &opening); - multi_target->rparen_loc = TOK2LOC(parser, &parser->previous); - PM_NODE_START_SET_TOKEN(parser, multi_target, &opening); - PM_NODE_LENGTH_SET_TOKEN(parser, multi_target, &parser->previous); - - pm_node_t *result; - if (match1(parser, PM_TOKEN_COMMA) && (binding_power == PM_BINDING_POWER_STATEMENT)) { - result = parse_targets(parser, UP(multi_target), PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); - accept1(parser, PM_TOKEN_NEWLINE); - } else { - result = UP(multi_target); - } - - if (context_p(parser, PM_CONTEXT_MULTI_TARGET)) { - // All set, this is explicitly allowed by the parent - // context. - } else if (context_p(parser, PM_CONTEXT_FOR_INDEX) && match1(parser, PM_TOKEN_KEYWORD_IN)) { - // All set, we're inside a for loop and we're parsing - // multiple targets. - } else if (binding_power != PM_BINDING_POWER_STATEMENT) { - // Multi targets are not allowed when it's not a - // statement level. - pm_parser_err_node(parser, result, PM_ERR_WRITE_TARGET_UNEXPECTED); - } else if (!match2(parser, PM_TOKEN_EQUAL, PM_TOKEN_PARENTHESIS_RIGHT)) { - // Multi targets must be followed by an equal sign in - // order to be valid (or a right parenthesis if they are - // nested). - pm_parser_err_node(parser, result, PM_ERR_WRITE_TARGET_UNEXPECTED); - } - - return result; - } - - // If we have a single statement and are ending on a right parenthesis - // and we didn't return a multiple assignment node, then we can return a - // regular parentheses node now. - pm_statements_node_t *statements = pm_statements_node_create(parser); - pm_statements_node_body_append(parser, statements, statement, true); - - return UP(pm_parentheses_node_create(parser, &opening, UP(statements), &parser->previous, paren_flags)); - } - - // If we have more than one statement in the set of parentheses, - // then we are going to parse all of them as a list of statements. - // We'll do that here. - context_push(parser, PM_CONTEXT_PARENS); - paren_flags |= PM_PARENTHESES_NODE_FLAGS_MULTIPLE_STATEMENTS; - - pm_statements_node_t *statements = pm_statements_node_create(parser); - pm_statements_node_body_append(parser, statements, statement, true); - - // If we didn't find a terminator and we didn't find a right - // parenthesis, then this is a syntax error. - if (!terminator_found && !match1(parser, PM_TOKEN_EOF)) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_str(parser->current.type)); - } - - // Parse each statement within the parentheses. - while (true) { - pm_node_t *node = parse_expression(parser, PM_BINDING_POWER_STATEMENT, PM_PARSE_ACCEPTS_COMMAND_CALL | PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_CANNOT_PARSE_EXPRESSION, (uint16_t) (depth + 1)); - pm_statements_node_body_append(parser, statements, node, true); - - // If we're recovering from a syntax error, then we need to stop - // parsing the statements now. - if (parser->recovering) { - // If this is the level of context where the recovery has - // happened, then we can mark the parser as done recovering. - if (match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) parser->recovering = false; - break; - } - - // If we couldn't parse an expression at all, then we need to - // bail out of the loop. - if (PM_NODE_TYPE_P(node, PM_ERROR_RECOVERY_NODE)) break; - - // If we successfully parsed a statement, then we are going to - // need terminator to delimit them. - if (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) { - while (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)); - if (match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) break; - } else if (match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { - break; - } else if (!match1(parser, PM_TOKEN_EOF)) { - // If we're at the end of the file, then we're going to add - // an error after this for the ) anyway. - PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_str(parser->current.type)); - } - } - - context_pop(parser); - pm_accepts_block_stack_pop(parser); - expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_EXPECT_RPAREN); - - // When we're parsing multi targets, we allow them to be followed by - // a right parenthesis if they are at the statement level. This is - // only possible if they are the final statement in a parentheses. - // We need to explicitly reject that here. - { - pm_node_t *statement = statements->body.nodes[statements->body.size - 1]; - - if (PM_NODE_TYPE_P(statement, PM_SPLAT_NODE)) { - pm_multi_target_node_t *multi_target = pm_multi_target_node_create(parser); - pm_multi_target_node_targets_append(parser, multi_target, statement); - - statement = UP(multi_target); - statements->body.nodes[statements->body.size - 1] = statement; - } - - if (PM_NODE_TYPE_P(statement, PM_MULTI_TARGET_NODE)) { - const uint8_t *offset = parser->start + PM_NODE_END(statement); - pm_token_t operator = { .type = PM_TOKEN_EQUAL, .start = offset, .end = offset }; - pm_node_t *value = UP(pm_error_recovery_node_create(parser, PM_NODE_END(statement), 0)); - - statement = UP(pm_multi_write_node_create(parser, (pm_multi_target_node_t *) statement, &operator, value)); - statements->body.nodes[statements->body.size - 1] = statement; - - pm_parser_err_node(parser, statement, PM_ERR_WRITE_TARGET_UNEXPECTED); - } - } - - pop_block_exits(parser, previous_block_exits); - pm_void_statements_check(parser, statements, true); - return UP(pm_parentheses_node_create(parser, &opening, UP(statements), &parser->previous, paren_flags)); - } + case PM_TOKEN_PARENTHESIS_LEFT_PARENTHESES: + return parse_parentheses(parser, binding_power, depth); case PM_TOKEN_BRACE_LEFT: { // If we were passed a current_hash_keys via the parser, then that // means we're already parsing a hash and we want to share the set From 9a24716e51e80a871dd8057de623c6217221835b Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Tue, 24 Mar 2026 12:28:34 -0400 Subject: [PATCH 06/12] [ruby/prism] Pull out parse_*_array from parse_expression_prefix https://github.com/ruby/prism/commit/80220a9d6a --- prism/prism.c | 593 ++++++++++++++++++++++++++------------------------ 1 file changed, 304 insertions(+), 289 deletions(-) diff --git a/prism/prism.c b/prism/prism.c index ac1f7415b2517e..0fa6f1009e706f 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -18353,6 +18353,306 @@ parse_def(pm_parser_t *parser, pm_binding_power_t binding_power, uint8_t flags, )); } +/** + * Parse an interpolated word array literal (`%W[...]`). + */ +static pm_node_t * +parse_string_array(pm_parser_t *parser, uint16_t depth) { + parser_lex(parser); + pm_token_t opening = parser->previous; + pm_array_node_t *array = pm_array_node_create(parser, &opening); + + /* This is the current node that we are parsing that will be added to the + * list of elements. */ + pm_node_t *current = NULL; + + while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) { + switch (parser->current.type) { + case PM_TOKEN_WORDS_SEP: { + /* Reset the explicit encoding if we hit a separator since each + * element can have its own encoding. */ + parser->explicit_encoding = NULL; + + if (current == NULL) { + /* If we hit a separator before we have any content, then we + * don't need to do anything. */ + } else { + /* If we hit a separator after we've hit content, then we + * need to append that content to the list and reset the + * current node. */ + pm_array_node_elements_append(parser->arena, array, current); + current = NULL; + } + + parser_lex(parser); + break; + } + case PM_TOKEN_STRING_CONTENT: { + pm_node_t *string = UP(pm_string_node_create_current_string(parser, NULL, &parser->current, NULL)); + pm_node_flag_set(string, parse_unescaped_encoding(parser)); + parser_lex(parser); + + if (current == NULL) { + /* If we hit content and the current node is NULL, then this + * is the first string content we've seen. In that case + * we're going to create a new string node and set that to + * the current. */ + current = string; + } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_STRING_NODE)) { + /* If we hit string content and the current node is an + * interpolated string, then we need to append the string + * content to the list of child nodes. */ + pm_interpolated_string_node_append(parser, (pm_interpolated_string_node_t *) current, string); + } else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) { + /* If we hit string content and the current node is a string + * node, then we need to convert the current node into an + * interpolated string and add the string content to the + * list of child nodes. */ + pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, NULL, NULL, NULL); + pm_interpolated_string_node_append(parser, interpolated, current); + pm_interpolated_string_node_append(parser, interpolated, string); + current = UP(interpolated); + } else { + assert(false && "unreachable"); + } + + break; + } + case PM_TOKEN_EMBVAR: { + if (current == NULL) { + /* If we hit an embedded variable and the current node is + * NULL, then this is the start of a new string. We'll set + * the current node to a new interpolated string. */ + current = UP(pm_interpolated_string_node_create(parser, NULL, NULL, NULL)); + } else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) { + /* If we hit an embedded variable and the current node is a + * string node, then we'll convert the current into an + * interpolated string and add the string node to the list + * of parts. */ + pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, NULL, NULL, NULL); + pm_interpolated_string_node_append(parser, interpolated, current); + current = UP(interpolated); + } else { + /* If we hit an embedded variable and the current node is an + * interpolated string, then we'll just add the embedded + * variable. */ + } + + pm_node_t *part = parse_string_part(parser, (uint16_t) (depth + 1)); + pm_interpolated_string_node_append(parser, (pm_interpolated_string_node_t *) current, part); + break; + } + case PM_TOKEN_EMBEXPR_BEGIN: { + if (current == NULL) { + /* If we hit an embedded expression and the current node is + * NULL, then this is the start of a new string. We'll set + * the current node to a new interpolated string. */ + current = UP(pm_interpolated_string_node_create(parser, NULL, NULL, NULL)); + } else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) { + /* If we hit an embedded expression and the current node is + * a string node, then we'll convert the current into an + * interpolated string and add the string node to the list + * of parts. */ + pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, NULL, NULL, NULL); + pm_interpolated_string_node_append(parser, interpolated, current); + current = UP(interpolated); + } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_STRING_NODE)) { + /* If we hit an embedded expression and the current node is + * an interpolated string, then we'll just continue on. */ + } else { + assert(false && "unreachable"); + } + + pm_node_t *part = parse_string_part(parser, (uint16_t) (depth + 1)); + pm_interpolated_string_node_append(parser, (pm_interpolated_string_node_t *) current, part); + break; + } + default: + expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_LIST_W_UPPER_ELEMENT); + parser_lex(parser); + break; + } + } + + /* If we have a current node, then we need to append it to the list. */ + if (current) { + pm_array_node_elements_append(parser->arena, array, current); + } + + pm_token_t closing = parser->current; + if (match1(parser, PM_TOKEN_EOF)) { + pm_parser_err_token(parser, &opening, PM_ERR_LIST_W_UPPER_TERM); + closing = (pm_token_t) { .type = 0, .start = parser->previous.end, .end = parser->previous.end }; + } else { + expect1(parser, PM_TOKEN_STRING_END, PM_ERR_LIST_W_UPPER_TERM); + } + + pm_array_node_close_set(parser, array, &closing); + return UP(array); +} + +/** + * Parse an interpolated symbol array literal (`%I[...]`). + */ +static pm_node_t * +parse_symbol_array(pm_parser_t *parser, uint16_t depth) { + parser_lex(parser); + pm_token_t opening = parser->previous; + pm_array_node_t *array = pm_array_node_create(parser, &opening); + + /* This is the current node that we are parsing that will be added to the + * list of elements. */ + pm_node_t *current = NULL; + + while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) { + switch (parser->current.type) { + case PM_TOKEN_WORDS_SEP: { + if (current == NULL) { + /* If we hit a separator before we have any content, then we + * don't need to do anything. */ + } else { + /* If we hit a separator after we've hit content, then we + * need to append that content to the list and reset the + * current node. */ + pm_array_node_elements_append(parser->arena, array, current); + current = NULL; + } + + parser_lex(parser); + break; + } + case PM_TOKEN_STRING_CONTENT: { + if (current == NULL) { + /* If we hit content and the current node is NULL, then this + * is the first string content we've seen. In that case + * we're going to create a new string node and set that to + * the current. */ + current = UP(pm_symbol_node_create_current_string(parser, NULL, &parser->current, NULL)); + parser_lex(parser); + } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_SYMBOL_NODE)) { + /* If we hit string content and the current node is an + * interpolated string, then we need to append the string + * content to the list of child nodes. */ + pm_node_t *string = UP(pm_string_node_create_current_string(parser, NULL, &parser->current, NULL)); + parser_lex(parser); + + pm_interpolated_symbol_node_append(parser->arena, (pm_interpolated_symbol_node_t *) current, string); + } else if (PM_NODE_TYPE_P(current, PM_SYMBOL_NODE)) { + /* If we hit string content and the current node is a symbol + * node, then we need to convert the current node into an + * interpolated string and add the string content to the + * list of child nodes. */ + pm_symbol_node_t *cast = (pm_symbol_node_t *) current; + pm_token_t content = { + .type = PM_TOKEN_STRING_CONTENT, + .start = parser->start + cast->value_loc.start, + .end = parser->start + cast->value_loc.start + cast->value_loc.length + }; + + pm_node_t *first_string = UP(pm_string_node_create_unescaped(parser, NULL, &content, NULL, &cast->unescaped)); + pm_node_t *second_string = UP(pm_string_node_create_current_string(parser, NULL, &parser->previous, NULL)); + parser_lex(parser); + + pm_interpolated_symbol_node_t *interpolated = pm_interpolated_symbol_node_create(parser, NULL, NULL, NULL); + pm_interpolated_symbol_node_append(parser->arena, interpolated, first_string); + pm_interpolated_symbol_node_append(parser->arena, interpolated, second_string); + + current = UP(interpolated); + } else { + assert(false && "unreachable"); + } + + break; + } + case PM_TOKEN_EMBVAR: { + bool start_location_set = false; + if (current == NULL) { + /* If we hit an embedded variable and the current node is + * NULL, then this is the start of a new string. We'll set + * the current node to a new interpolated string. */ + current = UP(pm_interpolated_symbol_node_create(parser, NULL, NULL, NULL)); + } else if (PM_NODE_TYPE_P(current, PM_SYMBOL_NODE)) { + /* If we hit an embedded variable and the current node is a + * string node, then we'll convert the current into an + * interpolated string and add the string node to the list + * of parts. */ + pm_interpolated_symbol_node_t *interpolated = pm_interpolated_symbol_node_create(parser, NULL, NULL, NULL); + + current = UP(pm_symbol_node_to_string_node(parser, (pm_symbol_node_t *) current)); + pm_interpolated_symbol_node_append(parser->arena, interpolated, current); + PM_NODE_START_SET_NODE(interpolated, current); + start_location_set = true; + current = UP(interpolated); + } else { + /* If we hit an embedded variable and the current node is an + * interpolated string, then we'll just add the embedded + * variable. */ + } + + pm_node_t *part = parse_string_part(parser, (uint16_t) (depth + 1)); + pm_interpolated_symbol_node_append(parser->arena, (pm_interpolated_symbol_node_t *) current, part); + if (!start_location_set) { + PM_NODE_START_SET_NODE(current, part); + } + break; + } + case PM_TOKEN_EMBEXPR_BEGIN: { + bool start_location_set = false; + if (current == NULL) { + /* If we hit an embedded expression and the current node is + * NULL, then this is the start of a new string. We'll set + * the current node to a new interpolated string. */ + current = UP(pm_interpolated_symbol_node_create(parser, NULL, NULL, NULL)); + } else if (PM_NODE_TYPE_P(current, PM_SYMBOL_NODE)) { + /* If we hit an embedded expression and the current node is + * a string node, then we'll convert the current into an + * interpolated string and add the string node to the list + * of parts. */ + pm_interpolated_symbol_node_t *interpolated = pm_interpolated_symbol_node_create(parser, NULL, NULL, NULL); + + current = UP(pm_symbol_node_to_string_node(parser, (pm_symbol_node_t *) current)); + pm_interpolated_symbol_node_append(parser->arena, interpolated, current); + PM_NODE_START_SET_NODE(interpolated, current); + start_location_set = true; + current = UP(interpolated); + } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_SYMBOL_NODE)) { + /* If we hit an embedded expression and the current node is + * an interpolated string, then we'll just continue on. */ + } else { + assert(false && "unreachable"); + } + + pm_node_t *part = parse_string_part(parser, (uint16_t) (depth + 1)); + pm_interpolated_symbol_node_append(parser->arena, (pm_interpolated_symbol_node_t *) current, part); + if (!start_location_set) { + PM_NODE_START_SET_NODE(current, part); + } + break; + } + default: + expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_LIST_I_UPPER_ELEMENT); + parser_lex(parser); + break; + } + } + + /* If we have a current node, then we need to append it to the list. */ + if (current) { + pm_array_node_elements_append(parser->arena, array, current); + } + + pm_token_t closing = parser->current; + if (match1(parser, PM_TOKEN_EOF)) { + pm_parser_err_token(parser, &opening, PM_ERR_LIST_I_UPPER_TERM); + closing = (pm_token_t) { .type = 0, .start = parser->previous.end, .end = parser->previous.end }; + } else { + expect1(parser, PM_TOKEN_STRING_END, PM_ERR_LIST_I_UPPER_TERM); + } + pm_array_node_close_set(parser, array, &closing); + + return UP(array); +} + /** * Parse a parenthesized expression, which could be a grouping, a multi-target * assignment, or a set of statements. @@ -19801,159 +20101,8 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, u return UP(array); } - case PM_TOKEN_PERCENT_UPPER_I: { - parser_lex(parser); - pm_token_t opening = parser->previous; - pm_array_node_t *array = pm_array_node_create(parser, &opening); - - // This is the current node that we are parsing that will be added to the - // list of elements. - pm_node_t *current = NULL; - - while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) { - switch (parser->current.type) { - case PM_TOKEN_WORDS_SEP: { - if (current == NULL) { - // If we hit a separator before we have any content, then we don't - // need to do anything. - } else { - // If we hit a separator after we've hit content, then we need to - // append that content to the list and reset the current node. - pm_array_node_elements_append(parser->arena, array, current); - current = NULL; - } - - parser_lex(parser); - break; - } - case PM_TOKEN_STRING_CONTENT: { - if (current == NULL) { - // If we hit content and the current node is NULL, then this is - // the first string content we've seen. In that case we're going - // to create a new string node and set that to the current. - current = UP(pm_symbol_node_create_current_string(parser, NULL, &parser->current, NULL)); - parser_lex(parser); - } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_SYMBOL_NODE)) { - // If we hit string content and the current node is an - // interpolated string, then we need to append the string content - // to the list of child nodes. - pm_node_t *string = UP(pm_string_node_create_current_string(parser, NULL, &parser->current, NULL)); - parser_lex(parser); - - pm_interpolated_symbol_node_append(parser->arena, (pm_interpolated_symbol_node_t *) current, string); - } else if (PM_NODE_TYPE_P(current, PM_SYMBOL_NODE)) { - // If we hit string content and the current node is a symbol node, - // then we need to convert the current node into an interpolated - // string and add the string content to the list of child nodes. - pm_symbol_node_t *cast = (pm_symbol_node_t *) current; - pm_token_t content = { - .type = PM_TOKEN_STRING_CONTENT, - .start = parser->start + cast->value_loc.start, - .end = parser->start + cast->value_loc.start + cast->value_loc.length - }; - - pm_node_t *first_string = UP(pm_string_node_create_unescaped(parser, NULL, &content, NULL, &cast->unescaped)); - pm_node_t *second_string = UP(pm_string_node_create_current_string(parser, NULL, &parser->previous, NULL)); - parser_lex(parser); - - pm_interpolated_symbol_node_t *interpolated = pm_interpolated_symbol_node_create(parser, NULL, NULL, NULL); - pm_interpolated_symbol_node_append(parser->arena, interpolated, first_string); - pm_interpolated_symbol_node_append(parser->arena, interpolated, second_string); - - // current is arena-allocated so no explicit free is needed. - current = UP(interpolated); - } else { - assert(false && "unreachable"); - } - - break; - } - case PM_TOKEN_EMBVAR: { - bool start_location_set = false; - if (current == NULL) { - // If we hit an embedded variable and the current node is NULL, - // then this is the start of a new string. We'll set the current - // node to a new interpolated string. - current = UP(pm_interpolated_symbol_node_create(parser, NULL, NULL, NULL)); - } else if (PM_NODE_TYPE_P(current, PM_SYMBOL_NODE)) { - // If we hit an embedded variable and the current node is a string - // node, then we'll convert the current into an interpolated - // string and add the string node to the list of parts. - pm_interpolated_symbol_node_t *interpolated = pm_interpolated_symbol_node_create(parser, NULL, NULL, NULL); - - current = UP(pm_symbol_node_to_string_node(parser, (pm_symbol_node_t *) current)); - pm_interpolated_symbol_node_append(parser->arena, interpolated, current); - PM_NODE_START_SET_NODE(interpolated, current); - start_location_set = true; - current = UP(interpolated); - } else { - // If we hit an embedded variable and the current node is an - // interpolated string, then we'll just add the embedded variable. - } - - pm_node_t *part = parse_string_part(parser, (uint16_t) (depth + 1)); - pm_interpolated_symbol_node_append(parser->arena, (pm_interpolated_symbol_node_t *) current, part); - if (!start_location_set) { - PM_NODE_START_SET_NODE(current, part); - } - break; - } - case PM_TOKEN_EMBEXPR_BEGIN: { - bool start_location_set = false; - if (current == NULL) { - // If we hit an embedded expression and the current node is NULL, - // then this is the start of a new string. We'll set the current - // node to a new interpolated string. - current = UP(pm_interpolated_symbol_node_create(parser, NULL, NULL, NULL)); - } else if (PM_NODE_TYPE_P(current, PM_SYMBOL_NODE)) { - // If we hit an embedded expression and the current node is a - // string node, then we'll convert the current into an - // interpolated string and add the string node to the list of - // parts. - pm_interpolated_symbol_node_t *interpolated = pm_interpolated_symbol_node_create(parser, NULL, NULL, NULL); - - current = UP(pm_symbol_node_to_string_node(parser, (pm_symbol_node_t *) current)); - pm_interpolated_symbol_node_append(parser->arena, interpolated, current); - PM_NODE_START_SET_NODE(interpolated, current); - start_location_set = true; - current = UP(interpolated); - } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_SYMBOL_NODE)) { - // If we hit an embedded expression and the current node is an - // interpolated string, then we'll just continue on. - } else { - assert(false && "unreachable"); - } - - pm_node_t *part = parse_string_part(parser, (uint16_t) (depth + 1)); - pm_interpolated_symbol_node_append(parser->arena, (pm_interpolated_symbol_node_t *) current, part); - if (!start_location_set) { - PM_NODE_START_SET_NODE(current, part); - } - break; - } - default: - expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_LIST_I_UPPER_ELEMENT); - parser_lex(parser); - break; - } - } - - // If we have a current node, then we need to append it to the list. - if (current) { - pm_array_node_elements_append(parser->arena, array, current); - } - - pm_token_t closing = parser->current; - if (match1(parser, PM_TOKEN_EOF)) { - pm_parser_err_token(parser, &opening, PM_ERR_LIST_I_UPPER_TERM); - closing = (pm_token_t) { .type = 0, .start = parser->previous.end, .end = parser->previous.end }; - } else { - expect1(parser, PM_TOKEN_STRING_END, PM_ERR_LIST_I_UPPER_TERM); - } - pm_array_node_close_set(parser, array, &closing); - - return UP(array); - } + case PM_TOKEN_PERCENT_UPPER_I: + return parse_symbol_array(parser, depth); case PM_TOKEN_PERCENT_LOWER_W: { parser_lex(parser); pm_token_t opening = parser->previous; @@ -20004,142 +20153,8 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, u pm_array_node_close_set(parser, array, &closing); return UP(array); } - case PM_TOKEN_PERCENT_UPPER_W: { - parser_lex(parser); - pm_token_t opening = parser->previous; - pm_array_node_t *array = pm_array_node_create(parser, &opening); - - // This is the current node that we are parsing that will be added - // to the list of elements. - pm_node_t *current = NULL; - - while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) { - switch (parser->current.type) { - case PM_TOKEN_WORDS_SEP: { - // Reset the explicit encoding if we hit a separator - // since each element can have its own encoding. - parser->explicit_encoding = NULL; - - if (current == NULL) { - // If we hit a separator before we have any content, - // then we don't need to do anything. - } else { - // If we hit a separator after we've hit content, - // then we need to append that content to the list - // and reset the current node. - pm_array_node_elements_append(parser->arena, array, current); - current = NULL; - } - - parser_lex(parser); - break; - } - case PM_TOKEN_STRING_CONTENT: { - pm_node_t *string = UP(pm_string_node_create_current_string(parser, NULL, &parser->current, NULL)); - pm_node_flag_set(string, parse_unescaped_encoding(parser)); - parser_lex(parser); - - if (current == NULL) { - // If we hit content and the current node is NULL, - // then this is the first string content we've seen. - // In that case we're going to create a new string - // node and set that to the current. - current = string; - } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_STRING_NODE)) { - // If we hit string content and the current node is - // an interpolated string, then we need to append - // the string content to the list of child nodes. - pm_interpolated_string_node_append(parser, (pm_interpolated_string_node_t *) current, string); - } else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) { - // If we hit string content and the current node is - // a string node, then we need to convert the - // current node into an interpolated string and add - // the string content to the list of child nodes. - pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, NULL, NULL, NULL); - pm_interpolated_string_node_append(parser, interpolated, current); - pm_interpolated_string_node_append(parser, interpolated, string); - current = UP(interpolated); - } else { - assert(false && "unreachable"); - } - - break; - } - case PM_TOKEN_EMBVAR: { - if (current == NULL) { - // If we hit an embedded variable and the current - // node is NULL, then this is the start of a new - // string. We'll set the current node to a new - // interpolated string. - current = UP(pm_interpolated_string_node_create(parser, NULL, NULL, NULL)); - } else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) { - // If we hit an embedded variable and the current - // node is a string node, then we'll convert the - // current into an interpolated string and add the - // string node to the list of parts. - pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, NULL, NULL, NULL); - pm_interpolated_string_node_append(parser, interpolated, current); - current = UP(interpolated); - } else { - // If we hit an embedded variable and the current - // node is an interpolated string, then we'll just - // add the embedded variable. - } - - pm_node_t *part = parse_string_part(parser, (uint16_t) (depth + 1)); - pm_interpolated_string_node_append(parser, (pm_interpolated_string_node_t *) current, part); - break; - } - case PM_TOKEN_EMBEXPR_BEGIN: { - if (current == NULL) { - // If we hit an embedded expression and the current - // node is NULL, then this is the start of a new - // string. We'll set the current node to a new - // interpolated string. - current = UP(pm_interpolated_string_node_create(parser, NULL, NULL, NULL)); - } else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) { - // If we hit an embedded expression and the current - // node is a string node, then we'll convert the - // current into an interpolated string and add the - // string node to the list of parts. - pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, NULL, NULL, NULL); - pm_interpolated_string_node_append(parser, interpolated, current); - current = UP(interpolated); - } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_STRING_NODE)) { - // If we hit an embedded expression and the current - // node is an interpolated string, then we'll just - // continue on. - } else { - assert(false && "unreachable"); - } - - pm_node_t *part = parse_string_part(parser, (uint16_t) (depth + 1)); - pm_interpolated_string_node_append(parser, (pm_interpolated_string_node_t *) current, part); - break; - } - default: - expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_LIST_W_UPPER_ELEMENT); - parser_lex(parser); - break; - } - } - - // If we have a current node, then we need to append it to the list. - if (current) { - pm_array_node_elements_append(parser->arena, array, current); - } - - pm_token_t closing = parser->current; - if (match1(parser, PM_TOKEN_EOF)) { - pm_parser_err_token(parser, &opening, PM_ERR_LIST_W_UPPER_TERM); - closing = (pm_token_t) { .type = 0, .start = parser->previous.end, .end = parser->previous.end }; - } else { - expect1(parser, PM_TOKEN_STRING_END, PM_ERR_LIST_W_UPPER_TERM); - } - - pm_array_node_close_set(parser, array, &closing); - return UP(array); - } + case PM_TOKEN_PERCENT_UPPER_W: + return parse_string_array(parser, depth); case PM_TOKEN_REGEXP_BEGIN: { pm_token_t opening = parser->current; parser_lex(parser); From a3eca0c9b87c99447b62ca5faff945cdadd535de Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Tue, 24 Mar 2026 21:32:03 -0400 Subject: [PATCH 07/12] [ruby/prism] Pull out parse_class and parse_module https://github.com/ruby/prism/commit/781ecf4338 --- prism/prism.c | 375 ++++++++++++++++++++++++++------------------------ 1 file changed, 195 insertions(+), 180 deletions(-) diff --git a/prism/prism.c b/prism/prism.c index 0fa6f1009e706f..0e798fdce88305 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -17987,6 +17987,122 @@ parse_case(pm_parser_t *parser, uint8_t flags, uint16_t depth) { return node; } +/** + * Parse a class definition expression (the `class` keyword). This handles both + * regular class definitions and singleton class definitions (`class << expr`). + */ +static pm_node_t * +parse_class(pm_parser_t *parser, uint8_t flags, uint16_t depth) { + size_t opening_newline_index = token_newline_index(parser); + parser_lex(parser); + + pm_token_t class_keyword = parser->previous; + pm_do_loop_stack_push(parser, false); + + pm_node_list_t current_block_exits = { 0 }; + pm_node_list_t *previous_block_exits = push_block_exits(parser, ¤t_block_exits); + + if (accept1(parser, PM_TOKEN_LESS_LESS)) { + pm_token_t operator = parser->previous; + pm_node_t *expression = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, (flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL, PM_ERR_EXPECT_EXPRESSION_AFTER_LESS_LESS, (uint16_t) (depth + 1)); + + pm_parser_scope_push(parser, true); + if (!match2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_EXPECT_SINGLETON_CLASS_DELIMITER, pm_token_str(parser->current.type)); + } + + pm_node_t *statements = NULL; + if (!match4(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { + pm_accepts_block_stack_push(parser, true); + statements = UP(parse_statements(parser, PM_CONTEXT_SCLASS, (uint16_t) (depth + 1))); + pm_accepts_block_stack_pop(parser); + } + + if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) { + assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE)); + statements = UP(parse_rescues_implicit_begin(parser, opening_newline_index, &class_keyword, class_keyword.start, (pm_statements_node_t *) statements, PM_RESCUES_SCLASS, (uint16_t) (depth + 1))); + } else { + parser_warn_indentation_mismatch(parser, opening_newline_index, &class_keyword, false, false); + } + + expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_CLASS_TERM, &class_keyword); + + pm_constant_id_list_t locals; + pm_locals_order(parser, &parser->current_scope->locals, &locals, false); + + pm_parser_scope_pop(parser); + pm_do_loop_stack_pop(parser); + + flush_block_exits(parser, previous_block_exits); + return UP(pm_singleton_class_node_create(parser, &locals, &class_keyword, &operator, expression, statements, &parser->previous)); + } + + pm_node_t *constant_path = parse_expression(parser, PM_BINDING_POWER_INDEX, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_CLASS_NAME, (uint16_t) (depth + 1)); + pm_token_t name = parser->previous; + if (name.type != PM_TOKEN_CONSTANT) { + pm_parser_err_token(parser, &name, PM_ERR_CLASS_NAME); + } + + pm_token_t inheritance_operator = { 0 }; + pm_node_t *superclass; + + if (match1(parser, PM_TOKEN_LESS)) { + inheritance_operator = parser->current; + lex_state_set(parser, PM_LEX_STATE_BEG); + + parser->command_start = true; + parser_lex(parser); + + superclass = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, (flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL, PM_ERR_CLASS_SUPERCLASS, (uint16_t) (depth + 1)); + } else { + superclass = NULL; + } + + pm_parser_scope_push(parser, true); + + if (inheritance_operator.start != NULL) { + expect2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON, PM_ERR_CLASS_UNEXPECTED_END); + } else { + accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); + } + pm_node_t *statements = NULL; + + if (!match4(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { + pm_accepts_block_stack_push(parser, true); + statements = UP(parse_statements(parser, PM_CONTEXT_CLASS, (uint16_t) (depth + 1))); + pm_accepts_block_stack_pop(parser); + } + + if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) { + assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE)); + statements = UP(parse_rescues_implicit_begin(parser, opening_newline_index, &class_keyword, class_keyword.start, (pm_statements_node_t *) statements, PM_RESCUES_CLASS, (uint16_t) (depth + 1))); + } else { + parser_warn_indentation_mismatch(parser, opening_newline_index, &class_keyword, false, false); + } + + expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_CLASS_TERM, &class_keyword); + + if (context_def_p(parser)) { + pm_parser_err_token(parser, &class_keyword, PM_ERR_CLASS_IN_METHOD); + } + + pm_constant_id_list_t locals; + pm_locals_order(parser, &parser->current_scope->locals, &locals, false); + + pm_parser_scope_pop(parser); + pm_do_loop_stack_pop(parser); + + if (!PM_NODE_TYPE_P(constant_path, PM_CONSTANT_PATH_NODE) && !(PM_NODE_TYPE_P(constant_path, PM_CONSTANT_READ_NODE))) { + pm_parser_err_node(parser, constant_path, PM_ERR_CLASS_NAME); + if (!PM_NODE_TYPE_P(constant_path, PM_ERROR_RECOVERY_NODE)) { + constant_path = UP(pm_error_recovery_node_create_unexpected(parser, constant_path)); + } + } + + pop_block_exits(parser, previous_block_exits); + return UP(pm_class_node_create(parser, &locals, &class_keyword, constant_path, &name, NTOK2PTR(inheritance_operator), superclass, statements, &parser->previous)); +} + /** * Parse a method definition expression (the `def` keyword). */ @@ -18353,6 +18469,81 @@ parse_def(pm_parser_t *parser, pm_binding_power_t binding_power, uint8_t flags, )); } +/** + * Parse a module definition expression (the `module` keyword). + */ +static pm_node_t * +parse_module(pm_parser_t *parser, uint8_t flags, uint16_t depth) { + pm_node_list_t current_block_exits = { 0 }; + pm_node_list_t *previous_block_exits = push_block_exits(parser, ¤t_block_exits); + + size_t opening_newline_index = token_newline_index(parser); + parser_lex(parser); + pm_token_t module_keyword = parser->previous; + + pm_node_t *constant_path = parse_expression(parser, PM_BINDING_POWER_INDEX, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_MODULE_NAME, (uint16_t) (depth + 1)); + pm_token_t name; + + /* If we can recover from a syntax error that occurred while parsing the + * name of the module, then we'll handle that here. */ + if (PM_NODE_TYPE_P(constant_path, PM_ERROR_RECOVERY_NODE)) { + pop_block_exits(parser, previous_block_exits); + + pm_token_t missing = (pm_token_t) { .type = 0, .start = parser->previous.end, .end = parser->previous.end }; + return UP(pm_module_node_create(parser, NULL, &module_keyword, constant_path, &missing, NULL, &missing)); + } + + while (accept1(parser, PM_TOKEN_COLON_COLON)) { + pm_token_t double_colon = parser->previous; + + expect1(parser, PM_TOKEN_CONSTANT, PM_ERR_CONSTANT_PATH_COLON_COLON_CONSTANT); + constant_path = UP(pm_constant_path_node_create(parser, constant_path, &double_colon, &parser->previous)); + } + + /* Here we retrieve the name of the module. If it wasn't a constant, then + * it's possible that `module foo` was passed, which is a syntax error. We + * handle that here as well. */ + name = parser->previous; + if (name.type != PM_TOKEN_CONSTANT) { + pm_parser_err_token(parser, &name, PM_ERR_MODULE_NAME); + } + + if (!PM_NODE_TYPE_P(constant_path, PM_CONSTANT_READ_NODE) && !PM_NODE_TYPE_P(constant_path, PM_CONSTANT_PATH_NODE) && !PM_NODE_TYPE_P(constant_path, PM_ERROR_RECOVERY_NODE)) { + constant_path = UP(pm_error_recovery_node_create_unexpected(parser, constant_path)); + } + + pm_parser_scope_push(parser, true); + accept2(parser, PM_TOKEN_SEMICOLON, PM_TOKEN_NEWLINE); + pm_node_t *statements = NULL; + + if (!match4(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { + pm_accepts_block_stack_push(parser, true); + statements = UP(parse_statements(parser, PM_CONTEXT_MODULE, (uint16_t) (depth + 1))); + pm_accepts_block_stack_pop(parser); + } + + if (match3(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_ELSE)) { + assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE)); + statements = UP(parse_rescues_implicit_begin(parser, opening_newline_index, &module_keyword, module_keyword.start, (pm_statements_node_t *) statements, PM_RESCUES_MODULE, (uint16_t) (depth + 1))); + } else { + parser_warn_indentation_mismatch(parser, opening_newline_index, &module_keyword, false, false); + } + + pm_constant_id_list_t locals; + pm_locals_order(parser, &parser->current_scope->locals, &locals, false); + + pm_parser_scope_pop(parser); + expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_MODULE_TERM, &module_keyword); + + if (context_def_p(parser)) { + pm_parser_err_token(parser, &module_keyword, PM_ERR_MODULE_IN_METHOD); + } + + pop_block_exits(parser, previous_block_exits); + + return UP(pm_module_node_create(parser, &locals, &module_keyword, constant_path, &name, statements, &parser->previous)); +} + /** * Parse an interpolated word array literal (`%W[...]`). */ @@ -19556,116 +19747,8 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, u return node; } - case PM_TOKEN_KEYWORD_CLASS: { - size_t opening_newline_index = token_newline_index(parser); - parser_lex(parser); - - pm_token_t class_keyword = parser->previous; - pm_do_loop_stack_push(parser, false); - - pm_node_list_t current_block_exits = { 0 }; - pm_node_list_t *previous_block_exits = push_block_exits(parser, ¤t_block_exits); - - if (accept1(parser, PM_TOKEN_LESS_LESS)) { - pm_token_t operator = parser->previous; - pm_node_t *expression = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, (flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL, PM_ERR_EXPECT_EXPRESSION_AFTER_LESS_LESS, (uint16_t) (depth + 1)); - - pm_parser_scope_push(parser, true); - if (!match2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_EXPECT_SINGLETON_CLASS_DELIMITER, pm_token_str(parser->current.type)); - } - - pm_node_t *statements = NULL; - if (!match4(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { - pm_accepts_block_stack_push(parser, true); - statements = UP(parse_statements(parser, PM_CONTEXT_SCLASS, (uint16_t) (depth + 1))); - pm_accepts_block_stack_pop(parser); - } - - if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) { - assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE)); - statements = UP(parse_rescues_implicit_begin(parser, opening_newline_index, &class_keyword, class_keyword.start, (pm_statements_node_t *) statements, PM_RESCUES_SCLASS, (uint16_t) (depth + 1))); - } else { - parser_warn_indentation_mismatch(parser, opening_newline_index, &class_keyword, false, false); - } - - expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_CLASS_TERM, &class_keyword); - - pm_constant_id_list_t locals; - pm_locals_order(parser, &parser->current_scope->locals, &locals, false); - - pm_parser_scope_pop(parser); - pm_do_loop_stack_pop(parser); - - flush_block_exits(parser, previous_block_exits); - return UP(pm_singleton_class_node_create(parser, &locals, &class_keyword, &operator, expression, statements, &parser->previous)); - } - - pm_node_t *constant_path = parse_expression(parser, PM_BINDING_POWER_INDEX, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_CLASS_NAME, (uint16_t) (depth + 1)); - pm_token_t name = parser->previous; - if (name.type != PM_TOKEN_CONSTANT) { - pm_parser_err_token(parser, &name, PM_ERR_CLASS_NAME); - } - - pm_token_t inheritance_operator = { 0 }; - pm_node_t *superclass; - - if (match1(parser, PM_TOKEN_LESS)) { - inheritance_operator = parser->current; - lex_state_set(parser, PM_LEX_STATE_BEG); - - parser->command_start = true; - parser_lex(parser); - - superclass = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, (flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL, PM_ERR_CLASS_SUPERCLASS, (uint16_t) (depth + 1)); - } else { - superclass = NULL; - } - - pm_parser_scope_push(parser, true); - - if (inheritance_operator.start != NULL) { - expect2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON, PM_ERR_CLASS_UNEXPECTED_END); - } else { - accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); - } - pm_node_t *statements = NULL; - - if (!match4(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { - pm_accepts_block_stack_push(parser, true); - statements = UP(parse_statements(parser, PM_CONTEXT_CLASS, (uint16_t) (depth + 1))); - pm_accepts_block_stack_pop(parser); - } - - if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) { - assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE)); - statements = UP(parse_rescues_implicit_begin(parser, opening_newline_index, &class_keyword, class_keyword.start, (pm_statements_node_t *) statements, PM_RESCUES_CLASS, (uint16_t) (depth + 1))); - } else { - parser_warn_indentation_mismatch(parser, opening_newline_index, &class_keyword, false, false); - } - - expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_CLASS_TERM, &class_keyword); - - if (context_def_p(parser)) { - pm_parser_err_token(parser, &class_keyword, PM_ERR_CLASS_IN_METHOD); - } - - pm_constant_id_list_t locals; - pm_locals_order(parser, &parser->current_scope->locals, &locals, false); - - pm_parser_scope_pop(parser); - pm_do_loop_stack_pop(parser); - - if (!PM_NODE_TYPE_P(constant_path, PM_CONSTANT_PATH_NODE) && !(PM_NODE_TYPE_P(constant_path, PM_CONSTANT_READ_NODE))) { - pm_parser_err_node(parser, constant_path, PM_ERR_CLASS_NAME); - if (!PM_NODE_TYPE_P(constant_path, PM_ERROR_RECOVERY_NODE)) { - constant_path = UP(pm_error_recovery_node_create_unexpected(parser, constant_path)); - } - } - - pop_block_exits(parser, previous_block_exits); - return UP(pm_class_node_create(parser, &locals, &class_keyword, constant_path, &name, NTOK2PTR(inheritance_operator), superclass, statements, &parser->previous)); - } + case PM_TOKEN_KEYWORD_CLASS: + return parse_class(parser, flags, depth); case PM_TOKEN_KEYWORD_DEF: return parse_def(parser, binding_power, flags, depth); case PM_TOKEN_KEYWORD_DEFINED: { @@ -19882,76 +19965,8 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, u return parse_conditional(parser, PM_CONTEXT_UNLESS, opening_newline_index, false, (uint16_t) (depth + 1)); } - case PM_TOKEN_KEYWORD_MODULE: { - pm_node_list_t current_block_exits = { 0 }; - pm_node_list_t *previous_block_exits = push_block_exits(parser, ¤t_block_exits); - - size_t opening_newline_index = token_newline_index(parser); - parser_lex(parser); - pm_token_t module_keyword = parser->previous; - - pm_node_t *constant_path = parse_expression(parser, PM_BINDING_POWER_INDEX, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_MODULE_NAME, (uint16_t) (depth + 1)); - pm_token_t name; - - // If we can recover from a syntax error that occurred while parsing - // the name of the module, then we'll handle that here. - if (PM_NODE_TYPE_P(constant_path, PM_ERROR_RECOVERY_NODE)) { - pop_block_exits(parser, previous_block_exits); - - pm_token_t missing = (pm_token_t) { .type = 0, .start = parser->previous.end, .end = parser->previous.end }; - return UP(pm_module_node_create(parser, NULL, &module_keyword, constant_path, &missing, NULL, &missing)); - } - - while (accept1(parser, PM_TOKEN_COLON_COLON)) { - pm_token_t double_colon = parser->previous; - - expect1(parser, PM_TOKEN_CONSTANT, PM_ERR_CONSTANT_PATH_COLON_COLON_CONSTANT); - constant_path = UP(pm_constant_path_node_create(parser, constant_path, &double_colon, &parser->previous)); - } - - // Here we retrieve the name of the module. If it wasn't a constant, - // then it's possible that `module foo` was passed, which is a - // syntax error. We handle that here as well. - name = parser->previous; - if (name.type != PM_TOKEN_CONSTANT) { - pm_parser_err_token(parser, &name, PM_ERR_MODULE_NAME); - } - - if (!PM_NODE_TYPE_P(constant_path, PM_CONSTANT_READ_NODE) && !PM_NODE_TYPE_P(constant_path, PM_CONSTANT_PATH_NODE) && !PM_NODE_TYPE_P(constant_path, PM_ERROR_RECOVERY_NODE)) { - constant_path = UP(pm_error_recovery_node_create_unexpected(parser, constant_path)); - } - - pm_parser_scope_push(parser, true); - accept2(parser, PM_TOKEN_SEMICOLON, PM_TOKEN_NEWLINE); - pm_node_t *statements = NULL; - - if (!match4(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { - pm_accepts_block_stack_push(parser, true); - statements = UP(parse_statements(parser, PM_CONTEXT_MODULE, (uint16_t) (depth + 1))); - pm_accepts_block_stack_pop(parser); - } - - if (match3(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_ELSE)) { - assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE)); - statements = UP(parse_rescues_implicit_begin(parser, opening_newline_index, &module_keyword, module_keyword.start, (pm_statements_node_t *) statements, PM_RESCUES_MODULE, (uint16_t) (depth + 1))); - } else { - parser_warn_indentation_mismatch(parser, opening_newline_index, &module_keyword, false, false); - } - - pm_constant_id_list_t locals; - pm_locals_order(parser, &parser->current_scope->locals, &locals, false); - - pm_parser_scope_pop(parser); - expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_MODULE_TERM, &module_keyword); - - if (context_def_p(parser)) { - pm_parser_err_token(parser, &module_keyword, PM_ERR_MODULE_IN_METHOD); - } - - pop_block_exits(parser, previous_block_exits); - - return UP(pm_module_node_create(parser, &locals, &module_keyword, constant_path, &name, statements, &parser->previous)); - } + case PM_TOKEN_KEYWORD_MODULE: + return parse_module(parser, flags, depth); case PM_TOKEN_KEYWORD_NIL: parser_lex(parser); return UP(pm_nil_node_create(parser, &parser->previous)); From ff89d56b56932940225ebad7a4d1e9aa04c3ae47 Mon Sep 17 00:00:00 2001 From: Takashi Kokubun Date: Wed, 25 Mar 2026 09:44:01 -0700 Subject: [PATCH 08/12] Pass reg_cfp instead of iseq to vm_search_method and related functions (#16541) Change vm_search_method_fastpath, vm_search_method, vm_method_cfunc_is, opt_equality, vm_opt_neq, vm_opt_nil_p, vm_opt_not, and vm_objtostring to take a CFP instead of an iseq. The iseq is now read lazily from reg_cfp->iseq only when the slowpath is actually needed. This is a preparatory refactoring for lightweight JIT frames where the iseq may not always be written eagerly to the CFP. ZJIT's rb_zjit_vm_search_method and rb_vm_method_cfunc_is, which are called with a compile-time iseq rather than a live CFP, now call the slowpath directly. --- insns.def | 12 ++++----- vm_insnhelper.c | 63 ++++++++++++++++++++++++++------------------- zjit/src/codegen.rs | 4 +-- zjit/src/cruby.rs | 2 +- 4 files changed, 45 insertions(+), 36 deletions(-) diff --git a/insns.def b/insns.def index 92ae7c181628b9..df4147efdeacf8 100644 --- a/insns.def +++ b/insns.def @@ -926,7 +926,7 @@ opt_new // The bookkeeping slot should be empty. RUBY_ASSERT(TOPN(argc + 1) == Qnil); - if (vm_method_cfunc_is(GET_ISEQ(), cd, val, rb_class_new_instance_pass_kw)) { + if (vm_method_cfunc_is(GET_CFP(), cd, val, rb_class_new_instance_pass_kw)) { RB_DEBUG_COUNTER_INC(opt_new_hit); val = rb_obj_alloc(val); TOPN(argc) = val; @@ -947,7 +947,7 @@ objtostring // attr bool leaf = false; // attr bool zjit_profile = true; { - val = vm_objtostring(GET_ISEQ(), recv, cd); + val = vm_objtostring(GET_CFP(), recv, cd); if (UNDEF_P(val)) { CALL_SIMPLE_METHOD(); @@ -1006,7 +1006,7 @@ opt_nil_p (VALUE val) // attr bool zjit_profile = true; { - val = vm_opt_nil_p(GET_ISEQ(), cd, recv); + val = vm_opt_nil_p(GET_CFP(), cd, recv); if (UNDEF_P(val)) { CALL_SIMPLE_METHOD(); @@ -1435,7 +1435,7 @@ opt_eq (VALUE val) // attr bool zjit_profile = true; { - val = opt_equality(GET_ISEQ(), recv, obj, cd); + val = opt_equality(GET_CFP(), recv, obj, cd); if (UNDEF_P(val)) { CALL_SIMPLE_METHOD(); @@ -1450,7 +1450,7 @@ opt_neq (VALUE val) // attr bool zjit_profile = true; { - val = vm_opt_neq(GET_ISEQ(), cd, cd_eq, recv, obj); + val = vm_opt_neq(GET_CFP(), cd, cd_eq, recv, obj); if (UNDEF_P(val)) { CALL_SIMPLE_METHOD(); @@ -1672,7 +1672,7 @@ opt_not (VALUE val) // attr bool zjit_profile = true; { - val = vm_opt_not(GET_ISEQ(), cd, recv); + val = vm_opt_not(GET_CFP(), cd, recv); if (UNDEF_P(val)) { CALL_SIMPLE_METHOD(); diff --git a/vm_insnhelper.c b/vm_insnhelper.c index 4e4ec36eb6729b..7a4f0cf54a6e2c 100644 --- a/vm_insnhelper.c +++ b/vm_insnhelper.c @@ -2352,9 +2352,9 @@ vm_search_method_slowpath0(VALUE cd_owner, struct rb_call_data *cd, VALUE klass) return cc; } -ALWAYS_INLINE(static const struct rb_callcache *vm_search_method_fastpath(VALUE cd_owner, struct rb_call_data *cd, VALUE klass)); +ALWAYS_INLINE(static const struct rb_callcache *vm_search_method_fastpath(const struct rb_control_frame_struct *reg_cfp, struct rb_call_data *cd, VALUE klass)); static const struct rb_callcache * -vm_search_method_fastpath(VALUE cd_owner, struct rb_call_data *cd, VALUE klass) +vm_search_method_fastpath(const struct rb_control_frame_struct *reg_cfp, struct rb_call_data *cd, VALUE klass) { const struct rb_callcache *cc = cd->cc; @@ -2376,24 +2376,28 @@ vm_search_method_fastpath(VALUE cd_owner, struct rb_call_data *cd, VALUE klass) } #endif - return vm_search_method_slowpath0(cd_owner, cd, klass); + return vm_search_method_slowpath0((VALUE)reg_cfp->iseq, cd, klass); } static const struct rb_callable_method_entry_struct * -vm_search_method(VALUE cd_owner, struct rb_call_data *cd, VALUE recv) +vm_search_method(struct rb_control_frame_struct *reg_cfp, struct rb_call_data *cd, VALUE recv) { VALUE klass = CLASS_OF(recv); VM_ASSERT(klass != Qfalse); VM_ASSERT(RBASIC_CLASS(klass) == 0 || rb_obj_is_kind_of(klass, rb_cClass)); - const struct rb_callcache *cc = vm_search_method_fastpath(cd_owner, cd, klass); + const struct rb_callcache *cc = vm_search_method_fastpath(reg_cfp, cd, klass); return vm_cc_cme(cc); } const struct rb_callable_method_entry_struct * rb_zjit_vm_search_method(VALUE cd_owner, struct rb_call_data *cd, VALUE recv) { - return vm_search_method(cd_owner, cd, recv); + // Called from ZJIT with the compile-time iseq, which may differ from + // the iseq on the current CFP. Use the slowpath to avoid stale caches. + VALUE klass = CLASS_OF(recv); + const struct rb_callcache *cc = vm_search_method_slowpath0(cd_owner, cd, klass); + return vm_cc_cme(cc); } #if __has_attribute(transparent_union) @@ -2453,10 +2457,10 @@ check_method_basic_definition(const rb_callable_method_entry_t *me) } static inline int -vm_method_cfunc_is(const rb_iseq_t *iseq, CALL_DATA cd, VALUE recv, cfunc_type func) +vm_method_cfunc_is(struct rb_control_frame_struct *reg_cfp, CALL_DATA cd, VALUE recv, cfunc_type func) { - VM_ASSERT(iseq != NULL); - const struct rb_callable_method_entry_struct *cme = vm_search_method((VALUE)iseq, cd, recv); + VM_ASSERT(reg_cfp != NULL); + const struct rb_callable_method_entry_struct *cme = vm_search_method(reg_cfp, cd, recv); return check_cfunc(cme, func); } @@ -2469,11 +2473,16 @@ rb_zjit_cme_is_cfunc(const rb_callable_method_entry_t *me, const cfunc_type func int rb_vm_method_cfunc_is(const rb_iseq_t *iseq, CALL_DATA cd, VALUE recv, cfunc_type func) { - return vm_method_cfunc_is(iseq, cd, recv, func); + // Called from ZJIT with the compile-time iseq, which may differ from + // the iseq on the current CFP. Use the slowpath to avoid stale caches. + VALUE klass = CLASS_OF(recv); + const struct rb_callcache *cc = vm_search_method_slowpath0((VALUE)iseq, cd, klass); + const struct rb_callable_method_entry_struct *cme = vm_cc_cme(cc); + return check_cfunc(cme, func); } #define check_cfunc(me, func) check_cfunc(me, make_cfunc_type(func)) -#define vm_method_cfunc_is(iseq, cd, recv, func) vm_method_cfunc_is(iseq, cd, recv, make_cfunc_type(func)) +#define vm_method_cfunc_is(reg_cfp, cd, recv, func) vm_method_cfunc_is(reg_cfp, cd, recv, make_cfunc_type(func)) #define EQ_UNREDEFINED_P(t) BASIC_OP_UNREDEFINED_P(BOP_EQ, t##_REDEFINED_OP_FLAG) @@ -2542,14 +2551,14 @@ opt_equality_specialized(VALUE recv, VALUE obj) } static VALUE -opt_equality(const rb_iseq_t *cd_owner, VALUE recv, VALUE obj, CALL_DATA cd) +opt_equality(struct rb_control_frame_struct *reg_cfp, VALUE recv, VALUE obj, CALL_DATA cd) { - VM_ASSERT(cd_owner != NULL); + VM_ASSERT(reg_cfp != NULL); VALUE val = opt_equality_specialized(recv, obj); if (!UNDEF_P(val)) return val; - if (!vm_method_cfunc_is(cd_owner, cd, recv, rb_obj_equal)) { + if (!vm_method_cfunc_is(reg_cfp, cd, recv, rb_obj_equal)) { return Qundef; } else { @@ -5171,7 +5180,7 @@ vm_search_super_method(const rb_control_frame_t *reg_cfp, struct rb_call_data *c RB_OBJ_WRITE(reg_cfp->iseq, &cd->cc, cc); } else { - cc = vm_search_method_fastpath((VALUE)reg_cfp->iseq, cd, klass); + cc = vm_search_method_fastpath(reg_cfp, cd, klass); const rb_callable_method_entry_t *cached_cme = vm_cc_cme(cc); // define_method can cache for different method id @@ -6123,7 +6132,7 @@ vm_sendish( switch (method_explorer) { case mexp_search_method: - calling.cc = cc = vm_search_method_fastpath((VALUE)reg_cfp->iseq, cd, CLASS_OF(recv)); + calling.cc = cc = vm_search_method_fastpath(reg_cfp, cd, CLASS_OF(recv)); val = vm_cc_call(cc)(ec, GET_CFP(), &calling); break; case mexp_search_super: @@ -6230,14 +6239,14 @@ VALUE rb_mod_to_s(VALUE); VALUE rb_mod_name(VALUE); static VALUE -vm_objtostring(const rb_iseq_t *iseq, VALUE recv, CALL_DATA cd) +vm_objtostring(struct rb_control_frame_struct *reg_cfp, VALUE recv, CALL_DATA cd) { int type = TYPE(recv); if (type == T_STRING) { return recv; } - const struct rb_callable_method_entry_struct *cme = vm_search_method((VALUE)iseq, cd, recv); + const struct rb_callable_method_entry_struct *cme = vm_search_method(reg_cfp, cd, recv); switch (type) { case T_SYMBOL: @@ -6288,9 +6297,9 @@ vm_objtostring(const rb_iseq_t *iseq, VALUE recv, CALL_DATA cd) // ZJIT implementation is using the C function // and needs to call a non-static function VALUE -rb_vm_objtostring(const rb_iseq_t *iseq, VALUE recv, CALL_DATA cd) +rb_vm_objtostring(struct rb_control_frame_struct *reg_cfp, VALUE recv, CALL_DATA cd) { - return vm_objtostring(iseq, recv, cd); + return vm_objtostring(reg_cfp, recv, cd); } static VALUE @@ -6841,10 +6850,10 @@ vm_opt_mod(VALUE recv, VALUE obj) } static VALUE -vm_opt_neq(const rb_iseq_t *iseq, CALL_DATA cd, CALL_DATA cd_eq, VALUE recv, VALUE obj) +vm_opt_neq(struct rb_control_frame_struct *reg_cfp, CALL_DATA cd, CALL_DATA cd_eq, VALUE recv, VALUE obj) { - if (vm_method_cfunc_is(iseq, cd, recv, rb_obj_not_equal)) { - VALUE val = opt_equality(iseq, recv, obj, cd_eq); + if (vm_method_cfunc_is(reg_cfp, cd, recv, rb_obj_not_equal)) { + VALUE val = opt_equality(reg_cfp, recv, obj, cd_eq); if (!UNDEF_P(val)) { return RBOOL(!RTEST(val)); @@ -7096,13 +7105,13 @@ vm_opt_empty_p(VALUE recv) VALUE rb_false(VALUE obj); static VALUE -vm_opt_nil_p(const rb_iseq_t *iseq, CALL_DATA cd, VALUE recv) +vm_opt_nil_p(struct rb_control_frame_struct *reg_cfp, CALL_DATA cd, VALUE recv) { if (NIL_P(recv) && BASIC_OP_UNREDEFINED_P(BOP_NIL_P, NIL_REDEFINED_OP_FLAG)) { return Qtrue; } - else if (vm_method_cfunc_is(iseq, cd, recv, rb_false)) { + else if (vm_method_cfunc_is(reg_cfp, cd, recv, rb_false)) { return Qfalse; } else { @@ -7158,9 +7167,9 @@ vm_opt_succ(VALUE recv) } static VALUE -vm_opt_not(const rb_iseq_t *iseq, CALL_DATA cd, VALUE recv) +vm_opt_not(struct rb_control_frame_struct *reg_cfp, CALL_DATA cd, VALUE recv) { - if (vm_method_cfunc_is(iseq, cd, recv, rb_obj_not)) { + if (vm_method_cfunc_is(reg_cfp, cd, recv, rb_obj_not)) { return RBOOL(!RTEST(recv)); } else { diff --git a/zjit/src/codegen.rs b/zjit/src/codegen.rs index b473bc69a630c1..d2d5c6abd6b1b8 100644 --- a/zjit/src/codegen.rs +++ b/zjit/src/codegen.rs @@ -742,8 +742,8 @@ fn gen_get_ep(asm: &mut Assembler, level: u32) -> Opnd { fn gen_objtostring(jit: &mut JITState, asm: &mut Assembler, val: Opnd, cd: *const rb_call_data, state: &FrameState) -> Opnd { gen_prepare_non_leaf_call(jit, asm, state); // TODO: Specialize for immediate types - // Call rb_vm_objtostring(iseq, recv, cd) - let ret = asm_ccall!(asm, rb_vm_objtostring, VALUE::from(jit.iseq).into(), val, Opnd::const_ptr(cd)); + // Call rb_vm_objtostring(cfp, recv, cd) + let ret = asm_ccall!(asm, rb_vm_objtostring, CFP, val, Opnd::const_ptr(cd)); // TODO: Call `to_s` on the receiver if rb_vm_objtostring returns Qundef // Need to replicate what CALL_SIMPLE_METHOD does diff --git a/zjit/src/cruby.rs b/zjit/src/cruby.rs index e1343f9b40dfdf..b92df55d48dbfe 100644 --- a/zjit/src/cruby.rs +++ b/zjit/src/cruby.rs @@ -165,7 +165,7 @@ unsafe extern "C" { pub fn rb_vm_stack_canary() -> VALUE; pub fn rb_vm_push_cfunc_frame(cme: *const rb_callable_method_entry_t, recv_idx: c_int); pub fn rb_obj_class(klass: VALUE) -> VALUE; - pub fn rb_vm_objtostring(iseq: IseqPtr, recv: VALUE, cd: *const rb_call_data) -> VALUE; + pub fn rb_vm_objtostring(reg_cfp: CfpPtr, recv: VALUE, cd: *const rb_call_data) -> VALUE; } // Renames From 2ca2865aa194cf832b205f530b4fb440166aea68 Mon Sep 17 00:00:00 2001 From: Takashi Kokubun Date: Wed, 25 Mar 2026 09:44:30 -0700 Subject: [PATCH 09/12] ZJIT: Remove eager nil-fill of locals in JIT-to-JIT calls (#16544) The caller in gen_send_iseq_direct was eagerly writing Qnil to all non-parameter local slots of the callee's frame before every JIT-to-JIT call. This is unnecessary because compile_jit_entry_state already initializes non-parameter locals to Const(Qnil) in the JIT entry block, and these values are propagated to the target block via branch edges. Before any non-leaf call (including eval/binding), gen_spill_locals writes these nil values from the FrameState to the stack, ensuring that eval can correctly read uninitialized locals as nil. The nil-fill in function_stub_hit's prepare_for_exit is kept because that path handles compilation failures where JIT code never runs. --- zjit/src/codegen.rs | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/zjit/src/codegen.rs b/zjit/src/codegen.rs index d2d5c6abd6b1b8..178b122ee6e5e6 100644 --- a/zjit/src/codegen.rs +++ b/zjit/src/codegen.rs @@ -1619,16 +1619,6 @@ fn gen_send_iseq_direct( 0 }; - // Fill non-parameter locals with nil (they may be read by eval before being written) - let num_params = params.size.to_usize(); - if local_size > num_params { - asm_comment!(asm, "initialize non-parameter locals to nil"); - for local_idx in num_params..local_size { - let offset = local_size_and_idx_to_bp_offset(local_size, local_idx); - asm.store(Opnd::mem(64, SP, -offset * SIZEOF_VALUE_I32), Qnil.into()); - } - } - // Make a method call. The target address will be rewritten once compiled. let iseq_call = IseqCall::new(iseq, num_optionals_passed); let dummy_ptr = cb.get_write_ptr().raw_ptr(cb); @@ -2876,7 +2866,7 @@ c_callable! { let pc = unsafe { rb_iseq_pc_at_idx(iseq, entry_insn_idxs[iseq_call.jit_entry_idx.to_usize()]) }; unsafe { rb_set_cfp_pc(cfp, pc) }; - // Successful JIT-to-JIT calls fill nils to non-parameter locals in generated code. + // JIT-to-JIT calls don't eagerly fill nils to non-parameter locals. // If we side-exit from function_stub_hit (before JIT code runs), we need to set them here. fn prepare_for_exit(iseq: IseqPtr, cfp: CfpPtr, sp: *mut VALUE, compile_error: &CompileError) { unsafe { From 17cd9bffd37710dbcd746a62b3d49ea7213095ed Mon Sep 17 00:00:00 2001 From: Max Bernstein Date: Wed, 25 Mar 2026 09:35:56 -0400 Subject: [PATCH 10/12] ZJIT: Support negative array indices Previously we would side-exit if the index was negative. Instead, adjust the index to be in-bounds. --- zjit/src/codegen.rs | 9 +++ zjit/src/cruby_methods.rs | 1 + zjit/src/hir.rs | 27 ++++++++ zjit/src/hir/opt_tests.rs | 126 ++++++++++++++++++++++---------------- 4 files changed, 109 insertions(+), 54 deletions(-) diff --git a/zjit/src/codegen.rs b/zjit/src/codegen.rs index 178b122ee6e5e6..af6e881bd02510 100644 --- a/zjit/src/codegen.rs +++ b/zjit/src/codegen.rs @@ -561,6 +561,7 @@ fn gen_insn(cb: &mut CodeBlock, jit: &mut JITState, asm: &mut Assembler, functio Insn::NewRange { low, high, flag, state } => gen_new_range(jit, asm, opnd!(low), opnd!(high), *flag, &function.frame_state(*state)), Insn::NewRangeFixnum { low, high, flag, state } => gen_new_range_fixnum(asm, opnd!(low), opnd!(high), *flag, &function.frame_state(*state)), Insn::ArrayDup { val, state } => gen_array_dup(asm, opnd!(val), &function.frame_state(*state)), + Insn::AdjustBounds { index, length } => gen_adjust_bounds(asm, opnd!(index), opnd!(length)), Insn::ArrayAref { array, index, .. } => gen_array_aref(asm, opnd!(array), opnd!(index)), Insn::ArrayAset { array, index, val } => { no_output!(gen_array_aset(asm, opnd!(array), opnd!(index), opnd!(val))) @@ -1781,6 +1782,14 @@ fn gen_new_array( } } +/// Adjust potentially-negative index by the given length, returning the adjusted index. If still negative, +/// return a negative number, which indicates the index is still out-of-bounds. +fn gen_adjust_bounds(asm: &mut Assembler, index: Opnd, length: Opnd) -> lir::Opnd { + let adjusted = asm.add(index, length); + asm.test(index, index); + asm.csel_l(adjusted, index) +} + /// Compile array access (`array[index]`) fn gen_array_aref( asm: &mut Assembler, diff --git a/zjit/src/cruby_methods.rs b/zjit/src/cruby_methods.rs index d39f38028743c5..bd4310e0845bd7 100644 --- a/zjit/src/cruby_methods.rs +++ b/zjit/src/cruby_methods.rs @@ -354,6 +354,7 @@ fn inline_array_aref(fun: &mut hir::Function, block: hir::BlockId, recv: hir::In let index = fun.push_insn(block, hir::Insn::UnboxFixnum { val: index }); let length = fun.push_insn(block, hir::Insn::ArrayLength { array: recv }); let index = fun.push_insn(block, hir::Insn::GuardLess { left: index, right: length, state }); + let index = fun.push_insn(block, hir::Insn::AdjustBounds { index, length }); let zero = fun.push_insn(block, hir::Insn::Const { val: hir::Const::CInt64(0) }); use crate::hir::SideExitReason; let index = fun.push_insn(block, hir::Insn::GuardGreaterEq { left: index, right: zero, reason: SideExitReason::GuardGreaterEq, state }); diff --git a/zjit/src/hir.rs b/zjit/src/hir.rs index b14298c935e981..f2e02181175a26 100644 --- a/zjit/src/hir.rs +++ b/zjit/src/hir.rs @@ -806,6 +806,9 @@ pub enum Insn { ArrayPop { array: InsnId, state: InsnId }, /// Return the length of the array as a C `long` ([`types::CInt64`]) ArrayLength { array: InsnId }, + /// Adjust potentially-negative index by the given length, returning the adjusted index. If + /// still negative, return a negative number, which indicates the index is still out-of-bounds. + AdjustBounds { index: InsnId, length: InsnId }, HashAref { hash: InsnId, key: InsnId, state: InsnId }, HashAset { hash: InsnId, key: InsnId, val: InsnId, state: InsnId }, @@ -1270,6 +1273,10 @@ macro_rules! for_each_operand_impl { Insn::ArrayLength { array } => { $visit_one!(array); } + Insn::AdjustBounds { index, length } => { + $visit_one!(index); + $visit_one!(length); + } Insn::HashAref { hash, key, state } => { $visit_one!(hash); $visit_one!(key); @@ -1472,6 +1479,7 @@ impl Insn { Insn::ArrayAset { .. } => effects::Any, Insn::ArrayPop { .. } => effects::Any, Insn::ArrayLength { .. } => Effect::write(abstract_heaps::Empty), + Insn::AdjustBounds { .. } => effects::Empty, Insn::HashAref { .. } => effects::Any, Insn::HashAset { .. } => effects::Any, Insn::HashDup { .. } => allocates, @@ -1713,6 +1721,9 @@ impl<'a> std::fmt::Display for InsnPrinter<'a> { Insn::ArrayLength { array } => { write!(f, "ArrayLength {array}") } + Insn::AdjustBounds { index, length } => { + write!(f, "AdjustBounds {index}, {length}") + } Insn::NewHash { elements, .. } => { write!(f, "NewHash")?; let mut prefix = " "; @@ -2806,6 +2817,7 @@ impl Function { &ArrayAset { array, index, val } => ArrayAset { array: find!(array), index: find!(index), val: find!(val) }, &ArrayPop { array, state } => ArrayPop { array: find!(array), state: find!(state) }, &ArrayLength { array } => ArrayLength { array: find!(array) }, + &AdjustBounds { index, length } => AdjustBounds { index: find!(index), length: find!(length) }, &ArrayMax { ref elements, state } => ArrayMax { elements: find_vec!(elements), state: find!(state) }, &ArrayInclude { ref elements, target, state } => ArrayInclude { elements: find_vec!(elements), target: find!(target), state: find!(state) }, &ArrayPackBuffer { ref elements, fmt, buffer, state } => ArrayPackBuffer { elements: find_vec!(elements), fmt: find!(fmt), buffer: find!(buffer), state: find!(state) }, @@ -2923,6 +2935,7 @@ impl Function { Insn::ArrayAref { .. } => types::BasicObject, Insn::ArrayPop { .. } => types::BasicObject, Insn::ArrayLength { .. } => types::CInt64, + Insn::AdjustBounds { .. } => types::CInt64, Insn::HashAref { .. } => types::BasicObject, Insn::NewHash { .. } => types::HashExact, Insn::HashDup { .. } => types::HashExact, @@ -5281,6 +5294,16 @@ impl Function { _ => insn_id, } } + Insn::AdjustBounds { index, .. } => { + // If index is known nonnegative, then we don't need to adjust bounds. + if self.type_of(index).cint64_value().filter(|&i| i >= 0).is_some() { + self.make_equal_to(insn_id, index); + // Don't bother re-inferring the type of index; we already know it. + continue; + } else { + insn_id + } + } Insn::Test { val } if self.type_of(val).is_known_falsy() => { self.new_insn(Insn::Const { val: Const::CBool(false) }) } @@ -6063,6 +6086,10 @@ impl Function { self.assert_subtype(insn_id, array, types::ArrayExact)?; self.assert_subtype(insn_id, index, types::CInt64) } + Insn::AdjustBounds { index, length } => { + self.assert_subtype(insn_id, index, types::CInt64)?; + self.assert_subtype(insn_id, length, types::CInt64) + } // Instructions with Hash operands Insn::HashAref { hash, .. } | Insn::HashAset { hash, .. } => self.assert_subtype(insn_id, hash, types::HashExact), diff --git a/zjit/src/hir/opt_tests.rs b/zjit/src/hir/opt_tests.rs index fb9e92b5bf0c45..db3889449f149a 100644 --- a/zjit/src/hir/opt_tests.rs +++ b/zjit/src/hir/opt_tests.rs @@ -1014,12 +1014,12 @@ mod hir_opt_tests { PatchPoint NoSingletonClass(Array@0x1008) PatchPoint MethodRedefined(Array@0x1008, []@0x1010, cme:0x1018) v27:ArrayExact = GuardType v10, ArrayExact - v34:CInt64[0] = Const CInt64(0) + v35:CInt64[0] = Const CInt64(0) v29:CInt64 = ArrayLength v27 - v30:CInt64[0] = GuardLess v34, v29 - v33:BasicObject = ArrayAref v27, v30 + v30:CInt64[0] = GuardLess v35, v29 + v34:BasicObject = ArrayAref v27, v30 CheckInterrupts - Return v33 + Return v34 "); } @@ -1047,12 +1047,12 @@ mod hir_opt_tests { PatchPoint NoSingletonClass(Array@0x1008) PatchPoint MethodRedefined(Array@0x1008, []@0x1010, cme:0x1018) v27:ArrayExact = GuardType v10, ArrayExact - v34:CInt64[0] = Const CInt64(0) + v35:CInt64[0] = Const CInt64(0) v29:CInt64 = ArrayLength v27 - v30:CInt64[0] = GuardLess v34, v29 - v33:BasicObject = ArrayAref v27, v30 + v30:CInt64[0] = GuardLess v35, v29 + v34:BasicObject = ArrayAref v27, v30 CheckInterrupts - Return v33 + Return v34 "); } @@ -1077,10 +1077,15 @@ mod hir_opt_tests { v13:Fixnum[-10] = Const Value(-10) PatchPoint NoSingletonClass(Array@0x1008) PatchPoint MethodRedefined(Array@0x1008, []@0x1010, cme:0x1018) - v31:CInt64[-10] = Const CInt64(-10) + v32:CInt64[-10] = Const CInt64(-10) v26:CInt64 = ArrayLength v11 - v27:CInt64[-10] = GuardLess v31, v26 - SideExit GuardGreaterEq + v27:CInt64[-10] = GuardLess v32, v26 + v28:CInt64 = AdjustBounds v27, v26 + v29:CInt64[0] = Const CInt64(0) + v30:CInt64 = GuardGreaterEq v28, v29 + v31:BasicObject = ArrayAref v11, v30 + CheckInterrupts + Return v31 "); } @@ -2343,12 +2348,12 @@ mod hir_opt_tests { PatchPoint NoSingletonClass(Array@0x1008) PatchPoint MethodRedefined(Array@0x1008, []@0x1010, cme:0x1018) v27:ArrayExact = GuardType v10, ArrayExact - v34:CInt64[0] = Const CInt64(0) + v35:CInt64[0] = Const CInt64(0) v29:CInt64 = ArrayLength v27 - v30:CInt64[0] = GuardLess v34, v29 - v33:BasicObject = ArrayAref v27, v30 + v30:CInt64[0] = GuardLess v35, v29 + v34:BasicObject = ArrayAref v27, v30 CheckInterrupts - Return v33 + Return v34 "); assert_snapshot!(inspect("test [1,2,3]"), @"1"); } @@ -6055,12 +6060,12 @@ mod hir_opt_tests { v12:Fixnum[0] = Const Value(0) PatchPoint NoSingletonClass(Array@0x1010) PatchPoint MethodRedefined(Array@0x1010, []@0x1018, cme:0x1020) - v33:CInt64[0] = Const CInt64(0) + v34:CInt64[0] = Const CInt64(0) v28:CInt64 = ArrayLength v23 - v29:CInt64[0] = GuardLess v33, v28 - v32:BasicObject = ArrayAref v23, v29 + v29:CInt64[0] = GuardLess v34, v28 + v33:BasicObject = ArrayAref v23, v29 CheckInterrupts - Return v32 + Return v33 "); // TODO(max): Check the result of `S[0] = 5; test` using `inspect` to make sure that we // actually do the load at run-time. @@ -6087,12 +6092,12 @@ mod hir_opt_tests { v13:Fixnum[1] = Const Value(1) PatchPoint NoSingletonClass(Array@0x1008) PatchPoint MethodRedefined(Array@0x1008, []@0x1010, cme:0x1018) - v31:CInt64[1] = Const CInt64(1) + v32:CInt64[1] = Const CInt64(1) v26:CInt64 = ArrayLength v11 - v27:CInt64[1] = GuardLess v31, v26 - v32:Fixnum[5] = Const Value(5) + v27:CInt64[1] = GuardLess v32, v26 + v33:Fixnum[5] = Const Value(5) CheckInterrupts - Return v32 + Return v33 "); } @@ -6117,10 +6122,15 @@ mod hir_opt_tests { v13:Fixnum[-3] = Const Value(-3) PatchPoint NoSingletonClass(Array@0x1008) PatchPoint MethodRedefined(Array@0x1008, []@0x1010, cme:0x1018) - v31:CInt64[-3] = Const CInt64(-3) + v32:CInt64[-3] = Const CInt64(-3) v26:CInt64 = ArrayLength v11 - v27:CInt64[-3] = GuardLess v31, v26 - SideExit GuardGreaterEq + v27:CInt64[-3] = GuardLess v32, v26 + v28:CInt64 = AdjustBounds v27, v26 + v29:CInt64[0] = Const CInt64(0) + v30:CInt64 = GuardGreaterEq v28, v29 + v31:BasicObject = ArrayAref v11, v30 + CheckInterrupts + Return v31 "); } @@ -6145,10 +6155,15 @@ mod hir_opt_tests { v13:Fixnum[-10] = Const Value(-10) PatchPoint NoSingletonClass(Array@0x1008) PatchPoint MethodRedefined(Array@0x1008, []@0x1010, cme:0x1018) - v31:CInt64[-10] = Const CInt64(-10) + v32:CInt64[-10] = Const CInt64(-10) v26:CInt64 = ArrayLength v11 - v27:CInt64[-10] = GuardLess v31, v26 - SideExit GuardGreaterEq + v27:CInt64[-10] = GuardLess v32, v26 + v28:CInt64 = AdjustBounds v27, v26 + v29:CInt64[0] = Const CInt64(0) + v30:CInt64 = GuardGreaterEq v28, v29 + v31:BasicObject = ArrayAref v11, v30 + CheckInterrupts + Return v31 "); } @@ -6173,12 +6188,12 @@ mod hir_opt_tests { v13:Fixnum[10] = Const Value(10) PatchPoint NoSingletonClass(Array@0x1008) PatchPoint MethodRedefined(Array@0x1008, []@0x1010, cme:0x1018) - v31:CInt64[10] = Const CInt64(10) + v32:CInt64[10] = Const CInt64(10) v26:CInt64 = ArrayLength v11 - v27:CInt64[10] = GuardLess v31, v26 - v32:NilClass = Const Value(nil) + v27:CInt64[10] = GuardLess v32, v26 + v33:NilClass = Const Value(nil) CheckInterrupts - Return v32 + Return v33 "); } @@ -8665,12 +8680,12 @@ mod hir_opt_tests { v19:Fixnum[0] = Const Value(0) PatchPoint NoSingletonClass(Array@0x1008) PatchPoint MethodRedefined(Array@0x1008, []@0x1010, cme:0x1018) - v37:CInt64[0] = Const CInt64(0) + v38:CInt64[0] = Const CInt64(0) v32:CInt64 = ArrayLength v14 - v33:CInt64[0] = GuardLess v37, v32 - v36:BasicObject = ArrayAref v14, v33 + v33:CInt64[0] = GuardLess v38, v32 + v37:BasicObject = ArrayAref v14, v33 CheckInterrupts - Return v36 + Return v37 "); } @@ -8705,11 +8720,12 @@ mod hir_opt_tests { v31:CInt64 = UnboxFixnum v30 v32:CInt64 = ArrayLength v29 v33:CInt64 = GuardLess v31, v32 - v34:CInt64[0] = Const CInt64(0) - v35:CInt64 = GuardGreaterEq v33, v34 - v36:BasicObject = ArrayAref v29, v35 + v34:CInt64 = AdjustBounds v33, v32 + v35:CInt64[0] = Const CInt64(0) + v36:CInt64 = GuardGreaterEq v34, v35 + v37:BasicObject = ArrayAref v29, v36 CheckInterrupts - Return v36 + Return v37 "); } @@ -8745,11 +8761,12 @@ mod hir_opt_tests { v31:CInt64 = UnboxFixnum v30 v32:CInt64 = ArrayLength v29 v33:CInt64 = GuardLess v31, v32 - v34:CInt64[0] = Const CInt64(0) - v35:CInt64 = GuardGreaterEq v33, v34 - v36:BasicObject = ArrayAref v29, v35 + v34:CInt64 = AdjustBounds v33, v32 + v35:CInt64[0] = Const CInt64(0) + v36:CInt64 = GuardGreaterEq v34, v35 + v37:BasicObject = ArrayAref v29, v36 CheckInterrupts - Return v36 + Return v37 "); } @@ -9340,21 +9357,22 @@ mod hir_opt_tests { v25:CallableMethodEntry[VALUE(0x1048)] = GuardBitEquals v24, Value(VALUE(0x1048)) v26:RubyValue = LoadField v23, :_ep_specval@0x1050 v27:FalseClass = GuardBitEquals v26, Value(false) - v37:CPtr = GetEP 0 - v38:RubyValue = LoadField v37, :_ep_method_entry@0x1040 - v39:CallableMethodEntry[VALUE(0x1048)] = GuardBitEquals v38, Value(VALUE(0x1048)) - v40:RubyValue = LoadField v37, :_ep_specval@0x1050 - v41:FalseClass = GuardBitEquals v40, Value(false) + v38:CPtr = GetEP 0 + v39:RubyValue = LoadField v38, :_ep_method_entry@0x1040 + v40:CallableMethodEntry[VALUE(0x1048)] = GuardBitEquals v39, Value(VALUE(0x1048)) + v41:RubyValue = LoadField v38, :_ep_specval@0x1050 + v42:FalseClass = GuardBitEquals v41, Value(false) v28:Array = GuardType v9, Array v29:Fixnum = GuardType v10, Fixnum v30:CInt64 = UnboxFixnum v29 v31:CInt64 = ArrayLength v28 v32:CInt64 = GuardLess v30, v31 - v33:CInt64[0] = Const CInt64(0) - v34:CInt64 = GuardGreaterEq v32, v33 - v35:BasicObject = ArrayAref v28, v34 + v33:CInt64 = AdjustBounds v32, v31 + v34:CInt64[0] = Const CInt64(0) + v35:CInt64 = GuardGreaterEq v33, v34 + v36:BasicObject = ArrayAref v28, v35 CheckInterrupts - Return v35 + Return v36 "); } From be783db2c2bb73456f808291aa4f72fa02861641 Mon Sep 17 00:00:00 2001 From: Max Bernstein Date: Wed, 25 Mar 2026 09:59:07 -0400 Subject: [PATCH 11/12] ZJIT: Support negative indices in more places --- zjit/src/cruby_methods.rs | 3 +++ zjit/src/hir/opt_tests.rs | 47 ++++++++++++++++++++++----------------- 2 files changed, 29 insertions(+), 21 deletions(-) diff --git a/zjit/src/cruby_methods.rs b/zjit/src/cruby_methods.rs index bd4310e0845bd7..767f6499e80606 100644 --- a/zjit/src/cruby_methods.rs +++ b/zjit/src/cruby_methods.rs @@ -379,6 +379,7 @@ fn inline_array_aset(fun: &mut hir::Function, block: hir::BlockId, recv: hir::In let index = fun.push_insn(block, hir::Insn::UnboxFixnum { val: index }); let length = fun.push_insn(block, hir::Insn::ArrayLength { array: recv }); let index = fun.push_insn(block, hir::Insn::GuardLess { left: index, right: length, state }); + let index = fun.push_insn(block, hir::Insn::AdjustBounds { index, length }); let zero = fun.push_insn(block, hir::Insn::Const { val: hir::Const::CInt64(0) }); use crate::hir::SideExitReason; let index = fun.push_insn(block, hir::Insn::GuardGreaterEq { left: index, right: zero, reason: SideExitReason::GuardGreaterEq, state }); @@ -476,6 +477,7 @@ fn inline_string_getbyte(fun: &mut hir::Function, block: hir::BlockId, recv: hir // // This is unlike most other guards. let unboxed_index = fun.push_insn(block, hir::Insn::GuardLess { left: unboxed_index, right: len, state }); + let unboxed_index = fun.push_insn(block, hir::Insn::AdjustBounds { index: unboxed_index, length: len }); let zero = fun.push_insn(block, hir::Insn::Const { val: hir::Const::CInt64(0) }); use crate::hir::SideExitReason; let _ = fun.push_insn(block, hir::Insn::GuardGreaterEq { left: unboxed_index, right: zero, reason: SideExitReason::GuardGreaterEq, state }); @@ -499,6 +501,7 @@ fn inline_string_setbyte(fun: &mut hir::Function, block: hir::BlockId, recv: hir return_type: types::CInt64, }); let unboxed_index = fun.push_insn(block, hir::Insn::GuardLess { left: unboxed_index, right: len, state }); + let unboxed_index = fun.push_insn(block, hir::Insn::AdjustBounds { index: unboxed_index, length: len }); let zero = fun.push_insn(block, hir::Insn::Const { val: hir::Const::CInt64(0) }); use crate::hir::SideExitReason; let _ = fun.push_insn(block, hir::Insn::GuardGreaterEq { left: unboxed_index, right: zero, reason: SideExitReason::GuardGreaterEq, state }); diff --git a/zjit/src/hir/opt_tests.rs b/zjit/src/hir/opt_tests.rs index db3889449f149a..af438c361b8af9 100644 --- a/zjit/src/hir/opt_tests.rs +++ b/zjit/src/hir/opt_tests.rs @@ -9062,9 +9062,9 @@ mod hir_opt_tests { v34:CUInt64 = LoadField v33, :_rbasic_flags@0x1040 v35:CUInt64 = GuardNoBitsSet v34, RUBY_FL_FREEZE=CUInt64(2048) v37:CUInt64 = GuardNoBitsSet v34, RUBY_ELTS_SHARED=CUInt64(4096) - v45:CInt64[1] = Const CInt64(1) + v46:CInt64[1] = Const CInt64(1) v39:CInt64 = ArrayLength v33 - v40:CInt64[1] = GuardLess v45, v39 + v40:CInt64[1] = GuardLess v46, v39 ArrayAset v33, v40, v19 WriteBarrier v33, v19 CheckInterrupts @@ -9108,9 +9108,10 @@ mod hir_opt_tests { v43:CInt64 = UnboxFixnum v38 v44:CInt64 = ArrayLength v37 v45:CInt64 = GuardLess v43, v44 - v46:CInt64[0] = Const CInt64(0) - v47:CInt64 = GuardGreaterEq v45, v46 - ArrayAset v37, v47, v16 + v46:CInt64 = AdjustBounds v45, v44 + v47:CInt64[0] = Const CInt64(0) + v48:CInt64 = GuardGreaterEq v46, v47 + ArrayAset v37, v48, v16 WriteBarrier v37, v16 CheckInterrupts Return v16 @@ -9533,11 +9534,12 @@ mod hir_opt_tests { v30:CInt64 = UnboxFixnum v29 v31:CInt64 = LoadField v28, :len@0x1040 v32:CInt64 = GuardLess v30, v31 - v33:CInt64[0] = Const CInt64(0) - v34:CInt64 = GuardGreaterEq v32, v33 - v35:Fixnum = StringGetbyte v28, v32 + v33:CInt64 = AdjustBounds v32, v31 + v34:CInt64[0] = Const CInt64(0) + v35:CInt64 = GuardGreaterEq v33, v34 + v36:Fixnum = StringGetbyte v28, v33 CheckInterrupts - Return v35 + Return v36 "); } @@ -9573,8 +9575,9 @@ mod hir_opt_tests { v34:CInt64 = UnboxFixnum v33 v35:CInt64 = LoadField v32, :len@0x1040 v36:CInt64 = GuardLess v34, v35 - v37:CInt64[0] = Const CInt64(0) - v38:CInt64 = GuardGreaterEq v36, v37 + v37:CInt64 = AdjustBounds v36, v35 + v38:CInt64[0] = Const CInt64(0) + v39:CInt64 = GuardGreaterEq v37, v38 v23:Fixnum[5] = Const Value(5) CheckInterrupts Return v23 @@ -9615,11 +9618,12 @@ mod hir_opt_tests { v35:CInt64 = UnboxFixnum v33 v36:CInt64 = LoadField v32, :len@0x1040 v37:CInt64 = GuardLess v35, v36 - v38:CInt64[0] = Const CInt64(0) - v39:CInt64 = GuardGreaterEq v37, v38 - v40:CUInt64 = LoadField v32, :_rbasic_flags@0x1041 - v41:CUInt64 = GuardNoBitsSet v40, RUBY_FL_FREEZE=CUInt64(2048) - v42:Fixnum = StringSetbyteFixnum v32, v33, v34 + v38:CInt64 = AdjustBounds v37, v36 + v39:CInt64[0] = Const CInt64(0) + v40:CInt64 = GuardGreaterEq v38, v39 + v41:CUInt64 = LoadField v32, :_rbasic_flags@0x1041 + v42:CUInt64 = GuardNoBitsSet v41, RUBY_FL_FREEZE=CUInt64(2048) + v43:Fixnum = StringSetbyteFixnum v32, v33, v34 CheckInterrupts Return v34 "); @@ -9661,11 +9665,12 @@ mod hir_opt_tests { v35:CInt64 = UnboxFixnum v33 v36:CInt64 = LoadField v32, :len@0x1040 v37:CInt64 = GuardLess v35, v36 - v38:CInt64[0] = Const CInt64(0) - v39:CInt64 = GuardGreaterEq v37, v38 - v40:CUInt64 = LoadField v32, :_rbasic_flags@0x1041 - v41:CUInt64 = GuardNoBitsSet v40, RUBY_FL_FREEZE=CUInt64(2048) - v42:Fixnum = StringSetbyteFixnum v32, v33, v34 + v38:CInt64 = AdjustBounds v37, v36 + v39:CInt64[0] = Const CInt64(0) + v40:CInt64 = GuardGreaterEq v38, v39 + v41:CUInt64 = LoadField v32, :_rbasic_flags@0x1041 + v42:CUInt64 = GuardNoBitsSet v41, RUBY_FL_FREEZE=CUInt64(2048) + v43:Fixnum = StringSetbyteFixnum v32, v33, v34 CheckInterrupts Return v34 "); From 8514166d54eec6ecb5cb79d56d4ea6cf74d4a390 Mon Sep 17 00:00:00 2001 From: Max Bernstein Date: Wed, 25 Mar 2026 09:59:31 -0400 Subject: [PATCH 12/12] ZJIT: Adjust Type API for checking signedness --- zjit/src/hir.rs | 2 +- zjit/src/hir_type/mod.rs | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/zjit/src/hir.rs b/zjit/src/hir.rs index f2e02181175a26..0891a59fa2c2b6 100644 --- a/zjit/src/hir.rs +++ b/zjit/src/hir.rs @@ -5296,7 +5296,7 @@ impl Function { } Insn::AdjustBounds { index, .. } => { // If index is known nonnegative, then we don't need to adjust bounds. - if self.type_of(index).cint64_value().filter(|&i| i >= 0).is_some() { + if self.type_of(index).known_nonnegative() { self.make_equal_to(insn_id, index); // Don't bother re-inferring the type of index; we already know it. continue; diff --git a/zjit/src/hir_type/mod.rs b/zjit/src/hir_type/mod.rs index e1e2c1a8104d51..1e6c0d2df7f8b8 100644 --- a/zjit/src/hir_type/mod.rs +++ b/zjit/src/hir_type/mod.rs @@ -411,6 +411,19 @@ impl Type { } } + fn int_spec_signed(&self) -> Option { + assert!(self.is_subtype(types::CSigned), "int_spec_signed() only makes sense for signed integer types"); + match self.spec { + Specialization::Int(val) => Some(val as i64), + _ => None, + } + } + + pub fn known_nonnegative(&self) -> bool { + assert!(self.is_subtype(types::CSigned), "nonnegative() only makes sense for signed integer types"); + self.int_spec_signed().map_or(false, |val| val >= 0) + } + /// Return true if the Type has object specialization and false otherwise. pub fn ruby_object_known(&self) -> bool { matches!(self.spec, Specialization::Object(_))