Import ruby/prism@0cc6110

eregon · eregon · commit c87fe8fbcc68 · 2024-01-30T17:31:58.000+01:00
diff --git a/src/main/c/yarp/include/prism.h b/src/main/c/yarp/include/prism.h
@@ -168,7 +168,15 @@ PRISM_EXPORTED_FUNCTION bool pm_parse_success_p(const uint8_t *source, size_t si
  * @param token_type The token type to convert to a string.
  * @return A string representation of the given token type.
  */
-PRISM_EXPORTED_FUNCTION const char * pm_token_type_to_str(pm_token_type_t token_type);
+PRISM_EXPORTED_FUNCTION const char * pm_token_type_name(pm_token_type_t token_type);
+
+/**
+ * Returns the human name of the given token type.
+ *
+ * @param token_type The token type to convert to a human name.
+ * @return The human name of the given token type.
+ */
+const char * pm_token_type_human(pm_token_type_t token_type);
 
 /**
  * Format the errors on the parser into the given buffer.
diff --git a/src/main/c/yarp/include/prism/diagnostic.h b/src/main/c/yarp/include/prism/diagnostic.h
@@ -66,6 +66,11 @@ typedef struct {
  * of errors between the parser and the user.
  */
 typedef enum {
+    // This is a special error that we can potentially replace by others. For
+    // an example of how this is used, see parse_expression_prefix.
+    PM_ERR_CANNOT_PARSE_EXPRESSION,
+
+    // These are the error codes.
     PM_ERR_ALIAS_ARGUMENT,
     PM_ERR_AMPAMPEQ_MULTI_ASSIGN,
     PM_ERR_ARGUMENT_AFTER_BLOCK,
@@ -100,7 +105,6 @@ typedef enum {
     PM_ERR_BLOCK_PARAM_PIPE_TERM,
     PM_ERR_BLOCK_TERM_BRACE,
     PM_ERR_BLOCK_TERM_END,
-    PM_ERR_CANNOT_PARSE_EXPRESSION,
     PM_ERR_CANNOT_PARSE_STRING_PART,
     PM_ERR_CASE_EXPRESSION_AFTER_CASE,
     PM_ERR_CASE_EXPRESSION_AFTER_WHEN,
@@ -272,6 +276,8 @@ typedef enum {
     PM_ERR_UNARY_RECEIVER_MINUS,
     PM_ERR_UNARY_RECEIVER_PLUS,
     PM_ERR_UNARY_RECEIVER_TILDE,
+    PM_ERR_UNEXPECTED_TOKEN_CLOSE_CONTEXT,
+    PM_ERR_UNEXPECTED_TOKEN_IGNORE,
     PM_ERR_UNDEF_ARGUMENT,
     PM_ERR_UNTIL_TERM,
     PM_ERR_VOID_EXPRESSION,
@@ -280,13 +286,15 @@ typedef enum {
     PM_ERR_WRITE_TARGET_READONLY,
     PM_ERR_WRITE_TARGET_UNEXPECTED,
     PM_ERR_XSTRING_TERM,
+
+    // These are the warning codes.
     PM_WARN_AMBIGUOUS_FIRST_ARGUMENT_MINUS,
     PM_WARN_AMBIGUOUS_FIRST_ARGUMENT_PLUS,
     PM_WARN_AMBIGUOUS_PREFIX_STAR,
     PM_WARN_AMBIGUOUS_SLASH,
     PM_WARN_END_IN_METHOD,
 
-    /* This must be the last member. */
+    // This is the number of diagnostic codes.
     PM_DIAGNOSTIC_ID_LEN,
 } pm_diagnostic_id_t;
 
diff --git a/src/main/c/yarp/include/prism/parser.h b/src/main/c/yarp/include/prism/parser.h
@@ -259,6 +259,9 @@ typedef struct pm_parser pm_parser_t;
  * token that is understood by a parent context but not by the current context.
  */
 typedef enum {
+    /** a null context, used for returning a value from a function */
+    PM_CONTEXT_NONE = 0,
+
     /** a begin statement */
     PM_CONTEXT_BEGIN,
 
diff --git a/src/main/c/yarp/src/diagnostic.c b/src/main/c/yarp/src/diagnostic.c
@@ -71,6 +71,8 @@ typedef struct {
  * * `PM_WARNING_LEVEL_VERBOSE` - Warnings that appear with `-w`, as in `ruby -w -c -e 'code'`.
  */
 static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_LEN] = {
+    [PM_ERR_CANNOT_PARSE_EXPRESSION]            = { "cannot parse the expression", PM_ERROR_LEVEL_FATAL },
+
     // Errors
     [PM_ERR_ALIAS_ARGUMENT]                     = { "invalid argument being passed to `alias`; expected a bare word, symbol, constant, or global variable", PM_ERROR_LEVEL_FATAL },
     [PM_ERR_AMPAMPEQ_MULTI_ASSIGN]              = { "unexpected `&&=` in a multiple assignment", PM_ERROR_LEVEL_FATAL },
@@ -106,7 +108,6 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_LEN] = {
     [PM_ERR_BLOCK_PARAM_PIPE_TERM]              = { "expected the block parameters to end with `|`", PM_ERROR_LEVEL_FATAL },
     [PM_ERR_BLOCK_TERM_BRACE]                   = { "expected a block beginning with `{` to end with `}`", PM_ERROR_LEVEL_FATAL },
     [PM_ERR_BLOCK_TERM_END]                     = { "expected a block beginning with `do` to end with `end`", PM_ERROR_LEVEL_FATAL },
-    [PM_ERR_CANNOT_PARSE_EXPRESSION]            = { "cannot parse the expression", PM_ERROR_LEVEL_FATAL },
     [PM_ERR_CANNOT_PARSE_STRING_PART]           = { "cannot parse the string part", PM_ERROR_LEVEL_FATAL },
     [PM_ERR_CASE_EXPRESSION_AFTER_CASE]         = { "expected an expression after `case`", PM_ERROR_LEVEL_FATAL },
     [PM_ERR_CASE_EXPRESSION_AFTER_WHEN]         = { "expected an expression after `when`", PM_ERROR_LEVEL_FATAL },
@@ -277,6 +278,8 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_LEN] = {
     [PM_ERR_UNARY_RECEIVER_BANG]                = { "expected a receiver for unary `!`", PM_ERROR_LEVEL_FATAL },
     [PM_ERR_UNARY_RECEIVER_MINUS]               = { "expected a receiver for unary `-`", PM_ERROR_LEVEL_FATAL },
     [PM_ERR_UNARY_RECEIVER_PLUS]                = { "expected a receiver for unary `+`", PM_ERROR_LEVEL_FATAL },
+    [PM_ERR_UNEXPECTED_TOKEN_CLOSE_CONTEXT]     = { "unexpected %s, assuming it is closing the parent %s", PM_ERROR_LEVEL_FATAL },
+    [PM_ERR_UNEXPECTED_TOKEN_IGNORE]            = { "unexpected %s, ignoring it", PM_ERROR_LEVEL_FATAL },
     [PM_ERR_UNARY_RECEIVER_TILDE]               = { "expected a receiver for unary `~`", PM_ERROR_LEVEL_FATAL },
     [PM_ERR_UNTIL_TERM]                         = { "expected an `end` to close the `until` statement", PM_ERROR_LEVEL_FATAL },
     [PM_ERR_VOID_EXPRESSION]                    = { "unexpected void value expression", PM_ERROR_LEVEL_FATAL },
diff --git a/src/main/c/yarp/src/prism.c b/src/main/c/yarp/src/prism.c
@@ -164,7 +164,7 @@ debug_state(pm_parser_t *parser) {
 
 PRISM_ATTRIBUTE_UNUSED static void
 debug_token(pm_token_t * token) {
-    fprintf(stderr, "%s: \"%.*s\"\n", pm_token_type_to_str(token->type), (int) (token->end - token->start), token->start);
+    fprintf(stderr, "%s: \"%.*s\"\n", pm_token_type_human(token->type), (int) (token->end - token->start), token->start);
 }
 
 #endif
@@ -6719,21 +6719,27 @@ context_terminator(pm_context_t context, pm_token_t *token) {
             return token->type == PM_TOKEN_BRACE_RIGHT;
         case PM_CONTEXT_PREDICATE:
             return token->type == PM_TOKEN_KEYWORD_THEN || token->type == PM_TOKEN_NEWLINE || token->type == PM_TOKEN_SEMICOLON;
+        case PM_CONTEXT_NONE:
+            return false;
     }
 
     return false;
 }
 
-static bool
-context_recoverable(pm_parser_t *parser, pm_token_t *token) {
+/**
+ * Returns the context that the given token is found to be terminating, or
+ * returns PM_CONTEXT_NONE.
+ */
+static pm_context_t
+context_recoverable(const pm_parser_t *parser, pm_token_t *token) {
     pm_context_node_t *context_node = parser->current_context;
 
     while (context_node != NULL) {
-        if (context_terminator(context_node->context, token)) return true;
+        if (context_terminator(context_node->context, token)) return context_node->context;
         context_node = context_node->prev;
     }
 
-    return false;
+    return PM_CONTEXT_NONE;
 }
 
 static bool
@@ -6761,7 +6767,7 @@ context_pop(pm_parser_t *parser) {
 }
 
 static bool
-context_p(pm_parser_t *parser, pm_context_t context) {
+context_p(const pm_parser_t *parser, pm_context_t context) {
     pm_context_node_t *context_node = parser->current_context;
 
     while (context_node != NULL) {
@@ -6773,7 +6779,7 @@ context_p(pm_parser_t *parser, pm_context_t context) {
 }
 
 static bool
-context_def_p(pm_parser_t *parser) {
+context_def_p(const pm_parser_t *parser) {
     pm_context_node_t *context_node = parser->current_context;
 
     while (context_node != NULL) {
@@ -6796,6 +6802,55 @@ context_def_p(pm_parser_t *parser) {
     return false;
 }
 
+/**
+ * Returns a human readable string for the given context, used in error
+ * messages.
+ */
+static const char *
+context_human(pm_context_t context) {
+    switch (context) {
+        case PM_CONTEXT_NONE:
+            assert(false && "unreachable");
+            return "";
+        case PM_CONTEXT_BEGIN: return "begin statement";
+        case PM_CONTEXT_BLOCK_BRACES: return "'{'..'}' block";
+        case PM_CONTEXT_BLOCK_KEYWORDS: return "'do'..'end' block";
+        case PM_CONTEXT_CASE_WHEN: return "'when' clause";
+        case PM_CONTEXT_CASE_IN: return "'in' clause";
+        case PM_CONTEXT_CLASS: return "class definition";
+        case PM_CONTEXT_DEF: return "method definition";
+        case PM_CONTEXT_DEF_PARAMS: return "method parameters";
+        case PM_CONTEXT_DEFAULT_PARAMS: return "parameter default value";
+        case PM_CONTEXT_ELSE: return "'else' clause";
+        case PM_CONTEXT_ELSIF: return "'elsif' clause";
+        case PM_CONTEXT_EMBEXPR: return "embedded expression";
+        case PM_CONTEXT_ENSURE: return "'ensure' clause";
+        case PM_CONTEXT_ENSURE_DEF: return "'ensure' clause";
+        case PM_CONTEXT_FOR: return "for loop";
+        case PM_CONTEXT_FOR_INDEX: return "for loop index";
+        case PM_CONTEXT_IF: return "if statement";
+        case PM_CONTEXT_LAMBDA_BRACES: return "'{'..'}' lambda block";
+        case PM_CONTEXT_LAMBDA_DO_END: return "'do'..'end' lambda block";
+        case PM_CONTEXT_MAIN: return "top level context";
+        case PM_CONTEXT_MODULE: return "module definition";
+        case PM_CONTEXT_PARENS: return "parentheses";
+        case PM_CONTEXT_POSTEXE: return "'END' block";
+        case PM_CONTEXT_PREDICATE: return "predicate";
+        case PM_CONTEXT_PREEXE: return "'BEGIN' block";
+        case PM_CONTEXT_RESCUE_ELSE: return "'else' clause";
+        case PM_CONTEXT_RESCUE_ELSE_DEF: return "'else' clause";
+        case PM_CONTEXT_RESCUE: return "'rescue' clause";
+        case PM_CONTEXT_RESCUE_DEF: return "'rescue' clause";
+        case PM_CONTEXT_SCLASS: return "singleton class definition";
+        case PM_CONTEXT_UNLESS: return "unless statement";
+        case PM_CONTEXT_UNTIL: return "until statement";
+        case PM_CONTEXT_WHILE: return "while statement";
+    }
+
+    assert(false && "unreachable");
+    return "";
+}
+
 /******************************************************************************/
 /* Specific token lexers                                                      */
 /******************************************************************************/
@@ -10385,8 +10440,8 @@ parser_lex(pm_parser_t *parser) {
 typedef enum {
     PM_BINDING_POWER_UNSET =             0, // used to indicate this token cannot be used as an infix operator
     PM_BINDING_POWER_STATEMENT =         2,
-    PM_BINDING_POWER_MODIFIER =          4, // if unless until while
-    PM_BINDING_POWER_MODIFIER_RESCUE =   6, // rescue
+    PM_BINDING_POWER_MODIFIER_RESCUE =   4, // rescue
+    PM_BINDING_POWER_MODIFIER =          6, // if unless until while
     PM_BINDING_POWER_COMPOSITION =       8, // and or
     PM_BINDING_POWER_NOT =              10, // not
     PM_BINDING_POWER_MATCH =            12, // => in
@@ -10440,15 +10495,15 @@ typedef struct {
 #define RIGHT_ASSOCIATIVE_UNARY(precedence) { precedence, precedence, false, false }
 
 pm_binding_powers_t pm_binding_powers[PM_TOKEN_MAXIMUM] = {
+    // rescue
+    [PM_TOKEN_KEYWORD_RESCUE_MODIFIER] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_MODIFIER_RESCUE),
+
     // if unless until while
     [PM_TOKEN_KEYWORD_IF_MODIFIER] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_MODIFIER),
     [PM_TOKEN_KEYWORD_UNLESS_MODIFIER] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_MODIFIER),
     [PM_TOKEN_KEYWORD_UNTIL_MODIFIER] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_MODIFIER),
     [PM_TOKEN_KEYWORD_WHILE_MODIFIER] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_MODIFIER),
 
-    // rescue
-    [PM_TOKEN_KEYWORD_RESCUE_MODIFIER] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_MODIFIER_RESCUE),
-
     // and or
     [PM_TOKEN_KEYWORD_AND] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_COMPOSITION),
     [PM_TOKEN_KEYWORD_OR] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_COMPOSITION),
@@ -14177,7 +14232,7 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
  * Parse an expression that begins with the previous node that we just lexed.
  */
 static inline pm_node_t *
-parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, bool accepts_command_call) {
+parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, bool accepts_command_call, pm_diagnostic_id_t diag_id) {
     switch (parser->current.type) {
         case PM_TOKEN_BRACKET_LEFT_ARRAY: {
             parser_lex(parser);
@@ -14595,30 +14650,30 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
 
             if ((binding_power == PM_BINDING_POWER_STATEMENT) && match1(parser, PM_TOKEN_COMMA)) {
                 node = parse_targets_validate(parser, node, PM_BINDING_POWER_INDEX);
-            }
-            else {
+            } else {
                 // Check if `it` is not going to be assigned.
                 switch (parser->current.type) {
-                  case PM_TOKEN_AMPERSAND_AMPERSAND_EQUAL:
-                  case PM_TOKEN_AMPERSAND_EQUAL:
-                  case PM_TOKEN_CARET_EQUAL:
-                  case PM_TOKEN_EQUAL:
-                  case PM_TOKEN_GREATER_GREATER_EQUAL:
-                  case PM_TOKEN_LESS_LESS_EQUAL:
-                  case PM_TOKEN_MINUS_EQUAL:
-                  case PM_TOKEN_PARENTHESIS_RIGHT:
-                  case PM_TOKEN_PERCENT_EQUAL:
-                  case PM_TOKEN_PIPE_EQUAL:
-                  case PM_TOKEN_PIPE_PIPE_EQUAL:
-                  case PM_TOKEN_PLUS_EQUAL:
-                  case PM_TOKEN_SLASH_EQUAL:
-                  case PM_TOKEN_STAR_EQUAL:
-                  case PM_TOKEN_STAR_STAR_EQUAL:
-                    break;
-                  default:
-                    // Once we know it's neither a method call nor an assignment,
-                    // we can finally create `it` default parameter.
-                    node = pm_node_check_it(parser, node);
+                    case PM_TOKEN_AMPERSAND_AMPERSAND_EQUAL:
+                    case PM_TOKEN_AMPERSAND_EQUAL:
+                    case PM_TOKEN_CARET_EQUAL:
+                    case PM_TOKEN_EQUAL:
+                    case PM_TOKEN_GREATER_GREATER_EQUAL:
+                    case PM_TOKEN_LESS_LESS_EQUAL:
+                    case PM_TOKEN_MINUS_EQUAL:
+                    case PM_TOKEN_PARENTHESIS_RIGHT:
+                    case PM_TOKEN_PERCENT_EQUAL:
+                    case PM_TOKEN_PIPE_EQUAL:
+                    case PM_TOKEN_PIPE_PIPE_EQUAL:
+                    case PM_TOKEN_PLUS_EQUAL:
+                    case PM_TOKEN_SLASH_EQUAL:
+                    case PM_TOKEN_STAR_EQUAL:
+                    case PM_TOKEN_STAR_STAR_EQUAL:
+                        break;
+                    default:
+                        // Once we know it's neither a method call nor an
+                        // assignment, we can finally create `it` default
+                        // parameter.
+                        node = pm_node_check_it(parser, node);
                 }
             }
 
@@ -14656,6 +14711,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
                 // If we get here, then we tried to find something in the
                 // heredoc but couldn't actually parse anything, so we'll just
                 // return a missing node.
+                //
+                // parse_string_part handles its own errors, so there is no need
+                // for us to add one here.
                 node = (pm_node_t *) pm_missing_node_create(parser, parser->previous.start, parser->previous.end);
             } else if (PM_NODE_TYPE_P(part, PM_STRING_NODE) && match2(parser, PM_TOKEN_HEREDOC_END, PM_TOKEN_EOF)) {
                 // If we get here, then the part that we parsed was plain string
@@ -16301,6 +16359,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
             // context of a multiple assignment. We enforce that here. We'll
             // still lex past it though and create a missing node place.
             if (binding_power != PM_BINDING_POWER_STATEMENT) {
+                pm_parser_err_previous(parser, diag_id);
                 return (pm_node_t *) pm_missing_node_create(parser, parser->previous.start, parser->previous.end);
             }
 
@@ -16487,12 +16546,34 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
 
             return parse_symbol(parser, &lex_mode, PM_LEX_STATE_END);
         }
-        default:
-            if (context_recoverable(parser, &parser->current)) {
+        default: {
+            pm_context_t recoverable = context_recoverable(parser, &parser->current);
+
+            if (recoverable != PM_CONTEXT_NONE) {
                 parser->recovering = true;
+
+                // If the given error is not the generic one, then we'll add it
+                // here because it will provide more context in addition to the
+                // recoverable error that we will also add.
+                if (diag_id != PM_ERR_CANNOT_PARSE_EXPRESSION) {
+                    pm_parser_err_previous(parser, diag_id);
+                }
+
+                // If we get here, then we are assuming this token is closing a
+                // parent context, so we'll indicate that to the user so that
+                // they know how we behaved.
+                PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_UNEXPECTED_TOKEN_CLOSE_CONTEXT, pm_token_type_human(parser->current.type), context_human(recoverable));
+            } else if (diag_id == PM_ERR_CANNOT_PARSE_EXPRESSION) {
+                // We're going to make a special case here, because "cannot
+                // parse expression" is pretty generic, and we know here that we
+                // have an unexpected token.
+                PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_UNEXPECTED_TOKEN_IGNORE, pm_token_type_human(parser->current.type));
+            } else {
+                pm_parser_err_previous(parser, diag_id);
             }
 
             return (pm_node_t *) pm_missing_node_create(parser, parser->previous.start, parser->previous.end);
+        }
     }
 }
 
@@ -17455,15 +17536,12 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
  */
 static pm_node_t *
 parse_expression(pm_parser_t *parser, pm_binding_power_t binding_power, bool accepts_command_call, pm_diagnostic_id_t diag_id) {
-    pm_token_t recovery = parser->previous;
-    pm_node_t *node = parse_expression_prefix(parser, binding_power, accepts_command_call);
+    pm_node_t *node = parse_expression_prefix(parser, binding_power, accepts_command_call, diag_id);
 
     switch (PM_NODE_TYPE(node)) {
         case PM_MISSING_NODE:
             // If we found a syntax error, then the type of node returned by
-            // parse_expression_prefix is going to be a missing node. In that
-            // case we need to add the error message to the parser's error list.
-            pm_parser_err(parser, recovery.end, recovery.end, diag_id);
+            // parse_expression_prefix is going to be a missing node.
             return node;
         case PM_PRE_EXECUTION_NODE:
         case PM_POST_EXECUTION_NODE:
@@ -17472,7 +17550,7 @@ parse_expression(pm_parser_t *parser, pm_binding_power_t binding_power, bool acc
         case PM_UNDEF_NODE:
             // These expressions are statements, and cannot be followed by
             // operators (except modifiers).
-            if (pm_binding_powers[parser->current.type].left > PM_BINDING_POWER_MODIFIER_RESCUE) {
+            if (pm_binding_powers[parser->current.type].left > PM_BINDING_POWER_MODIFIER) {
                 return node;
             }
             break;
diff --git a/src/main/c/yarp/src/token_type.c b/src/main/c/yarp/src/token_type.c