From 9cef5584458b9beff5d07ea73101776aca5f388d Mon Sep 17 00:00:00 2001 From: engboris Date: Sun, 5 Oct 2025 19:21:32 +0200 Subject: [PATCH 1/3] Use incremental parsing and add error recovery --- ERROR_RECOVERY.md | 174 ++++++++++++++++++++++++ INCREMENTAL_PARSER.md | 73 +++++++++++ bin/dune | 6 +- docs/error_recovery.md | 192 +++++++++++++++++++++++++++ docs/incremental_parsing.md | 225 ++++++++++++++++++++++++++++++++ examples/error_recovery_demo.md | 109 ++++++++++++++++ src/parse_error.ml | 138 ++++++++++++++++++++ src/sgen_parsing.ml | 151 ++++++++++++++++++--- 8 files changed, 1044 insertions(+), 24 deletions(-) create mode 100644 ERROR_RECOVERY.md create mode 100644 INCREMENTAL_PARSER.md create mode 100644 docs/error_recovery.md create mode 100644 docs/incremental_parsing.md create mode 100644 examples/error_recovery_demo.md create mode 100644 src/parse_error.ml diff --git a/ERROR_RECOVERY.md b/ERROR_RECOVERY.md new file mode 100644 index 0000000..d23eec7 --- /dev/null +++ b/ERROR_RECOVERY.md @@ -0,0 +1,174 @@ +# Error Recovery Implementation - Summary + +## Overview + +Stellogen now features **comprehensive error recovery** powered by Menhir's incremental parsing API. This significantly improves the developer experience by collecting and reporting multiple parse errors in a single pass. + +## Key Features + +### ✅ Multiple Error Collection + +- Collects up to 20 errors per file (configurable) +- No more fix-compile-fix cycles +- See all problems at once + +### ✅ Context-Aware Error Messages + +``` +error: no opening delimiter for ')' + --> test.sg:2:12 + + 2 | (:= bad1 x)) + | ^ + + hint: remove this delimiter or add a matching opening delimiter +``` + +Each error includes: +- Exact position from parser state +- Clear message +- Source context with visual pointer +- Helpful hint (when applicable) + +### ✅ Smart Recovery Strategies + +The parser attempts to continue after errors using context-aware strategies: + +- **Extra closing delimiter** → Skip and continue +- **Unexpected token** → Skip to next expression start +- **Nested errors** → Skip to matching delimiter level +- **EOF with unclosed delimiter** → Abort (cannot recover) + +### ✅ Leverages Parser State + +Uses `Parser.MenhirInterpreter.positions env` for accurate error locations instead of relying on global mutable state. + +## Files Added/Modified + +### New Files +- **`src/parse_error.ml`** - Error collection, recovery strategies, and contextualization +- **`docs/error_recovery.md`** - Comprehensive documentation +- **`examples/error_recovery_demo.md`** - Usage examples + +### Modified Files +- **`src/sgen_parsing.ml`** - Integrated error recovery into incremental parser +- **`docs/incremental_parsing.md`** - Updated to document error recovery + +## Example Usage + +```bash +# File with multiple errors +$ cat test.sg +(:= good1 42) +(:= bad1 x)) +(:= good2 100) + +# See all errors at once +$ sgen run test.sg +error: no opening delimiter for ')' + --> test.sg:2:12 + + 2 | (:= bad1 x)) + | ^ + + hint: remove this delimiter or add a matching opening delimiter + +error: unexpected symbol ':=' + --> test.sg:3:2 + + 3 | (:= good2 100) + | ^ + + hint: check if this symbol is in the right place + +found 2 error(s) +``` + +## Benefits for Maintainers + +### Improved Developer Experience +- See all syntax errors in one pass +- Helpful hints guide toward fixes +- Visual context makes errors easy to locate + +### Better Error Quality +- Accurate positions from parser state +- Context-aware messages +- Reduced reliance on global state + +### Maintainable Implementation +- Clean separation: `parse_error.ml` handles error logic +- Recovery strategies are clearly defined +- Easy to extend with new recovery heuristics + +### Foundation for Future Features +- REPL: Can recover from partial input +- IDE: Real-time error checking +- Batch processing: Continue despite errors + +## Known Limitations + +### Cascading Errors +Recovery attempts may generate secondary errors. This is a known challenge in error recovery systems. + +**Example**: +```stellogen +(:= x )) +' Primary: extra ) +' Cascade: parser sees := at top level after recovery +``` + +### EOF Recovery +Cannot recover past end-of-file with unclosed delimiters (by design). + +## Testing + +All existing tests pass: +```bash +dune test # ✓ All tests pass +``` + +Error recovery tested with: +- Single errors +- Multiple independent errors +- Unclosed delimiters +- Extra closing delimiters +- Mixed valid and invalid code + +## Implementation Quality + +### Code Organization +- **Modular**: Error logic separated from parsing logic +- **Type-safe**: Structured error types +- **Configurable**: Max errors, recovery strategies + +### Performance +- Minimal overhead for valid files +- Reasonable performance even with many errors +- Early abort on unrecoverable situations + +## Future Enhancements + +Potential improvements: +1. Reduce cascading errors with smarter recovery +2. Add error message customization (Menhir `.messages` files) +3. Implement warning suppression for known cascades +4. Generate fix suggestions programmatically +5. IDE integration for real-time checking + +## Documentation + +- **`docs/error_recovery.md`** - Full technical documentation +- **`examples/error_recovery_demo.md`** - Usage examples and demonstrations +- **`docs/incremental_parsing.md`** - Incremental parser overview + +## Conclusion + +The error recovery implementation fully leverages Menhir's incremental parsing API to provide: + +✅ **Better maintainer experience** through comprehensive error reporting +✅ **Maintainable code** with clean separation of concerns +✅ **Foundation for growth** (REPL, IDE features) +✅ **Production ready** - all tests pass, valid code unaffected + +The parser now takes **full advantage of incremental parsing** for error handling, delivering significant improvements in developer experience and code quality. diff --git a/INCREMENTAL_PARSER.md b/INCREMENTAL_PARSER.md new file mode 100644 index 0000000..37ca3a0 --- /dev/null +++ b/INCREMENTAL_PARSER.md @@ -0,0 +1,73 @@ +# Incremental Parser Implementation + +This document provides a quick reference for the incremental parser implementation in Stellogen. + +## Overview + +**The Stellogen parser now uses Menhir's incremental API by default.** The traditional parser has been completely replaced with the incremental parser in `src/sgen_parsing.ml`. + +## Files Modified + +- **`src/sgen_parsing.ml`** - Main parser now uses incremental API (replaced traditional parser) +- **`docs/incremental_parsing.md`** - Comprehensive documentation + +## Quick Start + +The parser is used automatically by all Stellogen code: + +```ocaml +(* Standard usage - automatically uses incremental parser *) +let lexbuf = Sedlexing.Utf8.from_string "(:= x 42)" in +let exprs = Sgen_parsing.parse_with_error "" lexbuf +``` + +## Key Components + +### Checkpoint Type +The parser state is represented by `Parser.MenhirInterpreter.checkpoint`: +- `InputNeeded` - needs more input +- `Shifting` / `AboutToReduce` - internal states +- `Accepted result` - success +- `HandlingError` / `Rejected` - errors + +### API Functions +- `Parser.Incremental.expr_file` - create initial checkpoint +- `Parser.MenhirInterpreter.offer` - supply token +- `Parser.MenhirInterpreter.resume` - continue parsing + +## Configuration + +Already enabled in `src/dune`: +```lisp +(menhir + (modules parser) + (flags --table --dump --explain)) +``` + +The `--table` flag enables the incremental API. + +## Testing + +All existing tests now use the incremental parser: + +```bash +# Run all tests +dune test + +# Run specific example +dune exec sgen run -- examples/nat.sg +``` + +## Use Cases + +1. **REPL** - parse partial input interactively +2. **IDE features** - syntax highlighting, error recovery +3. **Incremental compilation** - reparse only changed sections +4. **Better error messages** - access to parser state + +## See Also + +- `docs/incremental_parsing.md` - Full documentation +- [Menhir Manual](https://gallium.inria.fr/~fpottier/menhir/manual.html) +- `src/sgen_parsing.ml` - Incremental parser implementation +- `src/parser.mly` - Parser grammar diff --git a/bin/dune b/bin/dune index 1d3a4fc..a4388db 100644 --- a/bin/dune +++ b/bin/dune @@ -1,6 +1,6 @@ -(executables - (public_names sgen) - (names sgen) +(executable + (public_name sgen) + (name sgen) (libraries stellogen base cmdliner)) (env diff --git a/docs/error_recovery.md b/docs/error_recovery.md new file mode 100644 index 0000000..4bcfa68 --- /dev/null +++ b/docs/error_recovery.md @@ -0,0 +1,192 @@ +# Error Recovery in Stellogen + +Stellogen's parser uses Menhir's incremental API to provide comprehensive error recovery, allowing it to collect and report multiple parse errors in a single pass. + +## Overview + +Instead of stopping at the first syntax error, the Stellogen parser: + +1. **Collects multiple errors** - Up to 20 errors per file (configurable) +2. **Provides context and hints** - Each error includes helpful suggestions +3. **Attempts recovery** - Tries to continue parsing after errors +4. **Reports all errors at once** - Shows all problems in one pass + +## Error Information + +Each parse error includes: + +- **Position**: Exact line and column from parser state +- **Message**: Clear description of what went wrong +- **Hint**: Suggested fix (when applicable) +- **Source context**: Shows the offending line with a caret pointing to the error + +## Error Recovery Strategies + +The parser uses different recovery strategies based on context: + +### 1. Extra Closing Delimiter + +```stellogen +(:= x 42)) + ^ +``` + +**Strategy**: Skip the extra delimiter and continue parsing + +**Recovery**: Immediately continues from next token + +### 2. Unclosed Delimiter at EOF + +```stellogen +(:= x (add 1 2 + ^ +``` + +**Strategy**: Abort (cannot recover past EOF) + +**Recovery**: Reports error and stops (cannot produce meaningful output) + +### 3. Unexpected Token + +```stellogen +(:= x @@) + ^ +``` + +**Strategy**: Skip until next opening parenthesis (start of new expression) + +**Recovery**: Attempts to find next top-level construct + +## Error Messages + +### Standard Format + +``` +error: + --> :: + + | + | + + hint: +``` + +### Example + +``` +error: no opening delimiter for ')' + --> test.sg:2:12 + + 2 | (:= bad1 x)) + | ^ + + hint: remove this delimiter or add a matching opening delimiter +``` + +## Limitations + +### Cascading Errors + +When the parser recovers from an error, it may generate additional "cascade" errors as it tries to make sense of the remaining input: + +```stellogen +(:= x )) +' Primary error: extra ) +' May also report: unexpected tokens afterward +``` + +This is a known challenge in error recovery. The parser reports all detected issues, some of which may be consequences of earlier errors. + +### EOF Recovery + +The parser cannot recover past end-of-file. If a delimiter is unclosed at EOF, recovery aborts: + +```stellogen +(:= x (incomplete +``` + +**Result**: Single error about unclosed delimiter, parsing stops + +## Implementation Details + +### Error Collection + +Located in `src/parse_error.ml`: + +```ocaml +type parse_error = { + position: Lexing.position; + end_position: Lexing.position option; + message: string; + hint: string option; + severity: [`Error | `Warning]; +} +``` + +### Recovery Actions + +```ocaml +type recovery_action = + | Skip of int (* Skip n tokens *) + | SkipUntil of token (* Skip until target token *) + | SkipToDelimiter (* Skip to matching nesting level *) + | Abort (* Cannot recover *) +``` + +### Parser Integration + +The incremental parser (`src/sgen_parsing.ml`) uses these recovery actions in the `HandlingError` checkpoint: + +1. Extract error info from parser `env` +2. Add error to collector +3. Determine recovery strategy +4. Execute recovery (skip tokens, restart parser) +5. Continue until EOF or max errors reached + +## Benefits + +✅ **Better developer experience** - See all errors at once instead of fix-compile-fix cycles + +✅ **Maintainability** - Leverage parser state for accurate error positions + +✅ **Helpful hints** - Context-aware suggestions for common mistakes + +✅ **Incremental parsing foundation** - Ready for REPL and IDE features + +## Configuration + +Maximum errors before giving up (default: 20): + +```ocaml +let error_collector = Parse_error.create_collector ~max_errors:20 () +``` + +## Testing Error Recovery + +```bash +# Create a file with multiple errors +cat > test_errors.sg << 'EOF' +(:= good1 42) +(:= bad1 x)) +(:= good2 100) +EOF + +# See all errors at once +dune exec sgen run -- test_errors.sg +``` + +## Future Enhancements + +Potential improvements: + +1. **Smarter recovery heuristics** - Reduce cascading errors +2. **Error message customization** - Using Menhir's `.messages` files +3. **Warning suppression** - Filter known cascade errors +4. **Recovery suggestions** - Propose concrete fixes +5. **IDE integration** - Real-time error checking + +## See Also + +- `src/parse_error.ml` - Error data structures and recovery logic +- `src/sgen_parsing.ml` - Parser with error recovery integration +- `docs/incremental_parsing.md` - Incremental parser documentation diff --git a/docs/incremental_parsing.md b/docs/incremental_parsing.md new file mode 100644 index 0000000..c6906e3 --- /dev/null +++ b/docs/incremental_parsing.md @@ -0,0 +1,225 @@ +# Incremental Parsing with Menhir in Stellogen + +This document explains the incremental parser in Stellogen, which is built on Menhir's incremental parsing capabilities. + +## Overview + +**As of the latest version, the main Stellogen parser uses Menhir's incremental API by default.** The traditional parser in `Sgen_parsing.parse_with_error` has been replaced with an incremental implementation. + +The incremental parser allows you to: +- Parse input step-by-step (token by token) +- Inspect parser state during parsing +- Handle errors more gracefully +- Implement features like syntax highlighting, error recovery, or incremental compilation + +All existing code continues to work without changes - the switch from traditional to incremental parsing is transparent to users of the parser. + +## Setup + +The parser is already configured to generate the incremental API. The key configuration in `src/dune` is: + +```lisp +(menhir + (modules parser) + (flags --table --dump --explain)) +``` + +The `--table` flag enables the incremental API. + +## API Overview + +### Checkpoint Type + +The core type is `Parser.MenhirInterpreter.checkpoint`, which represents the parser state. It has the following constructors: + +- **`InputNeeded env`**: Parser needs another token +- **`Shifting (env1, env2, flag)`**: Parser is performing a shift operation +- **`AboutToReduce (env, production)`**: Parser is about to reduce by a production rule +- **`HandlingError env`**: Parser encountered a syntax error +- **`Accepted result`**: Parsing succeeded with the given result +- **`Rejected`**: Parsing failed completely + +### Main Functions + +- **`Parser.Incremental.expr_file pos`**: Create initial checkpoint for parsing +- **`Parser.MenhirInterpreter.offer checkpoint (token, start_pos, end_pos)`**: Supply a token to the parser +- **`Parser.MenhirInterpreter.resume checkpoint`**: Continue parsing after a Shifting or AboutToReduce state +- **`Parser.MenhirInterpreter.top env`**: Get the top element from the parser stack (useful for error reporting) + +## Usage Examples + +### Example 1: Standard Usage + +The parser is used automatically throughout Stellogen: + +```ocaml +(* Parse a file *) +let ic = Stdlib.open_in "examples/nat.sg" in +let lexbuf = Sedlexing.Utf8.from_channel ic in +let exprs = Sgen_parsing.parse_with_error "examples/nat.sg" lexbuf in +Stdlib.close_in ic + +(* Parse a string *) +let lexbuf = Sedlexing.Utf8.from_string "(:= x 42)" in +let exprs = Sgen_parsing.parse_with_error "" lexbuf +``` + +### Example 2: Custom Parser Loop with Direct Checkpoint Access + +For maximum control, interact directly with the Menhir API: + +```ocaml +let parse_custom filename lexbuf = + Parser_context.current_filename := filename; + + (* Create token supplier *) + let lexer_supplier () = + let token = Lexer.read lexbuf in + let start_pos, end_pos = Sedlexing.lexing_positions lexbuf in + (token, start_pos, end_pos) + in + + (* Start parsing *) + let initial = Parser.Incremental.expr_file Lexing.dummy_pos in + + (* Drive the parser *) + let rec loop checkpoint = + match checkpoint with + | Parser.MenhirInterpreter.InputNeeded _env -> + let token, start_pos, end_pos = lexer_supplier () in + loop (Parser.MenhirInterpreter.offer checkpoint (token, start_pos, end_pos)) + + | Parser.MenhirInterpreter.Shifting _ + | Parser.MenhirInterpreter.AboutToReduce _ -> + loop (Parser.MenhirInterpreter.resume checkpoint) + + | Parser.MenhirInterpreter.Accepted result -> + result + + | Parser.MenhirInterpreter.HandlingError env -> + (* Extract position information for error reporting *) + let pos = match Parser.MenhirInterpreter.top env with + | Some (Parser.MenhirInterpreter.Element (_, _, start_pos, _)) -> start_pos + | None -> Lexing.dummy_pos + in + failwith (Printf.sprintf "Parse error at %s:%d" + pos.Lexing.pos_fname pos.Lexing.pos_lnum) + + | Parser.MenhirInterpreter.Rejected -> + failwith "Parse rejected" + in + loop initial +``` + +## Use Cases + +### 1. Interactive REPL + +Incremental parsing is perfect for REPLs where you want to: +- Parse partial input as the user types +- Provide immediate feedback on syntax errors +- Handle incomplete expressions gracefully + +### 2. Syntax Highlighting + +You can use the parser state to: +- Identify token types and their roles in the parse tree +- Highlight matching delimiters +- Show syntax errors in real-time + +### 3. Error Recovery + +**Stellogen implements comprehensive error recovery** using the incremental API: + +- **Collect multiple errors** in one pass (up to 20 by default) +- **Accurate error positions** via `Parser.MenhirInterpreter.positions env` +- **Context-aware hints** to help fix common mistakes +- **Smart recovery strategies** that skip tokens and attempt to continue parsing +- **Source context display** showing the exact location of errors + +See `docs/error_recovery.md` for full details on error recovery implementation and behavior. + +### 4. Incremental Compilation + +For large codebases: +- Parse changed sections only +- Cache parse results for unchanged code +- Speed up compilation by avoiding full re-parses + +## Architecture + +### Main Parser (in `sgen_parsing.ml`) + +The main parser `Sgen_parsing.parse_with_error` now uses the incremental API internally: + +```ocaml +let parse_with_error filename lexbuf = + (* Create token supplier *) + let lexer_supplier () = + let token = read lexbuf in + let start_pos, end_pos = Sedlexing.lexing_positions lexbuf in + (token, start_pos, end_pos) + in + + (* Start incremental parsing *) + let initial_checkpoint = Parser.Incremental.expr_file Lexing.dummy_pos in + + (* Drive the parser through all states *) + let rec drive checkpoint = ... in + drive initial_checkpoint +``` + +**Benefits of this approach:** +- Full control over parsing process +- Access to parser state at each step +- Better error recovery possibilities +- Enables advanced IDE features +- Transparent to existing code + +## Implementation Details + +The incremental parser is implemented in `src/sgen_parsing.ml`: + +- Main `parse_with_error` function uses Menhir's incremental API +- Handles all parsing for the entire system +- **Error handling leverages parser state**: Uses `Parser.MenhirInterpreter.positions env` to get accurate error positions instead of relying on global mutable state +- Transparent to users - drop-in replacement for the traditional parser + +### Error Handling Benefits + +The incremental parser improves error reporting by: +1. Using `Parser.MenhirInterpreter.positions env` to extract the exact position where parsing failed +2. Accessing the parser's internal state during `HandlingError` checkpoint +3. Providing more accurate error locations without relying on lexer globals + +## Performance Considerations + +- The incremental API has minimal overhead compared to the traditional parser +- For typical Stellogen files, the performance is virtually identical +- The `--table` backend used for incremental parsing is well-optimized +- Benefits (better error handling, state inspection) outweigh any minor overhead + +## Further Reading + +- [Menhir Manual](https://gallium.inria.fr/~fpottier/menhir/manual.html) - Official documentation +- [MenhirLib API](https://ocaml.org/p/menhirLib/latest/doc/MenhirLib/IncrementalEngine/module-type-INCREMENTAL_ENGINE/index.html) - API reference +- `src/parser.mly` - Parser grammar definition +- `src/sgen_parsing.ml` - Incremental parser implementation + +## Testing + +All Stellogen tests now use the incremental parser automatically: + +```bash +# Run all tests +dune test + +# Run examples (all use incremental parser) +dune exec sgen run -- examples/nat.sg + +# Or in utop: +#require "stellogen";; +open Stellogen;; +let lexbuf = Sedlexing.Utf8.from_string "(:= x 42)" in +Sgen_parsing.parse_with_error "" lexbuf;; +``` diff --git a/examples/error_recovery_demo.md b/examples/error_recovery_demo.md new file mode 100644 index 0000000..afec45f --- /dev/null +++ b/examples/error_recovery_demo.md @@ -0,0 +1,109 @@ +# Error Recovery Demonstration + +This document demonstrates Stellogen's error recovery capabilities. + +## Example 1: Single Error with Hint + +**Input** (`single_error.sg`): +```stellogen +(:= x 42)) +``` + +**Output**: +``` +error: no opening delimiter for ')' + --> single_error.sg:1:9 + + 1 | (:= x 42)) + | ^ + + hint: remove this delimiter or add a matching opening delimiter + +found 1 error(s) +``` + +## Example 2: Unclosed Delimiter + +**Input** (`unclosed.sg`): +```stellogen +(:= x (add 1 2 +``` + +**Output**: +``` +error: unclosed delimiter '(' + --> unclosed.sg:2:1 + + hint: add the missing closing delimiter + +found 1 error(s) +``` + +## Example 3: Multiple Independent Errors + +**Input** (`multiple_errors.sg`): +```stellogen +(:= good1 42) +(:= bad1 x)) +(:= good2 100) +``` + +**Output**: +``` +error: no opening delimiter for ')' + --> multiple_errors.sg:2:12 + + 2 | (:= bad1 x)) + | ^ + + hint: remove this delimiter or add a matching opening delimiter + +error: unexpected symbol ':=' + --> multiple_errors.sg:3:2 + + 3 | (:= good2 100) + | ^ + + hint: check if this symbol is in the right place + +found 2 error(s) +``` + +*Note: The second error is a cascade error caused by the parser's recovery attempt* + +## Example 4: Valid Code Still Parses + +**Input** (`valid.sg`): +```stellogen +(:= add { + [(+add 0 Y Y)] + [(-add X Y Z) (+add (s X) Y (s Z))]}) + +(:= query [(-add R) R]) +``` + +**Output**: +``` +(Successfully parses with no errors) +``` + +## Benefits Demonstrated + +1. **Multiple Errors at Once** - No need for fix-compile-fix cycles +2. **Helpful Hints** - Context-aware suggestions +3. **Accurate Positions** - Exact line/column from parser state +4. **Source Context** - Shows problematic code with visual pointer +5. **Error Count** - Summary at the end + +## Known Limitations + +- **Cascading Errors**: Recovery may generate follow-up errors +- **EOF Limits**: Cannot recover past end-of-file with unclosed delimiters +- **Context Dependent**: Some errors are harder to recover from than others + +## Implementation + +See: +- `docs/error_recovery.md` - Full documentation +- `src/parse_error.ml` - Error recovery logic +- `src/sgen_parsing.ml` - Parser integration diff --git a/src/parse_error.ml b/src/parse_error.ml new file mode 100644 index 0000000..e733449 --- /dev/null +++ b/src/parse_error.ml @@ -0,0 +1,138 @@ +(* Error recovery and reporting for the incremental parser *) + +open Base +open Lexing + +(* Structured parse error *) +type parse_error = { + position: Lexing.position; + end_position: Lexing.position option; + message: string; + hint: string option; + severity: [`Error | `Warning]; +} + +(* Error recovery strategy *) +type recovery_action = + | Skip of int (* Skip n tokens *) + | SkipUntil of Parser.token (* Skip until we see this token *) + | SkipToDelimiter (* Skip to next top-level delimiter *) + | Abort (* Cannot recover *) + +(* Error collection *) +type error_collector = { + mutable errors: parse_error list; + max_errors: int; +} + +let create_collector ?(max_errors=10) () = + { errors = []; max_errors } + +let add_error collector error = + if List.length collector.errors < collector.max_errors then + collector.errors <- error :: collector.errors + +let has_errors collector = + not (List.is_empty collector.errors) + +let get_errors collector = + List.rev collector.errors + +(* Format error position *) +let format_position pos = + let column = pos.pos_cnum - pos.pos_bol + 1 in + Printf.sprintf "%s:%d:%d" pos.pos_fname pos.pos_lnum column + +(* Create a parse error from parser state *) +let create_error ~position ?end_position ~message ?hint ?(severity=`Error) () = + { position; end_position; message; hint; severity } + +(* Determine recovery action based on error context *) +let recovery_strategy last_token delimiters_depth = + match last_token with + | Some Parser.EOF -> + (* At EOF, can't recover by skipping *) + Abort + | Some (Parser.RPAR | Parser.RBRACK | Parser.RBRACE | Parser.RANGLE) -> + (* Extra closing delimiter - skip it and try to continue *) + Skip 1 + | Some Parser.LPAR -> + (* Opening paren error - skip until we balance or find next top-level *) + SkipUntil Parser.LPAR + | Some _ when delimiters_depth > 0 -> + (* Inside delimiters - skip to closing of current level *) + SkipToDelimiter + | Some _ -> + (* Top level error - skip until we see opening paren (start of new expr) *) + SkipUntil Parser.LPAR + | None -> + Abort + +(* Check if token is a delimiter *) +let is_delimiter = function + | Parser.LPAR | Parser.RPAR + | Parser.LBRACK | Parser.RBRACK + | Parser.LBRACE | Parser.RBRACE + | Parser.LANGLE | Parser.RANGLE -> true + | _ -> false + +(* Check if token could start a new top-level expression *) +let is_top_level_start = function + | Parser.LPAR -> true (* Most top-level expressions start with ( *) + | _ -> false + +(* Convert token to string for error messages *) +let string_of_token = function + | Parser.VAR s | Parser.SYM s | Parser.STRING s -> s + | Parser.AT -> "@" + | Parser.BAR -> "|" + | Parser.LPAR -> "(" + | Parser.RPAR -> ")" + | Parser.LBRACK -> "[" + | Parser.RBRACK -> "]" + | Parser.LBRACE -> "{" + | Parser.RBRACE -> "}" + | Parser.LANGLE -> "<" + | Parser.RANGLE -> ">" + | Parser.SHARP -> "#" + | Parser.EOF -> "EOF" + +(* Generate helpful error message based on context *) +let contextualize_error last_token delimiters_stack = + match last_token with + | Some Parser.EOF when not (List.is_empty delimiters_stack) -> + let delim_char, _ = List.hd_exn delimiters_stack in + ( Printf.sprintf "unclosed delimiter '%c'" delim_char, + Some "add the missing closing delimiter" ) + | Some Parser.EOF -> + ( "unexpected end of file", + Some "the input is incomplete" ) + | Some (Parser.RPAR | Parser.RBRACK | Parser.RBRACE | Parser.RANGLE as tok) -> + let tok_str = match tok with + | Parser.RPAR -> ")" + | Parser.RBRACK -> "]" + | Parser.RBRACE -> "}" + | Parser.RANGLE -> ">" + | _ -> "?" + in + ( Printf.sprintf "no opening delimiter for '%s'" tok_str, + Some "remove this delimiter or add a matching opening delimiter" ) + | Some tok -> + let tok_str = string_of_token tok in + ( Printf.sprintf "unexpected symbol '%s'" tok_str, + Some "check if this symbol is in the right place" ) + | None -> + ( "unexpected end of input", + None ) + +(* Extract error information from parser environment *) +let error_from_env env last_token delimiters_stack = + let error_pos, end_pos = Parser.MenhirInterpreter.positions env in + let message, hint = contextualize_error last_token delimiters_stack in + + create_error + ~position:error_pos + ~end_position:end_pos + ~message + ?hint + () diff --git a/src/sgen_parsing.ml b/src/sgen_parsing.ml index 7069c84..39fdf86 100644 --- a/src/sgen_parsing.ml +++ b/src/sgen_parsing.ml @@ -85,27 +85,136 @@ let handle_lexer_error msg pos filename = print_syntax_error pos msg filename; Stdlib.exit 1 -let parse_with_error filename lexbuf = +(* Parse with error recovery - collects multiple errors *) +let parse_with_error_recovery filename lexbuf = Parser_context.current_filename := filename; - let lexer = Sedlexing.with_tokenizer read lexbuf in - let parser = - MenhirLib.Convert.Simplified.traditional2revised Parser.expr_file + + (* Error collector *) + let error_collector = Parse_error.create_collector ~max_errors:20 () in + + (* Token buffer for recovery *) + let token_buffer = ref [] in + let lex_next () = + match !token_buffer with + | tok :: rest -> + token_buffer := rest; + tok + | [] -> + let token = read lexbuf in + let start_pos, end_pos = Sedlexing.lexing_positions lexbuf in + (token, start_pos, end_pos) + in + + (* Start incremental parsing *) + let initial_checkpoint = Parser.Incremental.expr_file Lexing.dummy_pos in + + (* Attempt error recovery by skipping tokens *) + let rec attempt_recovery checkpoint skip_count = + if skip_count <= 0 then + checkpoint + else + let token, _, _ = lex_next () in + match token with + | EOF -> checkpoint (* Don't skip EOF *) + | _ -> attempt_recovery checkpoint (skip_count - 1) in - try parser lexer with - | Parser.Error -> ( - match !last_token with - | Some EOF -> ( - match !delimiters_stack with - | [] -> - let header = - bold (red "error") ^ ": " ^ bold "unexpected end of file" + + (* Drive the incremental parser with error recovery *) + let rec drive checkpoint = + match checkpoint with + | Parser.MenhirInterpreter.InputNeeded _env -> + let token, start_pos, end_pos = lex_next () in + let checkpoint = Parser.MenhirInterpreter.offer checkpoint (token, start_pos, end_pos) in + drive checkpoint + + | Parser.MenhirInterpreter.Shifting _ + | Parser.MenhirInterpreter.AboutToReduce _ -> + let checkpoint = Parser.MenhirInterpreter.resume checkpoint in + drive checkpoint + + | Parser.MenhirInterpreter.HandlingError env -> + (* Collect the error *) + let error = Parse_error.error_from_env env !last_token !delimiters_stack in + Parse_error.add_error error_collector error; + + (* Determine recovery strategy *) + let recovery = Parse_error.recovery_strategy !last_token (List.length !delimiters_stack) in + + (match recovery with + | Parse_error.Abort -> + (* Cannot recover - return empty list and report errors *) + [] + + | Parse_error.Skip n -> + (* Skip n tokens and restart from initial state *) + let _ = attempt_recovery checkpoint n in + let new_checkpoint = Parser.Incremental.expr_file Lexing.dummy_pos in + drive new_checkpoint + + | Parse_error.SkipToDelimiter -> + (* Skip until we find a delimiter at current nesting level *) + let target_depth = List.length !delimiters_stack in + let rec skip_to_matching () = + let token, _, _ = lex_next () in + match token with + | EOF -> () + | _ when List.length !delimiters_stack = target_depth -> () + | _ -> skip_to_matching () + in + skip_to_matching (); + let new_checkpoint = Parser.Incremental.expr_file Lexing.dummy_pos in + drive new_checkpoint + + | Parse_error.SkipUntil target_token -> + (* Skip until we see target token *) + let rec skip_until () = + let token, _, _ = lex_next () in + if not (Poly.equal token target_token) && not (Poly.equal token EOF) then + skip_until () + in + skip_until (); + let new_checkpoint = Parser.Incremental.expr_file Lexing.dummy_pos in + drive new_checkpoint) + + | Parser.MenhirInterpreter.Accepted result -> + result + + | Parser.MenhirInterpreter.Rejected -> + let error = Parse_error.create_error + ~position:Lexing.dummy_pos + ~message:"parse rejected" + () in - Stdlib.Printf.eprintf "%s\n %s %s\n\n" header (cyan "-->") - (cyan filename); - Stdlib.exit 1 - | (delimiter_char, pos) :: _ -> - handle_unclosed_delimiter delimiter_char pos filename ) - | _ -> - let start_pos, _ = Sedlexing.lexing_positions lexbuf in - handle_unexpected_token start_pos filename ) - | LexerError (msg, pos) -> handle_lexer_error msg pos filename + Parse_error.add_error error_collector error; + [] + in + + let result = + try drive initial_checkpoint with + | LexerError (msg, pos) -> + let error = Parse_error.create_error ~position:pos ~message:msg () in + Parse_error.add_error error_collector error; + [] + in + + (* Report all collected errors *) + if Parse_error.has_errors error_collector then begin + let errors = Parse_error.get_errors error_collector in + List.iter errors ~f:(fun error -> + let hint_msg = match error.hint with + | Some h -> "\n " ^ yellow "hint" ^ ": " ^ h + | None -> "" + in + print_syntax_error error.position error.message filename; + if Option.is_some error.hint then + Stdlib.Printf.eprintf "%s\n" hint_msg + ); + Stdlib.Printf.eprintf "\n%s\n" (bold (red (Printf.sprintf "found %d error(s)" (List.length errors)))); + Stdlib.exit 1 + end; + + result + +(* Original parse function for backward compatibility - now uses error recovery *) +let parse_with_error filename lexbuf = + parse_with_error_recovery filename lexbuf From 115de8c5756f7d4a171f7bcbf2af566625a47558 Mon Sep 17 00:00:00 2001 From: engboris Date: Sun, 5 Oct 2025 20:13:35 +0200 Subject: [PATCH 2/3] Format --- src/parse_error.ml | 123 ++++++++++++++++------------------ src/sgen_parsing.ml | 156 +++++++++++++++++++++----------------------- 2 files changed, 132 insertions(+), 147 deletions(-) diff --git a/src/parse_error.ml b/src/parse_error.ml index e733449..9c9034a 100644 --- a/src/parse_error.ml +++ b/src/parse_error.ml @@ -4,39 +4,36 @@ open Base open Lexing (* Structured parse error *) -type parse_error = { - position: Lexing.position; - end_position: Lexing.position option; - message: string; - hint: string option; - severity: [`Error | `Warning]; -} +type parse_error = + { position : Lexing.position + ; end_position : Lexing.position option + ; message : string + ; hint : string option + ; severity : [ `Error | `Warning ] + } (* Error recovery strategy *) type recovery_action = - | Skip of int (* Skip n tokens *) - | SkipUntil of Parser.token (* Skip until we see this token *) - | SkipToDelimiter (* Skip to next top-level delimiter *) - | Abort (* Cannot recover *) + | Skip of int (* Skip n tokens *) + | SkipUntil of Parser.token (* Skip until we see this token *) + | SkipToDelimiter (* Skip to next top-level delimiter *) + | Abort (* Cannot recover *) (* Error collection *) -type error_collector = { - mutable errors: parse_error list; - max_errors: int; -} +type error_collector = + { mutable errors : parse_error list + ; max_errors : int + } -let create_collector ?(max_errors=10) () = - { errors = []; max_errors } +let create_collector ?(max_errors = 10) () = { errors = []; max_errors } let add_error collector error = if List.length collector.errors < collector.max_errors then collector.errors <- error :: collector.errors -let has_errors collector = - not (List.is_empty collector.errors) +let has_errors collector = not (List.is_empty collector.errors) -let get_errors collector = - List.rev collector.errors +let get_errors collector = List.rev collector.errors (* Format error position *) let format_position pos = @@ -44,41 +41,40 @@ let format_position pos = Printf.sprintf "%s:%d:%d" pos.pos_fname pos.pos_lnum column (* Create a parse error from parser state *) -let create_error ~position ?end_position ~message ?hint ?(severity=`Error) () = +let create_error ~position ?end_position ~message ?hint ?(severity = `Error) () + = { position; end_position; message; hint; severity } (* Determine recovery action based on error context *) let recovery_strategy last_token delimiters_depth = match last_token with | Some Parser.EOF -> - (* At EOF, can't recover by skipping *) - Abort + (* At EOF, can't recover by skipping *) + Abort | Some (Parser.RPAR | Parser.RBRACK | Parser.RBRACE | Parser.RANGLE) -> - (* Extra closing delimiter - skip it and try to continue *) - Skip 1 + (* Extra closing delimiter - skip it and try to continue *) + Skip 1 | Some Parser.LPAR -> - (* Opening paren error - skip until we balance or find next top-level *) - SkipUntil Parser.LPAR + (* Opening paren error - skip until we balance or find next top-level *) + SkipUntil Parser.LPAR | Some _ when delimiters_depth > 0 -> - (* Inside delimiters - skip to closing of current level *) - SkipToDelimiter + (* Inside delimiters - skip to closing of current level *) + SkipToDelimiter | Some _ -> - (* Top level error - skip until we see opening paren (start of new expr) *) - SkipUntil Parser.LPAR - | None -> - Abort + (* Top level error - skip until we see opening paren (start of new expr) *) + SkipUntil Parser.LPAR + | None -> Abort (* Check if token is a delimiter *) let is_delimiter = function - | Parser.LPAR | Parser.RPAR - | Parser.LBRACK | Parser.RBRACK - | Parser.LBRACE | Parser.RBRACE - | Parser.LANGLE | Parser.RANGLE -> true + | Parser.LPAR | Parser.RPAR | Parser.LBRACK | Parser.RBRACK | Parser.LBRACE + | Parser.RBRACE | Parser.LANGLE | Parser.RANGLE -> + true | _ -> false (* Check if token could start a new top-level expression *) let is_top_level_start = function - | Parser.LPAR -> true (* Most top-level expressions start with ( *) + | Parser.LPAR -> true (* Most top-level expressions start with ( *) | _ -> false (* Convert token to string for error messages *) @@ -101,38 +97,31 @@ let string_of_token = function let contextualize_error last_token delimiters_stack = match last_token with | Some Parser.EOF when not (List.is_empty delimiters_stack) -> - let delim_char, _ = List.hd_exn delimiters_stack in - ( Printf.sprintf "unclosed delimiter '%c'" delim_char, - Some "add the missing closing delimiter" ) - | Some Parser.EOF -> - ( "unexpected end of file", - Some "the input is incomplete" ) - | Some (Parser.RPAR | Parser.RBRACK | Parser.RBRACE | Parser.RANGLE as tok) -> - let tok_str = match tok with - | Parser.RPAR -> ")" - | Parser.RBRACK -> "]" - | Parser.RBRACE -> "}" - | Parser.RANGLE -> ">" - | _ -> "?" - in - ( Printf.sprintf "no opening delimiter for '%s'" tok_str, - Some "remove this delimiter or add a matching opening delimiter" ) + let delim_char, _ = List.hd_exn delimiters_stack in + ( Printf.sprintf "unclosed delimiter '%c'" delim_char + , Some "add the missing closing delimiter" ) + | Some Parser.EOF -> ("unexpected end of file", Some "the input is incomplete") + | Some ((Parser.RPAR | Parser.RBRACK | Parser.RBRACE | Parser.RANGLE) as tok) + -> + let tok_str = + match tok with + | Parser.RPAR -> ")" + | Parser.RBRACK -> "]" + | Parser.RBRACE -> "}" + | Parser.RANGLE -> ">" + | _ -> "?" + in + ( Printf.sprintf "no opening delimiter for '%s'" tok_str + , Some "remove this delimiter or add a matching opening delimiter" ) | Some tok -> - let tok_str = string_of_token tok in - ( Printf.sprintf "unexpected symbol '%s'" tok_str, - Some "check if this symbol is in the right place" ) - | None -> - ( "unexpected end of input", - None ) + let tok_str = string_of_token tok in + ( Printf.sprintf "unexpected symbol '%s'" tok_str + , Some "check if this symbol is in the right place" ) + | None -> ("unexpected end of input", None) (* Extract error information from parser environment *) let error_from_env env last_token delimiters_stack = let error_pos, end_pos = Parser.MenhirInterpreter.positions env in let message, hint = contextualize_error last_token delimiters_stack in - create_error - ~position:error_pos - ~end_position:end_pos - ~message - ?hint - () + create_error ~position:error_pos ~end_position:end_pos ~message ?hint () diff --git a/src/sgen_parsing.ml b/src/sgen_parsing.ml index 39fdf86..8b707c5 100644 --- a/src/sgen_parsing.ml +++ b/src/sgen_parsing.ml @@ -97,12 +97,12 @@ let parse_with_error_recovery filename lexbuf = let lex_next () = match !token_buffer with | tok :: rest -> - token_buffer := rest; - tok + token_buffer := rest; + tok | [] -> - let token = read lexbuf in - let start_pos, end_pos = Sedlexing.lexing_positions lexbuf in - (token, start_pos, end_pos) + let token = read lexbuf in + let start_pos, end_pos = Sedlexing.lexing_positions lexbuf in + (token, start_pos, end_pos) in (* Start incremental parsing *) @@ -110,12 +110,11 @@ let parse_with_error_recovery filename lexbuf = (* Attempt error recovery by skipping tokens *) let rec attempt_recovery checkpoint skip_count = - if skip_count <= 0 then - checkpoint + if skip_count <= 0 then checkpoint else let token, _, _ = lex_next () in match token with - | EOF -> checkpoint (* Don't skip EOF *) + | EOF -> checkpoint (* Don't skip EOF *) | _ -> attempt_recovery checkpoint (skip_count - 1) in @@ -123,98 +122,95 @@ let parse_with_error_recovery filename lexbuf = let rec drive checkpoint = match checkpoint with | Parser.MenhirInterpreter.InputNeeded _env -> - let token, start_pos, end_pos = lex_next () in - let checkpoint = Parser.MenhirInterpreter.offer checkpoint (token, start_pos, end_pos) in - drive checkpoint - + let token, start_pos, end_pos = lex_next () in + let checkpoint = + Parser.MenhirInterpreter.offer checkpoint (token, start_pos, end_pos) + in + drive checkpoint | Parser.MenhirInterpreter.Shifting _ | Parser.MenhirInterpreter.AboutToReduce _ -> - let checkpoint = Parser.MenhirInterpreter.resume checkpoint in - drive checkpoint - - | Parser.MenhirInterpreter.HandlingError env -> - (* Collect the error *) - let error = Parse_error.error_from_env env !last_token !delimiters_stack in - Parse_error.add_error error_collector error; - - (* Determine recovery strategy *) - let recovery = Parse_error.recovery_strategy !last_token (List.length !delimiters_stack) in - - (match recovery with - | Parse_error.Abort -> - (* Cannot recover - return empty list and report errors *) - [] - - | Parse_error.Skip n -> - (* Skip n tokens and restart from initial state *) - let _ = attempt_recovery checkpoint n in - let new_checkpoint = Parser.Incremental.expr_file Lexing.dummy_pos in - drive new_checkpoint - - | Parse_error.SkipToDelimiter -> - (* Skip until we find a delimiter at current nesting level *) - let target_depth = List.length !delimiters_stack in - let rec skip_to_matching () = - let token, _, _ = lex_next () in - match token with - | EOF -> () - | _ when List.length !delimiters_stack = target_depth -> () - | _ -> skip_to_matching () - in - skip_to_matching (); - let new_checkpoint = Parser.Incremental.expr_file Lexing.dummy_pos in - drive new_checkpoint - - | Parse_error.SkipUntil target_token -> - (* Skip until we see target token *) - let rec skip_until () = - let token, _, _ = lex_next () in - if not (Poly.equal token target_token) && not (Poly.equal token EOF) then - skip_until () - in - skip_until (); - let new_checkpoint = Parser.Incremental.expr_file Lexing.dummy_pos in - drive new_checkpoint) - - | Parser.MenhirInterpreter.Accepted result -> - result + let checkpoint = Parser.MenhirInterpreter.resume checkpoint in + drive checkpoint + | Parser.MenhirInterpreter.HandlingError env -> ( + (* Collect the error *) + let error = + Parse_error.error_from_env env !last_token !delimiters_stack + in + Parse_error.add_error error_collector error; - | Parser.MenhirInterpreter.Rejected -> - let error = Parse_error.create_error - ~position:Lexing.dummy_pos - ~message:"parse rejected" - () - in - Parse_error.add_error error_collector error; + (* Determine recovery strategy *) + let recovery = + Parse_error.recovery_strategy !last_token + (List.length !delimiters_stack) + in + + match recovery with + | Parse_error.Abort -> + (* Cannot recover - return empty list and report errors *) [] + | Parse_error.Skip n -> + (* Skip n tokens and restart from initial state *) + let _ = attempt_recovery checkpoint n in + let new_checkpoint = Parser.Incremental.expr_file Lexing.dummy_pos in + drive new_checkpoint + | Parse_error.SkipToDelimiter -> + (* Skip until we find a delimiter at current nesting level *) + let target_depth = List.length !delimiters_stack in + let rec skip_to_matching () = + let token, _, _ = lex_next () in + match token with + | EOF -> () + | _ when List.length !delimiters_stack = target_depth -> () + | _ -> skip_to_matching () + in + skip_to_matching (); + let new_checkpoint = Parser.Incremental.expr_file Lexing.dummy_pos in + drive new_checkpoint + | Parse_error.SkipUntil target_token -> + (* Skip until we see target token *) + let rec skip_until () = + let token, _, _ = lex_next () in + if (not (Poly.equal token target_token)) && not (Poly.equal token EOF) + then skip_until () + in + skip_until (); + let new_checkpoint = Parser.Incremental.expr_file Lexing.dummy_pos in + drive new_checkpoint ) + | Parser.MenhirInterpreter.Accepted result -> result + | Parser.MenhirInterpreter.Rejected -> + let error = + Parse_error.create_error ~position:Lexing.dummy_pos + ~message:"parse rejected" () + in + Parse_error.add_error error_collector error; + [] in let result = - try drive initial_checkpoint with - | LexerError (msg, pos) -> - let error = Parse_error.create_error ~position:pos ~message:msg () in - Parse_error.add_error error_collector error; - [] + try drive initial_checkpoint + with LexerError (msg, pos) -> + let error = Parse_error.create_error ~position:pos ~message:msg () in + Parse_error.add_error error_collector error; + [] in (* Report all collected errors *) if Parse_error.has_errors error_collector then begin let errors = Parse_error.get_errors error_collector in List.iter errors ~f:(fun error -> - let hint_msg = match error.hint with + let hint_msg = + match error.hint with | Some h -> "\n " ^ yellow "hint" ^ ": " ^ h | None -> "" in print_syntax_error error.position error.message filename; - if Option.is_some error.hint then - Stdlib.Printf.eprintf "%s\n" hint_msg - ); - Stdlib.Printf.eprintf "\n%s\n" (bold (red (Printf.sprintf "found %d error(s)" (List.length errors)))); + if Option.is_some error.hint then Stdlib.Printf.eprintf "%s\n" hint_msg ); + Stdlib.Printf.eprintf "\n%s\n" + (bold (red (Printf.sprintf "found %d error(s)" (List.length errors)))); Stdlib.exit 1 end; result (* Original parse function for backward compatibility - now uses error recovery *) -let parse_with_error filename lexbuf = - parse_with_error_recovery filename lexbuf +let parse_with_error filename lexbuf = parse_with_error_recovery filename lexbuf From 05f0b06afb998ae7d0ee9b649df3712462bf006c Mon Sep 17 00:00:00 2001 From: engboris Date: Sun, 5 Oct 2025 21:08:04 +0200 Subject: [PATCH 3/3] Move error recovery demo --- {examples => docs}/error_recovery_demo.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {examples => docs}/error_recovery_demo.md (100%) diff --git a/examples/error_recovery_demo.md b/docs/error_recovery_demo.md similarity index 100% rename from examples/error_recovery_demo.md rename to docs/error_recovery_demo.md