Skip to content

Commit 82e8f92

Browse files
committed
feat(parser): support distant headers and flexible hash formats
This commit introduces a major enhancement to the markdown parser, enabling it to associate content-related headers (`File:`, `Append File:`, etc.) with code blocks that are not immediately adjacent, and adds more flexibility to the hash-based header format. The parser is refactored from a two-pass to a three-pass architecture to support this new "distant header" functionality: - **Pass 1:** Associates code blocks with immediately adjacent, internal, or wrapped headers. It now only marks a code block's range as "processed" if it was successfully associated with a header, leaving unassociated blocks available for the next pass. - **Pass 2 (New):** Scans for any remaining unassociated content headers. For each one found, it searches forward for the next available (unprocessed) code block and links them. This allows for descriptive paragraphs or other content between a header and its corresponding code block. - **Pass 3 (Formerly Pass 2):** Finds standalone headers that do not require a code block (`Deleted File:`, `Moved File:`). This pass now correctly ignores headers that appear inside *any* code block discovered in Pass 1, not just those that were associated with an action. Additionally, the header-matching regular expression has been improved to: - Support any number of hash symbols (e.g., `###`, `####`). - Allow for arbitrary descriptive text on the same line before the action keyword (e.g., `### 1. New File: path/to/file.txt`). Fixes included in this refactor: - Headers with invalid paths are now correctly marked as processed in Pass 1 to prevent them from being incorrectly re-evaluated by Pass 2. - A new suite of integration tests (`tests/parser/create_distant.rs`) has been added to validate the new distant header functionality and its edge cases.
1 parent a85138b commit 82e8f92

File tree

13 files changed

+418
-144
lines changed

13 files changed

+418
-144
lines changed

src/parser/mod.rs

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ mod header_utils;
1010
mod helpers;
1111
mod internal_comment;
1212
mod pass1;
13-
mod pass2;
13+
mod pass2; // Find unassociated content headers and link forward
14+
mod pass3; // Find standalone Delete/Move headers
1415
mod path_utils;
1516
mod regex; // Contains regex definitions
1617

@@ -38,6 +39,7 @@ pub fn parse_markdown(markdown_content: &str) -> Result<Vec<Action>, ParseError>
3839
let mut actions_with_pos: Vec<(usize, Action)> = Vec::new();
3940
let mut processed_header_starts: HashSet<usize> = HashSet::new();
4041
// Store (start, end) byte indices relative to content_to_parse
42+
let mut all_code_block_ranges: HashSet<(usize, usize)> = HashSet::new();
4143
let mut processed_code_block_ranges: HashSet<(usize, usize)> = HashSet::new();
4244

4345
let (content_to_parse, parse_offset) = helpers::preprocess_markdown(markdown_content);
@@ -48,25 +50,39 @@ pub fn parse_markdown(markdown_content: &str) -> Result<Vec<Action>, ParseError>
4850
}
4951

5052
// --- Pass 1: Find Code Blocks and associate actions ---
51-
println!("Step 1: Locating code blocks and associating actions...");
53+
println!(
54+
"Step 1: Locating code blocks and associating with adjacent/internal/wrapped headers..."
55+
);
5256
pass1::run_pass1(
5357
// Now calls the function in the pass1 module
5458
content_to_parse,
5559
parse_offset,
5660
&mut actions_with_pos,
5761
&mut processed_header_starts,
62+
&mut all_code_block_ranges,
5863
&mut processed_code_block_ranges,
5964
)?;
6065

61-
// --- Pass 2: Find standalone Delete headers and orphaned Create ---
62-
println!("\nStep 2: Locating standalone Delete headers and orphaned Create...");
66+
// --- Pass 2: Find unassociated content headers and link to next block ---
67+
println!(
68+
"\nStep 2: Locating unassociated content headers and linking to subsequent code blocks..."
69+
);
6370
pass2::run_pass2(
64-
// Now calls the function in the pass2 module
6571
content_to_parse,
6672
parse_offset,
6773
&mut actions_with_pos,
68-
&processed_header_starts, // Pass as immutable ref
69-
&processed_code_block_ranges, // Pass as immutable ref
74+
&mut processed_header_starts,
75+
&mut processed_code_block_ranges,
76+
)?;
77+
78+
// --- Pass 3: Find standalone Delete/Move headers ---
79+
println!("\nStep 3: Locating standalone Delete/Move headers...");
80+
pass3::run_pass3(
81+
content_to_parse,
82+
parse_offset,
83+
&mut actions_with_pos,
84+
&processed_header_starts, // Pass as immutable ref
85+
&all_code_block_ranges, // Pass as immutable ref
7086
)?;
7187

7288
// --- Sort actions by original position ---

src/parser/pass1/action_adder.rs

Lines changed: 15 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,47 +1,26 @@
1-
//! Handles adding a successfully parsed action to the list or logging skips in Pass 1.
1+
//! Handles adding a successfully parsed action to the list.
22
33
use crate::core_types::Action;
44
use std::collections::HashSet;
55

6-
/// Adds the action if found, sets its original position, and marks the header as processed.
7-
/// Logs a skip message if no action was found and the block wasn't skipped for other reasons.
8-
#[allow(clippy::too_many_arguments)]
9-
pub(crate) fn add_action_or_log_skip(
10-
current_action: Option<Action>,
11-
header_line_start_pos_rel: Option<usize>,
6+
/// Adds the action, sets its final original position, and marks the header as processed.
7+
pub(crate) fn add_action(
8+
mut action: Action,
9+
header_start_rel: usize,
1210
action_source: &str,
1311
parse_offset: usize,
14-
block_content_start: usize,
15-
original_block_start: usize,
1612
actions_with_pos: &mut Vec<(usize, Action)>,
1713
processed_header_starts: &mut HashSet<usize>,
1814
) {
19-
if let (Some(mut action), Some(header_start_rel)) = (current_action, header_line_start_pos_rel)
20-
{
21-
let original_pos = header_start_rel + parse_offset;
22-
// Ensure original_pos wasn't already set by wrapped header logic
23-
if action.original_pos == 0 {
24-
action.original_pos = original_pos;
25-
}
26-
println!(
27-
" -> Adding action from source '{}' with original_pos {}",
28-
action_source, action.original_pos
29-
);
30-
actions_with_pos.push((action.original_pos, action)); // Use final original_pos for sorting
31-
processed_header_starts.insert(original_pos); // Mark header associated with action
32-
} else if action_source == "unknown" {
33-
// Only log skip if no action was attempted from any source.
34-
// Check if the block was skipped because of an *ignored* internal header
35-
// (which would have been marked in processed_header_starts by handle_internal_header).
36-
let first_line_start_original = block_content_start + parse_offset;
37-
if !processed_header_starts.contains(&first_line_start_original) {
38-
println!(
39-
" Code block at original pos {} has no associated action header (checked external, wrapped, internal). Skipping.",
40-
original_block_start
41-
);
42-
}
15+
let original_pos = header_start_rel + parse_offset;
16+
// Ensure original_pos wasn't already set by wrapped header logic
17+
if action.original_pos == 0 {
18+
action.original_pos = original_pos;
4319
}
44-
// If action_source is known but current_action is None, it means a potential header
45-
// was found but ignored (e.g., invalid path, wrapped header without next block),
46-
// and the warning/info message was already printed by the respective handler.
20+
println!(
21+
" -> Adding action from source '{}' with original_pos {}",
22+
action_source, action.original_pos
23+
);
24+
actions_with_pos.push((action.original_pos, action)); // Use final original_pos for sorting
25+
processed_header_starts.insert(original_pos); // Mark header associated with action
4726
}

src/parser/pass1/action_determiner.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ pub(crate) fn determine_block_action(
2828
block_content_start,
2929
block_content_end,
3030
parse_offset,
31+
processed_header_starts,
3132
)? {
3233
return Ok(Some((action, header_pos, "external")));
3334
}

src/parser/pass1/block_processor.rs

Lines changed: 24 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ pub(crate) fn process_single_block(
2121
original_block_start: usize,
2222
actions_with_pos: &mut Vec<(usize, Action)>,
2323
processed_header_starts: &mut HashSet<usize>,
24+
all_code_block_ranges: &mut HashSet<(usize, usize)>,
2425
processed_code_block_ranges: &mut HashSet<(usize, usize)>,
2526
) -> Result<(), ParseError> {
2627
// Determine the action type and source associated with this block
@@ -36,41 +37,32 @@ pub(crate) fn process_single_block(
3637
processed_header_starts,
3738
processed_code_block_ranges,
3839
)?; // Use ? here
40+
let action_found = determination_result.is_some();
3941

40-
// Process the result
41-
match determination_result {
42-
// Destructuring the tuple works the same with the type alias
43-
Some((action, header_pos_rel, action_source)) => {
44-
// Add action if found and log skips
45-
action_adder::add_action_or_log_skip(
46-
Some(action), // Pass action
47-
Some(header_pos_rel),
48-
action_source,
49-
parse_offset,
50-
block_content_start,
51-
original_block_start,
52-
actions_with_pos,
53-
processed_header_starts,
54-
);
55-
}
56-
None => {
57-
// No action associated with this block from any known source
58-
action_adder::add_action_or_log_skip(
59-
None,
60-
None,
61-
"unknown", // Mark source as unknown
62-
parse_offset,
63-
block_content_start,
64-
original_block_start,
65-
actions_with_pos,
66-
processed_header_starts,
67-
);
68-
}
42+
if let Some((action, header_pos_rel, action_source)) = determination_result {
43+
action_adder::add_action(
44+
action,
45+
header_pos_rel,
46+
action_source,
47+
parse_offset,
48+
actions_with_pos,
49+
processed_header_starts,
50+
);
51+
} else {
52+
// The block was unassociated in this pass. Log it.
53+
println!(
54+
" Code block at original pos {} has no associated action header (checked external, wrapped, internal). Leaving for Pass 2.",
55+
original_block_start
56+
);
6957
}
7058

71-
// Always record the block range if we successfully found opening and closing fences,
72-
// unless it was already added by the wrapped header logic pairing it with *this* block.
73-
if !processed_code_block_ranges.contains(&(fence_start_pos, block_outer_end)) {
59+
// Always record that this range is a code block.
60+
all_code_block_ranges.insert((fence_start_pos, block_outer_end));
61+
62+
// ONLY record the block range if an action was found for it in this pass.
63+
// Wrapped actions mark their content blocks inside the determiner, so this
64+
// correctly handles external/internal actions.
65+
if action_found && !processed_code_block_ranges.contains(&(fence_start_pos, block_outer_end)) {
7466
processed_code_block_ranges.insert((fence_start_pos, block_outer_end));
7567
}
7668

src/parser/pass1/external_header.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ use crate::parser::helpers::ensure_trailing_newline;
88
use crate::parser::pass1::external_delete_special;
99
use crate::parser::path_utils::validate_path_format;
1010
use crate::parser::regex::HEADER_REGEX; // Import the new module
11+
use std::collections::HashSet;
1112

1213
/// Checks for and handles an external header preceding a code block.
1314
/// This applies to *any* code block, including ```markdown.
@@ -18,6 +19,7 @@ pub(crate) fn handle_external_header(
1819
block_content_start: usize,
1920
block_content_end: usize,
2021
parse_offset: usize,
22+
processed_header_starts: &mut HashSet<usize>,
2123
) -> Result<Option<(Action, usize)>, ParseError> {
2224
// Returns (Action, header_start_pos_rel)
2325
let fence_line_start_rel = content[..fence_start_pos].rfind('\n').map_or(0, |n| n + 1);
@@ -77,6 +79,7 @@ pub(crate) fn handle_external_header(
7779
"Warning: Invalid path format in external header '{}'. Skipping.",
7880
stripped_prev_line
7981
);
82+
processed_header_starts.insert(prev_line_start_rel + parse_offset);
8083
return Ok(None);
8184
}
8285

src/parser/pass1/mod.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,14 @@ mod action_determiner;
1111
mod block_processor;
1212
mod external_delete_special;
1313
mod external_header;
14-
mod fence_finder;
14+
pub(crate) mod fence_finder;
1515
#[cfg(test)] // Conditionally compile the test module
1616
mod fence_finder_tests; // ADDED test module
1717
mod internal_comment_handler;
1818
mod internal_header;
1919
mod internal_standard_handler;
2020
mod types;
21-
mod utils;
21+
pub(crate) mod utils;
2222
mod wrapped_create_handler;
2323
mod wrapped_header;
2424

@@ -29,6 +29,7 @@ pub(crate) fn run_pass1(
2929
parse_offset: usize,
3030
actions_with_pos: &mut Vec<(usize, Action)>,
3131
processed_header_starts: &mut HashSet<usize>,
32+
all_code_block_ranges: &mut HashSet<(usize, usize)>,
3233
processed_code_block_ranges: &mut HashSet<(usize, usize)>,
3334
) -> Result<(), ParseError> {
3435
let mut current_search_pos = 0;
@@ -90,6 +91,7 @@ pub(crate) fn run_pass1(
9091
original_block_start,
9192
actions_with_pos,
9293
processed_header_starts,
94+
all_code_block_ranges,
9395
processed_code_block_ranges,
9496
)?;
9597

0 commit comments

Comments
 (0)