Skip to content

Commit 004ae66

Browse files
committed
feat(parser): allow optional trailing text in action headers
Enables users to add non-functional descriptive text or comments after the path in various action header formats without affecting path extraction. This improves the self-documenting capability of input markdown files. Examples of newly supported formats: - `## File: path/to/file.txt (description)` - `**Deleted File: old.log** # To be removed` - `` `config.json` (main config)`` - `## `path/in/ticks.css` (style file)` This was implemented by simplifying the main `HEADER_REGEX` capture groups for non-backticked paths (`**...**`, `## ...`), making them capture more broadly. The responsibility for precisely isolating the path and ignoring trailing text (starting with " (" or " #") is now handled by enhanced logic within the `header_utils::extract_action_path_from_captures` function in Rust. This approach avoids overly complex and brittle regex patterns. Additionally, this fixes an edge case where headers like `## File: ``` ```` could previously lead to ````` being incorrectly extracted as a path. The extractor now correctly identifies and rejects such cases.
1 parent 717bb4e commit 004ae66

File tree

7 files changed

+442
-65
lines changed

7 files changed

+442
-65
lines changed

src/parser/header_utils.rs

Lines changed: 68 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -4,77 +4,90 @@ use crate::constants::{ACTION_DELETED_FILE, ACTION_FILE};
44
use crate::core_types::ActionType;
55
use regex::Captures;
66

7-
/// Extracts action word and path string from HEADER_REGEX captures.
7+
/// Extracts action word and path string from HEADER_REGEX captures, ignoring trailing text.
8+
/// Relies on simplified regex capture groups and performs more parsing here.
89
pub(crate) fn extract_action_path_from_captures(caps: &Captures) -> Option<(String, String)> {
910
let mut action_word: Option<String> = None;
10-
let mut header_path: Option<String> = None;
11-
let mut content_str: Option<&str> = None;
11+
let mut final_path: Option<String> = None;
1212

13-
// Extract based on named capture groups
14-
if let (Some(aw), Some(c)) = (caps.name("action_word_bold"), caps.name("content_bold")) {
15-
action_word = Some(aw.as_str().to_string());
16-
content_str = Some(c.as_str());
17-
} else if let (Some(aw), Some(c)) = (caps.name("action_word_hash"), caps.name("content_hash")) {
18-
action_word = Some(aw.as_str().to_string());
19-
content_str = Some(c.as_str());
13+
// --- Determine Action Word and Raw Content/Path ---
14+
15+
// Check specific backtick path captures first (cleanest case)
16+
if let Some(p) = caps.name("path_hash_backtick") {
17+
action_word = Some(ACTION_FILE.to_string());
18+
final_path = Some(p.as_str().trim().to_string());
2019
} else if let Some(p) = caps.name("path_backtick_only") {
2120
action_word = Some(ACTION_FILE.to_string());
22-
header_path = Some(p.as_str().trim().to_string());
21+
final_path = Some(p.as_str().trim().to_string());
2322
} else if let Some(p) = caps.name("path_numbered_backtick") {
2423
action_word = Some(ACTION_FILE.to_string());
25-
header_path = Some(p.as_str().trim().to_string());
24+
final_path = Some(p.as_str().trim().to_string());
2625
} else if let Some(p) = caps.name("path_bold_backtick") {
2726
action_word = Some(ACTION_FILE.to_string());
28-
header_path = Some(p.as_str().trim().to_string());
29-
} else if let Some(p) = caps.name("path_hash_backtick") {
30-
action_word = Some(ACTION_FILE.to_string());
31-
header_path = Some(p.as_str().trim().to_string());
27+
final_path = Some(p.as_str().trim().to_string());
3228
}
33-
34-
// Process content_str for Bold/Hash Action formats to extract path
35-
if let Some(content) = content_str {
36-
let stripped_content = content.trim();
37-
// Check if the stripped content is *only* backticks (e.g., `` ` `` or ``` `` ```)
38-
// If so, treat it as an empty path.
39-
let is_only_backticks = stripped_content.starts_with('`')
40-
&& stripped_content.ends_with('`')
41-
&& stripped_content
42-
.chars()
43-
.skip(1)
44-
.take(stripped_content.len() - 2)
45-
.all(|c| c == '`');
46-
47-
if is_only_backticks {
48-
header_path = Some("".to_string()); // Treat as empty path explicitly
49-
} else {
50-
// Prefer path inside backticks if present within the content part
51-
header_path = Some(
52-
if stripped_content.len() > 1
53-
&& stripped_content.starts_with('`')
54-
&& stripped_content.ends_with('`')
55-
{
56-
stripped_content[1..stripped_content.len() - 1]
57-
.trim()
58-
.to_string() // Inside backticks
59-
} else {
60-
stripped_content.to_string() // Whole content as path
61-
},
62-
);
63-
}
29+
// Check combined Action: content captures (need parsing)
30+
else if let (Some(aw), Some(c)) = (caps.name("action_word_bold"), caps.name("content_bold")) {
31+
action_word = Some(aw.as_str().to_string());
32+
final_path = parse_content_for_path(c.as_str());
33+
} else if let (Some(aw), Some(c)) = (caps.name("action_word_hash"), caps.name("content_hash")) {
34+
action_word = Some(aw.as_str().to_string());
35+
final_path = parse_content_for_path(c.as_str());
6436
}
6537

66-
// Validate and return
67-
match (action_word, header_path) {
68-
// Ensure the extracted path is not empty AFTER trimming potential backticks and whitespace
69-
(Some(aw), Some(hp)) => {
70-
let final_path = hp.trim(); // Trim whitespace from final path string
71-
if !final_path.is_empty() {
72-
Some((aw, final_path.to_string()))
38+
// --- Validate and Return ---
39+
match (action_word, final_path) {
40+
// Ensure final path is not empty AFTER trimming potential backticks and whitespace
41+
(Some(aw), Some(fp)) => {
42+
let final_trimmed_path = fp.trim();
43+
// Add check: reject if path consists ONLY of backticks after trimming
44+
if !final_trimmed_path.is_empty() && final_trimmed_path.chars().all(|c| c == '`') {
45+
return None;
46+
}
47+
if !final_trimmed_path.is_empty() {
48+
Some((aw, final_trimmed_path.to_string()))
7349
} else {
7450
None
7551
}
7652
}
77-
_ => None,
53+
_ => None, // No action word, or path parsing failed/resulted in empty path
54+
}
55+
}
56+
57+
/// Parses the raw captured content string to extract the path, ignoring trailing text.
58+
fn parse_content_for_path(raw_content: &str) -> Option<String> {
59+
let trimmed_content = raw_content.trim();
60+
61+
// Check for path inside backticks first
62+
if let (Some(start), Some(end)) = (trimmed_content.find('`'), trimmed_content.rfind('`')) {
63+
if start < end {
64+
// Found distinct backticks, extract path from within
65+
let path_between_ticks = trimmed_content[start + 1..end].trim();
66+
// Ensure the content BETWEEN the ticks is not empty after trimming
67+
return if path_between_ticks.is_empty() {
68+
None
69+
} else {
70+
Some(path_between_ticks.to_string())
71+
};
72+
}
73+
// If start >= end, backticks are malformed or nested in a way we don't handle here.
74+
// Fall through to treat as non-backticked path.
75+
}
76+
77+
// No valid backticks found, treat as non-backticked path.
78+
// Find the end of the path (before potential trailing text).
79+
// Trailing text starts with " (" or " #".
80+
let path_end_index = trimmed_content
81+
.find(" (")
82+
.or_else(|| trimmed_content.find(" #"))
83+
.unwrap_or(trimmed_content.len()); // If no marker found, path is the whole string
84+
85+
let path = trimmed_content[..path_end_index].trim();
86+
87+
if path.is_empty() {
88+
None
89+
} else {
90+
Some(path.to_string())
7891
}
7992
}
8093

src/parser/header_utils_tests.rs

Lines changed: 62 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,35 @@ fn test_extract_backtick_only() {
6060
}
6161

6262
#[test]
63-
fn test_extract_backtick_only_with_trailing_text() {
63+
fn test_extract_bold_file_with_trailing_text_outside() {
64+
let input = "**File: path/to/file.txt** (description)";
65+
let caps = get_captures(input).expect("Regex failed to capture bold file with trailing text");
66+
let (action, path) = extract_action_path_from_captures(&caps).unwrap();
67+
assert_eq!(action, ACTION_FILE);
68+
assert_eq!(path, "path/to/file.txt"); // Trailing text ignored by extractor
69+
}
70+
71+
#[test]
72+
fn test_extract_hash_file_with_trailing_text() {
73+
let input = "## File: path/to/file.txt # comment";
74+
let caps = get_captures(input).expect("Regex failed to capture hash file with trailing text");
75+
let (action, path) = extract_action_path_from_captures(&caps).unwrap();
76+
assert_eq!(action, ACTION_FILE);
77+
assert_eq!(path, "path/to/file.txt"); // Trailing text ignored by extractor
78+
}
79+
80+
#[test]
81+
fn test_extract_hash_file_with_backticks_and_trailing_text() {
82+
let input = "## File: `path/in/ticks.txt` (description)";
83+
let caps = get_captures(input)
84+
.expect("Regex failed to capture hash file with ticks and trailing text");
85+
let (action, path) = extract_action_path_from_captures(&caps).unwrap();
86+
assert_eq!(action, ACTION_FILE);
87+
assert_eq!(path, "path/in/ticks.txt"); // Backticks stripped, trailing text ignored
88+
}
89+
90+
#[test]
91+
fn test_extract_backtick_only_with_trailing_text_outside() {
6492
let input = "`simple/path.rs` (some comment)";
6593
let caps =
6694
get_captures(input).expect("Regex failed to capture backtick only with trailing text");
@@ -69,6 +97,17 @@ fn test_extract_backtick_only_with_trailing_text() {
6997
assert_eq!(path, "simple/path.rs"); // Trailing text ignored
7098
}
7199

100+
// This test was already correct, but adding a comment for clarity
101+
#[test]
102+
fn test_extract_backtick_only_with_trailing_text() {
103+
let input = "`simple/path.rs` (some comment)";
104+
let caps =
105+
get_captures(input).expect("Regex failed to capture backtick only with trailing text");
106+
let (action, path) = extract_action_path_from_captures(&caps).unwrap();
107+
assert_eq!(action, ACTION_FILE);
108+
assert_eq!(path, "simple/path.rs"); // Trailing text ignored
109+
} // This test case is effectively duplicated by the one above now.
110+
72111
#[test]
73112
fn test_extract_numbered_backtick() {
74113
let input = "12. `numbered/path.py`";
@@ -78,6 +117,16 @@ fn test_extract_numbered_backtick() {
78117
assert_eq!(path, "numbered/path.py");
79118
}
80119

120+
#[test]
121+
fn test_extract_numbered_backtick_with_trailing_text() {
122+
let input = "12. `numbered/path.py` # comment";
123+
let caps =
124+
get_captures(input).expect("Regex failed to capture numbered backtick with trailing text");
125+
let (action, path) = extract_action_path_from_captures(&caps).unwrap();
126+
assert_eq!(action, ACTION_FILE);
127+
assert_eq!(path, "numbered/path.py"); // Trailing text ignored
128+
}
129+
81130
#[test]
82131
fn test_extract_bold_backtick() {
83132
let input = "**`bold/tick.js`**";
@@ -88,14 +137,15 @@ fn test_extract_bold_backtick() {
88137
}
89138

90139
#[test]
140+
// This test was already correct, but adding a comment for clarity
91141
fn test_extract_bold_backtick_with_trailing_text() {
92142
let input = "**`bold/tick.js`** and more";
93143
let caps =
94144
get_captures(input).expect("Regex failed to capture bold backtick with trailing text");
95145
let (action, path) = extract_action_path_from_captures(&caps).unwrap();
96146
assert_eq!(action, ACTION_FILE);
97147
assert_eq!(path, "bold/tick.js"); // Trailing text ignored
98-
}
148+
} // This test case is effectively duplicated by the one above now.
99149

100150
#[test]
101151
fn test_extract_hash_backtick() {
@@ -106,6 +156,16 @@ fn test_extract_hash_backtick() {
106156
assert_eq!(path, "hash/tick.css");
107157
}
108158

159+
#[test]
160+
fn test_extract_hash_backtick_with_trailing_text() {
161+
let input = "## `hash/tick.css` (style file)";
162+
let caps =
163+
get_captures(input).expect("Regex failed to capture hash backtick with trailing text");
164+
let (action, path) = extract_action_path_from_captures(&caps).unwrap();
165+
assert_eq!(action, ACTION_FILE);
166+
assert_eq!(path, "hash/tick.css"); // Trailing text ignored
167+
}
168+
109169
#[test]
110170
fn test_extract_no_match() {
111171
assert!(get_captures("Just text").is_none());

src/parser/regex.rs

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,20 +7,30 @@ use regex::Regex;
77
// Use Lazy from once_cell for thread-safe static initialization of Regex objects.
88

99
// Regex to find file headers anchored to the start of a line.
10-
// Revised to prevent matching across lines and simplify trailing whitespace handling.
10+
// Revised to simplify capture groups for non-backticked paths and rely more on Rust extraction.
1111
pub static HEADER_REGEX: Lazy<Regex> = Lazy::new(|| {
1212
let actions = &*VALID_ACTIONS_REGEX_STR; // Dereference Lazy<String>
1313

14-
// Use a single multi-line raw string literal r#"..."# as the format string
15-
// Put back \s*$ anchor specifically for content_bold and content_hash alternatives.
16-
// Use non-greedy *? for content capture before the \s*$ anchor.
17-
// No final \s*$ at the very end of the whole pattern string.
1814
let pattern = format!(
19-
r#"(?m)^(?:\*\*\s*(?P<action_word_bold>{actions}):\s+(?P<content_bold>[^\n]+?)\s*\*\*|##\s+`(?P<path_hash_backtick>[^`\n]+?)`|##\s+(?P<action_word_hash>{actions}):\s*(?P<content_hash>[^\n]*?)\s*$|`(?P<path_backtick_only>[^`\n]+?)`|(?P<num>\d+)\.\s+`(?P<path_numbered_backtick>[^`\n]+?)`|\*\*\s*`(?P<path_bold_backtick>[^`\n]+?)`\s*\*\*)"#,
20-
// Note: Added \s*$ to content_hash alternative only. content_bold already had \s*\*\* which acts similarly.
21-
// Kept content_hash as *? (non-greedy)
15+
// **Action: content**: Capture content greedily, allow optional trailing text after **.
16+
// ## Action: content: Capture content greedily, no optional trailing text needed here (extractor handles).
17+
// Backtick versions remain specific but allow optional trailing text after marker.
18+
r#"(?m)^(?:\*\*\s*(?P<action_word_bold>{actions}):\s*(?P<content_bold>.+?)\s*\*\*(?:[^\n]*)?$|##\s+`(?P<path_hash_backtick>[^`\n]+?)`(?:[^\n]*)?$|##\s+(?P<action_word_hash>{actions}):\s*(?P<content_hash>.*)$|`(?P<path_backtick_only>[^`\n]+?)`(?:[^\n]*)?$|(?P<num>\d+)\.\s+`(?P<path_numbered_backtick>[^`\n]+?)`(?:[^\n]*)?$|\*\*\s*`(?P<path_bold_backtick>[^`\n]+?)`\s*\*\*(?:[^\n]*)?$)"#,
2219
actions = actions // Argument for format!
2320
);
21+
// Explanation of changes:
22+
// - **Bold (`**...**`)**:
23+
// - `action_word_bold` captures the action.
24+
// - `content_bold` captures `.+?` (non-greedy) after `Action: \s*`. <--- Reverted to non-greedy
25+
// - Requires `\s*\*\*` after content.
26+
// - Added `(?:[^\n]*)?$` back to allow optional trailing text AFTER the closing **. <--- FIX
27+
// - **Hash (`## Action: ...`)**:
28+
// - `action_word_hash` captures the action.
29+
// - `content_hash` captures `.*` (greedy, zero or more chars) after `Action: \s*`.
30+
// - Requires `$` at the end. Extraction logic handles parsing `content_hash`.
31+
// - **Backtick paths (`## `path``, `` `path` ``, `1. `path``, `**`path`**`)**:
32+
// - These alternatives remain largely unchanged, capturing the path inside backticks specifically.
33+
// - They still allow optional trailing text `(?:[^\n]*)?$` after the closing backtick/bold marker.
2434
// println!("[REGEX INIT] Revised HEADER_REGEX pattern:\n{}", pattern); // DEBUG (optional)
2535
Regex::new(&pattern).expect("Failed to compile HEADER_REGEX")
2636
});

tests/parser/create_external.rs

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,3 +81,42 @@ fn test_parse_hash_backtick_path_header() {
8181
Some("Some raw content.\n"),
8282
);
8383
}
84+
85+
#[test]
86+
fn test_parse_hash_file_header_with_trailing_comment() {
87+
let md = "\n## File: config.cfg # Main config file\n```\nkey=value\n```\n";
88+
let actions = parse_markdown(md).expect("Parsing failed");
89+
assert_eq!(actions.len(), 1);
90+
assert_action(
91+
actions.first(),
92+
ActionType::Create,
93+
"config.cfg", // Trailing comment ignored
94+
Some("key=value\n"),
95+
);
96+
}
97+
98+
#[test]
99+
fn test_parse_bold_file_header_with_trailing_text_outside() {
100+
let md = "\n**File: data.json** (important data)\n```json\n{}\n```\n";
101+
let actions = parse_markdown(md).expect("Parsing failed");
102+
assert_eq!(actions.len(), 1);
103+
assert_action(
104+
actions.first(),
105+
ActionType::Create,
106+
"data.json", // Trailing text ignored
107+
Some("{}\n"),
108+
);
109+
}
110+
111+
#[test]
112+
fn test_parse_backtick_path_header_with_trailing_text() {
113+
let md = "\n`script.pl` # Perl script\n```perl\n#!/usr/bin/perl\nprint \"Hi\";\n```\n";
114+
let actions = parse_markdown(md).expect("Parsing failed");
115+
assert_eq!(actions.len(), 1);
116+
assert_action(
117+
actions.first(),
118+
ActionType::Create,
119+
"script.pl", // Trailing text ignored
120+
Some("#!/usr/bin/perl\nprint \"Hi\";\n"),
121+
);
122+
}

tests/parser/create_wrapped.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,3 +81,16 @@ fn test_parse_markdown_block_not_a_header() {
8181
"Markdown block with non-header line should be ignored"
8282
);
8383
}
84+
85+
#[test]
86+
fn test_parse_wrapped_hash_file_header_with_trailing_text() {
87+
let md = "\n```markdown\n## File: wrapped/config.toml # Main config\n```\n\n```toml\n[settings]\nkey = \"value\"\n```\n";
88+
let actions = parse_markdown(md).expect("Parsing failed");
89+
assert_eq!(actions.len(), 1);
90+
assert_action(
91+
actions.first(),
92+
ActionType::Create,
93+
"wrapped/config.toml", // Trailing text ignored
94+
Some("[settings]\nkey = \"value\"\n"),
95+
);
96+
}

tests/parser/delete.rs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,3 +76,29 @@ fn test_parse_wrapped_bold_deleted_file_header() {
7676
assert_eq!(actions.len(), 1);
7777
assert_action(actions.first(), ActionType::Delete, "temp.log", None);
7878
}
79+
80+
#[test]
81+
fn test_parse_hash_deleted_file_header_with_trailing_comment() {
82+
let md = "\n## Deleted File: old_cache.dat # Remove this\n";
83+
let actions = parse_markdown(md).expect("Parsing failed");
84+
assert_eq!(actions.len(), 1);
85+
assert_action(
86+
actions.first(),
87+
ActionType::Delete,
88+
"old_cache.dat", // Trailing comment ignored
89+
None,
90+
);
91+
}
92+
93+
#[test]
94+
fn test_parse_bold_deleted_file_header_with_trailing_text_outside() {
95+
let md = "\n**Deleted File: report.pdf** (old version)\n";
96+
let actions = parse_markdown(md).expect("Parsing failed");
97+
assert_eq!(actions.len(), 1);
98+
assert_action(
99+
actions.first(),
100+
ActionType::Delete,
101+
"report.pdf",
102+
None, // Trailing text ignored
103+
);
104+
}

0 commit comments

Comments
 (0)