feat(parser): allow optional trailing text in action headers

Romelium · Romelium · commit 004ae663a831 · 2025-04-13T04:38:20.000+08:00
Enables users to add non-functional descriptive text or comments after the path in various action header formats without affecting path extraction. This improves the self-documenting capability of input markdown files.

Examples of newly supported formats:
- `## File: path/to/file.txt (description)`
- `**Deleted File: old.log** # To be removed`
- `` `config.json` (main config)``
- `## `path/in/ticks.css` (style file)`

This was implemented by simplifying the main `HEADER_REGEX` capture groups for non-backticked paths (`**...**`, `## ...`), making them capture more broadly. The responsibility for precisely isolating the path and ignoring trailing text (starting with " (" or " #") is now handled by enhanced logic within the `header_utils::extract_action_path_from_captures` function in Rust.

This approach avoids overly complex and brittle regex patterns.

Additionally, this fixes an edge case where headers like `## File: ``` ```` could previously lead to ````` being incorrectly extracted as a path. The extractor now correctly identifies and rejects such cases.
diff --git a/src/parser/header_utils.rs b/src/parser/header_utils.rs
@@ -4,77 +4,90 @@ use crate::constants::{ACTION_DELETED_FILE, ACTION_FILE};
 use crate::core_types::ActionType;
 use regex::Captures;
 
-/// Extracts action word and path string from HEADER_REGEX captures.
+/// Extracts action word and path string from HEADER_REGEX captures, ignoring trailing text.
+/// Relies on simplified regex capture groups and performs more parsing here.
 pub(crate) fn extract_action_path_from_captures(caps: &Captures) -> Option<(String, String)> {
     let mut action_word: Option<String> = None;
-    let mut header_path: Option<String> = None;
-    let mut content_str: Option<&str> = None;
+    let mut final_path: Option<String> = None;
 
-    // Extract based on named capture groups
-    if let (Some(aw), Some(c)) = (caps.name("action_word_bold"), caps.name("content_bold")) {
-        action_word = Some(aw.as_str().to_string());
-        content_str = Some(c.as_str());
-    } else if let (Some(aw), Some(c)) = (caps.name("action_word_hash"), caps.name("content_hash")) {
-        action_word = Some(aw.as_str().to_string());
-        content_str = Some(c.as_str());
+    // --- Determine Action Word and Raw Content/Path ---
+
+    // Check specific backtick path captures first (cleanest case)
+    if let Some(p) = caps.name("path_hash_backtick") {
+        action_word = Some(ACTION_FILE.to_string());
+        final_path = Some(p.as_str().trim().to_string());
     } else if let Some(p) = caps.name("path_backtick_only") {
         action_word = Some(ACTION_FILE.to_string());
-        header_path = Some(p.as_str().trim().to_string());
+        final_path = Some(p.as_str().trim().to_string());
     } else if let Some(p) = caps.name("path_numbered_backtick") {
         action_word = Some(ACTION_FILE.to_string());
-        header_path = Some(p.as_str().trim().to_string());
+        final_path = Some(p.as_str().trim().to_string());
     } else if let Some(p) = caps.name("path_bold_backtick") {
         action_word = Some(ACTION_FILE.to_string());
-        header_path = Some(p.as_str().trim().to_string());
-    } else if let Some(p) = caps.name("path_hash_backtick") {
-        action_word = Some(ACTION_FILE.to_string());
-        header_path = Some(p.as_str().trim().to_string());
+        final_path = Some(p.as_str().trim().to_string());
     }
-
-    // Process content_str for Bold/Hash Action formats to extract path
-    if let Some(content) = content_str {
-        let stripped_content = content.trim();
-        // Check if the stripped content is *only* backticks (e.g., `` ` `` or ``` `` ```)
-        // If so, treat it as an empty path.
-        let is_only_backticks = stripped_content.starts_with('`')
-            && stripped_content.ends_with('`')
-            && stripped_content
-                .chars()
-                .skip(1)
-                .take(stripped_content.len() - 2)
-                .all(|c| c == '`');
-
-        if is_only_backticks {
-            header_path = Some("".to_string()); // Treat as empty path explicitly
-        } else {
-            // Prefer path inside backticks if present within the content part
-            header_path = Some(
-                if stripped_content.len() > 1
-                    && stripped_content.starts_with('`')
-                    && stripped_content.ends_with('`')
-                {
-                    stripped_content[1..stripped_content.len() - 1]
-                        .trim()
-                        .to_string() // Inside backticks
-                } else {
-                    stripped_content.to_string() // Whole content as path
-                },
-            );
-        }
+    // Check combined Action: content captures (need parsing)
+    else if let (Some(aw), Some(c)) = (caps.name("action_word_bold"), caps.name("content_bold")) {
+        action_word = Some(aw.as_str().to_string());
+        final_path = parse_content_for_path(c.as_str());
+    } else if let (Some(aw), Some(c)) = (caps.name("action_word_hash"), caps.name("content_hash")) {
+        action_word = Some(aw.as_str().to_string());
+        final_path = parse_content_for_path(c.as_str());
     }
 
-    // Validate and return
-    match (action_word, header_path) {
-        // Ensure the extracted path is not empty AFTER trimming potential backticks and whitespace
-        (Some(aw), Some(hp)) => {
-            let final_path = hp.trim(); // Trim whitespace from final path string
-            if !final_path.is_empty() {
-                Some((aw, final_path.to_string()))
+    // --- Validate and Return ---
+    match (action_word, final_path) {
+        // Ensure final path is not empty AFTER trimming potential backticks and whitespace
+        (Some(aw), Some(fp)) => {
+            let final_trimmed_path = fp.trim();
+            // Add check: reject if path consists ONLY of backticks after trimming
+            if !final_trimmed_path.is_empty() && final_trimmed_path.chars().all(|c| c == '`') {
+                return None;
+            }
+            if !final_trimmed_path.is_empty() {
+                Some((aw, final_trimmed_path.to_string()))
             } else {
                 None
             }
         }
-        _ => None,
+        _ => None, // No action word, or path parsing failed/resulted in empty path
+    }
+}
+
+/// Parses the raw captured content string to extract the path, ignoring trailing text.
+fn parse_content_for_path(raw_content: &str) -> Option<String> {
+    let trimmed_content = raw_content.trim();
+
+    // Check for path inside backticks first
+    if let (Some(start), Some(end)) = (trimmed_content.find('`'), trimmed_content.rfind('`')) {
+        if start < end {
+            // Found distinct backticks, extract path from within
+            let path_between_ticks = trimmed_content[start + 1..end].trim();
+            // Ensure the content BETWEEN the ticks is not empty after trimming
+            return if path_between_ticks.is_empty() {
+                None
+            } else {
+                Some(path_between_ticks.to_string())
+            };
+        }
+        // If start >= end, backticks are malformed or nested in a way we don't handle here.
+        // Fall through to treat as non-backticked path.
+    }
+
+    // No valid backticks found, treat as non-backticked path.
+    // Find the end of the path (before potential trailing text).
+    // Trailing text starts with " (" or " #".
+    let path_end_index = trimmed_content
+        .find(" (")
+        .or_else(|| trimmed_content.find(" #"))
+        .unwrap_or(trimmed_content.len()); // If no marker found, path is the whole string
+
+    let path = trimmed_content[..path_end_index].trim();
+
+    if path.is_empty() {
+        None
+    } else {
+        Some(path.to_string())
     }
 }
 
diff --git a/src/parser/header_utils_tests.rs b/src/parser/header_utils_tests.rs
@@ -60,7 +60,35 @@ fn test_extract_backtick_only() {
 }
 
 #[test]
-fn test_extract_backtick_only_with_trailing_text() {
+fn test_extract_bold_file_with_trailing_text_outside() {
+    let input = "**File: path/to/file.txt** (description)";
+    let caps = get_captures(input).expect("Regex failed to capture bold file with trailing text");
+    let (action, path) = extract_action_path_from_captures(&caps).unwrap();
+    assert_eq!(action, ACTION_FILE);
+    assert_eq!(path, "path/to/file.txt"); // Trailing text ignored by extractor
+}
+
+#[test]
+fn test_extract_hash_file_with_trailing_text() {
+    let input = "## File: path/to/file.txt # comment";
+    let caps = get_captures(input).expect("Regex failed to capture hash file with trailing text");
+    let (action, path) = extract_action_path_from_captures(&caps).unwrap();
+    assert_eq!(action, ACTION_FILE);
+    assert_eq!(path, "path/to/file.txt"); // Trailing text ignored by extractor
+}
+
+#[test]
+fn test_extract_hash_file_with_backticks_and_trailing_text() {
+    let input = "## File: `path/in/ticks.txt` (description)";
+    let caps = get_captures(input)
+        .expect("Regex failed to capture hash file with ticks and trailing text");
+    let (action, path) = extract_action_path_from_captures(&caps).unwrap();
+    assert_eq!(action, ACTION_FILE);
+    assert_eq!(path, "path/in/ticks.txt"); // Backticks stripped, trailing text ignored
+}
+
+#[test]
+fn test_extract_backtick_only_with_trailing_text_outside() {
     let input = "`simple/path.rs` (some comment)";
     let caps =
         get_captures(input).expect("Regex failed to capture backtick only with trailing text");
@@ -69,6 +97,17 @@ fn test_extract_backtick_only_with_trailing_text() {
     assert_eq!(path, "simple/path.rs"); // Trailing text ignored
 }
 
+// This test was already correct, but adding a comment for clarity
+#[test]
+fn test_extract_backtick_only_with_trailing_text() {
+    let input = "`simple/path.rs` (some comment)";
+    let caps =
+        get_captures(input).expect("Regex failed to capture backtick only with trailing text");
+    let (action, path) = extract_action_path_from_captures(&caps).unwrap();
+    assert_eq!(action, ACTION_FILE);
+    assert_eq!(path, "simple/path.rs"); // Trailing text ignored
+} // This test case is effectively duplicated by the one above now.
+
 #[test]
 fn test_extract_numbered_backtick() {
     let input = "12. `numbered/path.py`";
@@ -78,6 +117,16 @@ fn test_extract_numbered_backtick() {
     assert_eq!(path, "numbered/path.py");
 }
 
+#[test]
+fn test_extract_numbered_backtick_with_trailing_text() {
+    let input = "12. `numbered/path.py` # comment";
+    let caps =
+        get_captures(input).expect("Regex failed to capture numbered backtick with trailing text");
+    let (action, path) = extract_action_path_from_captures(&caps).unwrap();
+    assert_eq!(action, ACTION_FILE);
+    assert_eq!(path, "numbered/path.py"); // Trailing text ignored
+}
+
 #[test]
 fn test_extract_bold_backtick() {
     let input = "**`bold/tick.js`**";
@@ -88,14 +137,15 @@ fn test_extract_bold_backtick() {
 }
 
 #[test]
+// This test was already correct, but adding a comment for clarity
 fn test_extract_bold_backtick_with_trailing_text() {
     let input = "**`bold/tick.js`** and more";
     let caps =
         get_captures(input).expect("Regex failed to capture bold backtick with trailing text");
     let (action, path) = extract_action_path_from_captures(&caps).unwrap();
     assert_eq!(action, ACTION_FILE);
     assert_eq!(path, "bold/tick.js"); // Trailing text ignored
-}
+} // This test case is effectively duplicated by the one above now.
 
 #[test]
 fn test_extract_hash_backtick() {
@@ -106,6 +156,16 @@ fn test_extract_hash_backtick() {
     assert_eq!(path, "hash/tick.css");
 }
 
+#[test]
+fn test_extract_hash_backtick_with_trailing_text() {
+    let input = "## `hash/tick.css` (style file)";
+    let caps =
+        get_captures(input).expect("Regex failed to capture hash backtick with trailing text");
+    let (action, path) = extract_action_path_from_captures(&caps).unwrap();
+    assert_eq!(action, ACTION_FILE);
+    assert_eq!(path, "hash/tick.css"); // Trailing text ignored
+}
+
 #[test]
 fn test_extract_no_match() {
     assert!(get_captures("Just text").is_none());
diff --git a/src/parser/regex.rs b/src/parser/regex.rs
@@ -7,20 +7,30 @@ use regex::Regex;
 // Use Lazy from once_cell for thread-safe static initialization of Regex objects.
 
 // Regex to find file headers anchored to the start of a line.
-// Revised to prevent matching across lines and simplify trailing whitespace handling.
+// Revised to simplify capture groups for non-backticked paths and rely more on Rust extraction.
 pub static HEADER_REGEX: Lazy<Regex> = Lazy::new(|| {
     let actions = &*VALID_ACTIONS_REGEX_STR; // Dereference Lazy<String>
 
-    // Use a single multi-line raw string literal r#"..."# as the format string
-    // Put back \s*$ anchor specifically for content_bold and content_hash alternatives.
-    // Use non-greedy *? for content capture before the \s*$ anchor.
-    // No final \s*$ at the very end of the whole pattern string.
     let pattern = format!(
-        r#"(?m)^(?:\*\*\s*(?P<action_word_bold>{actions}):\s+(?P<content_bold>[^\n]+?)\s*\*\*|##\s+`(?P<path_hash_backtick>[^`\n]+?)`|##\s+(?P<action_word_hash>{actions}):\s*(?P<content_hash>[^\n]*?)\s*$|`(?P<path_backtick_only>[^`\n]+?)`|(?P<num>\d+)\.\s+`(?P<path_numbered_backtick>[^`\n]+?)`|\*\*\s*`(?P<path_bold_backtick>[^`\n]+?)`\s*\*\*)"#,
-        // Note: Added \s*$ to content_hash alternative only. content_bold already had \s*\*\* which acts similarly.
-        //       Kept content_hash as *? (non-greedy)
+        // **Action: content**: Capture content greedily, allow optional trailing text after **.
+        // ## Action: content: Capture content greedily, no optional trailing text needed here (extractor handles).
+        // Backtick versions remain specific but allow optional trailing text after marker.
+        r#"(?m)^(?:\*\*\s*(?P<action_word_bold>{actions}):\s*(?P<content_bold>.+?)\s*\*\*(?:[^\n]*)?$|##\s+`(?P<path_hash_backtick>[^`\n]+?)`(?:[^\n]*)?$|##\s+(?P<action_word_hash>{actions}):\s*(?P<content_hash>.*)$|`(?P<path_backtick_only>[^`\n]+?)`(?:[^\n]*)?$|(?P<num>\d+)\.\s+`(?P<path_numbered_backtick>[^`\n]+?)`(?:[^\n]*)?$|\*\*\s*`(?P<path_bold_backtick>[^`\n]+?)`\s*\*\*(?:[^\n]*)?$)"#,
         actions = actions // Argument for format!
     );
+    // Explanation of changes:
+    // - **Bold (`**...**`)**:
+    //   - `action_word_bold` captures the action.
+    //   - `content_bold` captures `.+?` (non-greedy) after `Action: \s*`. <--- Reverted to non-greedy
+    //   - Requires `\s*\*\*` after content.
+    //   - Added `(?:[^\n]*)?$` back to allow optional trailing text AFTER the closing **. <--- FIX
+    // - **Hash (`## Action: ...`)**:
+    //   - `action_word_hash` captures the action.
+    //   - `content_hash` captures `.*` (greedy, zero or more chars) after `Action: \s*`.
+    //   - Requires `$` at the end. Extraction logic handles parsing `content_hash`.
+    // - **Backtick paths (`## `path``, `` `path` ``, `1. `path``, `**`path`**`)**:
+    //   - These alternatives remain largely unchanged, capturing the path inside backticks specifically.
+    //   - They still allow optional trailing text `(?:[^\n]*)?$` after the closing backtick/bold marker.
     // println!("[REGEX INIT] Revised HEADER_REGEX pattern:\n{}", pattern); // DEBUG (optional)
     Regex::new(&pattern).expect("Failed to compile HEADER_REGEX")
 });
diff --git a/tests/parser/create_external.rs b/tests/parser/create_external.rs
@@ -81,3 +81,42 @@ fn test_parse_hash_backtick_path_header() {
         Some("Some raw content.\n"),
     );
 }
+
+#[test]
+fn test_parse_hash_file_header_with_trailing_comment() {
+    let md = "\n## File: config.cfg # Main config file\n```\nkey=value\n```\n";
+    let actions = parse_markdown(md).expect("Parsing failed");
+    assert_eq!(actions.len(), 1);
+    assert_action(
+        actions.first(),
+        ActionType::Create,
+        "config.cfg", // Trailing comment ignored
+        Some("key=value\n"),
+    );
+}
+
+#[test]
+fn test_parse_bold_file_header_with_trailing_text_outside() {
+    let md = "\n**File: data.json** (important data)\n```json\n{}\n```\n";
+    let actions = parse_markdown(md).expect("Parsing failed");
+    assert_eq!(actions.len(), 1);
+    assert_action(
+        actions.first(),
+        ActionType::Create,
+        "data.json", // Trailing text ignored
+        Some("{}\n"),
+    );
+}
+
+#[test]
+fn test_parse_backtick_path_header_with_trailing_text() {
+    let md = "\n`script.pl` # Perl script\n```perl\n#!/usr/bin/perl\nprint \"Hi\";\n```\n";
+    let actions = parse_markdown(md).expect("Parsing failed");
+    assert_eq!(actions.len(), 1);
+    assert_action(
+        actions.first(),
+        ActionType::Create,
+        "script.pl", // Trailing text ignored
+        Some("#!/usr/bin/perl\nprint \"Hi\";\n"),
+    );
+}
diff --git a/tests/parser/create_wrapped.rs b/tests/parser/create_wrapped.rs
@@ -81,3 +81,16 @@ fn test_parse_markdown_block_not_a_header() {
         "Markdown block with non-header line should be ignored"
     );
 }
+
+#[test]
+fn test_parse_wrapped_hash_file_header_with_trailing_text() {
+    let md = "\n```markdown\n## File: wrapped/config.toml # Main config\n```\n\n```toml\n[settings]\nkey = \"value\"\n```\n";
+    let actions = parse_markdown(md).expect("Parsing failed");
+    assert_eq!(actions.len(), 1);
+    assert_action(
+        actions.first(),
+        ActionType::Create,
+        "wrapped/config.toml", // Trailing text ignored
+        Some("[settings]\nkey = \"value\"\n"),
+    );
+}
diff --git a/tests/parser/delete.rs b/tests/parser/delete.rs
@@ -76,3 +76,29 @@ fn test_parse_wrapped_bold_deleted_file_header() {
     assert_eq!(actions.len(), 1);
     assert_action(actions.first(), ActionType::Delete, "temp.log", None);
 }
+
+#[test]
+fn test_parse_hash_deleted_file_header_with_trailing_comment() {
+    let md = "\n## Deleted File: old_cache.dat # Remove this\n";
+    let actions = parse_markdown(md).expect("Parsing failed");
+    assert_eq!(actions.len(), 1);
+    assert_action(
+        actions.first(),
+        ActionType::Delete,
+        "old_cache.dat", // Trailing comment ignored
+        None,
+    );
+}
+
+#[test]
+fn test_parse_bold_deleted_file_header_with_trailing_text_outside() {
+    let md = "\n**Deleted File: report.pdf** (old version)\n";
+    let actions = parse_markdown(md).expect("Parsing failed");
+    assert_eq!(actions.len(), 1);
+    assert_action(
+        actions.first(),
+        ActionType::Delete,
+        "report.pdf",
+        None, // Trailing text ignored
+    );
+}
diff --git a/tests/parser/header_utils_tests.rs b/tests/parser/header_utils_tests.rs