diff --git a/Cargo.lock b/Cargo.lock index 24ece07..5e0d363 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -146,6 +146,15 @@ version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + [[package]] name = "bstr" version = "1.10.0" @@ -258,7 +267,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.90", ] [[package]] @@ -305,29 +314,6 @@ dependencies = [ "windows-sys 0.48.0", ] -[[package]] -name = "comemo" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df6916408a724339aa77b18214233355f3eb04c42eb895e5f8909215bd8a7a91" -dependencies = [ - "comemo-macros", - "once_cell", - "parking_lot", - "siphasher", -] - -[[package]] -name = "comemo-macros" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8936e42f9b4f5bdfaf23700609ac1f11cb03ad4c1ec128a4ee4fd0903e228db" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.79", -] - [[package]] name = "core-foundation" version = "0.9.4" @@ -344,6 +330,15 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" +[[package]] +name = "cpufeatures" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16b80225097f2e5ae4e7179dd2266824648f3e2f49d9134d584b76389d31c4c3" +dependencies = [ + "libc", +] + [[package]] name = "criterion" version = "0.5.1" @@ -411,12 +406,32 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + [[package]] name = "difflib" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", +] + [[package]] name = "doc-comment" version = "0.3.3" @@ -456,7 +471,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.90", ] [[package]] @@ -576,7 +591,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.90", ] [[package]] @@ -609,6 +624,16 @@ dependencies = [ "slab", ] +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + [[package]] name = "getopts" version = "0.2.21" @@ -677,6 +702,21 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fbf6a919d6cf397374f7dfeeea91d974c7c0a7221d0d0f4f20d859d329e53fcc" +[[package]] +name = "html_parser" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f56db07b6612644f6f7719f8ef944f75fff9d6378fdf3d316fd32194184abd" +dependencies = [ + "doc-comment", + "pest", + "pest_derive", + "serde", + "serde_derive", + "serde_json", + "thiserror 1.0.64", +] + [[package]] name = "http" version = "0.2.12" @@ -827,6 +867,7 @@ dependencies = [ "criterion", "enum_dispatch", "futures", + "html_parser", "is-terminal", "lifetime", "predicates", @@ -836,7 +877,7 @@ dependencies = [ "serde_json", "tempfile", "termcolor", - "thiserror", + "thiserror 1.0.64", "tokio", "typst-syntax", ] @@ -889,16 +930,6 @@ version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" -[[package]] -name = "lock_api" -version = "0.4.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" -dependencies = [ - "autocfg", - "scopeguard", -] - [[package]] name = "log" version = "0.4.22" @@ -1014,7 +1045,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.90", ] [[package]] @@ -1046,33 +1077,55 @@ dependencies = [ ] [[package]] -name = "parking_lot" -version = "0.12.3" +name = "percent-encoding" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" + +[[package]] +name = "pest" +version = "2.7.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b7cafe60d6cf8e62e1b9b2ea516a089c008945bb5a275416789e7db0bc199dc" +dependencies = [ + "memchr", + "thiserror 2.0.8", + "ucd-trie", +] + +[[package]] +name = "pest_derive" +version = "2.7.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" +checksum = "816518421cfc6887a0d62bf441b6ffb4536fcc926395a69e1a85852d4363f57e" dependencies = [ - "lock_api", - "parking_lot_core", + "pest", + "pest_generator", ] [[package]] -name = "parking_lot_core" -version = "0.9.10" +name = "pest_generator" +version = "2.7.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" +checksum = "7d1396fd3a870fc7838768d171b4616d5c91f6cc25e377b673d714567d99377b" dependencies = [ - "cfg-if", - "libc", - "redox_syscall", - "smallvec", - "windows-targets 0.52.6", + "pest", + "pest_meta", + "proc-macro2", + "quote", + "syn 2.0.90", ] [[package]] -name = "percent-encoding" -version = "2.3.1" +name = "pest_meta" +version = "2.7.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" +checksum = "e1e58089ea25d717bfd31fb534e4f3afcc2cc569c70de3e239778991ea3b7dea" +dependencies = [ + "once_cell", + "pest", + "sha2", +] [[package]] name = "pin-project-lite" @@ -1120,6 +1173,12 @@ dependencies = [ "plotters-backend", ] +[[package]] +name = "portable-atomic" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" + [[package]] name = "predicates" version = "3.1.2" @@ -1152,9 +1211,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.86" +version = "1.0.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" dependencies = [ "unicode-ident", ] @@ -1207,15 +1266,6 @@ dependencies = [ "crossbeam-utils", ] -[[package]] -name = "redox_syscall" -version = "0.5.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "355ae415ccd3a04315d3f8246e86d67689ea74d88d915576e1589a351062a13b" -dependencies = [ - "bitflags 2.6.0", -] - [[package]] name = "regex" version = "1.10.6" @@ -1337,12 +1387,6 @@ dependencies = [ "windows-sys 0.59.0", ] -[[package]] -name = "scopeguard" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" - [[package]] name = "security-framework" version = "2.11.1" @@ -1383,7 +1427,7 @@ checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.90", ] [[package]] @@ -1398,6 +1442,15 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_spanned" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87607cb1398ed59d48732e575a4c28a7a8ebf2454b964fe3f224f2afc07909e1" +dependencies = [ + "serde", +] + [[package]] name = "serde_urlencoded" version = "0.7.1" @@ -1410,6 +1463,17 @@ dependencies = [ "serde", ] +[[package]] +name = "sha2" +version = "0.10.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + [[package]] name = "shlex" version = "1.3.0" @@ -1431,12 +1495,6 @@ dependencies = [ "autocfg", ] -[[package]] -name = "smallvec" -version = "1.13.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" - [[package]] name = "socket2" version = "0.5.7" @@ -1466,9 +1524,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.79" +version = "2.0.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89132cd0bf050864e1d38dc3bbc07a0eb8e7530af26344d3d2bbbef83499f590" +checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31" dependencies = [ "proc-macro2", "quote", @@ -1540,13 +1598,28 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76" +[[package]] +name = "thin-vec" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a38c90d48152c236a3ab59271da4f4ae63d678c5d7ad6b7714d7cb9760be5e4b" + [[package]] name = "thiserror" version = "1.0.64" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d50af8abc119fb8bb6dbabcfa89656f46f84aa0ac7688088608076ad2b459a84" dependencies = [ - "thiserror-impl", + "thiserror-impl 1.0.64", +] + +[[package]] +name = "thiserror" +version = "2.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08f5383f3e0071702bf93ab5ee99b52d26936be9dedd9413067cbdcddcb6141a" +dependencies = [ + "thiserror-impl 2.0.8", ] [[package]] @@ -1557,7 +1630,18 @@ checksum = "08904e7672f5eb876eaaf87e0ce17857500934f4981c4a0ab2b4aa98baac7fc3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.90", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f357fcec90b3caef6623a099691be676d033b40a058ac95d2a6ade6fa0c943" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.90", ] [[package]] @@ -1609,7 +1693,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.90", ] [[package]] @@ -1635,6 +1719,40 @@ dependencies = [ "tokio", ] +[[package]] +name = "toml" +version = "0.8.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1ed1f98e3fdc28d6d910e6737ae6ab1a93bf1985935a1193e68f93eeb68d24e" +dependencies = [ + "serde", + "serde_spanned", + "toml_datetime", + "toml_edit", +] + +[[package]] +name = "toml_datetime" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41" +dependencies = [ + "serde", +] + +[[package]] +name = "toml_edit" +version = "0.22.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5" +dependencies = [ + "indexmap", + "serde", + "serde_spanned", + "toml_datetime", + "winnow", +] + [[package]] name = "tower-service" version = "0.3.3" @@ -1666,16 +1784,23 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "typenum" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" + [[package]] name = "typst-syntax" -version = "0.11.1" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3db69f2f41613b1ff6edbec44fd7dc524137f099ee36c46f560cedeaadb40c4" +checksum = "05b7be8b6ed6b2cb39ca495947d548a28d7db0ba244008e44c5a759120327693" dependencies = [ - "comemo", "ecow", "once_cell", "serde", + "toml", + "typst-utils", "unicode-ident", "unicode-math-class", "unicode-script", @@ -1683,6 +1808,25 @@ dependencies = [ "unscanny", ] +[[package]] +name = "typst-utils" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f0305443ed97f0b658471487228f86bf835705e7525fbdcc671cebd864f7a40" +dependencies = [ + "once_cell", + "portable-atomic", + "rayon", + "siphasher", + "thin-vec", +] + +[[package]] +name = "ucd-trie" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971" + [[package]] name = "unicase" version = "2.7.0" @@ -1828,7 +1972,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.90", "wasm-bindgen-shared", ] @@ -1862,7 +2006,7 @@ checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.90", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -2062,6 +2206,15 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "winnow" +version = "0.6.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36c1fec1a2bb5866f07c25f68c26e565c4c200aebb96d7e55710c19d3e8ac49b" +dependencies = [ + "memchr", +] + [[package]] name = "winreg" version = "0.50.0" diff --git a/Cargo.toml b/Cargo.toml index f58857a..7b0a252 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,6 +13,7 @@ annotate-snippets = {version = "^0.9.1", optional = true} clap = {version = "^4.5.18", features = ["cargo", "derive", "env", "wrap_help"], optional = true} clap_complete = {version = "^4.5.2", optional = true} enum_dispatch = {version = "0.3.13", optional = true} +html_parser = {version = "0.7.0", optional = true} is-terminal = {version = "0.4.3", optional = true} lifetime = {version = "0.1.0", features = ["macros"]} pulldown-cmark = {version = "0.10.2", optional = true} @@ -22,7 +23,7 @@ serde_json = "^1.0" termcolor = {version = "1.2.0", optional = true} thiserror = "^1.0" tokio = {version = "^1.0", features = ["macros", "rt-multi-thread"], optional = true} -typst-syntax = {version = "^0.11.0", optional = true} +typst-syntax = {version = "^0.12.0", optional = true} [dev-dependencies] assert_cmd = "2.0.11" @@ -35,13 +36,14 @@ tokio = {version = "^1.0", features = ["macros"]} [features] annotate = ["dep:annotate-snippets"] -cli = ["annotate", "color", "dep:clap", "dep:enum_dispatch", "dep:is-terminal", "multithreaded"] +cli = ["annotate", "color", "dep:clap", "dep:enum_dispatch", "dep:is-terminal", "multithreaded", "typst", "markdown", "html"] cli-complete = ["cli", "clap_complete"] color = ["annotate-snippets?/color", "dep:termcolor"] default = ["cli", "native-tls"] docker = [] full = ["cli-complete", "docker", "unstable"] -markdown = ["dep:pulldown-cmark"] +html = ["dep:html_parser"] +markdown = ["dep:pulldown-cmark", "html"] multithreaded = ["dep:tokio"] native-tls = ["reqwest/native-tls"] native-tls-vendored = ["reqwest/native-tls-vendored"] @@ -63,7 +65,7 @@ license = "MIT" name = "languagetool-rust" readme = "README.md" repository = "https://github.com/jeertmans/languagetool-rust" -rust-version = "1.75.0" +rust-version = "1.77.2" version = "2.1.4" [package.metadata.docs.rs] diff --git a/src/cli/check.rs b/src/cli/check.rs index f58773e..d3ea6aa 100644 --- a/src/cli/check.rs +++ b/src/cli/check.rs @@ -20,6 +20,7 @@ use crate::{ server::ServerClient, }, error::{Error, Result}, + parsers::{parse_html, parse_markdown, parse_typst}, }; use super::ExecuteSubcommand; @@ -74,8 +75,12 @@ pub enum FileType { /// Auto. #[default] Auto, + /// Raw text. + Raw, /// Markdown. Markdown, + /// HTML. + Html, /// Typst. Typst, } @@ -119,21 +124,61 @@ impl ExecuteSubcommand for Command { // FILES for filename in self.filenames.iter() { - let text = std::fs::read_to_string(filename)?; - let requests = request - .clone() - .with_text(text) - .split(self.max_length, self.split_pattern.as_str()); - let response = server_client.check_multiple_and_join(requests).await?; + let mut file_type = self.r#type.clone(); + + // If file type is "Auto", guess file type from extension + if matches!(self.r#type, FileType::Auto) { + file_type = match PathBuf::from(filename).extension().and_then(|e| e.to_str()) { + Some(ext) => match ext { + "typ" => FileType::Typst, + "md" | "markdown" | "mdown" | "mdwn" | "mkd" | "mkdn" | "mdx" => { + FileType::Markdown + }, + + "html" | "htm" => FileType::Html, + _ => FileType::Raw, + }, + None => FileType::Raw, + }; + }; + + let file_content = std::fs::read_to_string(filename)?; + let (response, text): (check::Response, String) = match &file_type { + FileType::Auto => unreachable!(), + FileType::Raw => { + let requests = (request.clone().with_text(&file_content)) + .split(self.max_length, self.split_pattern.as_str()); + let response = server_client.check_multiple_and_join(requests).await?; + (response.into(), file_content) + }, + FileType::Typst | FileType::Markdown | FileType::Html => { + let data = match file_type { + FileType::Typst => parse_typst(&file_content), + FileType::Html => { + let text = parse_html(&file_content); + Data::from_iter([DataAnnotation::new_text(text)]) + }, + FileType::Markdown => { + let text = parse_markdown(&file_content); + Data::from_iter([DataAnnotation::new_text(text)]) + }, + _ => unreachable!(), + }; + let response = server_client + .check(&request.clone().with_data(data)) + .await?; + (response, file_content) + }, + }; if !self.raw { writeln!( &mut stdout, "{}", - &response.annotate(response.text.as_ref(), filename.to_str(), color) + &response.annotate(&text, filename.to_str(), color) )?; } else { - writeln!(&mut stdout, "{}", serde_json::to_string_pretty(&*response)?)?; + writeln!(&mut stdout, "{}", serde_json::to_string_pretty(&response)?)?; } } diff --git a/src/lib.rs b/src/lib.rs index 4d8f386..a7cde7a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,3 +21,4 @@ pub mod api; #[cfg(feature = "cli")] pub mod cli; pub mod error; +pub mod parsers; diff --git a/src/parsers.rs b/src/parsers.rs new file mode 100644 index 0000000..3907c2a --- /dev/null +++ b/src/parsers.rs @@ -0,0 +1,210 @@ +//! Utilities for parsing the contents of different file types into a text +//! representation that can be sent to the LanguageTool API. + +use crate::api::check::{Data, DataAnnotation}; + +/// Parse the contents of an HTML file into a text format to be sent to the +/// LanguageTool API. +#[cfg(feature = "html")] +pub fn parse_html(file_content: impl AsRef) -> String { + use html_parser::Node; + + let mut txt = String::new(); + + let html = html_parser::Dom::parse(file_content.as_ref()).unwrap_or_default(); + let mut children: Vec = html.children.into_iter().rev().collect(); + + fn handle_node(txt: &mut String, node: Node) { + if let Some(e) = node.element() { + match e.name.as_str() { + "head" | "script" | "style" => { + return; + }, + "code" => { + txt.push_str("_code_"); + return; + }, + "a" => { + txt.push_str("_link_"); + return; + }, + "pre" => { + txt.push_str("_pre_"); + txt.push_str("\n\n"); + return; + }, + s => { + let add_children = |txt: &mut String| { + if !e.children.is_empty() { + // Recursively handle children + e.children.clone().into_iter().for_each(|n| { + handle_node(txt, n); + }); + }; + }; + + match s { + "p" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "li" | "td" | "th" => { + while txt.chars().last().is_some_and(|c| c.is_whitespace()) { + txt.pop(); + } + txt.push_str("\n\n"); + add_children(txt); + txt.push_str("\n\n"); + }, + _ => { + add_children(txt); + }, + } + }, + } + } + + if let Some(t) = node.text() { + let mut text = t.trim().to_owned(); + if !text.is_empty() { + let mut chars = t.chars(); + + // Maintain leading/trailing white space, but only a single space + if chars.next().is_some_and(|c| c.is_whitespace()) { + while txt.chars().last().is_some_and(|c| c.is_whitespace()) { + txt.pop(); + } + text.insert(0, ' '); + } + if chars.last().is_some_and(|c| c.is_whitespace()) { + text.push(' '); + } + + txt.push_str(&text); + } + } + } + + while let Some(node) = children.pop() { + handle_node(&mut txt, node); + } + + txt +} + +/// Parse the contents of a Markdown file into a text format to be sent to the +/// LanguageTool API. +#[cfg(feature = "markdown")] +pub fn parse_markdown(file_content: impl AsRef) -> String { + use pulldown_cmark::{html, Options, Parser}; + + let mut options = Options::empty(); + options.insert(Options::ENABLE_STRIKETHROUGH); + options.insert(Options::ENABLE_TABLES); + + let parser = Parser::new_ext(file_content.as_ref(), options); + let mut html = String::new(); + html::push_html(&mut html, parser); + + parse_html(html) +} + +/// Parse the contents of a Typst file into a text format to be sent to the +/// LanguageTool API. +#[cfg(feature = "typst")] +pub fn parse_typst(file_content: impl AsRef) -> Data<'static> { + use typst_syntax::{parse, SyntaxKind, SyntaxNode}; + + let mut annotations: Vec = vec![]; + + let parent = parse(file_content.as_ref()); + let mut nodes: Vec<&SyntaxNode> = parent.children().rev().collect(); + + while let Some(node) = nodes.pop() { + let kind = node.kind(); + + // MARKUP NODES + match kind { + // Pure markup + SyntaxKind::SetRule + | SyntaxKind::Ident + | SyntaxKind::ShowRule + | SyntaxKind::Raw + | SyntaxKind::Code + | SyntaxKind::CodeBlock + | SyntaxKind::Math + | SyntaxKind::Equation + | SyntaxKind::Ref + | SyntaxKind::LetBinding + | SyntaxKind::FieldAccess + | SyntaxKind::FuncCall + | SyntaxKind::Args => { + let mut markup = node.text().to_string(); + if markup.is_empty() { + let mut stack: Vec<&SyntaxNode> = node.children().rev().collect(); + while let Some(n) = stack.pop() { + if n.text().is_empty() { + stack.extend(n.children().rev()); + } else { + markup += n.text(); + } + } + } + + annotations.push(DataAnnotation::new_interpreted_markup( + markup, + // This pattern is ignored by LanguageTool, and allows us to avoid whitespace issues. + // The following sentence would give an error for repeated whitespace + // otherwise: This has ``` `backticks` ``` in it + "_ignore_".to_string(), + )); + continue; + }, + // Markup with valid text interpretations + SyntaxKind::Heading + | SyntaxKind::Markup + | SyntaxKind::EnumItem + | SyntaxKind::ListItem + | SyntaxKind::Emph + | SyntaxKind::Strong => { + let (mut full_text, mut interpreted_as) = (String::new(), String::new()); + let mut stack: Vec<&SyntaxNode> = node.children().rev().collect(); + + while let Some(n) = stack.pop() { + if n.text().is_empty() { + stack.extend(n.children().rev()); + } else { + if matches!(n.kind(), SyntaxKind::Text | SyntaxKind::Space) { + interpreted_as += n.text(); + } + full_text += n.text(); + } + } + + annotations.push(DataAnnotation::new_interpreted_markup( + full_text, + interpreted_as, + )); + continue; + }, + _ => {}, + } + + // NESTED NODES + if node.children().count() > 0 && !matches!(kind, SyntaxKind::Args | SyntaxKind::FuncCall) { + nodes.extend(node.children().rev()); + continue; + } + + // TEXT + if matches!( + kind, + SyntaxKind::Text + | SyntaxKind::SmartQuote + | SyntaxKind::BlockComment + | SyntaxKind::LineComment + | SyntaxKind::Space + | SyntaxKind::Parbreak + ) { + annotations.push(DataAnnotation::new_text(node.text().to_string())); + }; + } + + Data::from_iter(annotations) +}