jeertmans · jeertmans · Apr 9, 2025 · Mar 2, 2025 · Mar 8, 2025 · Mar 8, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -10,22 +10,31 @@ required-features = ["cli"]
 
 [dependencies]
 annotate-snippets = {version = "^0.9.1", optional = true}
-clap = {version = "^4.5.18", features = ["cargo", "derive", "env", "wrap_help"], optional = true}
-clap-verbosity-flag = { version = "3.0.2", optional = true }
+clap = {version = "^4.5.18", features = [
+  "cargo",
+  "derive",
+  "env",
+  "wrap_help",
+], optional = true}
+clap-verbosity-flag = {version = "3.0.2", optional = true}
 clap_complete = {version = "^4.5.2", optional = true}
+ego-tree = {version = "0.10.0", optional = true}
 enum_dispatch = {version = "0.3.13", optional = true}
-html_parser = {version = "0.7.0", optional = true}
 is-terminal = {version = "0.4.3", optional = true}
 lifetime = {version = "0.1.0", features = ["macros"]}
-log = { version = "0.4.25", optional = true }
-pretty_env_logger = { version = "0.5.0", optional = true }
+log = {version = "0.4.25", optional = true}
+pretty_env_logger = {version = "0.5.0", optional = true}
 pulldown-cmark = {version = "0.10.2", optional = true}
 reqwest = {version = "^0.11", default-features = false, features = ["json"]}
+scraper = {version = "0.23.1", optional = true}
 serde = {version = "^1.0", features = ["derive"]}
 serde_json = "^1.0"
 termcolor = {version = "1.2.0", optional = true}
 thiserror = "^1.0"
-tokio = {version = "^1.0", features = ["macros", "rt-multi-thread"], optional = true}
+tokio = {version = "^1.0", features = [
+  "macros",
+  "rt-multi-thread",
+], optional = true}
 typst-syntax = {version = "^0.12.0", optional = true}
 
 [dev-dependencies]
@@ -34,29 +43,43 @@ assert_matches = "1.5.0"
 codspeed-criterion-compat = "2.7.0"
 criterion = "0.5"
 futures = "0.3"
+insta = {version = "1.41.1", features = ["yaml"]}
 predicates = "3.0.3"
 tempfile = "3.5.0"
 tokio = {version = "^1.0", features = ["macros"]}
 
 [features]
 annotate = ["dep:annotate-snippets"]
-cli = ["annotate", "color", "dep:clap", "dep:clap-verbosity-flag", "dep:enum_dispatch", "dep:is-terminal", "dep:log", "dep:pretty_env_logger", "multithreaded", "typst", "markdown", "html"]
+clap-verbosity-flag = ["dep:clap-verbosity-flag"]
+cli = [
+  "annotate",
+  "color",
+  "dep:clap",
+  "dep:clap-verbosity-flag",
+  "dep:enum_dispatch",
+  "dep:is-terminal",
+  "dep:log",
+  "dep:pretty_env_logger",
+  "multithreaded",
+  "typst",
+  "markdown",
+  "html",
+]
 cli-complete = ["cli", "clap_complete"]
 color = ["annotate-snippets?/color", "dep:termcolor"]
 default = ["cli", "native-tls"]
 docker = []
 full = ["cli-complete", "docker", "unstable"]
-html = ["dep:html_parser"]
+html = ["dep:ego-tree", "dep:scraper"]
+log = ["dep:log"]
 markdown = ["dep:pulldown-cmark", "html"]
 multithreaded = ["dep:tokio"]
 native-tls = ["reqwest/native-tls"]
 native-tls-vendored = ["reqwest/native-tls-vendored"]
+pretty_env_logger = ["dep:pretty_env_logger"]
 pulldown-cmark = ["dep:pulldown-cmark"]
 typst = ["dep:typst-syntax"]
 unstable = []
-log = ["dep:log"]
-pretty_env_logger = ["dep:pretty_env_logger"]
-clap-verbosity-flag = ["dep:clap-verbosity-flag"]
 
 [lib]
 name = "languagetool_rust"
@@ -72,7 +95,7 @@ license = "MIT"
 name = "languagetool-rust"
 readme = "README.md"
 repository = "https://github.com/jeertmans/languagetool-rust"
-rust-version = "1.77.2"
+rust-version = "1.80.0"
 version = "2.1.4"
 
 [package.metadata.release]

diff --git a/src/api/check.rs b/src/api/check.rs
@@ -239,6 +239,62 @@ pub struct Data<'source> {
     pub annotation: Vec<DataAnnotation<'source>>,
 }
 
+impl Data<'_> {
+    /// Split data into as few fragments as possible, where each fragment
+    /// contains (if possible) a maximum of `n` characters in it's
+    /// annotations' markup and text fields.
+    ///
+    /// Pattern str `pat` is used for splitting.
+    #[must_use]
+    pub fn split(self, n: usize, pat: &str) -> Vec<Self> {
+        // Build vec of breakpoints and the length of the text + markup at that
+        // potential breakpoint
+        let mut break_point_lengths = vec![];
+        let mut len = 0;
+        for (i, ann) in self.annotation.iter().enumerate() {
+            len +=
+                ann.text.as_deref().unwrap_or("").len() + ann.markup.as_deref().unwrap_or("").len();
+            if ann.text.as_ref().is_some_and(|t| t.contains(pat)) {
+                break_point_lengths.push((i, len));
+            }
+        }
+
+        // Decide which breakpoints to split the annotations at
+        let mut break_points: Vec<usize> = vec![];
+        if break_point_lengths.len() > 1 {
+            let (mut i, mut ii) = (0, 1);
+            let (mut base, mut curr) = (0, 0);
+            while ii < break_point_lengths.len() {
+                curr += break_point_lengths[i].1 - base;
+
+                if break_point_lengths[ii].1 - base + curr > n {
+                    break_points.push(break_point_lengths[i].0);
+                    base = break_point_lengths[i].1;
+                    curr = 0;
+                }
+
+                i += 1;
+                ii += 1;
+            }
+        }
+
+        // Split annotations based on calculated break points
+        let mut split = Vec::with_capacity(break_points.len());
+        let mut iter = self.into_iter();
+        let mut taken = 0;
+        let mut annotations = vec![];
+        for break_point in break_points {
+            while taken != break_point + 1 {
+                annotations.push(iter.next().unwrap());
+                taken += 1;
+            }
+            split.push(Data::from_iter(mem::take(&mut annotations)));
+        }
+
+        split
+    }
+}
+
 impl IntoStatic for Data<'_> {
     type Static = Data<'static>;
     fn into_static(self) -> Self::Static {
@@ -259,6 +315,15 @@ impl<'source, T: Into<DataAnnotation<'source>>> FromIterator<T> for Data<'source
     }
 }
 
+impl<'source> IntoIterator for Data<'source> {
+    type Item = DataAnnotation<'source>;
+    type IntoIter = std::vec::IntoIter<Self::Item>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.annotation.into_iter()
+    }
+}
+
 impl Serialize for Data<'_> {
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
     where
@@ -601,10 +666,20 @@ impl<'source> Request<'source> {
     ///
     /// # Errors
     ///
-    /// If `self.text` is none.
+    /// If `self.text` is [`None`] and `self.data` is [`None`].
     pub fn try_split(mut self, n: usize, pat: &str) -> Result<Vec<Self>> {
+        // DATA ANNOTATIONS
+        if let Some(data) = mem::take(&mut self.data) {
+            return Ok(data
+                .split(n, pat)
+                .into_iter()
+                .map(|d| self.clone().with_data(d))
+                .collect());
+        }
+
+        // TEXT
         let text = mem::take(&mut self.text)
-            .ok_or_else(|| Error::InvalidRequest("missing text field".to_string()))?;
+            .ok_or_else(|| Error::InvalidRequest("missing text or data field".to_string()))?;
         let string: &str = match &text {
             Cow::Owned(s) => s.as_str(),
             Cow::Borrowed(s) => s,
@@ -944,6 +1019,29 @@ impl Response {
         }
         annotation
     }
+
+    /// Joins the given [`Request`] to the current one.
+    ///
+    /// This is especially useful when a request was split into multiple
+    /// requests.
+    #[must_use]
+    pub fn append(mut self, mut other: Self) -> Self {
+        #[cfg(feature = "unstable")]
+        if let Some(ref mut sr_other) = other.sentence_ranges {
+            match self.sentence_ranges {
+                Some(ref mut sr_self) => {
+                    sr_self.append(sr_other);
+                },
+                None => {
+                    std::mem::swap(&mut self.sentence_ranges, &mut other.sentence_ranges);
+                },
+            }
+        }
+
+        self.matches.append(&mut other.matches);
+
+        self
+    }
 }
 
 /// Check response with additional context.

diff --git a/src/api/server.rs b/src/api/server.rs
@@ -1,14 +1,17 @@
 //! Structure to communicate with some `LanguageTool` server through the API.
 
+#[cfg(feature = "multithreaded")]
+use crate::api::check;
 use crate::{
     api::{
-        check::{self, Request, Response},
+        check::{Request, Response},
         languages, words,
     },
     error::{Error, Result},
 };
 #[cfg(feature = "cli")]
 use clap::Args;
+#[cfg(feature = "multithreaded")]
 use lifetime::IntoStatic;
 use reqwest::{
     header::{HeaderValue, ACCEPT},
@@ -423,15 +426,14 @@ impl ServerClient {
                 "no request; cannot join zero request".to_string(),
             ));
         }
-        let mut tasks = Vec::with_capacity(requests.len());
 
-        requests
+        let tasks = requests
             .into_iter()
             .map(|r| r.into_static())
-            .for_each(|request| {
+            .map(|request| {
                 let server_client = self.clone();
 
-                tasks.push(tokio::spawn(async move {
+                tokio::spawn(async move {
                     let response = server_client.check(&request).await?;
                     let text = request.text.ok_or_else(|| {
                         Error::InvalidRequest(
@@ -440,9 +442,9 @@ impl ServerClient {
                         )
                     })?;
                     Result::<(Cow<'static, str>, Response)>::Ok((text, response))
-                }));
+                })
             });
-
+      
         let mut response_with_context: Option<check::ResponseWithContext> = None;
 
         for task in tasks {
@@ -457,6 +459,45 @@ impl ServerClient {
         Ok(response_with_context.unwrap())
     }
 
+    /// Send multiple check requests and join them into a single response,
+    /// without any context.
+    ///
+    /// # Error
+    ///
+    /// If any of the requests has `self.text` or `self.data` field which is
+    /// [`None`].
+    #[cfg(feature = "multithreaded")]
+    pub async fn check_multiple_and_join_without_context<'source>(
+        &self,
+        requests: Vec<Request<'source>>,
+    ) -> Result<check::Response> {
+        let mut response: Option<check::Response> = None;
+
+        let tasks = requests
+            .into_iter()
+            .map(|r| r.into_static())
+            .map(|request| {
+                let server_client = self.clone();
+
+                tokio::spawn(async move {
+                    let response = server_client.check(&request).await?;
+                    Result::<Response>::Ok(response)
+                })
+            });
+
+        // Make requests in sequence
+        for task in tasks {
+            let resp = task.await.unwrap()?;
+
+            response = Some(match response {
+                Some(r) => r.append(resp),
+                None => resp,
+            })
+        }
+
+        Ok(response.unwrap())
+    }
+
     /// Send a check request to the server, await for the response and annotate
     /// it.
     #[cfg(feature = "annotate")]

diff --git a/src/cli/check.rs b/src/cli/check.rs
@@ -21,7 +21,7 @@ use crate::{
         server::ServerClient,
     },
     error::{Error, Result},
-    parsers::{parse_html, parse_markdown, parse_typst},
+    parsers::{html::parse_html, markdown::parse_markdown, typst::parse_typst},
 };
 
 use super::ExecuteSubcommand;
@@ -175,6 +175,7 @@ impl ExecuteSubcommand for Command {
             };
 
             let file_content = std::fs::read_to_string(filename)?;
+
             let (response, text): (check::Response, String) = match &file_type {
                 FileType::Auto => unreachable!(),
                 FileType::Raw => {
@@ -192,18 +193,14 @@ impl ExecuteSubcommand for Command {
                 FileType::Typst | FileType::Markdown | FileType::Html => {
                     let data = match file_type {
                         FileType::Typst => parse_typst(&file_content),
-                        FileType::Html => {
-                            let text = parse_html(&file_content);
-                            Data::from_iter([DataAnnotation::new_text(text)])
-                        },
-                        FileType::Markdown => {
-                            let text = parse_markdown(&file_content);
-                            Data::from_iter([DataAnnotation::new_text(text)])
-                        },
+                        FileType::Html => parse_html(&file_content),
+                        FileType::Markdown => parse_markdown(&file_content),
                         _ => unreachable!(),
                     };
+                    let requests = (request.clone().with_data(data))
+                        .split(self.max_length, self.split_pattern.as_str());
                     let response = server_client
-                        .check(&request.clone().with_data(data))
+                        .check_multiple_and_join_without_context(requests)
                         .await?;
                     (response, file_content)
                 },