Skip to content

feat: use data annotations for HTML and Markdown files, and implement splitting of data annotation requests #134

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 29 commits into from
Apr 9, 2025
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
f6998de
feat: use data annotations for Markdown files
Rolv-Apneseth Mar 2, 2025
c19b15b
refactor: use shared ignore pattern
Rolv-Apneseth Mar 8, 2025
7d276f1
feat: use data annotations for HTML files
Rolv-Apneseth Mar 8, 2025
433db83
feat: splitting for data annotation requests
Rolv-Apneseth Mar 8, 2025
40c35a9
fix: pass `cargo check` for different features combinations
Rolv-Apneseth Mar 8, 2025
1ec8f82
fix: satisfy toml pre-commit hook
Rolv-Apneseth Mar 8, 2025
e11e289
Merge branch 'v3' into data_annotations
jeertmans Mar 11, 2025
2c51dd4
feat(tests): snapshot tests for file parsers
Rolv-Apneseth Mar 11, 2025
9a319c4
Merge branch 'v3' into data_annotations
jeertmans Mar 12, 2025
11de4b2
remove dup
jeertmans Mar 12, 2025
4fa1244
fix(tests): go for `lazy_static`
jeertmans Mar 12, 2025
44b10d2
fmt
jeertmans Mar 12, 2025
5b95ba4
setup MSRV to minimal
jeertmans Mar 12, 2025
3c560b9
fix(tests): filter out paths from snapshots
Rolv-Apneseth Mar 12, 2025
07235cd
refactor(tests): use util function for building cmd
Rolv-Apneseth Mar 12, 2025
015a5bd
Merge branch 'v3' into data_annotations
Rolv-Apneseth Mar 12, 2025
8349405
fix(tests): adjust snapshots to results from running against docker-h…
Rolv-Apneseth Mar 12, 2025
64078ca
chore(tests): only test snapshots on latest
jeertmans Mar 13, 2025
7209411
chore(ci): fix when CI runs
jeertmans Mar 13, 2025
727ca6a
chore(tests): only test snapshots on latest
jeertmans Mar 13, 2025
d06c92d
Merge remote-tracking branch 'Rolv-Apneseth/data_annotations' into da…
jeertmans Mar 13, 2025
75a6a03
fmt
jeertmans Mar 13, 2025
c0dea8c
fix(ci): add dummy value
jeertmans Mar 13, 2025
d7a81c7
fix(ci): clear artifacts after each run of `cargo hack` to avoid exha…
Rolv-Apneseth Mar 17, 2025
de79ac7
refactor: adjust `cli` feature combination
Rolv-Apneseth Mar 17, 2025
29b0ce8
fix(ci): significantly reduce `cargo hack` runtime by reducing possib…
Rolv-Apneseth Mar 17, 2025
413d99d
refactor: use plain markup instead of interpreted markup
Rolv-Apneseth Mar 18, 2025
b58470f
accept snapshot changes for false positives
Rolv-Apneseth Apr 1, 2025
dc81d5c
fix: remove needless lifetime
Rolv-Apneseth Apr 1, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
474 changes: 359 additions & 115 deletions Cargo.lock

Large diffs are not rendered by default.

47 changes: 35 additions & 12 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,22 +10,31 @@ required-features = ["cli"]

[dependencies]
annotate-snippets = {version = "^0.9.1", optional = true}
clap = {version = "^4.5.18", features = ["cargo", "derive", "env", "wrap_help"], optional = true}
clap-verbosity-flag = { version = "3.0.2", optional = true }
clap = {version = "^4.5.18", features = [
"cargo",
"derive",
"env",
"wrap_help",
], optional = true}
clap-verbosity-flag = {version = "3.0.2", optional = true}
clap_complete = {version = "^4.5.2", optional = true}
ego-tree = {version = "0.10.0", optional = true}
enum_dispatch = {version = "0.3.13", optional = true}
html_parser = {version = "0.7.0", optional = true}
is-terminal = {version = "0.4.3", optional = true}
lifetime = {version = "0.1.0", features = ["macros"]}
log = { version = "0.4.25", optional = true }
pretty_env_logger = { version = "0.5.0", optional = true }
log = {version = "0.4.25", optional = true}
pretty_env_logger = {version = "0.5.0", optional = true}
pulldown-cmark = {version = "0.10.2", optional = true}
reqwest = {version = "^0.11", default-features = false, features = ["json"]}
scraper = {version = "0.23.1", optional = true}
serde = {version = "^1.0", features = ["derive"]}
serde_json = "^1.0"
termcolor = {version = "1.2.0", optional = true}
thiserror = "^1.0"
tokio = {version = "^1.0", features = ["macros", "rt-multi-thread"], optional = true}
tokio = {version = "^1.0", features = [
"macros",
"rt-multi-thread",
], optional = true}
typst-syntax = {version = "^0.12.0", optional = true}

[dev-dependencies]
Expand All @@ -34,29 +43,43 @@ assert_matches = "1.5.0"
codspeed-criterion-compat = "2.7.0"
criterion = "0.5"
futures = "0.3"
insta = {version = "1.41.1", features = ["yaml"]}
predicates = "3.0.3"
tempfile = "3.5.0"
tokio = {version = "^1.0", features = ["macros"]}

[features]
annotate = ["dep:annotate-snippets"]
cli = ["annotate", "color", "dep:clap", "dep:clap-verbosity-flag", "dep:enum_dispatch", "dep:is-terminal", "dep:log", "dep:pretty_env_logger", "multithreaded", "typst", "markdown", "html"]
clap-verbosity-flag = ["dep:clap-verbosity-flag"]
cli = [
"annotate",
"color",
"dep:clap",
"dep:clap-verbosity-flag",
"dep:enum_dispatch",
"dep:is-terminal",
"dep:log",
"dep:pretty_env_logger",
"multithreaded",
"typst",
"markdown",
"html",
]
cli-complete = ["cli", "clap_complete"]
color = ["annotate-snippets?/color", "dep:termcolor"]
default = ["cli", "native-tls"]
docker = []
full = ["cli-complete", "docker", "unstable"]
html = ["dep:html_parser"]
html = ["dep:ego-tree", "dep:scraper"]
log = ["dep:log"]
markdown = ["dep:pulldown-cmark", "html"]
multithreaded = ["dep:tokio"]
native-tls = ["reqwest/native-tls"]
native-tls-vendored = ["reqwest/native-tls-vendored"]
pretty_env_logger = ["dep:pretty_env_logger"]
pulldown-cmark = ["dep:pulldown-cmark"]
typst = ["dep:typst-syntax"]
unstable = []
log = ["dep:log"]
pretty_env_logger = ["dep:pretty_env_logger"]
clap-verbosity-flag = ["dep:clap-verbosity-flag"]

[lib]
name = "languagetool_rust"
Expand All @@ -72,7 +95,7 @@ license = "MIT"
name = "languagetool-rust"
readme = "README.md"
repository = "https://github.com/jeertmans/languagetool-rust"
rust-version = "1.77.2"
rust-version = "1.80.0"
version = "2.1.4"

[package.metadata.release]
Expand Down
102 changes: 100 additions & 2 deletions src/api/check.rs
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,62 @@ pub struct Data<'source> {
pub annotation: Vec<DataAnnotation<'source>>,
}

impl Data<'_> {
/// Split data into as few fragments as possible, where each fragment
/// contains (if possible) a maximum of `n` characters in it's
/// annotations' markup and text fields.
///
/// Pattern str `pat` is used for splitting.
#[must_use]
pub fn split(self, n: usize, pat: &str) -> Vec<Self> {
// Build vec of breakpoints and the length of the text + markup at that
// potential breakpoint
let mut break_point_lengths = vec![];
let mut len = 0;
for (i, ann) in self.annotation.iter().enumerate() {
len +=
ann.text.as_deref().unwrap_or("").len() + ann.markup.as_deref().unwrap_or("").len();
if ann.text.as_ref().is_some_and(|t| t.contains(pat)) {
break_point_lengths.push((i, len));
}
}

// Decide which breakpoints to split the annotations at
let mut break_points: Vec<usize> = vec![];
if break_point_lengths.len() > 1 {
let (mut i, mut ii) = (0, 1);
let (mut base, mut curr) = (0, 0);
while ii < break_point_lengths.len() {
curr += break_point_lengths[i].1 - base;

if break_point_lengths[ii].1 - base + curr > n {
break_points.push(break_point_lengths[i].0);
base = break_point_lengths[i].1;
curr = 0;
}

i += 1;
ii += 1;
}
}

// Split annotations based on calculated break points
let mut split = Vec::with_capacity(break_points.len());
let mut iter = self.into_iter();
let mut taken = 0;
let mut annotations = vec![];
for break_point in break_points {
while taken != break_point + 1 {
annotations.push(iter.next().unwrap());
taken += 1;
}
split.push(Data::from_iter(mem::take(&mut annotations)));
}

split
}
}

impl IntoStatic for Data<'_> {
type Static = Data<'static>;
fn into_static(self) -> Self::Static {
Expand All @@ -259,6 +315,15 @@ impl<'source, T: Into<DataAnnotation<'source>>> FromIterator<T> for Data<'source
}
}

impl<'source> IntoIterator for Data<'source> {
type Item = DataAnnotation<'source>;
type IntoIter = std::vec::IntoIter<Self::Item>;

fn into_iter(self) -> Self::IntoIter {
self.annotation.into_iter()
}
}

impl Serialize for Data<'_> {
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
where
Expand Down Expand Up @@ -601,10 +666,20 @@ impl<'source> Request<'source> {
///
/// # Errors
///
/// If `self.text` is none.
/// If `self.text` is [`None`] and `self.data` is [`None`].
pub fn try_split(mut self, n: usize, pat: &str) -> Result<Vec<Self>> {
// DATA ANNOTATIONS
if let Some(data) = mem::take(&mut self.data) {
return Ok(data
.split(n, pat)
.into_iter()
.map(|d| self.clone().with_data(d))
.collect());
}

// TEXT
let text = mem::take(&mut self.text)
.ok_or_else(|| Error::InvalidRequest("missing text field".to_string()))?;
.ok_or_else(|| Error::InvalidRequest("missing text or data field".to_string()))?;
let string: &str = match &text {
Cow::Owned(s) => s.as_str(),
Cow::Borrowed(s) => s,
Expand Down Expand Up @@ -944,6 +1019,29 @@ impl Response {
}
annotation
}

/// Joins the given [`Request`] to the current one.
///
/// This is especially useful when a request was split into multiple
/// requests.
#[must_use]
pub fn append(mut self, mut other: Self) -> Self {
#[cfg(feature = "unstable")]
if let Some(ref mut sr_other) = other.sentence_ranges {
match self.sentence_ranges {
Some(ref mut sr_self) => {
sr_self.append(sr_other);
},
None => {
std::mem::swap(&mut self.sentence_ranges, &mut other.sentence_ranges);
},
}
}

self.matches.append(&mut other.matches);

self
}
}

/// Check response with additional context.
Expand Down
55 changes: 48 additions & 7 deletions src/api/server.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
//! Structure to communicate with some `LanguageTool` server through the API.

#[cfg(feature = "multithreaded")]
use crate::api::check;
use crate::{
api::{
check::{self, Request, Response},
check::{Request, Response},
languages, words,
},
error::{Error, Result},
};
#[cfg(feature = "cli")]
use clap::Args;
#[cfg(feature = "multithreaded")]
use lifetime::IntoStatic;
use reqwest::{
header::{HeaderValue, ACCEPT},
Expand Down Expand Up @@ -423,15 +426,14 @@ impl ServerClient {
"no request; cannot join zero request".to_string(),
));
}
let mut tasks = Vec::with_capacity(requests.len());

requests
let tasks = requests
.into_iter()
.map(|r| r.into_static())
.for_each(|request| {
.map(|request| {
let server_client = self.clone();

tasks.push(tokio::spawn(async move {
tokio::spawn(async move {
let response = server_client.check(&request).await?;
let text = request.text.ok_or_else(|| {
Error::InvalidRequest(
Expand All @@ -440,9 +442,9 @@ impl ServerClient {
)
})?;
Result::<(Cow<'static, str>, Response)>::Ok((text, response))
}));
})
});

let mut response_with_context: Option<check::ResponseWithContext> = None;

for task in tasks {
Expand All @@ -457,6 +459,45 @@ impl ServerClient {
Ok(response_with_context.unwrap())
}

/// Send multiple check requests and join them into a single response,
/// without any context.
///
/// # Error
///
/// If any of the requests has `self.text` or `self.data` field which is
/// [`None`].
#[cfg(feature = "multithreaded")]
pub async fn check_multiple_and_join_without_context<'source>(
&self,
requests: Vec<Request<'source>>,
) -> Result<check::Response> {
let mut response: Option<check::Response> = None;

let tasks = requests
.into_iter()
.map(|r| r.into_static())
.map(|request| {
let server_client = self.clone();

tokio::spawn(async move {
let response = server_client.check(&request).await?;
Result::<Response>::Ok(response)
})
});

// Make requests in sequence
for task in tasks {
let resp = task.await.unwrap()?;

response = Some(match response {
Some(r) => r.append(resp),
None => resp,
})
}

Ok(response.unwrap())
}

/// Send a check request to the server, await for the response and annotate
/// it.
#[cfg(feature = "annotate")]
Expand Down
17 changes: 7 additions & 10 deletions src/cli/check.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ use crate::{
server::ServerClient,
},
error::{Error, Result},
parsers::{parse_html, parse_markdown, parse_typst},
parsers::{html::parse_html, markdown::parse_markdown, typst::parse_typst},
};

use super::ExecuteSubcommand;
Expand Down Expand Up @@ -175,6 +175,7 @@ impl ExecuteSubcommand for Command {
};

let file_content = std::fs::read_to_string(filename)?;

let (response, text): (check::Response, String) = match &file_type {
FileType::Auto => unreachable!(),
FileType::Raw => {
Expand All @@ -192,18 +193,14 @@ impl ExecuteSubcommand for Command {
FileType::Typst | FileType::Markdown | FileType::Html => {
let data = match file_type {
FileType::Typst => parse_typst(&file_content),
FileType::Html => {
let text = parse_html(&file_content);
Data::from_iter([DataAnnotation::new_text(text)])
},
FileType::Markdown => {
let text = parse_markdown(&file_content);
Data::from_iter([DataAnnotation::new_text(text)])
},
FileType::Html => parse_html(&file_content),
FileType::Markdown => parse_markdown(&file_content),
_ => unreachable!(),
};
let requests = (request.clone().with_data(data))
.split(self.max_length, self.split_pattern.as_str());
let response = server_client
.check(&request.clone().with_data(data))
.check_multiple_and_join_without_context(requests)
.await?;
(response, file_content)
},
Expand Down
Loading
Loading