From 93cf67e0cb6d7bd1cf44ce399c6691e89a353e5b Mon Sep 17 00:00:00 2001 From: c Date: Wed, 14 May 2025 13:09:21 +0200 Subject: [PATCH 01/13] fix wrong attribute from outer to inner --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 4c44681..a181672 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,7 +5,7 @@ */ #![doc = include_str!("../README.md")] #![deny(rust_2018_idioms)] -#[forbid(unsafe_code)] +#![forbid(unsafe_code)] pub mod core; pub mod decoders; pub mod mailbox; From f3d14758785778a97aa9c394dcc46437ed6d879e Mon Sep 17 00:00:00 2001 From: c Date: Wed, 14 May 2025 13:16:27 +0200 Subject: [PATCH 02/13] add private module with struct Html to fix wrong html charset --- src/core/html.rs | 85 ++++++++++++++++++++++++++++++++++++++++++++++++ src/core/mod.rs | 3 ++ src/lib.rs | 2 ++ 3 files changed, 90 insertions(+) create mode 100644 src/core/html.rs diff --git a/src/core/html.rs b/src/core/html.rs new file mode 100644 index 0000000..82e2888 --- /dev/null +++ b/src/core/html.rs @@ -0,0 +1,85 @@ +use std::borrow::Cow; + +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; + +#[derive(Debug, PartialEq, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +pub struct Html<'x>(Cow<'x, str>); + +impl<'x> Html<'x> { + pub fn make_owned(self) -> Html<'static> { + Html(self.0.into_owned().into()) + } + pub fn new(html: Cow<'x, str>) -> Html<'x> { + Html(html) + } + /// Access the raw html with a potentially wrong charset. + /// + /// `mail-parser` only returns utf-8 strings, so the only sensible charset for the html is utf-8. Because html can declare its charset in `` tags, in the process of transcoding to utf-8 these may be incorrect. + /// Call [`Html::strip_charset`] before this method if the html will be given to a standard-conforming browser. + pub fn potentially_wrong_charset(&self) -> &Cow<'x, str> { + &self.0 + } + /// Strip charset from html, making it utf-8 by default. + /// + /// Call this method if the result is given to a standard-conforming browser. + pub fn strip_charset(&mut self) { + let mut off = 0; + let mut first = true; + let mut found = None; + 'meta: for part in self.0.split("') else { + return; + }; + for w in between.as_bytes().windows(b"charset".len()) { + if w.eq_ignore_ascii_case(b"charset") { + found = Some((off, off + "".len())); + break 'meta; + } + } + off += " Cow<'_, str> { + let mut html = Html(html.into()); + html.strip_charset(); + html.potentially_wrong_charset().clone() + } + + #[test] + fn strip_charset() { + assert_eq!( + strip(""), + "" + ); + + let stripped = strip(""); + assert_eq!(stripped, ""); + + let stripped = strip(""); + assert_eq!(stripped, ""); + + let stripped = strip(""); + assert_eq!(stripped, ""); + + let stripped = strip(""); + assert_eq!(stripped, ""); + + let stripped = strip(""); + assert_eq!(stripped, ""); + } +} diff --git a/src/core/mod.rs b/src/core/mod.rs index 541addd..c1c7f1d 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -8,6 +8,9 @@ pub mod address; pub mod body; pub mod builder; pub mod header; +mod html; pub mod message; #[cfg(feature = "rkyv")] pub mod rkyv; + +pub use html::Html; diff --git a/src/lib.rs b/src/lib.rs index a181672..c13ec13 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,6 +11,8 @@ pub mod decoders; pub mod mailbox; pub mod parsers; + +use core::Html; use parsers::MessageStream; use std::{borrow::Cow, collections::HashMap, hash::Hash, net::IpAddr}; From 4fe30a6f6addf5c33b1f98265f49bbcae9f8696b Mon Sep 17 00:00:00 2001 From: c Date: Wed, 14 May 2025 13:24:54 +0200 Subject: [PATCH 03/13] replace PartType::Html(Cow<'x, str>) with PartType::Html(Html<'x>) --- examples/message_parse.rs | 7 +++++-- src/core/body.rs | 3 ++- src/core/header.rs | 11 +++++++---- src/core/message.rs | 10 +++++----- src/lib.rs | 2 +- src/parsers/message.rs | 6 +++--- src/parsers/preview.rs | 8 ++++++-- tests/integration_test.rs | 7 +++++-- 8 files changed, 34 insertions(+), 20 deletions(-) diff --git a/examples/message_parse.rs b/examples/message_parse.rs index 1b450f6..0baec30 100644 --- a/examples/message_parse.rs +++ b/examples/message_parse.rs @@ -89,7 +89,7 @@ R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7 // HTML and text body parts are returned conforming to RFC8621, Section 4.1.4 assert_eq!( - message.body_html(0).unwrap(), + message.body_html(0).unwrap().potentially_wrong_charset(), concat!( "

I was thinking about quitting the “exporting” to ", "focus just on the “importing”,

but then I thought,", @@ -120,7 +120,10 @@ R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7 "ℌ𝔢𝔩𝔭 𝔪𝔢 𝔢𝔵𝔭𝔬𝔯𝔱 𝔪𝔶 𝔟𝔬𝔬𝔨 𝔭𝔩𝔢𝔞𝔰𝔢!" ); assert_eq!( - nested_message.body_html(0).unwrap(), + nested_message + .body_html(0) + .unwrap() + .potentially_wrong_charset(), "ℌ𝔢𝔩𝔭 𝔪𝔢 𝔢𝔵𝔭𝔬𝔯𝔱 𝔪𝔶 𝔟𝔬𝔬𝔨 𝔭𝔩𝔢𝔞𝔰𝔢!" ); diff --git a/src/core/body.rs b/src/core/body.rs index c1ab74e..c194608 100644 --- a/src/core/body.rs +++ b/src/core/body.rs @@ -10,7 +10,8 @@ impl PartType<'_> { #[allow(clippy::len_without_is_empty)] pub fn len(&self) -> usize { match self { - PartType::Text(v) | PartType::Html(v) => v.len(), + PartType::Text(v) => v.len(), + PartType::Html(v) => v.potentially_wrong_charset().len(), PartType::Binary(v) | PartType::InlineBinary(v) => v.len(), PartType::Message(v) => v.raw_message.len(), PartType::Multipart(_) => 0, diff --git a/src/core/header.rs b/src/core/header.rs index 54e9c49..113fa60 100644 --- a/src/core/header.rs +++ b/src/core/header.rs @@ -649,7 +649,8 @@ impl<'x> MessagePart<'x> { /// Returns the body part's contents as a `u8` slice pub fn contents(&self) -> &[u8] { match &self.body { - PartType::Text(text) | PartType::Html(text) => text.as_bytes(), + PartType::Text(text) => text.as_bytes(), + PartType::Html(text) => text.potentially_wrong_charset().as_bytes(), PartType::Binary(bin) | PartType::InlineBinary(bin) => bin.as_ref(), PartType::Message(message) => message.raw_message(), PartType::Multipart(_) => b"", @@ -659,7 +660,8 @@ impl<'x> MessagePart<'x> { /// Returns the body part's contents as a `str` pub fn text_contents(&self) -> Option<&str> { match &self.body { - PartType::Text(text) | PartType::Html(text) => text.as_ref().into(), + PartType::Text(text) => text.as_ref().into(), + PartType::Html(text) => text.potentially_wrong_charset().as_ref().into(), PartType::Binary(bin) | PartType::InlineBinary(bin) => { std::str::from_utf8(bin.as_ref()).ok() } @@ -689,7 +691,8 @@ impl<'x> MessagePart<'x> { /// Returns the body part's length pub fn len(&self) -> usize { match &self.body { - PartType::Text(text) | PartType::Html(text) => text.len(), + PartType::Text(text) => text.len(), + PartType::Html(text) => text.potentially_wrong_charset().len(), PartType::Binary(bin) | PartType::InlineBinary(bin) => bin.len(), PartType::Message(message) => message.raw_message().len(), PartType::Multipart(_) => 0, @@ -758,7 +761,7 @@ impl<'x> MessagePart<'x> { is_encoding_problem: self.is_encoding_problem, body: match self.body { PartType::Text(v) => PartType::Text(v.into_owned().into()), - PartType::Html(v) => PartType::Html(v.into_owned().into()), + PartType::Html(v) => PartType::Html(v.make_owned()), PartType::Binary(v) => PartType::Binary(v.into_owned().into()), PartType::InlineBinary(v) => PartType::InlineBinary(v.into_owned().into()), PartType::Message(v) => PartType::Message(v.into_owned()), diff --git a/src/core/message.rs b/src/core/message.rs index e839302..553c27e 100644 --- a/src/core/message.rs +++ b/src/core/message.rs @@ -14,7 +14,7 @@ use crate::{ MessageStream, }, Address, AttachmentIterator, BodyPartIterator, DateTime, GetHeader, Header, HeaderForm, - HeaderName, HeaderValue, Message, MessageParser, MessagePart, PartType, Received, + HeaderName, HeaderValue, Html, Message, MessageParser, MessagePart, PartType, Received, }; impl<'x> Message<'x> { @@ -391,11 +391,11 @@ impl<'x> Message<'x> { } /// Returns a message body part as text/plain - pub fn body_html(&'x self, pos: usize) -> Option> { + pub fn body_html(&'x self, pos: usize) -> Option> { let part = self.parts.get(*self.html_body.get(pos)? as usize)?; match &part.body { - PartType::Html(html) => Some(html.as_ref().into()), - PartType::Text(text) => Some(text_to_html(text.as_ref()).into()), + PartType::Html(html) => Some(html.to_owned()), + PartType::Text(text) => Some(Html::new(text_to_html(text.as_ref()).into())), _ => None, } } @@ -405,7 +405,7 @@ impl<'x> Message<'x> { let part = self.parts.get(*self.text_body.get(pos)? as usize)?; match &part.body { PartType::Text(text) => Some(text.as_ref().into()), - PartType::Html(html) => Some(html_to_text(html.as_ref()).into()), + PartType::Html(html) => Some(html_to_text(html.potentially_wrong_charset()).into()), _ => None, } } diff --git a/src/lib.rs b/src/lib.rs index c13ec13..c59a6a0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -134,7 +134,7 @@ pub enum PartType<'x> { Text(#[cfg_attr(feature = "rkyv", rkyv(with = rkyv::with::AsOwned))] Cow<'x, str>), /// A text/html part - Html(#[cfg_attr(feature = "rkyv", rkyv(with = rkyv::with::AsOwned))] Cow<'x, str>), + Html(#[cfg_attr(feature = "rkyv", rkyv(with = rkyv::with::AsOwned))] Html<'x>), /// Any other part type that is not text. Binary(#[cfg_attr(feature = "rkyv", rkyv(with = rkyv::with::AsOwned))] Cow<'x, [u8]>), diff --git a/src/parsers/message.rs b/src/parsers/message.rs index 4ff25f9..d91b013 100644 --- a/src/parsers/message.rs +++ b/src/parsers/message.rs @@ -8,8 +8,8 @@ use std::borrow::Cow; use crate::{ decoders::{charsets::map::charset_decoder, DecodeFnc}, - ContentType, Encoding, GetHeader, HeaderName, HeaderValue, Message, MessageParser, MessagePart, - MessagePartId, PartType, + ContentType, Encoding, GetHeader, HeaderName, HeaderValue, Html, Message, MessageParser, + MessagePart, MessagePartId, PartType, }; use super::MessageStream; @@ -337,7 +337,7 @@ impl MessageParser { } if is_html { - PartType::Html(text) + PartType::Html(Html::new(text)) } else { PartType::Text(text) } diff --git a/src/parsers/preview.rs b/src/parsers/preview.rs index 7941618..a3e6ad9 100644 --- a/src/parsers/preview.rs +++ b/src/parsers/preview.rs @@ -7,9 +7,13 @@ use std::borrow::Cow; use crate::decoders::html::html_to_text; +use crate::Html; -pub fn preview_html<'x>(html: Cow<'_, str>, max_len: usize) -> Cow<'x, str> { - preview_text(html_to_text(html.as_ref()).into(), max_len) +pub fn preview_html<'x>(html: Html<'_>, max_len: usize) -> Cow<'x, str> { + preview_text( + html_to_text(html.potentially_wrong_charset()).into(), + max_len, + ) } pub fn preview_text<'x>(text: Cow<'_, str>, mut max_len: usize) -> Cow<'x, str> { diff --git a/tests/integration_test.rs b/tests/integration_test.rs index e394801..c4744be 100644 --- a/tests/integration_test.rs +++ b/tests/integration_test.rs @@ -108,7 +108,7 @@ R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7 ); assert_eq!( - message.body_html(0).unwrap(), + message.body_html(0).unwrap().potentially_wrong_charset(), concat!( "

I was thinking about quitting the “exporting” to ", "focus just on the “importing”,

but then I thought,", @@ -137,7 +137,10 @@ R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7 ); assert_eq!( - nested_message.body_html(0).unwrap(), + nested_message + .body_html(0) + .unwrap() + .potentially_wrong_charset(), "ℌ𝔢𝔩𝔭 𝔪𝔢 𝔢𝔵𝔭𝔬𝔯𝔱 𝔪𝔶 𝔟𝔬𝔬𝔨 𝔭𝔩𝔢𝔞𝔰𝔢!" ); From 092b6c3bf6b900202ed2f52d96a6781967c73525 Mon Sep 17 00:00:00 2001 From: c Date: Wed, 14 May 2025 13:57:07 +0200 Subject: [PATCH 04/13] make modules private that have no public items --- src/core/mod.rs | 10 +++++----- src/decoders/charsets/mod.rs | 6 +++--- src/decoders/mod.rs | 2 +- src/parsers/fields/mod.rs | 12 ++++++------ src/parsers/mod.rs | 6 +++--- 5 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/core/mod.rs b/src/core/mod.rs index c1c7f1d..4751743 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -4,12 +4,12 @@ * SPDX-License-Identifier: Apache-2.0 OR MIT */ -pub mod address; -pub mod body; -pub mod builder; -pub mod header; +mod address; +mod body; +mod builder; +mod header; mod html; -pub mod message; +mod message; #[cfg(feature = "rkyv")] pub mod rkyv; diff --git a/src/decoders/charsets/mod.rs b/src/decoders/charsets/mod.rs index cd8a61f..3c3f422 100644 --- a/src/decoders/charsets/mod.rs +++ b/src/decoders/charsets/mod.rs @@ -5,9 +5,9 @@ */ pub mod map; -pub mod multi_byte; -pub mod single_byte; -pub mod utf; +mod multi_byte; +mod single_byte; +mod utf; pub type DecoderFnc = fn(&[u8]) -> String; diff --git a/src/decoders/mod.rs b/src/decoders/mod.rs index e5f8a28..8a6b0bd 100644 --- a/src/decoders/mod.rs +++ b/src/decoders/mod.rs @@ -10,7 +10,7 @@ use crate::parsers::MessageStream; pub mod base64; pub mod charsets; -pub mod encoded_word; +mod encoded_word; pub mod hex; pub mod html; pub mod quoted_printable; diff --git a/src/parsers/fields/mod.rs b/src/parsers/fields/mod.rs index 299c059..a663393 100644 --- a/src/parsers/fields/mod.rs +++ b/src/parsers/fields/mod.rs @@ -5,14 +5,14 @@ */ pub mod address; -pub mod content_type; +mod content_type; pub mod date; -pub mod id; -pub mod list; -pub mod raw; -pub mod received; +mod id; +mod list; +mod raw; +mod received; pub mod thread; -pub mod unstructured; +mod unstructured; #[cfg(test)] use serde::{Deserialize, Serialize}; diff --git a/src/parsers/mod.rs b/src/parsers/mod.rs index 0f2574a..6fe0e74 100644 --- a/src/parsers/mod.rs +++ b/src/parsers/mod.rs @@ -7,9 +7,9 @@ use std::{iter::Peekable, ops::Range, slice::Iter}; pub mod fields; -pub mod header; -pub mod message; -pub mod mime; +mod header; +mod message; +mod mime; pub mod preview; pub struct MessageStream<'x> { From 468ff8fe21aff217b1c9180b1d7f1146909accb5 Mon Sep 17 00:00:00 2001 From: c Date: Wed, 14 May 2025 13:57:25 +0200 Subject: [PATCH 05/13] fmt --- src/lib.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index c59a6a0..4201079 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,7 +11,6 @@ pub mod decoders; pub mod mailbox; pub mod parsers; - use core::Html; use parsers::MessageStream; use std::{borrow::Cow, collections::HashMap, hash::Hash, net::IpAddr}; From 02189487ace91a7e696f9e73bbb2b23eec270404 Mon Sep 17 00:00:00 2001 From: c Date: Wed, 14 May 2025 15:46:34 +0200 Subject: [PATCH 06/13] dont make more methods pub than necessary --- src/core/html.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/html.rs b/src/core/html.rs index 82e2888..917a280 100644 --- a/src/core/html.rs +++ b/src/core/html.rs @@ -8,10 +8,10 @@ use serde::{Deserialize, Serialize}; pub struct Html<'x>(Cow<'x, str>); impl<'x> Html<'x> { - pub fn make_owned(self) -> Html<'static> { + pub(crate) fn make_owned(self) -> Html<'static> { Html(self.0.into_owned().into()) } - pub fn new(html: Cow<'x, str>) -> Html<'x> { + pub(crate) fn new(html: Cow<'x, str>) -> Html<'x> { Html(html) } /// Access the raw html with a potentially wrong charset. From 9fe77ed675d34ed4b384ea6d453a43aeda832737 Mon Sep 17 00:00:00 2001 From: c Date: Wed, 14 May 2025 15:52:07 +0200 Subject: [PATCH 07/13] improve wording --- src/core/html.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/core/html.rs b/src/core/html.rs index 917a280..db47f60 100644 --- a/src/core/html.rs +++ b/src/core/html.rs @@ -14,16 +14,16 @@ impl<'x> Html<'x> { pub(crate) fn new(html: Cow<'x, str>) -> Html<'x> { Html(html) } - /// Access the raw html with a potentially wrong charset. + /// Access the raw html with the original charset. /// - /// `mail-parser` only returns utf-8 strings, so the only sensible charset for the html is utf-8. Because html can declare its charset in `` tags, in the process of transcoding to utf-8 these may be incorrect. - /// Call [`Html::strip_charset`] before this method if the html will be given to a standard-conforming browser. + /// `mail-parser` returns utf-8 strings, so the only correct charset for the html is utf-8. Because html can declare its charset in `` tags, in the process of transcoding these may become incorrect. + /// If the correct charset is needed [`Html::strip_charset`] must be called before accessing the html with this method. pub fn potentially_wrong_charset(&self) -> &Cow<'x, str> { &self.0 } /// Strip charset from html, making it utf-8 by default. /// - /// Call this method if the result is given to a standard-conforming browser. + /// This method should be called if the consumer of the html is a standard-conforming browser. pub fn strip_charset(&mut self) { let mut off = 0; let mut first = true; From 1ee08927a423e8a75344712ef4bcd7b8f00423aa Mon Sep 17 00:00:00 2001 From: c Date: Wed, 14 May 2025 16:13:17 +0200 Subject: [PATCH 08/13] cover rkyv feature by CI --- .github/workflows/rust.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 5c2da0c..1e4cae0 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -17,6 +17,6 @@ jobs: steps: - uses: actions/checkout@v2 - name: Build - run: cargo build --verbose + run: cargo build --verbose --all-features - name: Run tests - run: cargo test --verbose + run: cargo test --verbose --all-features From c9167e9b4e71fc84df46638644e437ca7359690c Mon Sep 17 00:00:00 2001 From: c Date: Thu, 15 May 2025 14:38:15 +0200 Subject: [PATCH 09/13] fix: some html may contain multiple variants of the charset declaration --- src/core/html.rs | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/core/html.rs b/src/core/html.rs index db47f60..3cc789f 100644 --- a/src/core/html.rs +++ b/src/core/html.rs @@ -27,16 +27,15 @@ impl<'x> Html<'x> { pub fn strip_charset(&mut self) { let mut off = 0; let mut first = true; - let mut found = None; - 'meta: for part in self.0.split("') else { return; }; for w in between.as_bytes().windows(b"charset".len()) { if w.eq_ignore_ascii_case(b"charset") { - found = Some((off, off + "".len())); - break 'meta; + found.push((off, off + "".len())); } } off += " Html<'x> { off += part.len(); first = false; } - if let Some((start, end)) = found { - self.0.to_mut().replace_range(start..end, ""); + let mut deleted = 0; + for (start, end) in found { + self.0 + .to_mut() + .replace_range(start - deleted..end - deleted, ""); + deleted += end - start; } } } @@ -81,5 +84,8 @@ mod tests { let stripped = strip(""); assert_eq!(stripped, ""); + + let stripped = strip(""); + assert_eq!(stripped, ""); } } From 58d0a3b1fd86146bc005642b0b0dde76b3a514b5 Mon Sep 17 00:00:00 2001 From: c Date: Thu, 15 May 2025 14:45:01 +0200 Subject: [PATCH 10/13] replace the charset with utf-8 --- src/core/html.rs | 73 ++++++++++++++++++++++++++++++------------------ 1 file changed, 46 insertions(+), 27 deletions(-) diff --git a/src/core/html.rs b/src/core/html.rs index 3cc789f..86a3089 100644 --- a/src/core/html.rs +++ b/src/core/html.rs @@ -16,15 +16,15 @@ impl<'x> Html<'x> { } /// Access the raw html with the original charset. /// - /// `mail-parser` returns utf-8 strings, so the only correct charset for the html is utf-8. Because html can declare its charset in `` tags, in the process of transcoding these may become incorrect. - /// If the correct charset is needed [`Html::strip_charset`] must be called before accessing the html with this method. + /// `mail-parser` returns utf-8 strings, so the only correct charset for the html is utf-8. Because html can declare its charset in `` tags, these may be incorrect after transcoding. + /// If the charset must be correct call [`Html::fix_charset`] before accessing the html with this method. pub fn potentially_wrong_charset(&self) -> &Cow<'x, str> { &self.0 } - /// Strip charset from html, making it utf-8 by default. + /// Replace charset with `utf-8`. /// /// This method should be called if the consumer of the html is a standard-conforming browser. - pub fn strip_charset(&mut self) { + pub fn fix_charset(&mut self) { let mut off = 0; let mut first = true; let mut found = Vec::with_capacity(2); @@ -35,7 +35,10 @@ impl<'x> Html<'x> { }; for w in between.as_bytes().windows(b"charset".len()) { if w.eq_ignore_ascii_case(b"charset") { - found.push((off, off + "".len())); + found.push(( + off as isize, + (off + "".len()) as isize, + )); } } off += " Html<'x> { off += part.len(); first = false; } - let mut deleted = 0; + let mut deleted: isize = 0; + let mut first = true; for (start, end) in found { - self.0 - .to_mut() - .replace_range(start - deleted..end - deleted, ""); - deleted += end - start; + let mut replace = ""; + if first { + replace = ""; + } + self.0.to_mut().replace_range( + (start - deleted) as usize..(end - deleted) as usize, + replace, + ); + deleted += end - start - replace.len() as isize; + first = false; } } } @@ -57,35 +67,44 @@ impl<'x> Html<'x> { mod tests { use super::*; - fn strip(html: &str) -> Cow<'_, str> { + fn fix(html: &str) -> Cow<'_, str> { let mut html = Html(html.into()); - html.strip_charset(); + html.fix_charset(); html.potentially_wrong_charset().clone() } #[test] - fn strip_charset() { + fn fix_charset() { assert_eq!( - strip(""), - "" + fix(""), + "" ); - let stripped = strip(""); - assert_eq!(stripped, ""); + let fixed = fix(""); + assert_eq!(fixed, ""); - let stripped = strip(""); - assert_eq!(stripped, ""); + let fixed = fix(""); + assert_eq!(fixed, ""); - let stripped = strip(""); - assert_eq!(stripped, ""); + let fixed = fix(""); + assert_eq!(fixed, ""); - let stripped = strip(""); - assert_eq!(stripped, ""); + let fixed = fix(""); + assert_eq!( + fixed, + "" + ); - let stripped = strip(""); - assert_eq!(stripped, ""); + let fixed = fix(""); + assert_eq!( + fixed, + "" + ); - let stripped = strip(""); - assert_eq!(stripped, ""); + let fixed = fix(""); + assert_eq!( + fixed, + "" + ); } } From f174c3eb84e683882b5cac88396875aa95f56fbe Mon Sep 17 00:00:00 2001 From: c Date: Thu, 15 May 2025 15:14:25 +0200 Subject: [PATCH 11/13] test that malformed tags are not modified --- src/core/html.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/core/html.rs b/src/core/html.rs index 86a3089..582af05 100644 --- a/src/core/html.rs +++ b/src/core/html.rs @@ -106,5 +106,11 @@ mod tests { fixed, "" ); + + let malformed = fix(""); + assert_eq!( + malformed, + "" + ); } } From 23eab1716be2290491d177d407a2d78946bcabde Mon Sep 17 00:00:00 2001 From: c Date: Thu, 15 May 2025 15:15:46 +0200 Subject: [PATCH 12/13] fix: add space to tag to prevent from being stripped --- src/core/html.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/core/html.rs b/src/core/html.rs index 582af05..377db91 100644 --- a/src/core/html.rs +++ b/src/core/html.rs @@ -28,7 +28,7 @@ impl<'x> Html<'x> { let mut off = 0; let mut first = true; let mut found = Vec::with_capacity(2); - for part in self.0.split("') else { return; @@ -37,11 +37,12 @@ impl<'x> Html<'x> { if w.eq_ignore_ascii_case(b"charset") { found.push(( off as isize, - (off + "".len()) as isize, + (off + "".len()) as isize, )); + break; } } - off += "" ); + + let fixed = fix(""); + assert_eq!(fixed, ""); } } From e9138b2b9504ff044a36adb6b52f137a6bf9da4c Mon Sep 17 00:00:00 2001 From: c Date: Thu, 15 May 2025 15:32:37 +0200 Subject: [PATCH 13/13] fix: rustdoc warnings --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8a7d56a..340ecef 100644 --- a/README.md +++ b/README.md @@ -270,8 +270,8 @@ Supported character sets via the optional dependency [encoding_rs](https://crate Licensed under either of - * Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) - * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) + * Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE) or ) + * MIT license ([LICENSE-MIT](LICENSE-MIT) or ) at your option.