From 93cf67e0cb6d7bd1cf44ce399c6691e89a353e5b Mon Sep 17 00:00:00 2001
From: c <c@farsight.net>
Date: Wed, 14 May 2025 13:09:21 +0200
Subject: [PATCH 01/13] fix wrong attribute from outer to inner

---
 src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/lib.rs b/src/lib.rs
index 4c44681..a181672 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -5,7 +5,7 @@
  */
 #![doc = include_str!("../README.md")]
 #![deny(rust_2018_idioms)]
-#[forbid(unsafe_code)]
+#![forbid(unsafe_code)]
 pub mod core;
 pub mod decoders;
 pub mod mailbox;

From f3d14758785778a97aa9c394dcc46437ed6d879e Mon Sep 17 00:00:00 2001
From: c <c@farsight.net>
Date: Wed, 14 May 2025 13:16:27 +0200
Subject: [PATCH 02/13] add private module with struct Html to fix wrong html
 charset

---
 src/core/html.rs | 85 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/core/mod.rs  |  3 ++
 src/lib.rs       |  2 ++
 3 files changed, 90 insertions(+)
 create mode 100644 src/core/html.rs

diff --git a/src/core/html.rs b/src/core/html.rs
new file mode 100644
index 0000000..82e2888
--- /dev/null
+++ b/src/core/html.rs
@@ -0,0 +1,85 @@
+use std::borrow::Cow;
+
+#[cfg(feature = "serde")]
+use serde::{Deserialize, Serialize};
+
+#[derive(Debug, PartialEq, Clone)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+pub struct Html<'x>(Cow<'x, str>);
+
+impl<'x> Html<'x> {
+    pub fn make_owned(self) -> Html<'static> {
+        Html(self.0.into_owned().into())
+    }
+    pub fn new(html: Cow<'x, str>) -> Html<'x> {
+        Html(html)
+    }
+    /// Access the raw html with a potentially wrong charset.
+    ///
+    /// `mail-parser` only returns utf-8 strings, so the only sensible charset for the html is utf-8. Because html can declare its charset in `<meta>` tags, in the process of transcoding to utf-8 these may be incorrect.
+    /// Call [`Html::strip_charset`] before this method if the html will be given to a standard-conforming browser.
+    pub fn potentially_wrong_charset(&self) -> &Cow<'x, str> {
+        &self.0
+    }
+    /// Strip charset from html, making it utf-8 by default.
+    ///
+    /// Call this method if the result is given to a standard-conforming browser.
+    pub fn strip_charset(&mut self) {
+        let mut off = 0;
+        let mut first = true;
+        let mut found = None;
+        'meta: for part in self.0.split("<meta") {
+            if !first {
+                let Some((between, _)) = part.split_once('>') else {
+                    return;
+                };
+                for w in between.as_bytes().windows(b"charset".len()) {
+                    if w.eq_ignore_ascii_case(b"charset") {
+                        found = Some((off, off + "<meta".len() + between.len() + ">".len()));
+                        break 'meta;
+                    }
+                }
+                off += "<meta".len();
+            }
+            off += part.len();
+            first = false;
+        }
+        if let Some((start, end)) = found {
+            self.0.to_mut().replace_range(start..end, "");
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn strip(html: &str) -> Cow<'_, str> {
+        let mut html = Html(html.into());
+        html.strip_charset();
+        html.potentially_wrong_charset().clone()
+    }
+
+    #[test]
+    fn strip_charset() {
+        assert_eq!(
+            strip("<head><meta cHarSet=Windows-1252></head>"),
+            "<head></head>"
+        );
+
+        let stripped = strip("<head><meta cHarSet=\"Windows-1252\"></head>");
+        assert_eq!(stripped, "<head></head>");
+
+        let stripped = strip("<head><meta http-equiv=\"Content-Type\" content=\"text/html; cHarSet=Windows-1252\"></head>");
+        assert_eq!(stripped, "<head></head>");
+
+        let stripped = strip("<head><meta http-equiv=\"Content-Type\" content=\"text/html; cHarSet = &quot;Windows-1252&quot;></head>");
+        assert_eq!(stripped, "<head></head>");
+
+        let stripped = strip("<head><meta name=\"xxx\"><meta http-equiv=\"Content-Type\" content=\"text/html; cHarSet = &quot;Windows-1252&quot;></head>");
+        assert_eq!(stripped, "<head><meta name=\"xxx\"></head>");
+
+        let stripped = strip("<head><meta http-equiv=\"Content-Type\" content=\"text/html; cHarSet = &quot;Windows-1252&quot;><meta name=\"xxx\"></head>");
+        assert_eq!(stripped, "<head><meta name=\"xxx\"></head>");
+    }
+}
diff --git a/src/core/mod.rs b/src/core/mod.rs
index 541addd..c1c7f1d 100644
--- a/src/core/mod.rs
+++ b/src/core/mod.rs
@@ -8,6 +8,9 @@ pub mod address;
 pub mod body;
 pub mod builder;
 pub mod header;
+mod html;
 pub mod message;
 #[cfg(feature = "rkyv")]
 pub mod rkyv;
+
+pub use html::Html;
diff --git a/src/lib.rs b/src/lib.rs
index a181672..c13ec13 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -11,6 +11,8 @@ pub mod decoders;
 pub mod mailbox;
 pub mod parsers;
 
+
+use core::Html;
 use parsers::MessageStream;
 use std::{borrow::Cow, collections::HashMap, hash::Hash, net::IpAddr};
 

From 4fe30a6f6addf5c33b1f98265f49bbcae9f8696b Mon Sep 17 00:00:00 2001
From: c <c@farsight.net>
Date: Wed, 14 May 2025 13:24:54 +0200
Subject: [PATCH 03/13] replace PartType::Html(Cow<'x, str>) with
 PartType::Html(Html<'x>)

---
 examples/message_parse.rs |  7 +++++--
 src/core/body.rs          |  3 ++-
 src/core/header.rs        | 11 +++++++----
 src/core/message.rs       | 10 +++++-----
 src/lib.rs                |  2 +-
 src/parsers/message.rs    |  6 +++---
 src/parsers/preview.rs    |  8 ++++++--
 tests/integration_test.rs |  7 +++++--
 8 files changed, 34 insertions(+), 20 deletions(-)

diff --git a/examples/message_parse.rs b/examples/message_parse.rs
index 1b450f6..0baec30 100644
--- a/examples/message_parse.rs
+++ b/examples/message_parse.rs
@@ -89,7 +89,7 @@ R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7
 
     // HTML and text body parts are returned conforming to RFC8621, Section 4.1.4
     assert_eq!(
-        message.body_html(0).unwrap(),
+        message.body_html(0).unwrap().potentially_wrong_charset(),
         concat!(
             "<html><p>I was thinking about quitting the &ldquo;exporting&rdquo; to ",
             "focus just on the &ldquo;importing&rdquo;,</p><p>but then I thought,",
@@ -120,7 +120,10 @@ R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7
         "ℌ𝔢𝔩𝔭 𝔪𝔢 𝔢𝔵𝔭𝔬𝔯𝔱 𝔪𝔶 𝔟𝔬𝔬𝔨 𝔭𝔩𝔢𝔞𝔰𝔢!"
     );
     assert_eq!(
-        nested_message.body_html(0).unwrap(),
+        nested_message
+            .body_html(0)
+            .unwrap()
+            .potentially_wrong_charset(),
         "<html><body>ℌ𝔢𝔩𝔭 𝔪𝔢 𝔢𝔵𝔭𝔬𝔯𝔱 𝔪𝔶 𝔟𝔬𝔬𝔨 𝔭𝔩𝔢𝔞𝔰𝔢!</body></html>"
     );
 
diff --git a/src/core/body.rs b/src/core/body.rs
index c1ab74e..c194608 100644
--- a/src/core/body.rs
+++ b/src/core/body.rs
@@ -10,7 +10,8 @@ impl PartType<'_> {
     #[allow(clippy::len_without_is_empty)]
     pub fn len(&self) -> usize {
         match self {
-            PartType::Text(v) | PartType::Html(v) => v.len(),
+            PartType::Text(v) => v.len(),
+            PartType::Html(v) => v.potentially_wrong_charset().len(),
             PartType::Binary(v) | PartType::InlineBinary(v) => v.len(),
             PartType::Message(v) => v.raw_message.len(),
             PartType::Multipart(_) => 0,
diff --git a/src/core/header.rs b/src/core/header.rs
index 54e9c49..113fa60 100644
--- a/src/core/header.rs
+++ b/src/core/header.rs
@@ -649,7 +649,8 @@ impl<'x> MessagePart<'x> {
     /// Returns the body part's contents as a `u8` slice
     pub fn contents(&self) -> &[u8] {
         match &self.body {
-            PartType::Text(text) | PartType::Html(text) => text.as_bytes(),
+            PartType::Text(text) => text.as_bytes(),
+            PartType::Html(text) => text.potentially_wrong_charset().as_bytes(),
             PartType::Binary(bin) | PartType::InlineBinary(bin) => bin.as_ref(),
             PartType::Message(message) => message.raw_message(),
             PartType::Multipart(_) => b"",
@@ -659,7 +660,8 @@ impl<'x> MessagePart<'x> {
     /// Returns the body part's contents as a `str`
     pub fn text_contents(&self) -> Option<&str> {
         match &self.body {
-            PartType::Text(text) | PartType::Html(text) => text.as_ref().into(),
+            PartType::Text(text) => text.as_ref().into(),
+            PartType::Html(text) => text.potentially_wrong_charset().as_ref().into(),
             PartType::Binary(bin) | PartType::InlineBinary(bin) => {
                 std::str::from_utf8(bin.as_ref()).ok()
             }
@@ -689,7 +691,8 @@ impl<'x> MessagePart<'x> {
     /// Returns the body part's length
     pub fn len(&self) -> usize {
         match &self.body {
-            PartType::Text(text) | PartType::Html(text) => text.len(),
+            PartType::Text(text) => text.len(),
+            PartType::Html(text) => text.potentially_wrong_charset().len(),
             PartType::Binary(bin) | PartType::InlineBinary(bin) => bin.len(),
             PartType::Message(message) => message.raw_message().len(),
             PartType::Multipart(_) => 0,
@@ -758,7 +761,7 @@ impl<'x> MessagePart<'x> {
             is_encoding_problem: self.is_encoding_problem,
             body: match self.body {
                 PartType::Text(v) => PartType::Text(v.into_owned().into()),
-                PartType::Html(v) => PartType::Html(v.into_owned().into()),
+                PartType::Html(v) => PartType::Html(v.make_owned()),
                 PartType::Binary(v) => PartType::Binary(v.into_owned().into()),
                 PartType::InlineBinary(v) => PartType::InlineBinary(v.into_owned().into()),
                 PartType::Message(v) => PartType::Message(v.into_owned()),
diff --git a/src/core/message.rs b/src/core/message.rs
index e839302..553c27e 100644
--- a/src/core/message.rs
+++ b/src/core/message.rs
@@ -14,7 +14,7 @@ use crate::{
         MessageStream,
     },
     Address, AttachmentIterator, BodyPartIterator, DateTime, GetHeader, Header, HeaderForm,
-    HeaderName, HeaderValue, Message, MessageParser, MessagePart, PartType, Received,
+    HeaderName, HeaderValue, Html, Message, MessageParser, MessagePart, PartType, Received,
 };
 
 impl<'x> Message<'x> {
@@ -391,11 +391,11 @@ impl<'x> Message<'x> {
     }
 
     /// Returns a message body part as text/plain
-    pub fn body_html(&'x self, pos: usize) -> Option<Cow<'x, str>> {
+    pub fn body_html(&'x self, pos: usize) -> Option<Html<'x>> {
         let part = self.parts.get(*self.html_body.get(pos)? as usize)?;
         match &part.body {
-            PartType::Html(html) => Some(html.as_ref().into()),
-            PartType::Text(text) => Some(text_to_html(text.as_ref()).into()),
+            PartType::Html(html) => Some(html.to_owned()),
+            PartType::Text(text) => Some(Html::new(text_to_html(text.as_ref()).into())),
             _ => None,
         }
     }
@@ -405,7 +405,7 @@ impl<'x> Message<'x> {
         let part = self.parts.get(*self.text_body.get(pos)? as usize)?;
         match &part.body {
             PartType::Text(text) => Some(text.as_ref().into()),
-            PartType::Html(html) => Some(html_to_text(html.as_ref()).into()),
+            PartType::Html(html) => Some(html_to_text(html.potentially_wrong_charset()).into()),
             _ => None,
         }
     }
diff --git a/src/lib.rs b/src/lib.rs
index c13ec13..c59a6a0 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -134,7 +134,7 @@ pub enum PartType<'x> {
     Text(#[cfg_attr(feature = "rkyv", rkyv(with = rkyv::with::AsOwned))] Cow<'x, str>),
 
     /// A text/html part
-    Html(#[cfg_attr(feature = "rkyv", rkyv(with = rkyv::with::AsOwned))] Cow<'x, str>),
+    Html(#[cfg_attr(feature = "rkyv", rkyv(with = rkyv::with::AsOwned))] Html<'x>),
 
     /// Any other part type that is not text.
     Binary(#[cfg_attr(feature = "rkyv", rkyv(with = rkyv::with::AsOwned))] Cow<'x, [u8]>),
diff --git a/src/parsers/message.rs b/src/parsers/message.rs
index 4ff25f9..d91b013 100644
--- a/src/parsers/message.rs
+++ b/src/parsers/message.rs
@@ -8,8 +8,8 @@ use std::borrow::Cow;
 
 use crate::{
     decoders::{charsets::map::charset_decoder, DecodeFnc},
-    ContentType, Encoding, GetHeader, HeaderName, HeaderValue, Message, MessageParser, MessagePart,
-    MessagePartId, PartType,
+    ContentType, Encoding, GetHeader, HeaderName, HeaderValue, Html, Message, MessageParser,
+    MessagePart, MessagePartId, PartType,
 };
 
 use super::MessageStream;
@@ -337,7 +337,7 @@ impl MessageParser {
                     }
 
                     if is_html {
-                        PartType::Html(text)
+                        PartType::Html(Html::new(text))
                     } else {
                         PartType::Text(text)
                     }
diff --git a/src/parsers/preview.rs b/src/parsers/preview.rs
index 7941618..a3e6ad9 100644
--- a/src/parsers/preview.rs
+++ b/src/parsers/preview.rs
@@ -7,9 +7,13 @@
 use std::borrow::Cow;
 
 use crate::decoders::html::html_to_text;
+use crate::Html;
 
-pub fn preview_html<'x>(html: Cow<'_, str>, max_len: usize) -> Cow<'x, str> {
-    preview_text(html_to_text(html.as_ref()).into(), max_len)
+pub fn preview_html<'x>(html: Html<'_>, max_len: usize) -> Cow<'x, str> {
+    preview_text(
+        html_to_text(html.potentially_wrong_charset()).into(),
+        max_len,
+    )
 }
 
 pub fn preview_text<'x>(text: Cow<'_, str>, mut max_len: usize) -> Cow<'x, str> {
diff --git a/tests/integration_test.rs b/tests/integration_test.rs
index e394801..c4744be 100644
--- a/tests/integration_test.rs
+++ b/tests/integration_test.rs
@@ -108,7 +108,7 @@ R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7
     );
 
     assert_eq!(
-        message.body_html(0).unwrap(),
+        message.body_html(0).unwrap().potentially_wrong_charset(),
         concat!(
             "<html><p>I was thinking about quitting the &ldquo;exporting&rdquo; to ",
             "focus just on the &ldquo;importing&rdquo;,</p><p>but then I thought,",
@@ -137,7 +137,10 @@ R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7
     );
 
     assert_eq!(
-        nested_message.body_html(0).unwrap(),
+        nested_message
+            .body_html(0)
+            .unwrap()
+            .potentially_wrong_charset(),
         "<html><body>ℌ𝔢𝔩𝔭 𝔪𝔢 𝔢𝔵𝔭𝔬𝔯𝔱 𝔪𝔶 𝔟𝔬𝔬𝔨 𝔭𝔩𝔢𝔞𝔰𝔢!</body></html>"
     );
 

From 092b6c3bf6b900202ed2f52d96a6781967c73525 Mon Sep 17 00:00:00 2001
From: c <c@farsight.net>
Date: Wed, 14 May 2025 13:57:07 +0200
Subject: [PATCH 04/13] make modules private that have no public items

---
 src/core/mod.rs              | 10 +++++-----
 src/decoders/charsets/mod.rs |  6 +++---
 src/decoders/mod.rs          |  2 +-
 src/parsers/fields/mod.rs    | 12 ++++++------
 src/parsers/mod.rs           |  6 +++---
 5 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/core/mod.rs b/src/core/mod.rs
index c1c7f1d..4751743 100644
--- a/src/core/mod.rs
+++ b/src/core/mod.rs
@@ -4,12 +4,12 @@
  * SPDX-License-Identifier: Apache-2.0 OR MIT
  */
 
-pub mod address;
-pub mod body;
-pub mod builder;
-pub mod header;
+mod address;
+mod body;
+mod builder;
+mod header;
 mod html;
-pub mod message;
+mod message;
 #[cfg(feature = "rkyv")]
 pub mod rkyv;
 
diff --git a/src/decoders/charsets/mod.rs b/src/decoders/charsets/mod.rs
index cd8a61f..3c3f422 100644
--- a/src/decoders/charsets/mod.rs
+++ b/src/decoders/charsets/mod.rs
@@ -5,9 +5,9 @@
  */
 
 pub mod map;
-pub mod multi_byte;
-pub mod single_byte;
-pub mod utf;
+mod multi_byte;
+mod single_byte;
+mod utf;
 
 pub type DecoderFnc = fn(&[u8]) -> String;
 
diff --git a/src/decoders/mod.rs b/src/decoders/mod.rs
index e5f8a28..8a6b0bd 100644
--- a/src/decoders/mod.rs
+++ b/src/decoders/mod.rs
@@ -10,7 +10,7 @@ use crate::parsers::MessageStream;
 
 pub mod base64;
 pub mod charsets;
-pub mod encoded_word;
+mod encoded_word;
 pub mod hex;
 pub mod html;
 pub mod quoted_printable;
diff --git a/src/parsers/fields/mod.rs b/src/parsers/fields/mod.rs
index 299c059..a663393 100644
--- a/src/parsers/fields/mod.rs
+++ b/src/parsers/fields/mod.rs
@@ -5,14 +5,14 @@
  */
 
 pub mod address;
-pub mod content_type;
+mod content_type;
 pub mod date;
-pub mod id;
-pub mod list;
-pub mod raw;
-pub mod received;
+mod id;
+mod list;
+mod raw;
+mod received;
 pub mod thread;
-pub mod unstructured;
+mod unstructured;
 
 #[cfg(test)]
 use serde::{Deserialize, Serialize};
diff --git a/src/parsers/mod.rs b/src/parsers/mod.rs
index 0f2574a..6fe0e74 100644
--- a/src/parsers/mod.rs
+++ b/src/parsers/mod.rs
@@ -7,9 +7,9 @@
 use std::{iter::Peekable, ops::Range, slice::Iter};
 
 pub mod fields;
-pub mod header;
-pub mod message;
-pub mod mime;
+mod header;
+mod message;
+mod mime;
 pub mod preview;
 
 pub struct MessageStream<'x> {

From 468ff8fe21aff217b1c9180b1d7f1146909accb5 Mon Sep 17 00:00:00 2001
From: c <c@farsight.net>
Date: Wed, 14 May 2025 13:57:25 +0200
Subject: [PATCH 05/13] fmt

---
 src/lib.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/lib.rs b/src/lib.rs
index c59a6a0..4201079 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -11,7 +11,6 @@ pub mod decoders;
 pub mod mailbox;
 pub mod parsers;
 
-
 use core::Html;
 use parsers::MessageStream;
 use std::{borrow::Cow, collections::HashMap, hash::Hash, net::IpAddr};

From 02189487ace91a7e696f9e73bbb2b23eec270404 Mon Sep 17 00:00:00 2001
From: c <c@farsight.net>
Date: Wed, 14 May 2025 15:46:34 +0200
Subject: [PATCH 06/13] dont make more methods pub than necessary

---
 src/core/html.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/core/html.rs b/src/core/html.rs
index 82e2888..917a280 100644
--- a/src/core/html.rs
+++ b/src/core/html.rs
@@ -8,10 +8,10 @@ use serde::{Deserialize, Serialize};
 pub struct Html<'x>(Cow<'x, str>);
 
 impl<'x> Html<'x> {
-    pub fn make_owned(self) -> Html<'static> {
+    pub(crate) fn make_owned(self) -> Html<'static> {
         Html(self.0.into_owned().into())
     }
-    pub fn new(html: Cow<'x, str>) -> Html<'x> {
+    pub(crate) fn new(html: Cow<'x, str>) -> Html<'x> {
         Html(html)
     }
     /// Access the raw html with a potentially wrong charset.

From 9fe77ed675d34ed4b384ea6d453a43aeda832737 Mon Sep 17 00:00:00 2001
From: c <c@farsight.net>
Date: Wed, 14 May 2025 15:52:07 +0200
Subject: [PATCH 07/13] improve wording

---
 src/core/html.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/core/html.rs b/src/core/html.rs
index 917a280..db47f60 100644
--- a/src/core/html.rs
+++ b/src/core/html.rs
@@ -14,16 +14,16 @@ impl<'x> Html<'x> {
     pub(crate) fn new(html: Cow<'x, str>) -> Html<'x> {
         Html(html)
     }
-    /// Access the raw html with a potentially wrong charset.
+    /// Access the raw html with the original charset.
     ///
-    /// `mail-parser` only returns utf-8 strings, so the only sensible charset for the html is utf-8. Because html can declare its charset in `<meta>` tags, in the process of transcoding to utf-8 these may be incorrect.
-    /// Call [`Html::strip_charset`] before this method if the html will be given to a standard-conforming browser.
+    /// `mail-parser` returns utf-8 strings, so the only correct charset for the html is utf-8. Because html can declare its charset in `<meta>` tags, in the process of transcoding these may become incorrect.
+    /// If the correct charset is needed [`Html::strip_charset`] must be called before accessing the html with this method.
     pub fn potentially_wrong_charset(&self) -> &Cow<'x, str> {
         &self.0
     }
     /// Strip charset from html, making it utf-8 by default.
     ///
-    /// Call this method if the result is given to a standard-conforming browser.
+    /// This method should be called if the consumer of the html is a standard-conforming browser.
     pub fn strip_charset(&mut self) {
         let mut off = 0;
         let mut first = true;

From 1ee08927a423e8a75344712ef4bcd7b8f00423aa Mon Sep 17 00:00:00 2001
From: c <c@farsight.net>
Date: Wed, 14 May 2025 16:13:17 +0200
Subject: [PATCH 08/13] cover rkyv feature by CI

---
 .github/workflows/rust.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 5c2da0c..1e4cae0 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -17,6 +17,6 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: Build
-      run: cargo build --verbose
+      run: cargo build --verbose --all-features
     - name: Run tests
-      run: cargo test --verbose
+      run: cargo test --verbose --all-features

From c9167e9b4e71fc84df46638644e437ca7359690c Mon Sep 17 00:00:00 2001
From: c <c@farsight.net>
Date: Thu, 15 May 2025 14:38:15 +0200
Subject: [PATCH 09/13] fix: some html may contain multiple variants of the
 charset declaration

---
 src/core/html.rs | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/core/html.rs b/src/core/html.rs
index db47f60..3cc789f 100644
--- a/src/core/html.rs
+++ b/src/core/html.rs
@@ -27,16 +27,15 @@ impl<'x> Html<'x> {
     pub fn strip_charset(&mut self) {
         let mut off = 0;
         let mut first = true;
-        let mut found = None;
-        'meta: for part in self.0.split("<meta") {
+        let mut found = Vec::with_capacity(2);
+        for part in self.0.split("<meta") {
             if !first {
                 let Some((between, _)) = part.split_once('>') else {
                     return;
                 };
                 for w in between.as_bytes().windows(b"charset".len()) {
                     if w.eq_ignore_ascii_case(b"charset") {
-                        found = Some((off, off + "<meta".len() + between.len() + ">".len()));
-                        break 'meta;
+                        found.push((off, off + "<meta".len() + between.len() + ">".len()));
                     }
                 }
                 off += "<meta".len();
@@ -44,8 +43,12 @@ impl<'x> Html<'x> {
             off += part.len();
             first = false;
         }
-        if let Some((start, end)) = found {
-            self.0.to_mut().replace_range(start..end, "");
+        let mut deleted = 0;
+        for (start, end) in found {
+            self.0
+                .to_mut()
+                .replace_range(start - deleted..end - deleted, "");
+            deleted += end - start;
         }
     }
 }
@@ -81,5 +84,8 @@ mod tests {
 
         let stripped = strip("<head><meta http-equiv=\"Content-Type\" content=\"text/html; cHarSet = &quot;Windows-1252&quot;><meta name=\"xxx\"></head>");
         assert_eq!(stripped, "<head><meta name=\"xxx\"></head>");
+
+        let stripped = strip("<head><meta cHarSet=Windows-1252><meta http-equiv=\"Content-Type\" content=\"text/html; cHarSet = &quot;Windows-1252&quot;><meta name=\"xxx\"></head>");
+        assert_eq!(stripped, "<head><meta name=\"xxx\"></head>");
     }
 }

From 58d0a3b1fd86146bc005642b0b0dde76b3a514b5 Mon Sep 17 00:00:00 2001
From: c <c@farsight.net>
Date: Thu, 15 May 2025 14:45:01 +0200
Subject: [PATCH 10/13] replace the charset with utf-8

---
 src/core/html.rs | 73 ++++++++++++++++++++++++++++++------------------
 1 file changed, 46 insertions(+), 27 deletions(-)

diff --git a/src/core/html.rs b/src/core/html.rs
index 3cc789f..86a3089 100644
--- a/src/core/html.rs
+++ b/src/core/html.rs
@@ -16,15 +16,15 @@ impl<'x> Html<'x> {
     }
     /// Access the raw html with the original charset.
     ///
-    /// `mail-parser` returns utf-8 strings, so the only correct charset for the html is utf-8. Because html can declare its charset in `<meta>` tags, in the process of transcoding these may become incorrect.
-    /// If the correct charset is needed [`Html::strip_charset`] must be called before accessing the html with this method.
+    /// `mail-parser` returns utf-8 strings, so the only correct charset for the html is utf-8. Because html can declare its charset in `<meta>` tags, these may be incorrect after transcoding.
+    /// If the charset must be correct call [`Html::fix_charset`] before accessing the html with this method.
     pub fn potentially_wrong_charset(&self) -> &Cow<'x, str> {
         &self.0
     }
-    /// Strip charset from html, making it utf-8 by default.
+    /// Replace charset with `utf-8`.
     ///
     /// This method should be called if the consumer of the html is a standard-conforming browser.
-    pub fn strip_charset(&mut self) {
+    pub fn fix_charset(&mut self) {
         let mut off = 0;
         let mut first = true;
         let mut found = Vec::with_capacity(2);
@@ -35,7 +35,10 @@ impl<'x> Html<'x> {
                 };
                 for w in between.as_bytes().windows(b"charset".len()) {
                     if w.eq_ignore_ascii_case(b"charset") {
-                        found.push((off, off + "<meta".len() + between.len() + ">".len()));
+                        found.push((
+                            off as isize,
+                            (off + "<meta".len() + between.len() + ">".len()) as isize,
+                        ));
                     }
                 }
                 off += "<meta".len();
@@ -43,12 +46,19 @@ impl<'x> Html<'x> {
             off += part.len();
             first = false;
         }
-        let mut deleted = 0;
+        let mut deleted: isize = 0;
+        let mut first = true;
         for (start, end) in found {
-            self.0
-                .to_mut()
-                .replace_range(start - deleted..end - deleted, "");
-            deleted += end - start;
+            let mut replace = "";
+            if first {
+                replace = "<meta charset=utf-8>";
+            }
+            self.0.to_mut().replace_range(
+                (start - deleted) as usize..(end - deleted) as usize,
+                replace,
+            );
+            deleted += end - start - replace.len() as isize;
+            first = false;
         }
     }
 }
@@ -57,35 +67,44 @@ impl<'x> Html<'x> {
 mod tests {
     use super::*;
 
-    fn strip(html: &str) -> Cow<'_, str> {
+    fn fix(html: &str) -> Cow<'_, str> {
         let mut html = Html(html.into());
-        html.strip_charset();
+        html.fix_charset();
         html.potentially_wrong_charset().clone()
     }
 
     #[test]
-    fn strip_charset() {
+    fn fix_charset() {
         assert_eq!(
-            strip("<head><meta cHarSet=Windows-1252></head>"),
-            "<head></head>"
+            fix("<head><meta cHarSet=Windows-1252></head>"),
+            "<head><meta charset=utf-8></head>"
         );
 
-        let stripped = strip("<head><meta cHarSet=\"Windows-1252\"></head>");
-        assert_eq!(stripped, "<head></head>");
+        let fixed = fix("<head><meta cHarSet=\"Windows-1252\"></head>");
+        assert_eq!(fixed, "<head><meta charset=utf-8></head>");
 
-        let stripped = strip("<head><meta http-equiv=\"Content-Type\" content=\"text/html; cHarSet=Windows-1252\"></head>");
-        assert_eq!(stripped, "<head></head>");
+        let fixed = fix("<head><meta http-equiv=\"Content-Type\" content=\"text/html; cHarSet=Windows-1252\"></head>");
+        assert_eq!(fixed, "<head><meta charset=utf-8></head>");
 
-        let stripped = strip("<head><meta http-equiv=\"Content-Type\" content=\"text/html; cHarSet = &quot;Windows-1252&quot;></head>");
-        assert_eq!(stripped, "<head></head>");
+        let fixed = fix("<head><meta http-equiv=\"Content-Type\" content=\"text/html; cHarSet = &quot;Windows-1252&quot;></head>");
+        assert_eq!(fixed, "<head><meta charset=utf-8></head>");
 
-        let stripped = strip("<head><meta name=\"xxx\"><meta http-equiv=\"Content-Type\" content=\"text/html; cHarSet = &quot;Windows-1252&quot;></head>");
-        assert_eq!(stripped, "<head><meta name=\"xxx\"></head>");
+        let fixed = fix("<head><meta name=\"xxx\"><meta http-equiv=\"Content-Type\" content=\"text/html; cHarSet = &quot;Windows-1252&quot;></head>");
+        assert_eq!(
+            fixed,
+            "<head><meta name=\"xxx\"><meta charset=utf-8></head>"
+        );
 
-        let stripped = strip("<head><meta http-equiv=\"Content-Type\" content=\"text/html; cHarSet = &quot;Windows-1252&quot;><meta name=\"xxx\"></head>");
-        assert_eq!(stripped, "<head><meta name=\"xxx\"></head>");
+        let fixed = fix("<head><meta http-equiv=\"Content-Type\" content=\"text/html; cHarSet = &quot;Windows-1252&quot;><meta name=\"xxx\"></head>");
+        assert_eq!(
+            fixed,
+            "<head><meta charset=utf-8><meta name=\"xxx\"></head>"
+        );
 
-        let stripped = strip("<head><meta cHarSet=Windows-1252><meta http-equiv=\"Content-Type\" content=\"text/html; cHarSet = &quot;Windows-1252&quot;><meta name=\"xxx\"></head>");
-        assert_eq!(stripped, "<head><meta name=\"xxx\"></head>");
+        let fixed = fix("<head><meta cHarSet=Windows-1252><meta http-equiv=\"Content-Type\" content=\"text/html; cHarSet = &quot;Windows-1252&quot;><meta name=\"xxx\"></head>");
+        assert_eq!(
+            fixed,
+            "<head><meta charset=utf-8><meta name=\"xxx\"></head>"
+        );
     }
 }

From f174c3eb84e683882b5cac88396875aa95f56fbe Mon Sep 17 00:00:00 2001
From: c <c@farsight.net>
Date: Thu, 15 May 2025 15:14:25 +0200
Subject: [PATCH 11/13] test that malformed tags are not modified

---
 src/core/html.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/core/html.rs b/src/core/html.rs
index 86a3089..582af05 100644
--- a/src/core/html.rs
+++ b/src/core/html.rs
@@ -106,5 +106,11 @@ mod tests {
             fixed,
             "<head><meta charset=utf-8><meta name=\"xxx\"></head>"
         );
+
+        let malformed = fix("<head><meta cHarSet=Windows-1252<meta http-equiv=\"Content-Type\" content=\"text/html; cHarSet = &quot;Windows-1252&quot;><meta name=\"xxx\"></head>");
+        assert_eq!(
+            malformed,
+            "<head><meta cHarSet=Windows-1252<meta http-equiv=\"Content-Type\" content=\"text/html; cHarSet = &quot;Windows-1252&quot;><meta name=\"xxx\"></head>"
+        );
     }
 }

From 23eab1716be2290491d177d407a2d78946bcabde Mon Sep 17 00:00:00 2001
From: c <c@farsight.net>
Date: Thu, 15 May 2025 15:15:46 +0200
Subject: [PATCH 12/13] fix: add space to tag to prevent
 <metacharset></metacharset> from being stripped

---
 src/core/html.rs | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/core/html.rs b/src/core/html.rs
index 582af05..377db91 100644
--- a/src/core/html.rs
+++ b/src/core/html.rs
@@ -28,7 +28,7 @@ impl<'x> Html<'x> {
         let mut off = 0;
         let mut first = true;
         let mut found = Vec::with_capacity(2);
-        for part in self.0.split("<meta") {
+        for part in self.0.split("<meta ") {
             if !first {
                 let Some((between, _)) = part.split_once('>') else {
                     return;
@@ -37,11 +37,12 @@ impl<'x> Html<'x> {
                     if w.eq_ignore_ascii_case(b"charset") {
                         found.push((
                             off as isize,
-                            (off + "<meta".len() + between.len() + ">".len()) as isize,
+                            (off + "<meta ".len() + between.len() + ">".len()) as isize,
                         ));
+                        break;
                     }
                 }
-                off += "<meta".len();
+                off += "<meta ".len();
             }
             off += part.len();
             first = false;
@@ -112,5 +113,8 @@ mod tests {
             malformed,
             "<head><meta cHarSet=Windows-1252<meta http-equiv=\"Content-Type\" content=\"text/html; cHarSet = &quot;Windows-1252&quot;><meta name=\"xxx\"></head>"
         );
+
+        let fixed = fix("<metacharset></metacharset>");
+        assert_eq!(fixed, "<metacharset></metacharset>");
     }
 }

From e9138b2b9504ff044a36adb6b52f137a6bf9da4c Mon Sep 17 00:00:00 2001
From: c <c@farsight.net>
Date: Thu, 15 May 2025 15:32:37 +0200
Subject: [PATCH 13/13] fix: rustdoc warnings

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 8a7d56a..340ecef 100644
--- a/README.md
+++ b/README.md
@@ -270,8 +270,8 @@ Supported character sets via the optional dependency [encoding_rs](https://crate
 
 Licensed under either of
 
- * Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
- * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+ * Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE) or <http://www.apache.org/licenses/LICENSE-2.0>)
+ * MIT license ([LICENSE-MIT](LICENSE-MIT) or <http://opensource.org/licenses/MIT>)
 
 at your option.