Skip to content

Add methods to fixup html charset to utf-8 #110

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,6 @@ jobs:
steps:
- uses: actions/checkout@v2
- name: Build
run: cargo build --verbose
run: cargo build --verbose --all-features
- name: Run tests
run: cargo test --verbose
run: cargo test --verbose --all-features
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -270,8 +270,8 @@ Supported character sets via the optional dependency [encoding_rs](https://crate

Licensed under either of

* Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
* MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
* Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE) or <http://www.apache.org/licenses/LICENSE-2.0>)
* MIT license ([LICENSE-MIT](LICENSE-MIT) or <http://opensource.org/licenses/MIT>)

at your option.

Expand Down
7 changes: 5 additions & 2 deletions examples/message_parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7

// HTML and text body parts are returned conforming to RFC8621, Section 4.1.4
assert_eq!(
message.body_html(0).unwrap(),
message.body_html(0).unwrap().potentially_wrong_charset(),
concat!(
"<html><p>I was thinking about quitting the &ldquo;exporting&rdquo; to ",
"focus just on the &ldquo;importing&rdquo;,</p><p>but then I thought,",
Expand Down Expand Up @@ -120,7 +120,10 @@ R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7
"ℌ𝔢𝔩𝔭 𝔪𝔢 𝔢𝔵𝔭𝔬𝔯𝔱 𝔪𝔶 𝔟𝔬𝔬𝔨 𝔭𝔩𝔢𝔞𝔰𝔢!"
);
assert_eq!(
nested_message.body_html(0).unwrap(),
nested_message
.body_html(0)
.unwrap()
.potentially_wrong_charset(),
"<html><body>ℌ𝔢𝔩𝔭 𝔪𝔢 𝔢𝔵𝔭𝔬𝔯𝔱 𝔪𝔶 𝔟𝔬𝔬𝔨 𝔭𝔩𝔢𝔞𝔰𝔢!</body></html>"
);

Expand Down
3 changes: 2 additions & 1 deletion src/core/body.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ impl PartType<'_> {
#[allow(clippy::len_without_is_empty)]
pub fn len(&self) -> usize {
match self {
PartType::Text(v) | PartType::Html(v) => v.len(),
PartType::Text(v) => v.len(),
PartType::Html(v) => v.potentially_wrong_charset().len(),
PartType::Binary(v) | PartType::InlineBinary(v) => v.len(),
PartType::Message(v) => v.raw_message.len(),
PartType::Multipart(_) => 0,
Expand Down
11 changes: 7 additions & 4 deletions src/core/header.rs
Original file line number Diff line number Diff line change
Expand Up @@ -649,7 +649,8 @@ impl<'x> MessagePart<'x> {
/// Returns the body part's contents as a `u8` slice
pub fn contents(&self) -> &[u8] {
match &self.body {
PartType::Text(text) | PartType::Html(text) => text.as_bytes(),
PartType::Text(text) => text.as_bytes(),
PartType::Html(text) => text.potentially_wrong_charset().as_bytes(),
PartType::Binary(bin) | PartType::InlineBinary(bin) => bin.as_ref(),
PartType::Message(message) => message.raw_message(),
PartType::Multipart(_) => b"",
Expand All @@ -659,7 +660,8 @@ impl<'x> MessagePart<'x> {
/// Returns the body part's contents as a `str`
pub fn text_contents(&self) -> Option<&str> {
match &self.body {
PartType::Text(text) | PartType::Html(text) => text.as_ref().into(),
PartType::Text(text) => text.as_ref().into(),
PartType::Html(text) => text.potentially_wrong_charset().as_ref().into(),
PartType::Binary(bin) | PartType::InlineBinary(bin) => {
std::str::from_utf8(bin.as_ref()).ok()
}
Expand Down Expand Up @@ -689,7 +691,8 @@ impl<'x> MessagePart<'x> {
/// Returns the body part's length
pub fn len(&self) -> usize {
match &self.body {
PartType::Text(text) | PartType::Html(text) => text.len(),
PartType::Text(text) => text.len(),
PartType::Html(text) => text.potentially_wrong_charset().len(),
PartType::Binary(bin) | PartType::InlineBinary(bin) => bin.len(),
PartType::Message(message) => message.raw_message().len(),
PartType::Multipart(_) => 0,
Expand Down Expand Up @@ -758,7 +761,7 @@ impl<'x> MessagePart<'x> {
is_encoding_problem: self.is_encoding_problem,
body: match self.body {
PartType::Text(v) => PartType::Text(v.into_owned().into()),
PartType::Html(v) => PartType::Html(v.into_owned().into()),
PartType::Html(v) => PartType::Html(v.make_owned()),
PartType::Binary(v) => PartType::Binary(v.into_owned().into()),
PartType::InlineBinary(v) => PartType::InlineBinary(v.into_owned().into()),
PartType::Message(v) => PartType::Message(v.into_owned()),
Expand Down
120 changes: 120 additions & 0 deletions src/core/html.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
use std::borrow::Cow;

#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};

#[derive(Debug, PartialEq, Clone)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct Html<'x>(Cow<'x, str>);

impl<'x> Html<'x> {
pub(crate) fn make_owned(self) -> Html<'static> {
Html(self.0.into_owned().into())
}
pub(crate) fn new(html: Cow<'x, str>) -> Html<'x> {
Html(html)
}
/// Access the raw html with the original charset.
///
/// `mail-parser` returns utf-8 strings, so the only correct charset for the html is utf-8. Because html can declare its charset in `<meta>` tags, these may be incorrect after transcoding.
/// If the charset must be correct call [`Html::fix_charset`] before accessing the html with this method.
pub fn potentially_wrong_charset(&self) -> &Cow<'x, str> {
&self.0
}
/// Replace charset with `utf-8`.
///
/// This method should be called if the consumer of the html is a standard-conforming browser.
pub fn fix_charset(&mut self) {
let mut off = 0;
let mut first = true;
let mut found = Vec::with_capacity(2);
for part in self.0.split("<meta ") {
if !first {
let Some((between, _)) = part.split_once('>') else {
return;
};
for w in between.as_bytes().windows(b"charset".len()) {
if w.eq_ignore_ascii_case(b"charset") {
found.push((
off as isize,
(off + "<meta ".len() + between.len() + ">".len()) as isize,
));
break;
}
}
off += "<meta ".len();
}
off += part.len();
first = false;
}
let mut deleted: isize = 0;
let mut first = true;
for (start, end) in found {
let mut replace = "";
if first {
replace = "<meta charset=utf-8>";
}
self.0.to_mut().replace_range(
(start - deleted) as usize..(end - deleted) as usize,
replace,
);
deleted += end - start - replace.len() as isize;
first = false;
}
}
}

#[cfg(test)]
mod tests {
use super::*;

fn fix(html: &str) -> Cow<'_, str> {
let mut html = Html(html.into());
html.fix_charset();
html.potentially_wrong_charset().clone()
}

#[test]
fn fix_charset() {
assert_eq!(
fix("<head><meta cHarSet=Windows-1252></head>"),
"<head><meta charset=utf-8></head>"
);

let fixed = fix("<head><meta cHarSet=\"Windows-1252\"></head>");
assert_eq!(fixed, "<head><meta charset=utf-8></head>");

let fixed = fix("<head><meta http-equiv=\"Content-Type\" content=\"text/html; cHarSet=Windows-1252\"></head>");
assert_eq!(fixed, "<head><meta charset=utf-8></head>");

let fixed = fix("<head><meta http-equiv=\"Content-Type\" content=\"text/html; cHarSet = &quot;Windows-1252&quot;></head>");
assert_eq!(fixed, "<head><meta charset=utf-8></head>");

let fixed = fix("<head><meta name=\"xxx\"><meta http-equiv=\"Content-Type\" content=\"text/html; cHarSet = &quot;Windows-1252&quot;></head>");
assert_eq!(
fixed,
"<head><meta name=\"xxx\"><meta charset=utf-8></head>"
);

let fixed = fix("<head><meta http-equiv=\"Content-Type\" content=\"text/html; cHarSet = &quot;Windows-1252&quot;><meta name=\"xxx\"></head>");
assert_eq!(
fixed,
"<head><meta charset=utf-8><meta name=\"xxx\"></head>"
);

let fixed = fix("<head><meta cHarSet=Windows-1252><meta http-equiv=\"Content-Type\" content=\"text/html; cHarSet = &quot;Windows-1252&quot;><meta name=\"xxx\"></head>");
assert_eq!(
fixed,
"<head><meta charset=utf-8><meta name=\"xxx\"></head>"
);

let malformed = fix("<head><meta cHarSet=Windows-1252<meta http-equiv=\"Content-Type\" content=\"text/html; cHarSet = &quot;Windows-1252&quot;><meta name=\"xxx\"></head>");
assert_eq!(
malformed,
"<head><meta cHarSet=Windows-1252<meta http-equiv=\"Content-Type\" content=\"text/html; cHarSet = &quot;Windows-1252&quot;><meta name=\"xxx\"></head>"
);

let fixed = fix("<metacharset></metacharset>");
assert_eq!(fixed, "<metacharset></metacharset>");
}
}
10 changes: 5 additions & 5 deletions src/core/message.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ use crate::{
MessageStream,
},
Address, AttachmentIterator, BodyPartIterator, DateTime, GetHeader, Header, HeaderForm,
HeaderName, HeaderValue, Message, MessageParser, MessagePart, PartType, Received,
HeaderName, HeaderValue, Html, Message, MessageParser, MessagePart, PartType, Received,
};

impl<'x> Message<'x> {
Expand Down Expand Up @@ -391,11 +391,11 @@ impl<'x> Message<'x> {
}

/// Returns a message body part as text/plain
pub fn body_html(&'x self, pos: usize) -> Option<Cow<'x, str>> {
pub fn body_html(&'x self, pos: usize) -> Option<Html<'x>> {
let part = self.parts.get(*self.html_body.get(pos)? as usize)?;
match &part.body {
PartType::Html(html) => Some(html.as_ref().into()),
PartType::Text(text) => Some(text_to_html(text.as_ref()).into()),
PartType::Html(html) => Some(html.to_owned()),
PartType::Text(text) => Some(Html::new(text_to_html(text.as_ref()).into())),
_ => None,
}
}
Expand All @@ -405,7 +405,7 @@ impl<'x> Message<'x> {
let part = self.parts.get(*self.text_body.get(pos)? as usize)?;
match &part.body {
PartType::Text(text) => Some(text.as_ref().into()),
PartType::Html(html) => Some(html_to_text(html.as_ref()).into()),
PartType::Html(html) => Some(html_to_text(html.potentially_wrong_charset()).into()),
_ => None,
}
}
Expand Down
13 changes: 8 additions & 5 deletions src/core/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@
* SPDX-License-Identifier: Apache-2.0 OR MIT
*/

pub mod address;
pub mod body;
pub mod builder;
pub mod header;
pub mod message;
mod address;
mod body;
mod builder;
mod header;
mod html;
mod message;
#[cfg(feature = "rkyv")]
pub mod rkyv;

pub use html::Html;
6 changes: 3 additions & 3 deletions src/decoders/charsets/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
*/

pub mod map;
pub mod multi_byte;
pub mod single_byte;
pub mod utf;
mod multi_byte;
mod single_byte;
mod utf;

pub type DecoderFnc = fn(&[u8]) -> String;

Expand Down
2 changes: 1 addition & 1 deletion src/decoders/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use crate::parsers::MessageStream;

pub mod base64;
pub mod charsets;
pub mod encoded_word;
mod encoded_word;
pub mod hex;
pub mod html;
pub mod quoted_printable;
Expand Down
5 changes: 3 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@
*/
#![doc = include_str!("../README.md")]
#![deny(rust_2018_idioms)]
#[forbid(unsafe_code)]
#![forbid(unsafe_code)]
pub mod core;
pub mod decoders;
pub mod mailbox;
pub mod parsers;

use core::Html;
use parsers::MessageStream;
use std::{borrow::Cow, collections::HashMap, hash::Hash, net::IpAddr};

Expand Down Expand Up @@ -132,7 +133,7 @@ pub enum PartType<'x> {
Text(#[cfg_attr(feature = "rkyv", rkyv(with = rkyv::with::AsOwned))] Cow<'x, str>),

/// A text/html part
Html(#[cfg_attr(feature = "rkyv", rkyv(with = rkyv::with::AsOwned))] Cow<'x, str>),
Html(#[cfg_attr(feature = "rkyv", rkyv(with = rkyv::with::AsOwned))] Html<'x>),

/// Any other part type that is not text.
Binary(#[cfg_attr(feature = "rkyv", rkyv(with = rkyv::with::AsOwned))] Cow<'x, [u8]>),
Expand Down
12 changes: 6 additions & 6 deletions src/parsers/fields/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@
*/

pub mod address;
pub mod content_type;
mod content_type;
pub mod date;
pub mod id;
pub mod list;
pub mod raw;
pub mod received;
mod id;
mod list;
mod raw;
mod received;
pub mod thread;
pub mod unstructured;
mod unstructured;

#[cfg(test)]
use serde::{Deserialize, Serialize};
Expand Down
6 changes: 3 additions & 3 deletions src/parsers/message.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ use std::borrow::Cow;

use crate::{
decoders::{charsets::map::charset_decoder, DecodeFnc},
ContentType, Encoding, GetHeader, HeaderName, HeaderValue, Message, MessageParser, MessagePart,
MessagePartId, PartType,
ContentType, Encoding, GetHeader, HeaderName, HeaderValue, Html, Message, MessageParser,
MessagePart, MessagePartId, PartType,
};

use super::MessageStream;
Expand Down Expand Up @@ -337,7 +337,7 @@ impl MessageParser {
}

if is_html {
PartType::Html(text)
PartType::Html(Html::new(text))
} else {
PartType::Text(text)
}
Expand Down
6 changes: 3 additions & 3 deletions src/parsers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
use std::{iter::Peekable, ops::Range, slice::Iter};

pub mod fields;
pub mod header;
pub mod message;
pub mod mime;
mod header;
mod message;
mod mime;
pub mod preview;

pub struct MessageStream<'x> {
Expand Down
8 changes: 6 additions & 2 deletions src/parsers/preview.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,13 @@
use std::borrow::Cow;

use crate::decoders::html::html_to_text;
use crate::Html;

pub fn preview_html<'x>(html: Cow<'_, str>, max_len: usize) -> Cow<'x, str> {
preview_text(html_to_text(html.as_ref()).into(), max_len)
pub fn preview_html<'x>(html: Html<'_>, max_len: usize) -> Cow<'x, str> {
preview_text(
html_to_text(html.potentially_wrong_charset()).into(),
max_len,
)
}

pub fn preview_text<'x>(text: Cow<'_, str>, mut max_len: usize) -> Cow<'x, str> {
Expand Down
7 changes: 5 additions & 2 deletions tests/integration_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7
);

assert_eq!(
message.body_html(0).unwrap(),
message.body_html(0).unwrap().potentially_wrong_charset(),
concat!(
"<html><p>I was thinking about quitting the &ldquo;exporting&rdquo; to ",
"focus just on the &ldquo;importing&rdquo;,</p><p>but then I thought,",
Expand Down Expand Up @@ -137,7 +137,10 @@ R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7
);

assert_eq!(
nested_message.body_html(0).unwrap(),
nested_message
.body_html(0)
.unwrap()
.potentially_wrong_charset(),
"<html><body>ℌ𝔢𝔩𝔭 𝔪𝔢 𝔢𝔵𝔭𝔬𝔯𝔱 𝔪𝔶 𝔟𝔬𝔬𝔨 𝔭𝔩𝔢𝔞𝔰𝔢!</body></html>"
);

Expand Down