ID3v2: Properly handle multi-value UTF-16 encoded frames

Serial-ATA · Serial-ATA · commit 23c334e1de92 · 2023-10-27T10:47:06.000-04:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -21,7 +21,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
     parse it as another atom definition. As the specification is broad, there is no way for us to say *with certainty*
     that an identifier is invalid. Now we unfortunately have to guess the validity based on the commonly known atoms.
     For this, we follow [TagLib]'s [checks](https://github.com/taglib/taglib/blob/b40b834b1bdbd74593c5619e969e793d4d4886d9/taglib/mp4/mp4atom.cpp#L89).
-- **ID3v2**: No longer error on inputs shorter than 128 bytes (the length of an ID3v1 tag). ([PR](https://github.com/Serial-ATA/lofty-rs/pull/270))
+- **ID3v1**: No longer error on inputs shorter than 128 bytes (the length of an ID3v1 tag). ([PR](https://github.com/Serial-ATA/lofty-rs/pull/270))
+- **ID3v2**: No longer error on multi-value UTF-16 encoded text frames ([issue](https://github.com/Serial-ATA/lofty-rs/issues/265)) ([PR](https://github.com/Serial-ATA/lofty-rs/pull/284))
 
 ### Removed
 - **MP4**: `Ilst::{track_total, disc_number, disc_total}` ([PR](https://github.com/Serial-ATA/lofty-rs/pull/269))
diff --git a/src/id3/v2/items/extended_text_frame.rs b/src/id3/v2/items/extended_text_frame.rs
@@ -1,9 +1,7 @@
 use crate::error::{Id3v2Error, Id3v2ErrorKind, LoftyError, Result};
 use crate::id3::v2::frame::content::verify_encoding;
 use crate::id3::v2::header::Id3v2Version;
-use crate::util::text::{
-	decode_text, encode_text, read_to_terminator, utf16_decode_bytes, TextEncoding,
-};
+use crate::util::text::{decode_text, encode_text, utf16_decode_bytes, TextEncoding};
 
 use std::hash::{Hash, Hasher};
 use std::io::Read;
@@ -74,17 +72,19 @@ impl ExtendedTextFrame {
 
 		// It's possible for the description to be the only string with a BOM
 		'utf16: {
-			let bom = description.bom;
-			let Some(raw_text) = read_to_terminator(reader, TextEncoding::UTF16) else {
+			let mut raw_text = Vec::new();
+			reader.read_to_end(&mut raw_text)?;
+
+			if raw_text.is_empty() {
 				// Nothing left to do
 				frame_content = String::new();
 				break 'utf16;
-			};
+			}
 
+			let mut bom = description.bom;
 			if raw_text.starts_with(&[0xFF, 0xFE]) || raw_text.starts_with(&[0xFE, 0xFF]) {
-				frame_content =
-					decode_text(&mut &raw_text[..], TextEncoding::UTF16, false)?.content;
-				break 'utf16;
+				// The text specifies a BOM
+				bom = [raw_text[0], raw_text[1]];
 			}
 
 			let endianness = match bom {
diff --git a/src/util/text.rs b/src/util/text.rs
@@ -207,8 +207,11 @@ pub(crate) fn utf16_decode_bytes(bytes: &[u8], endianness: fn([u8; 2]) -> u16) -
 
 	let unverified: Vec<u16> = bytes
 		.chunks_exact(2)
-		.map_while(|c| match c {
-			[0, 0] => None,
+		// In ID3v2, it is possible to have multiple UTF-16 strings separated by null.
+		// This also makes it possible for us to encounter multiple BOMs in a single string.
+		// We must filter them out.
+		.filter_map(|c| match c {
+			[0xFF, 0xFE] | [0xFE, 0xFF] => None,
 			_ => Some(endianness(c.try_into().unwrap())), // Infallible
 		})
 		.collect();