Serial-ATA · Serial-ATA · Aug 9, 2025 · Aug 9, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,6 +16,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - **Vorbis Comments**: Check `ALBUM ARTIST` for `ItemKey::AlbumArtist` conversions
 * **ItemKey**: `ItemKey` is now `Copy` ([PR](https://github.com/Serial-ATA/lofty-rs/pull/526))
 
+### Fixed
+- **ID3v2**: Support parsing UTF-16 `COMM`/`USLT` frames with a single BOM ([issue](https://github.com/Serial-ATA/lofty-rs/issues/532)) ([PR](https://github.com/Serial-ATA/lofty-rs/pull/535))
+  - Some encoders will only write a BOM to the frame's description, rather than to every string in the frame.
+    This was previously only supported in `SYLT` frames, and has been extended to `COMM` and `USLT`.
+
 ### Removed
 
 * **ItemKey**: `ItemKey::Unknown` ([PR](https://github.com/Serial-ATA/lofty-rs/pull/526))

diff --git a/lofty/src/id3/v2/items/language_frame.rs b/lofty/src/id3/v2/items/language_frame.rs
@@ -3,7 +3,10 @@ use crate::id3::v2::frame::content::verify_encoding;
 use crate::id3::v2::header::Id3v2Version;
 use crate::id3::v2::{FrameFlags, FrameHeader, FrameId};
 use crate::tag::items::Lang;
-use crate::util::text::{TextDecodeOptions, TextEncoding, decode_text, encode_text};
+use crate::util::text::{
+	DecodeTextResult, TextDecodeOptions, TextEncoding, decode_text, encode_text,
+	utf16_decode_terminated_maybe_bom,
+};
 
 use std::borrow::Cow;
 use std::hash::{Hash, Hasher};
@@ -35,12 +38,34 @@ impl LanguageFrame {
 		let mut language = [0; 3];
 		reader.read_exact(&mut language)?;
 
-		let description = decode_text(
+		let DecodeTextResult {
+			content: description,
+			bom,
+			..
+		} = decode_text(
 			reader,
 			TextDecodeOptions::new().encoding(encoding).terminated(true),
-		)?
-		.content;
-		let content = decode_text(reader, TextDecodeOptions::new().encoding(encoding))?.content;
+		)?;
+
+		let mut endianness: fn([u8; 2]) -> u16 = u16::from_le_bytes;
+
+		// It's possible for the description to be the only string with a BOM
+		// To be safe, we change the encoding to the concrete variant determined from the description
+		if encoding == TextEncoding::UTF16 {
+			endianness = match bom {
+				[0xFF, 0xFE] => u16::from_le_bytes,
+				[0xFE, 0xFF] => u16::from_be_bytes,
+				// The BOM has to be valid for `decode_text` to succeed
+				_ => unreachable!("Bad BOM {bom:?}"),
+			};
+		}
+
+		let content;
+		if encoding == TextEncoding::UTF16 {
+			(content, _) = utf16_decode_terminated_maybe_bom(reader, endianness)?;
+		} else {
+			content = decode_text(reader, TextDecodeOptions::new().encoding(encoding))?.content;
+		}
 
 		Ok(Some(Self {
 			encoding,

diff --git a/lofty/src/id3/v2/items/sync_text.rs b/lofty/src/id3/v2/items/sync_text.rs
@@ -2,12 +2,12 @@ use crate::error::{ErrorKind, Id3v2Error, Id3v2ErrorKind, LoftyError, Result};
 use crate::id3::v2::{FrameFlags, FrameHeader, FrameId};
 use crate::macros::err;
 use crate::util::text::{
-	TextDecodeOptions, TextEncoding, decode_text, encode_text, read_to_terminator,
-	utf16_decode_bytes,
+	DecodeTextResult, TextDecodeOptions, TextEncoding, decode_text, encode_text,
+	utf16_decode_terminated_maybe_bom,
 };
 
 use std::borrow::Cow;
-use std::io::{Cursor, Read, Seek, SeekFrom, Write};
+use std::io::{Cursor, Seek, Write};
 
 use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
 
@@ -148,23 +148,26 @@ impl SynchronizedTextFrame<'_> {
 			.ok_or_else(|| Id3v2Error::new(Id3v2ErrorKind::BadSyncText))?;
 
 		let mut cursor = Cursor::new(&data[6..]);
-		let description = crate::util::text::decode_text(
+		let DecodeTextResult {
+			content: description,
+			bom,
+			..
+		} = decode_text(
 			&mut cursor,
 			TextDecodeOptions::new().encoding(encoding).terminated(true),
 		)
-		.map_err(|_| Id3v2Error::new(Id3v2ErrorKind::BadSyncText))?
-		.text_or_none();
+		.map_err(|_| Id3v2Error::new(Id3v2ErrorKind::BadSyncText))?;
 
 		let mut endianness: fn([u8; 2]) -> u16 = u16::from_le_bytes;
 
 		// It's possible for the description to be the only string with a BOM
 		// To be safe, we change the encoding to the concrete variant determined from the description
 		if encoding == TextEncoding::UTF16 {
-			endianness = match cursor.get_ref()[..=1] {
+			endianness = match bom {
 				[0xFF, 0xFE] => u16::from_le_bytes,
 				[0xFE, 0xFF] => u16::from_be_bytes,
 				// Since the description was already read, we can assume the BOM was valid
-				_ => unreachable!(),
+				_ => unreachable!("Bad BOM {bom:?}"),
 			};
 		}
 
@@ -174,33 +177,23 @@ impl SynchronizedTextFrame<'_> {
 		let mut content = Vec::new();
 
 		while pos < total {
-			let text = (|| -> Result<String> {
-				if encoding == TextEncoding::UTF16 {
-					// Check for a BOM
-					let mut bom = [0; 2];
-					cursor
-						.read_exact(&mut bom)
+			let text;
+			if encoding == TextEncoding::UTF16 {
+				let (decoded, bytes_read) =
+					utf16_decode_terminated_maybe_bom(&mut cursor, endianness)
 						.map_err(|_| Id3v2Error::new(Id3v2ErrorKind::BadSyncText))?;
-
-					cursor.seek(SeekFrom::Current(-2))?;
-
-					// Encountered text that doesn't include a BOM
-					if bom != [0xFF, 0xFE] && bom != [0xFE, 0xFF] {
-						let (raw_text, _) = read_to_terminator(&mut cursor, TextEncoding::UTF16);
-						return utf16_decode_bytes(&raw_text, endianness)
-							.map_err(|_| Id3v2Error::new(Id3v2ErrorKind::BadSyncText).into());
-					}
-				}
-
+				pos += bytes_read as u64;
+				text = decoded;
+			} else {
 				let decoded_text = decode_text(
 					&mut cursor,
 					TextDecodeOptions::new().encoding(encoding).terminated(true),
 				)
 				.map_err(|_| Id3v2Error::new(Id3v2ErrorKind::BadSyncText))?;
 				pos += decoded_text.bytes_read as u64;
 
-				Ok(decoded_text.content)
-			})()?;
+				text = decoded_text.content;
+			}
 
 			let time = cursor
 				.read_u32::<BigEndian>()
@@ -217,7 +210,11 @@ impl SynchronizedTextFrame<'_> {
 			language,
 			timestamp_format,
 			content_type,
-			description,
+			description: if description.is_empty() {
+				None
+			} else {
+				Some(description)
+			},
 			content,
 		})
 	}

diff --git a/lofty/src/util/text.rs b/lofty/src/util/text.rs
@@ -160,22 +160,15 @@ where
 				err!(TextDecode("UTF-16 string has an odd length"));
 			}
 
-			let bom_to_check;
 			if options.bom == [0, 0] {
-				bom_to_check = [raw_bytes[0], raw_bytes[1]];
+				bom = [raw_bytes[0], raw_bytes[1]];
 			} else {
-				bom_to_check = options.bom;
+				bom = options.bom;
 			}
 
-			match bom_to_check {
-				[0xFE, 0xFF] => {
-					bom = [0xFE, 0xFF];
-					utf16_decode_bytes(&raw_bytes[2..], u16::from_be_bytes)?
-				},
-				[0xFF, 0xFE] => {
-					bom = [0xFF, 0xFE];
-					utf16_decode_bytes(&raw_bytes[2..], u16::from_le_bytes)?
-				},
+			match bom {
+				[0xFE, 0xFF] => utf16_decode_bytes(&raw_bytes[2..], u16::from_be_bytes)?,
+				[0xFF, 0xFE] => utf16_decode_bytes(&raw_bytes[2..], u16::from_le_bytes)?,
 				_ => err!(TextDecode("UTF-16 string has an invalid byte order mark")),
 			}
 		},
@@ -184,10 +177,6 @@ where
 			.map_err(|_| LoftyError::new(ErrorKind::TextDecode("Expected a UTF-8 string")))?,
 	};
 
-	if read_string.is_empty() {
-		return Ok(EMPTY_DECODED_TEXT);
-	}
-
 	Ok(DecodeTextResult {
 		content: read_string,
 		bytes_read,
@@ -278,6 +267,36 @@ pub(crate) fn utf16_decode_bytes(bytes: &[u8], endianness: fn([u8; 2]) -> u16) -
 	utf16_decode(&unverified)
 }
 
+// TODO: Can probably just be merged into an option on `TextDecodeOptions`
+/// Read a null-terminated UTF-16 string that may or may not have a BOM
+///
+/// This is needed for ID3v2, as some encoders will encode *only* the first string in a frame with a BOM,
+/// and the others are assumed to have the same byte order.
+///
+/// This is seen in frames like SYLT, COMM, and USLT, where the description will be the only string
+/// with a BOM.
+///
+/// If no BOM is present, the string will be decoded using `endianness`.
+pub(crate) fn utf16_decode_terminated_maybe_bom<R>(
+	reader: &mut R,
+	endianness: fn([u8; 2]) -> u16,
+) -> Result<(String, usize)>
+where
+	R: Read,
+{
+	let (raw_text, terminator_len) = read_to_terminator(reader, TextEncoding::UTF16);
+
+	let bytes_read = raw_text.len() + terminator_len;
+	let decoded;
+	match &*raw_text {
+		[0xFF, 0xFE, ..] => decoded = utf16_decode_bytes(&raw_text[2..], u16::from_le_bytes),
+		[0xFE, 0xFF, ..] => decoded = utf16_decode_bytes(&raw_text[2..], u16::from_be_bytes),
+		_ => decoded = utf16_decode_bytes(&raw_text, endianness),
+	}
+
+	decoded.map(|d| (d, bytes_read))
+}
+
 pub(crate) fn encode_text(text: &str, text_encoding: TextEncoding, terminated: bool) -> Vec<u8> {
 	match text_encoding {
 		TextEncoding::Latin1 => {