From efff0eb3061bf0270450cc481cc2bc8c58a033fe Mon Sep 17 00:00:00 2001 From: Serial <69764315+Serial-ATA@users.noreply.github.com> Date: Sat, 25 Jun 2022 21:21:47 -0400 Subject: [PATCH] ID3v2: Support decoding UTF-16 T/WXXX frames with missing content BOM closes #53 --- CHANGELOG.md | 1 + src/id3/v2/frame/content.rs | 46 ++++++++++++++++++++---- src/id3/v2/items/sync_text.rs | 2 +- src/id3/v2/tag.rs | 5 +++ tests/tags/assets/id3v2/issue_53.id3v24 | Bin 0 -> 1084 bytes 5 files changed, 47 insertions(+), 7 deletions(-) create mode 100644 tests/tags/assets/id3v2/issue_53.id3v24 diff --git a/CHANGELOG.md b/CHANGELOG.md index 670ee3ad..653e49b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,6 +37,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - **AIFF**: Fixed division by zero panic during property reading ([issue](https://github.com/Serial-ATA/lofty-rs/issues/56)) +- **ID3v2**: Support decoding UTF-16 T/WXXX frames with missing content BOM ([issue](https://github.com/Serial-ATA/lofty-rs/issues/53)) ## [0.6.3] - 2022-05-18 diff --git a/src/id3/v2/frame/content.rs b/src/id3/v2/frame/content.rs index a27daeb3..b2d8d752 100644 --- a/src/id3/v2/frame/content.rs +++ b/src/id3/v2/frame/content.rs @@ -2,13 +2,15 @@ use crate::error::{ErrorKind, Id3v2Error, Id3v2ErrorKind, LoftyError, Result}; use crate::id3::v2::frame::FrameValue; use crate::id3::v2::items::encoded_text_frame::EncodedTextFrame; use crate::id3::v2::items::language_frame::LanguageFrame; -use crate::id3::v2::util::text_utils::{decode_text, TextEncoding}; +use crate::id3::v2::items::popularimeter::Popularimeter; +use crate::id3::v2::util::text_utils::{ + decode_text, read_to_terminator, utf16_decode, TextEncoding, +}; use crate::id3::v2::Id3v2Version; use crate::picture::Picture; -use std::io::Read; +use std::io::{Cursor, Read}; -use crate::id3::v2::items::popularimeter::Popularimeter; use byteorder::ReadBytesExt; pub(super) fn parse_content( @@ -39,7 +41,7 @@ pub(super) fn parse_content( // There are 2 possibilities for the frame's content: text or link. fn parse_user_defined( - content: &mut &[u8], + mut content: &mut &[u8], link: bool, version: Id3v2Version, ) -> Result> { @@ -49,6 +51,22 @@ fn parse_user_defined( let encoding = verify_encoding(content.read_u8()?, version)?; + let mut endianness: fn([u8; 2]) -> u16 = u16::from_le_bytes; + if encoding == TextEncoding::UTF16 { + let mut cursor = Cursor::new(content); + let mut bom = [0; 2]; + cursor.read_exact(&mut bom)?; + + match [bom[0], bom[1]] { + [0xFF, 0xFE] => endianness = u16::from_le_bytes, + [0xFE, 0xFF] => endianness = u16::from_be_bytes, + // We'll catch an invalid BOM below + _ => {}, + }; + + content = cursor.into_inner(); + } + let description = decode_text(content, encoding, true)?.unwrap_or_default(); Ok(Some(if link { @@ -60,12 +78,28 @@ fn parse_user_defined( content, }) } else { - let content = decode_text(content, encoding, false)?.unwrap_or_default(); + let frame_content; + // It's possible for the description to be the only string with a BOM + if encoding == TextEncoding::UTF16 { + if content.len() >= 2 && (content[..2] == [0xFF, 0xFE] || content[..2] == [0xFE, 0xFF]) + { + frame_content = decode_text(content, encoding, false)?.unwrap_or_default(); + } else { + frame_content = match read_to_terminator(content, TextEncoding::UTF16) { + Some(raw_text) => utf16_decode(&*raw_text, endianness).map_err(|_| { + Into::::into(Id3v2Error::new(Id3v2ErrorKind::BadSyncText)) + })?, + None => String::new(), + } + } + } else { + frame_content = decode_text(content, encoding, false)?.unwrap_or_default(); + } FrameValue::UserText(EncodedTextFrame { encoding, description, - content, + content: frame_content, }) })) } diff --git a/src/id3/v2/items/sync_text.rs b/src/id3/v2/items/sync_text.rs index 120ebb8a..6fde93ab 100644 --- a/src/id3/v2/items/sync_text.rs +++ b/src/id3/v2/items/sync_text.rs @@ -140,7 +140,7 @@ impl SynchronizedText { .map_err(|_| Id3v2Error::new(Id3v2ErrorKind::BadSyncText))?; // Encountered text that doesn't include a BOM - if bom != [0xFF, 0xFE] || bom != [0xFE, 0xFF] { + if bom != [0xFF, 0xFE] && bom != [0xFE, 0xFF] { cursor.seek(SeekFrom::Current(-2))?; if let Some(raw_text) = read_to_terminator(&mut cursor, TextEncoding::UTF16) diff --git a/src/id3/v2/tag.rs b/src/id3/v2/tag.rs index ee163834..fd66204d 100644 --- a/src/id3/v2/tag.rs +++ b/src/id3/v2/tag.rs @@ -1059,4 +1059,9 @@ mod tests { let tag: Id3v2Tag = tag.into(); assert_eq!(tag.artist(), Some("foo/bar/baz")) } + + #[test] + fn utf16_txxx_with_single_bom() { + let _ = read_tag("tests/tags/assets/id3v2/issue_53.id3v24"); + } } diff --git a/tests/tags/assets/id3v2/issue_53.id3v24 b/tests/tags/assets/id3v2/issue_53.id3v24 new file mode 100644 index 0000000000000000000000000000000000000000..eb01d2a2406e37bbbd9d231420e916e1eeaec2c7 GIT binary patch literal 1084 zcmeH@I|_h63`9q;u<;Ih1-*jJ_A6{Fen1cvT@Ww7`tTOQCPgNhmu5^;r=qgT8e1W9 zDS3j|s8gQZI_DVecb>G-g@(`Wb!gTOHgjE}o?vh025M literal 0 HcmV?d00001