From 0e8ad5759c622d8fd8125cf316e6656206c7a4bc Mon Sep 17 00:00:00 2001 From: localthomas <81923579+localthomas@users.noreply.github.com> Date: Fri, 21 Jan 2022 13:55:28 +0100 Subject: [PATCH] added searching for frame sync bits when file type probing When a file contains an ID3 block, the following data might be an MP3 frame. The frame might be prepended by junk bytes, which are skipped with this commit. --- src/mp3/header.rs | 48 +++++++++++++++++++++++ src/probe.rs | 97 ++++++++++++++++++++++++++++++++++++----------- 2 files changed, 123 insertions(+), 22 deletions(-) diff --git a/src/mp3/header.rs b/src/mp3/header.rs index acb83e47..9ee9701d 100644 --- a/src/mp3/header.rs +++ b/src/mp3/header.rs @@ -9,6 +9,36 @@ pub(crate) fn verify_frame_sync(frame_sync: [u8; 2]) -> bool { frame_sync[0] == 0xFF && frame_sync[1] >> 5 == 0b111 } +/// Searches for a frame sync (11 bits with the value 1 like `0b1111_1111_111`) in the input reader. +/// The search starts at the beginning of the reader and returns the index relative to this beginning. +/// Only the first match is returned and on no match, [`None`] is returned instead. +/// +/// Note that the search searches in 8 bit steps, i.e. the first 8 bits need to be byte aligned. +pub(crate) fn search_for_frame_sync(input: &mut dyn Read) -> Result> { + let mut index = 0u64; + let mut iterator = input.bytes(); + let mut buffer = [0u8; 2]; + // Read the first byte, as each iteration expects that buffer 0 was set from a previous iteration. + // This is not the case in the first iteration, which is therefore a special case. + if let Some(byte) = iterator.next() { + buffer[0] = byte?; + } + // create a stream of overlapping 2 byte pairs + // example: [0x01, 0x02, 0x03, 0x04] should be analyzed as + // [0x01, 0x02], [0x02, 0x03], [0x03, 0x04] + while let Some(byte) = iterator.next() { + buffer[1] = byte?; + // check the two bytes in the buffer + if verify_frame_sync(buffer) { + return Ok(Some(index)); + } + // if they do not match, copy the last byte in the buffer to the front for the next iteration + buffer[0] = buffer[1]; + index += 1; + } + Ok(None) +} + #[derive(PartialEq, Copy, Clone, Debug)] #[allow(missing_docs)] /// MPEG Audio version @@ -200,3 +230,21 @@ impl XingHeader { } } } + +#[cfg(test)] +mod tests { + #[test] + fn search_for_frame_sync() { + fn test(data: &[u8], expected_result: Option) { + use super::search_for_frame_sync; + assert_eq!( + search_for_frame_sync(&mut Box::new(data)).unwrap(), + expected_result + ); + } + + test(&[0xFF, 0xFB, 0x00], Some(0)); + test(&[0x00, 0x00, 0x01, 0xFF, 0xFB], Some(3)); + test(&[0x01, 0xFF], None); + } +} diff --git a/src/probe.rs b/src/probe.rs index fd6a016c..97883c20 100644 --- a/src/probe.rs +++ b/src/probe.rs @@ -2,7 +2,7 @@ use crate::ape::ApeFile; use crate::error::{LoftyError, Result}; use crate::iff::aiff::AiffFile; use crate::iff::wav::WavFile; -use crate::mp3::header::verify_frame_sync; +use crate::mp3::header::search_for_frame_sync; use crate::mp3::Mp3File; use crate::mp4::Mp4File; use crate::ogg::flac::FlacFile; @@ -149,41 +149,62 @@ impl Probe { #[allow(clippy::shadow_unrelated)] fn guess_inner(&mut self) -> Result> { + // temporary buffer for storing 36 bytes + // (36 is just a guess as to how long the data for estimating the file type might be) let mut buf = [0; 36]; - let pos = self.inner.seek(SeekFrom::Current(0))?; + // read the first 36 bytes and seek back to the starting position + let starting_position = self.inner.stream_position()?; let buf_len = std::io::copy( - &mut self.inner.by_ref().take(36), + &mut self.inner.by_ref().take(buf.len() as u64), &mut Cursor::new(&mut buf[..]), )? as usize; + self.inner.seek(SeekFrom::Start(starting_position))?; - self.inner.seek(SeekFrom::Start(pos))?; - + // estimate the file type by using these 36 bytes + // note that any error from `from_buffer_inner` are suppressed, as it returns an error on unknown format + // - TODO: why is the case `Err(LoftyError::UnknownFormat)` suppressed, but only for `from_buffer_inner`? + // - What is the special meaning of the return type `Ok(None)` vs `Err(LoftyError::UnknownFormat)`? match FileType::from_buffer_inner(&buf[..buf_len]) { + // the file type was guessed based on these bytes Ok((Some(f_ty), _)) => Ok(Some(f_ty)), + // the first data block is ID3 data; this means other data can follow (e.g. APE or MP3 frames) Ok((None, id3_len)) => { - self.inner + // the position right after the ID3 block is the internal size value (id3_len) + // added to the length of the ID3 header (which is 10 bytes), + // as the size does not include the header itself + let position_after_id3_block = self + .inner .seek(SeekFrom::Current(i64::from(10 + id3_len)))?; - let mut ident = [0; 3]; - let buf_len = std::io::copy( - &mut self.inner.by_ref().take(3), - &mut Cursor::new(&mut ident[..]), - )?; + let file_type_after_id3_block = { + // try to guess the file type after the ID3 block by inspecting the first 3 bytes + let mut ident = [0; 3]; + let buf_len = std::io::copy( + &mut self.inner.by_ref().take(ident.len() as u64), + &mut Cursor::new(&mut ident[..]), + )?; - self.inner.seek(SeekFrom::Start(pos))?; + if buf_len < 3 { + Err(LoftyError::UnknownFormat) + } else if &ident == b"MAC" { + Ok(Some(FileType::APE)) + } else { + // potentially some junk bytes are between the ID3 block and the following MP3 block + // search for any possible sync bits after the ID3 block + self.inner.seek(SeekFrom::Start(position_after_id3_block))?; + if let Some(_) = search_for_frame_sync(&mut self.inner)? { + Ok(Some(FileType::MP3)) + } else { + Err(LoftyError::UnknownFormat) + } + } + }; - if buf_len < 3 { - return Err(LoftyError::UnknownFormat); - } + // before returning any result for a file type, seek back to the front + self.inner.seek(SeekFrom::Start(starting_position))?; - if &ident == b"MAC" { - Ok(Some(FileType::APE)) - } else if verify_frame_sync([ident[0], ident[1]]) { - Ok(Some(FileType::MP3)) - } else { - Err(LoftyError::UnknownFormat) - } + file_type_after_id3_block }, _ => Ok(None), } @@ -247,3 +268,35 @@ where { Probe::open(path)?.read(read_properties) } + +#[cfg(test)] +mod tests { + use crate::Probe; + + #[test] + fn mp3_file_id3v2_3() { + // test data that contains 4 bytes of junk (0x20) between the ID3 portion and the first MP3 frame + let data: [&[u8]; 4] = [ + // ID3v2.3 header (10 bytes) + &[0x49, 0x44, 0x33, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23], + // TALB frame + &[ + 0x54, 0x41, 0x4C, 0x42, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x01, 0xFF, 0xFE, 0x61, + 0x00, 0x61, 0x00, 0x61, 0x00, 0x61, 0x00, 0x61, 0x00, 0x61, 0x00, 0x61, 0x00, 0x61, + 0x00, 0x61, 0x00, 0x61, 0x00, 0x61, 0x00, + ], + // 4 bytes of junk + &[0x20, 0x20, 0x20, 0x20], + // start of MP3 frame (not all bytes are shown in this slice) + &[ + 0xFF, 0xFB, 0x50, 0xC4, 0x00, 0x03, 0xC0, 0x00, 0x01, 0xA4, 0x00, 0x00, 0x00, 0x20, + 0x00, 0x00, 0x34, 0x80, 0x00, 0x00, 0x04, + ], + ]; + let data: Vec = data.into_iter().flatten().cloned().collect(); + let data = std::io::Cursor::new(&data); + let probe = Probe::new(data); + let probe = probe.guess_file_type().unwrap(); + matches!(probe.file_type(), Some(crate::FileType::MP3)); + } +}