added searching for frame sync bits when file type probing

When a file contains an ID3 block, the following data might be an MP3 frame.
The frame might be prepended by junk bytes, which are skipped with this commit.
This commit is contained in:
localthomas 2022-01-21 13:55:28 +01:00
parent c88cad09b8
commit 0e8ad5759c
No known key found for this signature in database
GPG key ID: 2F94960E894832A3
2 changed files with 123 additions and 22 deletions

View file

@ -9,6 +9,36 @@ pub(crate) fn verify_frame_sync(frame_sync: [u8; 2]) -> bool {
frame_sync[0] == 0xFF && frame_sync[1] >> 5 == 0b111 frame_sync[0] == 0xFF && frame_sync[1] >> 5 == 0b111
} }
/// Searches for a frame sync (11 bits with the value 1 like `0b1111_1111_111`) in the input reader.
/// The search starts at the beginning of the reader and returns the index relative to this beginning.
/// Only the first match is returned and on no match, [`None`] is returned instead.
///
/// Note that the search searches in 8 bit steps, i.e. the first 8 bits need to be byte aligned.
pub(crate) fn search_for_frame_sync(input: &mut dyn Read) -> Result<Option<u64>> {
let mut index = 0u64;
let mut iterator = input.bytes();
let mut buffer = [0u8; 2];
// Read the first byte, as each iteration expects that buffer 0 was set from a previous iteration.
// This is not the case in the first iteration, which is therefore a special case.
if let Some(byte) = iterator.next() {
buffer[0] = byte?;
}
// create a stream of overlapping 2 byte pairs
// example: [0x01, 0x02, 0x03, 0x04] should be analyzed as
// [0x01, 0x02], [0x02, 0x03], [0x03, 0x04]
while let Some(byte) = iterator.next() {
buffer[1] = byte?;
// check the two bytes in the buffer
if verify_frame_sync(buffer) {
return Ok(Some(index));
}
// if they do not match, copy the last byte in the buffer to the front for the next iteration
buffer[0] = buffer[1];
index += 1;
}
Ok(None)
}
#[derive(PartialEq, Copy, Clone, Debug)] #[derive(PartialEq, Copy, Clone, Debug)]
#[allow(missing_docs)] #[allow(missing_docs)]
/// MPEG Audio version /// MPEG Audio version
@ -200,3 +230,21 @@ impl XingHeader {
} }
} }
} }
#[cfg(test)]
mod tests {
#[test]
fn search_for_frame_sync() {
fn test(data: &[u8], expected_result: Option<u64>) {
use super::search_for_frame_sync;
assert_eq!(
search_for_frame_sync(&mut Box::new(data)).unwrap(),
expected_result
);
}
test(&[0xFF, 0xFB, 0x00], Some(0));
test(&[0x00, 0x00, 0x01, 0xFF, 0xFB], Some(3));
test(&[0x01, 0xFF], None);
}
}

View file

@ -2,7 +2,7 @@ use crate::ape::ApeFile;
use crate::error::{LoftyError, Result}; use crate::error::{LoftyError, Result};
use crate::iff::aiff::AiffFile; use crate::iff::aiff::AiffFile;
use crate::iff::wav::WavFile; use crate::iff::wav::WavFile;
use crate::mp3::header::verify_frame_sync; use crate::mp3::header::search_for_frame_sync;
use crate::mp3::Mp3File; use crate::mp3::Mp3File;
use crate::mp4::Mp4File; use crate::mp4::Mp4File;
use crate::ogg::flac::FlacFile; use crate::ogg::flac::FlacFile;
@ -149,41 +149,62 @@ impl<R: Read + Seek> Probe<R> {
#[allow(clippy::shadow_unrelated)] #[allow(clippy::shadow_unrelated)]
fn guess_inner(&mut self) -> Result<Option<FileType>> { fn guess_inner(&mut self) -> Result<Option<FileType>> {
// temporary buffer for storing 36 bytes
// (36 is just a guess as to how long the data for estimating the file type might be)
let mut buf = [0; 36]; let mut buf = [0; 36];
let pos = self.inner.seek(SeekFrom::Current(0))?; // read the first 36 bytes and seek back to the starting position
let starting_position = self.inner.stream_position()?;
let buf_len = std::io::copy( let buf_len = std::io::copy(
&mut self.inner.by_ref().take(36), &mut self.inner.by_ref().take(buf.len() as u64),
&mut Cursor::new(&mut buf[..]), &mut Cursor::new(&mut buf[..]),
)? as usize; )? as usize;
self.inner.seek(SeekFrom::Start(starting_position))?;
self.inner.seek(SeekFrom::Start(pos))?; // estimate the file type by using these 36 bytes
// note that any error from `from_buffer_inner` are suppressed, as it returns an error on unknown format
// - TODO: why is the case `Err(LoftyError::UnknownFormat)` suppressed, but only for `from_buffer_inner`?
// - What is the special meaning of the return type `Ok(None)` vs `Err(LoftyError::UnknownFormat)`?
match FileType::from_buffer_inner(&buf[..buf_len]) { match FileType::from_buffer_inner(&buf[..buf_len]) {
// the file type was guessed based on these bytes
Ok((Some(f_ty), _)) => Ok(Some(f_ty)), Ok((Some(f_ty), _)) => Ok(Some(f_ty)),
// the first data block is ID3 data; this means other data can follow (e.g. APE or MP3 frames)
Ok((None, id3_len)) => { Ok((None, id3_len)) => {
self.inner // the position right after the ID3 block is the internal size value (id3_len)
// added to the length of the ID3 header (which is 10 bytes),
// as the size does not include the header itself
let position_after_id3_block = self
.inner
.seek(SeekFrom::Current(i64::from(10 + id3_len)))?; .seek(SeekFrom::Current(i64::from(10 + id3_len)))?;
let file_type_after_id3_block = {
// try to guess the file type after the ID3 block by inspecting the first 3 bytes
let mut ident = [0; 3]; let mut ident = [0; 3];
let buf_len = std::io::copy( let buf_len = std::io::copy(
&mut self.inner.by_ref().take(3), &mut self.inner.by_ref().take(ident.len() as u64),
&mut Cursor::new(&mut ident[..]), &mut Cursor::new(&mut ident[..]),
)?; )?;
self.inner.seek(SeekFrom::Start(pos))?;
if buf_len < 3 { if buf_len < 3 {
return Err(LoftyError::UnknownFormat); Err(LoftyError::UnknownFormat)
} } else if &ident == b"MAC" {
if &ident == b"MAC" {
Ok(Some(FileType::APE)) Ok(Some(FileType::APE))
} else if verify_frame_sync([ident[0], ident[1]]) { } else {
// potentially some junk bytes are between the ID3 block and the following MP3 block
// search for any possible sync bits after the ID3 block
self.inner.seek(SeekFrom::Start(position_after_id3_block))?;
if let Some(_) = search_for_frame_sync(&mut self.inner)? {
Ok(Some(FileType::MP3)) Ok(Some(FileType::MP3))
} else { } else {
Err(LoftyError::UnknownFormat) Err(LoftyError::UnknownFormat)
} }
}
};
// before returning any result for a file type, seek back to the front
self.inner.seek(SeekFrom::Start(starting_position))?;
file_type_after_id3_block
}, },
_ => Ok(None), _ => Ok(None),
} }
@ -247,3 +268,35 @@ where
{ {
Probe::open(path)?.read(read_properties) Probe::open(path)?.read(read_properties)
} }
#[cfg(test)]
mod tests {
use crate::Probe;
#[test]
fn mp3_file_id3v2_3() {
// test data that contains 4 bytes of junk (0x20) between the ID3 portion and the first MP3 frame
let data: [&[u8]; 4] = [
// ID3v2.3 header (10 bytes)
&[0x49, 0x44, 0x33, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23],
// TALB frame
&[
0x54, 0x41, 0x4C, 0x42, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x01, 0xFF, 0xFE, 0x61,
0x00, 0x61, 0x00, 0x61, 0x00, 0x61, 0x00, 0x61, 0x00, 0x61, 0x00, 0x61, 0x00, 0x61,
0x00, 0x61, 0x00, 0x61, 0x00, 0x61, 0x00,
],
// 4 bytes of junk
&[0x20, 0x20, 0x20, 0x20],
// start of MP3 frame (not all bytes are shown in this slice)
&[
0xFF, 0xFB, 0x50, 0xC4, 0x00, 0x03, 0xC0, 0x00, 0x01, 0xA4, 0x00, 0x00, 0x00, 0x20,
0x00, 0x00, 0x34, 0x80, 0x00, 0x00, 0x04,
],
];
let data: Vec<u8> = data.into_iter().flatten().cloned().collect();
let data = std::io::Cursor::new(&data);
let probe = Probe::new(data);
let probe = probe.guess_file_type().unwrap();
matches!(probe.file_type(), Some(crate::FileType::MP3));
}
}