diff --git a/src/api.rs b/src/api.rs index 629308e..0bf492f 100644 --- a/src/api.rs +++ b/src/api.rs @@ -25,7 +25,6 @@ pub fn yaml_parser_new<'r>() -> yaml_parser_t<'r> { eof: false, buffer: VecDeque::with_capacity(INPUT_BUFFER_SIZE), unread: 0, - raw_buffer: VecDeque::with_capacity(INPUT_RAW_BUFFER_SIZE), encoding: YAML_ANY_ENCODING, offset: 0, mark: yaml_mark_t::default(), @@ -49,7 +48,6 @@ pub fn yaml_parser_new<'r>() -> yaml_parser_t<'r> { /// Destroy a parser. pub fn yaml_parser_delete(parser: &mut yaml_parser_t) { - parser.raw_buffer.clear(); parser.buffer.clear(); parser.tokens.clear(); parser.indents.clear(); @@ -70,7 +68,10 @@ pub fn yaml_parser_set_input_string<'r>(parser: &mut yaml_parser_t<'r>, input: & } /// Set a generic input handler. -pub fn yaml_parser_set_input<'r>(parser: &mut yaml_parser_t<'r>, input: &'r mut dyn std::io::Read) { +pub fn yaml_parser_set_input<'r>( + parser: &mut yaml_parser_t<'r>, + input: &'r mut dyn std::io::BufRead, +) { assert!((parser.read_handler).is_none()); parser.read_handler = Some(input); } diff --git a/src/bin/run-parser-test-suite.rs b/src/bin/run-parser-test-suite.rs index 99c8282..764ec0a 100644 --- a/src/bin/run-parser-test-suite.rs +++ b/src/bin/run-parser-test-suite.rs @@ -29,7 +29,8 @@ pub(crate) fn test_main( ) -> Result<(), Box> { let mut parser = yaml_parser_new(); - yaml_parser_set_input(&mut parser, stdin); + let mut stdin = std::io::BufReader::new(stdin); + yaml_parser_set_input(&mut parser, &mut stdin); loop { let event = match yaml_parser_parse(&mut parser) { diff --git a/src/error.rs b/src/error.rs index f55310e..cea4cd6 100644 --- a/src/error.rs +++ b/src/error.rs @@ -24,6 +24,12 @@ pub enum ReaderError { offset: usize, value: i32, }, + #[error("input stream produced an invalid byte order marker")] + InvalidBom, + #[error("invalid UTF-8 byte at offset: {value:x}")] + InvalidUtf8 { value: u8 }, + #[error("invalid UTF-16 unpaired surrogate: {value:x}")] + InvalidUtf16 { value: u16 }, #[error(transparent)] Io(#[from] std::io::Error), } diff --git a/src/reader.rs b/src/reader.rs index 7313b65..4a385f3 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -1,14 +1,13 @@ +use std::io::BufRead; + use alloc::collections::VecDeque; -use crate::api::INPUT_RAW_BUFFER_SIZE; -use crate::macros::vecdeque_starts_with; use crate::{ - yaml_parser_t, ReaderError, YAML_ANY_ENCODING, YAML_UTF16BE_ENCODING, YAML_UTF16LE_ENCODING, - YAML_UTF8_ENCODING, + yaml_encoding_t, yaml_parser_t, ReaderError, YAML_ANY_ENCODING, YAML_UTF16BE_ENCODING, + YAML_UTF16LE_ENCODING, YAML_UTF8_ENCODING, }; fn yaml_parser_set_reader_error( - _parser: &mut yaml_parser_t, problem: &'static str, offset: usize, value: i32, @@ -20,168 +19,254 @@ fn yaml_parser_set_reader_error( }) } -const BOM_UTF8: &[u8] = b"\xEF\xBB\xBF"; -const BOM_UTF16LE: &[u8] = b"\xFF\xFE"; -const BOM_UTF16BE: &[u8] = b"\xFE\xFF"; +const BOM_UTF8: [u8; 3] = [0xef, 0xbb, 0xbf]; +const BOM_UTF16LE: [u8; 2] = [0xff, 0xfe]; +const BOM_UTF16BE: [u8; 2] = [0xfe, 0xff]; -fn yaml_parser_determine_encoding(parser: &mut yaml_parser_t) -> Result<(), ReaderError> { - while !parser.eof && parser.raw_buffer.len() < 3 { - yaml_parser_update_raw_buffer(parser)?; - } - if vecdeque_starts_with(&parser.raw_buffer, BOM_UTF16LE) { - parser.encoding = YAML_UTF16LE_ENCODING; - parser.raw_buffer.drain(0..2); - parser.offset += 2; - } else if vecdeque_starts_with(&parser.raw_buffer, BOM_UTF16BE) { - parser.encoding = YAML_UTF16BE_ENCODING; - parser.raw_buffer.drain(0..2); - parser.offset += 2; - } else if vecdeque_starts_with(&parser.raw_buffer, BOM_UTF8) { - parser.encoding = YAML_UTF8_ENCODING; - parser.raw_buffer.drain(0..3); - parser.offset += 3; - } else { - parser.encoding = YAML_UTF8_ENCODING; - } - Ok(()) -} - -fn yaml_parser_update_raw_buffer(parser: &mut yaml_parser_t) -> Result<(), ReaderError> { - if parser.raw_buffer.len() >= INPUT_RAW_BUFFER_SIZE { - return Ok(()); - } - if parser.eof { - return Ok(()); +fn yaml_parser_determine_encoding( + reader: &mut dyn BufRead, +) -> Result, ReaderError> { + let initial_bytes = reader.fill_buf()?; + if initial_bytes.is_empty() { + return Ok(None); } - let len_before = parser.raw_buffer.len(); - debug_assert!(len_before < INPUT_RAW_BUFFER_SIZE); - parser.raw_buffer.resize(INPUT_RAW_BUFFER_SIZE, 0); - let contiguous = parser.raw_buffer.make_contiguous(); - let write_to = &mut contiguous[len_before..]; - - let size_read = parser - .read_handler - .as_mut() - .expect("non-null read handler") - .read(write_to)?; - - let valid_size = len_before + size_read; - parser.raw_buffer.truncate(valid_size); - if size_read == 0 { - parser.eof = true; - } - Ok(()) -} - -fn utf8_char_width_and_initial_value(initial: u8) -> (usize, u32) { - let initial = initial as u32; - if initial & 0x80 == 0 { - (1, initial & 0x7f) - } else if initial & 0xE0 == 0xC0 { - (2, initial & 0x1f) - } else if initial & 0xF0 == 0xE0 { - (3, initial & 0x0f) - } else if initial & 0xF8 == 0xF0 { - (4, initial & 0x07) - } else { - (0, 0) - } -} - -enum Utf8Error { - Incomplete, - InvalidLeadingOctet, - InvalidTrailingOctet(usize), - InvalidLength, - InvalidUnicode(u32), -} - -fn read_char_utf8(raw: &mut VecDeque) -> Option> { - let first = raw.front().copied()?; - let (width, mut value) = utf8_char_width_and_initial_value(first); - if width == 0 { - return Some(Err(Utf8Error::InvalidLeadingOctet)); - } - if raw.len() < width { - return Some(Err(Utf8Error::Incomplete)); - } - for (i, trailing) in raw.iter().enumerate().take(width).skip(1) { - if trailing & 0xc0 != 0x80 { - return Some(Err(Utf8Error::InvalidTrailingOctet(i))); + match initial_bytes[0] { + 0xef => { + let mut bom = [0; 3]; + reader.read_exact(&mut bom)?; + if bom == BOM_UTF8 { + Ok(Some(YAML_UTF8_ENCODING)) + } else { + Err(ReaderError::InvalidBom) + } } - value <<= 6; - value += *trailing as u32 & 0x3f; - } - if !(width == 1 - || width == 2 && value >= 0x80 - || width == 3 && value >= 0x800 - || width == 4 && value >= 0x10000) - { - return Some(Err(Utf8Error::InvalidLength)); - } - if let Some(ch) = char::from_u32(value) { - raw.drain(..width); - Some(Ok(ch)) - } else { - Some(Err(Utf8Error::InvalidUnicode(value))) + 0xff | 0xfe => { + let mut bom = [0; 2]; + reader.read_exact(&mut bom)?; + if bom == BOM_UTF16LE { + Ok(Some(YAML_UTF16LE_ENCODING)) + } else if bom == BOM_UTF16BE { + Ok(Some(YAML_UTF16BE_ENCODING)) + } else { + Err(ReaderError::InvalidBom) + } + } + _ => Ok(Some(YAML_UTF8_ENCODING)), } } -enum Utf16Error { - Incomplete, - UnexpectedLowSurrogateArea(u32), - ExpectedLowSurrogateArea(u32), - InvalidUnicode(u32), -} - -fn read_char_utf16( - raw: &mut VecDeque, -) -> Option> { - if raw.is_empty() { - return None; - } - if raw.len() < 2 { - return Some(Err(Utf16Error::Incomplete)); - } - let bytes = [raw[0], raw[1]]; - let mut value = if BIG_ENDIAN { - u16::from_be_bytes(bytes) as u32 - } else { - u16::from_le_bytes(bytes) as u32 +fn read_utf8_buffered( + reader: &mut dyn BufRead, + out: &mut VecDeque, + offset: &mut usize, +) -> Result { + let available = loop { + match reader.fill_buf() { + Ok([]) => return Ok(false), + Ok(available) => break available, + Err(err) if err.kind() == std::io::ErrorKind::Interrupted => continue, + Err(err) => return Err(err.into()), + } }; - if value & 0xfc00 == 0xdc00 { - return Some(Err(Utf16Error::UnexpectedLowSurrogateArea(value))); - } - let width; - if value & 0xfc00 == 0xd800 { - width = 4; - if raw.len() < width { - return Some(Err(Utf16Error::Incomplete)); - } - let bytes2 = [raw[2], raw[3]]; - let value2 = if BIG_ENDIAN { - u16::from_be_bytes(bytes2) as u32 - } else { - u16::from_le_bytes(bytes2) as u32 - }; - if value2 & 0xfc00 != 0xdc00 { - return Some(Err(Utf16Error::ExpectedLowSurrogateArea(value2))); - } - value = (0x10000 + (value & 0x3ff)) << (10 + (value2 & 0x3ff)); - } else { - width = 2; - } - if let Some(ch) = char::from_u32(value) { - raw.drain(..width); - Some(Ok(ch)) - } else { - Some(Err(Utf16Error::InvalidUnicode(value))) + match core::str::from_utf8(available) { + Ok(valid) => { + let used = valid.len(); + // The entire contents of the input buffer was valid UTF-8. + for ch in valid.chars() { + push_char(out, ch, *offset)?; + *offset += ch.len_utf8(); + } + reader.consume(used); + Ok(true) + } + Err(err) => { + let valid_bytes = err.valid_up_to(); + + // If some of the buffer contents were valid, append that to the + // output. + let valid = unsafe { + // SAFETY: This is safe because of `valid_up_to()`. + core::str::from_utf8_unchecked(&available[..valid_bytes]) + }; + for ch in valid.chars() { + push_char(out, ch, *offset)?; + *offset += ch.len_utf8(); + } + + match err.error_len() { + Some(_invalid_len) => { + return Err(ReaderError::InvalidUtf8 { + value: available[valid_bytes], + }); + } + None => { + if valid_bytes != 0 { + // Some valid UTF-8 characters were present, and the + // tail end of the buffer was an incomplete sequence. + // Leave the incomplete sequence in the buffer. + reader.consume(valid_bytes); + Ok(true) + } else { + // The beginning of the buffer was an incomplete UTF-8 + // sequence. Read the whole character unbuffered. + // + // This will return `UnexpectedEof` if the sequence + // cannot be completed. Note that `read_exact()` handles + // interrupt automatically. + let initial = available[0]; + read_utf8_char_unbuffered(reader, out, initial, offset)?; + Ok(true) + } + } + } + } } } -fn push_char(parser: &mut yaml_parser_t, ch: char) -> Result<(), ReaderError> { +fn read_utf8_char_unbuffered( + reader: &mut dyn BufRead, + out: &mut VecDeque, + initial: u8, + offset: &mut usize, +) -> Result<(), ReaderError> { + let width = utf8_char_width(initial); + let mut buffer = [0; 4]; + reader.read_exact(&mut buffer[..width])?; + if let Ok(valid) = core::str::from_utf8(&buffer[..width]) { + // We read a whole, valid character. + let Some(ch) = valid.chars().next() else { + unreachable!() + }; + push_char(out, ch, *offset)?; + *offset += width; + Ok(()) + } else { + // Since we read the exact character width, the only + // possible error here is invalid Unicode. + Err(ReaderError::InvalidUtf8 { value: buffer[0] }) + } +} + +fn read_utf16_buffered( + reader: &mut dyn BufRead, + out: &mut VecDeque, + offset: &mut usize, +) -> Result { + let available = loop { + match reader.fill_buf() { + Ok([]) => return Ok(false), + Ok(available) => break available, + Err(err) if err.kind() == std::io::ErrorKind::Interrupted => continue, + Err(err) => return Err(err.into()), + } + }; + + let chunks = available.chunks_exact(2).map(|chunk| { + let [a, b] = chunk else { unreachable!() }; + if BIG_ENDIAN { + u16::from_be_bytes([*a, *b]) + } else { + u16::from_le_bytes([*a, *b]) + } + }); + + let mut used = 0; + for ch in core::char::decode_utf16(chunks) { + match ch { + Ok(ch) => { + push_char(out, ch, *offset)?; + let n = ch.len_utf16(); + *offset += n; + used += n; + } + Err(_) => { + // An unpaired surrogate may either be a corrupt stream, but it + // can also be that the buffer just happens to contain the first + // half of a surrogate pair. Consume all of the valid bytes in + // the buffer first, and then handle the unpaired surrogate in + // the "slow" path (`read_utf16_char_unbuffered`) the next time + // we are called. + break; + } + } + } + + if used != 0 { + reader.consume(used); + *offset += used; + Ok(true) + } else { + debug_assert!(available.len() != 0 && available.len() < 2); + read_utf16_char_unbuffered::(reader, out, offset)?; + Ok(true) + } +} + +fn read_utf16_char_unbuffered( + reader: &mut dyn BufRead, + out: &mut VecDeque, + offset: &mut usize, +) -> Result<(), ReaderError> { + let mut buffer = [0; 2]; + reader.read_exact(&mut buffer)?; + let first = if BIG_ENDIAN { + u16::from_be_bytes(buffer) + } else { + u16::from_le_bytes(buffer) + }; + + if is_utf16_surrogate(first) { + reader.read_exact(&mut buffer)?; + let second = if BIG_ENDIAN { + u16::from_be_bytes(buffer) + } else { + u16::from_le_bytes(buffer) + }; + + match core::char::decode_utf16([first, second]).next() { + Some(Ok(ch)) => { + push_char(out, ch, *offset)?; + *offset += 4; + Ok(()) + } + Some(Err(err)) => Err(ReaderError::InvalidUtf16 { + value: err.unpaired_surrogate(), + }), + None => unreachable!(), + } + } else { + match core::char::decode_utf16([first]).next() { + Some(Ok(ch)) => { + push_char(out, ch, *offset)?; + *offset += 2; + Ok(()) + } + Some(Err(_)) | None => unreachable!(), + } + } +} + +fn utf8_char_width(initial: u8) -> usize { + if initial & 0x80 == 0 { + 1 + } else if initial & 0xE0 == 0xC0 { + 2 + } else if initial & 0xF0 == 0xE0 { + 3 + } else if initial & 0xF8 == 0xF0 { + 4 + } else { + 0 + } +} + +fn is_utf16_surrogate(value: u16) -> bool { + matches!(value, 0xD800..=0xDFFF) +} + +fn push_char(out: &mut VecDeque, ch: char, offset: usize) -> Result<(), ReaderError> { if !(ch == '\x09' || ch == '\x0A' || ch == '\x0D' @@ -191,16 +276,9 @@ fn push_char(parser: &mut yaml_parser_t, ch: char) -> Result<(), ReaderError> { || ch >= '\u{E000}' && ch <= '\u{FFFD}' || ch >= '\u{10000}' && ch <= '\u{10FFFF}') { - return yaml_parser_set_reader_error( - parser, - "control characters are not allowed", - parser.offset, - ch as _, - ); + return yaml_parser_set_reader_error("control characters are not allowed", offset, ch as _); } - parser.buffer.push_back(ch); - parser.offset += ch.len_utf8(); - parser.unread += 1; + out.push_back(ch); Ok(()) } @@ -208,145 +286,49 @@ pub(crate) fn yaml_parser_update_buffer( parser: &mut yaml_parser_t, length: usize, ) -> Result<(), ReaderError> { - let mut first = true; - assert!((parser.read_handler).is_some()); - if parser.eof && parser.raw_buffer.is_empty() { - return Ok(()); - } + let reader = parser.read_handler.as_deref_mut().expect("no read handler"); if parser.unread >= length { return Ok(()); } if parser.encoding == YAML_ANY_ENCODING { - yaml_parser_determine_encoding(parser)?; + if let Some(encoding) = yaml_parser_determine_encoding(reader)? { + parser.encoding = encoding; + } else { + parser.eof = true; + return Ok(()); + } } while parser.unread < length { - if parser.eof && parser.raw_buffer.is_empty() { + if parser.eof { return Ok(()); } - if !first || parser.raw_buffer.is_empty() { - yaml_parser_update_raw_buffer(parser)?; - } - first = false; - match parser.encoding { + + let tokens_before = parser.buffer.len(); + + let not_eof = match parser.encoding { + YAML_ANY_ENCODING => unreachable!(), YAML_UTF8_ENCODING => { - match read_char_utf8(&mut parser.raw_buffer) { - Some(Ok(ch)) => { - push_char(parser, ch)?; - } - Some(Err(Utf8Error::Incomplete)) => { - if parser.eof { - return yaml_parser_set_reader_error( - parser, - "incomplete UTF-8 octet sequence", - parser.offset, - -1, - ); - } else { - // Read more - } - } - Some(Err(Utf8Error::InvalidLeadingOctet)) => { - return yaml_parser_set_reader_error( - parser, - "invalid leading UTF-8 octet", - parser.offset, - parser.raw_buffer[0] as _, - ); - } - Some(Err(Utf8Error::InvalidTrailingOctet(offset))) => { - return yaml_parser_set_reader_error( - parser, - "invalid trailing UTF-8 octet", - parser.offset + offset, - parser.raw_buffer[offset] as _, - ); - } - Some(Err(Utf8Error::InvalidLength)) => { - return yaml_parser_set_reader_error( - parser, - "invalid length of a UTF-8 sequence", - parser.offset, - -1, - ); - } - Some(Err(Utf8Error::InvalidUnicode(value))) => { - return yaml_parser_set_reader_error( - parser, - "invalid Unicode character", - parser.offset, - value as _, - ); - } - None => (), - } + read_utf8_buffered(reader, &mut parser.buffer, &mut parser.offset)? } - YAML_UTF16LE_ENCODING | YAML_UTF16BE_ENCODING => { - let is_big_endian = parser.encoding == YAML_UTF16BE_ENCODING; - let res = if is_big_endian { - read_char_utf16::(&mut parser.raw_buffer) - } else { - read_char_utf16::(&mut parser.raw_buffer) - }; - match res { - Some(Ok(ch)) => { - push_char(parser, ch)?; - } - Some(Err(Utf16Error::Incomplete)) => { - if parser.eof { - return yaml_parser_set_reader_error( - parser, - "incomplete UTF-16 character", - parser.offset, - -1, - ); - } else { - // Read more - } - } - Some(Err(Utf16Error::UnexpectedLowSurrogateArea(value))) => { - return yaml_parser_set_reader_error( - parser, - "unexpected low surrogate area", - parser.offset, - value as i32, - ); - } - // Some(Err(Utf16Error::IncompleteSurrogatePair)) => { - // return yaml_parser_set_reader_error( - // parser, - // "incomplete UTF-16 surrogate pair", - // parser.offset, - // -1, - // ); - // } - Some(Err(Utf16Error::ExpectedLowSurrogateArea(value))) => { - return yaml_parser_set_reader_error( - parser, - "expected low surrogate area", - parser.offset + 2, - value as i32, - ); - } - Some(Err(Utf16Error::InvalidUnicode(value))) => { - return yaml_parser_set_reader_error( - parser, - "invalid Unicode character", - parser.offset, - value as i32, - ); - } - None => (), - } + YAML_UTF16LE_ENCODING => { + read_utf16_buffered::(reader, &mut parser.buffer, &mut parser.offset)? } - _ => { - panic!("unhandled encoded enum variant") + YAML_UTF16BE_ENCODING => { + read_utf16_buffered::(reader, &mut parser.buffer, &mut parser.offset)? } + }; + + let num_read = parser.buffer.len() - tokens_before; + parser.unread += num_read; + if !not_eof { + parser.eof = true; + return Ok(()); } } if parser.offset >= (!0_usize).wrapping_div(2_usize) { - return yaml_parser_set_reader_error(parser, "input is too long", parser.offset, -1); + return yaml_parser_set_reader_error("input is too long", parser.offset, -1); } Ok(()) } diff --git a/src/yaml.rs b/src/yaml.rs index 2380674..163930f 100644 --- a/src/yaml.rs +++ b/src/yaml.rs @@ -628,7 +628,7 @@ pub struct yaml_alias_data_t { #[non_exhaustive] pub struct yaml_parser_t<'r> { /// Read handler. - pub(crate) read_handler: Option<&'r mut dyn std::io::Read>, + pub(crate) read_handler: Option<&'r mut dyn std::io::BufRead>, /// EOF flag pub(crate) eof: bool, /// The working buffer. @@ -637,12 +637,6 @@ pub struct yaml_parser_t<'r> { pub(crate) buffer: VecDeque, /// The number of unread characters in the buffer. pub(crate) unread: usize, - /// The raw buffer. - /// - /// This is the raw unchecked input from the read handler (for example, it - /// may be UTF-16 encoded). - // TODO: Get rid of this and ask users to provide something implementing `BufRead` instead of `Read`. - pub(crate) raw_buffer: VecDeque, /// The input encoding. pub(crate) encoding: yaml_encoding_t, /// The offset of the current position (in bytes).