Parser: Replace internal buffering with std::io::BufRead

This commit is contained in:
Simon Ask Ulsnes 2024-02-03 08:19:12 +01:00
parent e28400ee5f
commit 5bf087e0a3
5 changed files with 281 additions and 297 deletions

View file

@ -25,7 +25,6 @@ pub fn yaml_parser_new<'r>() -> yaml_parser_t<'r> {
eof: false,
buffer: VecDeque::with_capacity(INPUT_BUFFER_SIZE),
unread: 0,
raw_buffer: VecDeque::with_capacity(INPUT_RAW_BUFFER_SIZE),
encoding: YAML_ANY_ENCODING,
offset: 0,
mark: yaml_mark_t::default(),
@ -49,7 +48,6 @@ pub fn yaml_parser_new<'r>() -> yaml_parser_t<'r> {
/// Destroy a parser.
pub fn yaml_parser_delete(parser: &mut yaml_parser_t) {
parser.raw_buffer.clear();
parser.buffer.clear();
parser.tokens.clear();
parser.indents.clear();
@ -70,7 +68,10 @@ pub fn yaml_parser_set_input_string<'r>(parser: &mut yaml_parser_t<'r>, input: &
}
/// Set a generic input handler.
pub fn yaml_parser_set_input<'r>(parser: &mut yaml_parser_t<'r>, input: &'r mut dyn std::io::Read) {
pub fn yaml_parser_set_input<'r>(
parser: &mut yaml_parser_t<'r>,
input: &'r mut dyn std::io::BufRead,
) {
assert!((parser.read_handler).is_none());
parser.read_handler = Some(input);
}

View file

@ -29,7 +29,8 @@ pub(crate) fn test_main(
) -> Result<(), Box<dyn Error>> {
let mut parser = yaml_parser_new();
yaml_parser_set_input(&mut parser, stdin);
let mut stdin = std::io::BufReader::new(stdin);
yaml_parser_set_input(&mut parser, &mut stdin);
loop {
let event = match yaml_parser_parse(&mut parser) {

View file

@ -24,6 +24,12 @@ pub enum ReaderError {
offset: usize,
value: i32,
},
#[error("input stream produced an invalid byte order marker")]
InvalidBom,
#[error("invalid UTF-8 byte at offset: {value:x}")]
InvalidUtf8 { value: u8 },
#[error("invalid UTF-16 unpaired surrogate: {value:x}")]
InvalidUtf16 { value: u16 },
#[error(transparent)]
Io(#[from] std::io::Error),
}

View file

@ -1,14 +1,13 @@
use std::io::BufRead;
use alloc::collections::VecDeque;
use crate::api::INPUT_RAW_BUFFER_SIZE;
use crate::macros::vecdeque_starts_with;
use crate::{
yaml_parser_t, ReaderError, YAML_ANY_ENCODING, YAML_UTF16BE_ENCODING, YAML_UTF16LE_ENCODING,
YAML_UTF8_ENCODING,
yaml_encoding_t, yaml_parser_t, ReaderError, YAML_ANY_ENCODING, YAML_UTF16BE_ENCODING,
YAML_UTF16LE_ENCODING, YAML_UTF8_ENCODING,
};
fn yaml_parser_set_reader_error<T>(
_parser: &mut yaml_parser_t,
problem: &'static str,
offset: usize,
value: i32,
@ -20,168 +19,254 @@ fn yaml_parser_set_reader_error<T>(
})
}
const BOM_UTF8: &[u8] = b"\xEF\xBB\xBF";
const BOM_UTF16LE: &[u8] = b"\xFF\xFE";
const BOM_UTF16BE: &[u8] = b"\xFE\xFF";
const BOM_UTF8: [u8; 3] = [0xef, 0xbb, 0xbf];
const BOM_UTF16LE: [u8; 2] = [0xff, 0xfe];
const BOM_UTF16BE: [u8; 2] = [0xfe, 0xff];
fn yaml_parser_determine_encoding(parser: &mut yaml_parser_t) -> Result<(), ReaderError> {
while !parser.eof && parser.raw_buffer.len() < 3 {
yaml_parser_update_raw_buffer(parser)?;
}
if vecdeque_starts_with(&parser.raw_buffer, BOM_UTF16LE) {
parser.encoding = YAML_UTF16LE_ENCODING;
parser.raw_buffer.drain(0..2);
parser.offset += 2;
} else if vecdeque_starts_with(&parser.raw_buffer, BOM_UTF16BE) {
parser.encoding = YAML_UTF16BE_ENCODING;
parser.raw_buffer.drain(0..2);
parser.offset += 2;
} else if vecdeque_starts_with(&parser.raw_buffer, BOM_UTF8) {
parser.encoding = YAML_UTF8_ENCODING;
parser.raw_buffer.drain(0..3);
parser.offset += 3;
} else {
parser.encoding = YAML_UTF8_ENCODING;
}
Ok(())
}
fn yaml_parser_update_raw_buffer(parser: &mut yaml_parser_t) -> Result<(), ReaderError> {
if parser.raw_buffer.len() >= INPUT_RAW_BUFFER_SIZE {
return Ok(());
}
if parser.eof {
return Ok(());
fn yaml_parser_determine_encoding(
reader: &mut dyn BufRead,
) -> Result<Option<yaml_encoding_t>, ReaderError> {
let initial_bytes = reader.fill_buf()?;
if initial_bytes.is_empty() {
return Ok(None);
}
let len_before = parser.raw_buffer.len();
debug_assert!(len_before < INPUT_RAW_BUFFER_SIZE);
parser.raw_buffer.resize(INPUT_RAW_BUFFER_SIZE, 0);
let contiguous = parser.raw_buffer.make_contiguous();
let write_to = &mut contiguous[len_before..];
let size_read = parser
.read_handler
.as_mut()
.expect("non-null read handler")
.read(write_to)?;
let valid_size = len_before + size_read;
parser.raw_buffer.truncate(valid_size);
if size_read == 0 {
parser.eof = true;
}
Ok(())
}
fn utf8_char_width_and_initial_value(initial: u8) -> (usize, u32) {
let initial = initial as u32;
if initial & 0x80 == 0 {
(1, initial & 0x7f)
} else if initial & 0xE0 == 0xC0 {
(2, initial & 0x1f)
} else if initial & 0xF0 == 0xE0 {
(3, initial & 0x0f)
} else if initial & 0xF8 == 0xF0 {
(4, initial & 0x07)
} else {
(0, 0)
}
}
enum Utf8Error {
Incomplete,
InvalidLeadingOctet,
InvalidTrailingOctet(usize),
InvalidLength,
InvalidUnicode(u32),
}
fn read_char_utf8(raw: &mut VecDeque<u8>) -> Option<Result<char, Utf8Error>> {
let first = raw.front().copied()?;
let (width, mut value) = utf8_char_width_and_initial_value(first);
if width == 0 {
return Some(Err(Utf8Error::InvalidLeadingOctet));
}
if raw.len() < width {
return Some(Err(Utf8Error::Incomplete));
}
for (i, trailing) in raw.iter().enumerate().take(width).skip(1) {
if trailing & 0xc0 != 0x80 {
return Some(Err(Utf8Error::InvalidTrailingOctet(i)));
match initial_bytes[0] {
0xef => {
let mut bom = [0; 3];
reader.read_exact(&mut bom)?;
if bom == BOM_UTF8 {
Ok(Some(YAML_UTF8_ENCODING))
} else {
Err(ReaderError::InvalidBom)
}
}
value <<= 6;
value += *trailing as u32 & 0x3f;
}
if !(width == 1
|| width == 2 && value >= 0x80
|| width == 3 && value >= 0x800
|| width == 4 && value >= 0x10000)
{
return Some(Err(Utf8Error::InvalidLength));
}
if let Some(ch) = char::from_u32(value) {
raw.drain(..width);
Some(Ok(ch))
} else {
Some(Err(Utf8Error::InvalidUnicode(value)))
0xff | 0xfe => {
let mut bom = [0; 2];
reader.read_exact(&mut bom)?;
if bom == BOM_UTF16LE {
Ok(Some(YAML_UTF16LE_ENCODING))
} else if bom == BOM_UTF16BE {
Ok(Some(YAML_UTF16BE_ENCODING))
} else {
Err(ReaderError::InvalidBom)
}
}
_ => Ok(Some(YAML_UTF8_ENCODING)),
}
}
enum Utf16Error {
Incomplete,
UnexpectedLowSurrogateArea(u32),
ExpectedLowSurrogateArea(u32),
InvalidUnicode(u32),
}
fn read_char_utf16<const BIG_ENDIAN: bool>(
raw: &mut VecDeque<u8>,
) -> Option<Result<char, Utf16Error>> {
if raw.is_empty() {
return None;
}
if raw.len() < 2 {
return Some(Err(Utf16Error::Incomplete));
}
let bytes = [raw[0], raw[1]];
let mut value = if BIG_ENDIAN {
u16::from_be_bytes(bytes) as u32
} else {
u16::from_le_bytes(bytes) as u32
fn read_utf8_buffered(
reader: &mut dyn BufRead,
out: &mut VecDeque<char>,
offset: &mut usize,
) -> Result<bool, ReaderError> {
let available = loop {
match reader.fill_buf() {
Ok([]) => return Ok(false),
Ok(available) => break available,
Err(err) if err.kind() == std::io::ErrorKind::Interrupted => continue,
Err(err) => return Err(err.into()),
}
};
if value & 0xfc00 == 0xdc00 {
return Some(Err(Utf16Error::UnexpectedLowSurrogateArea(value)));
}
let width;
if value & 0xfc00 == 0xd800 {
width = 4;
if raw.len() < width {
return Some(Err(Utf16Error::Incomplete));
}
let bytes2 = [raw[2], raw[3]];
let value2 = if BIG_ENDIAN {
u16::from_be_bytes(bytes2) as u32
} else {
u16::from_le_bytes(bytes2) as u32
};
if value2 & 0xfc00 != 0xdc00 {
return Some(Err(Utf16Error::ExpectedLowSurrogateArea(value2)));
}
value = (0x10000 + (value & 0x3ff)) << (10 + (value2 & 0x3ff));
} else {
width = 2;
}
if let Some(ch) = char::from_u32(value) {
raw.drain(..width);
Some(Ok(ch))
} else {
Some(Err(Utf16Error::InvalidUnicode(value)))
match core::str::from_utf8(available) {
Ok(valid) => {
let used = valid.len();
// The entire contents of the input buffer was valid UTF-8.
for ch in valid.chars() {
push_char(out, ch, *offset)?;
*offset += ch.len_utf8();
}
reader.consume(used);
Ok(true)
}
Err(err) => {
let valid_bytes = err.valid_up_to();
// If some of the buffer contents were valid, append that to the
// output.
let valid = unsafe {
// SAFETY: This is safe because of `valid_up_to()`.
core::str::from_utf8_unchecked(&available[..valid_bytes])
};
for ch in valid.chars() {
push_char(out, ch, *offset)?;
*offset += ch.len_utf8();
}
match err.error_len() {
Some(_invalid_len) => {
return Err(ReaderError::InvalidUtf8 {
value: available[valid_bytes],
});
}
None => {
if valid_bytes != 0 {
// Some valid UTF-8 characters were present, and the
// tail end of the buffer was an incomplete sequence.
// Leave the incomplete sequence in the buffer.
reader.consume(valid_bytes);
Ok(true)
} else {
// The beginning of the buffer was an incomplete UTF-8
// sequence. Read the whole character unbuffered.
//
// This will return `UnexpectedEof` if the sequence
// cannot be completed. Note that `read_exact()` handles
// interrupt automatically.
let initial = available[0];
read_utf8_char_unbuffered(reader, out, initial, offset)?;
Ok(true)
}
}
}
}
}
}
fn push_char(parser: &mut yaml_parser_t, ch: char) -> Result<(), ReaderError> {
fn read_utf8_char_unbuffered(
reader: &mut dyn BufRead,
out: &mut VecDeque<char>,
initial: u8,
offset: &mut usize,
) -> Result<(), ReaderError> {
let width = utf8_char_width(initial);
let mut buffer = [0; 4];
reader.read_exact(&mut buffer[..width])?;
if let Ok(valid) = core::str::from_utf8(&buffer[..width]) {
// We read a whole, valid character.
let Some(ch) = valid.chars().next() else {
unreachable!()
};
push_char(out, ch, *offset)?;
*offset += width;
Ok(())
} else {
// Since we read the exact character width, the only
// possible error here is invalid Unicode.
Err(ReaderError::InvalidUtf8 { value: buffer[0] })
}
}
fn read_utf16_buffered<const BIG_ENDIAN: bool>(
reader: &mut dyn BufRead,
out: &mut VecDeque<char>,
offset: &mut usize,
) -> Result<bool, ReaderError> {
let available = loop {
match reader.fill_buf() {
Ok([]) => return Ok(false),
Ok(available) => break available,
Err(err) if err.kind() == std::io::ErrorKind::Interrupted => continue,
Err(err) => return Err(err.into()),
}
};
let chunks = available.chunks_exact(2).map(|chunk| {
let [a, b] = chunk else { unreachable!() };
if BIG_ENDIAN {
u16::from_be_bytes([*a, *b])
} else {
u16::from_le_bytes([*a, *b])
}
});
let mut used = 0;
for ch in core::char::decode_utf16(chunks) {
match ch {
Ok(ch) => {
push_char(out, ch, *offset)?;
let n = ch.len_utf16();
*offset += n;
used += n;
}
Err(_) => {
// An unpaired surrogate may either be a corrupt stream, but it
// can also be that the buffer just happens to contain the first
// half of a surrogate pair. Consume all of the valid bytes in
// the buffer first, and then handle the unpaired surrogate in
// the "slow" path (`read_utf16_char_unbuffered`) the next time
// we are called.
break;
}
}
}
if used != 0 {
reader.consume(used);
*offset += used;
Ok(true)
} else {
debug_assert!(available.len() != 0 && available.len() < 2);
read_utf16_char_unbuffered::<BIG_ENDIAN>(reader, out, offset)?;
Ok(true)
}
}
fn read_utf16_char_unbuffered<const BIG_ENDIAN: bool>(
reader: &mut dyn BufRead,
out: &mut VecDeque<char>,
offset: &mut usize,
) -> Result<(), ReaderError> {
let mut buffer = [0; 2];
reader.read_exact(&mut buffer)?;
let first = if BIG_ENDIAN {
u16::from_be_bytes(buffer)
} else {
u16::from_le_bytes(buffer)
};
if is_utf16_surrogate(first) {
reader.read_exact(&mut buffer)?;
let second = if BIG_ENDIAN {
u16::from_be_bytes(buffer)
} else {
u16::from_le_bytes(buffer)
};
match core::char::decode_utf16([first, second]).next() {
Some(Ok(ch)) => {
push_char(out, ch, *offset)?;
*offset += 4;
Ok(())
}
Some(Err(err)) => Err(ReaderError::InvalidUtf16 {
value: err.unpaired_surrogate(),
}),
None => unreachable!(),
}
} else {
match core::char::decode_utf16([first]).next() {
Some(Ok(ch)) => {
push_char(out, ch, *offset)?;
*offset += 2;
Ok(())
}
Some(Err(_)) | None => unreachable!(),
}
}
}
fn utf8_char_width(initial: u8) -> usize {
if initial & 0x80 == 0 {
1
} else if initial & 0xE0 == 0xC0 {
2
} else if initial & 0xF0 == 0xE0 {
3
} else if initial & 0xF8 == 0xF0 {
4
} else {
0
}
}
fn is_utf16_surrogate(value: u16) -> bool {
matches!(value, 0xD800..=0xDFFF)
}
fn push_char(out: &mut VecDeque<char>, ch: char, offset: usize) -> Result<(), ReaderError> {
if !(ch == '\x09'
|| ch == '\x0A'
|| ch == '\x0D'
@ -191,16 +276,9 @@ fn push_char(parser: &mut yaml_parser_t, ch: char) -> Result<(), ReaderError> {
|| ch >= '\u{E000}' && ch <= '\u{FFFD}'
|| ch >= '\u{10000}' && ch <= '\u{10FFFF}')
{
return yaml_parser_set_reader_error(
parser,
"control characters are not allowed",
parser.offset,
ch as _,
);
return yaml_parser_set_reader_error("control characters are not allowed", offset, ch as _);
}
parser.buffer.push_back(ch);
parser.offset += ch.len_utf8();
parser.unread += 1;
out.push_back(ch);
Ok(())
}
@ -208,145 +286,49 @@ pub(crate) fn yaml_parser_update_buffer(
parser: &mut yaml_parser_t,
length: usize,
) -> Result<(), ReaderError> {
let mut first = true;
assert!((parser.read_handler).is_some());
if parser.eof && parser.raw_buffer.is_empty() {
return Ok(());
}
let reader = parser.read_handler.as_deref_mut().expect("no read handler");
if parser.unread >= length {
return Ok(());
}
if parser.encoding == YAML_ANY_ENCODING {
yaml_parser_determine_encoding(parser)?;
if let Some(encoding) = yaml_parser_determine_encoding(reader)? {
parser.encoding = encoding;
} else {
parser.eof = true;
return Ok(());
}
}
while parser.unread < length {
if parser.eof && parser.raw_buffer.is_empty() {
if parser.eof {
return Ok(());
}
if !first || parser.raw_buffer.is_empty() {
yaml_parser_update_raw_buffer(parser)?;
}
first = false;
match parser.encoding {
let tokens_before = parser.buffer.len();
let not_eof = match parser.encoding {
YAML_ANY_ENCODING => unreachable!(),
YAML_UTF8_ENCODING => {
match read_char_utf8(&mut parser.raw_buffer) {
Some(Ok(ch)) => {
push_char(parser, ch)?;
}
Some(Err(Utf8Error::Incomplete)) => {
if parser.eof {
return yaml_parser_set_reader_error(
parser,
"incomplete UTF-8 octet sequence",
parser.offset,
-1,
);
} else {
// Read more
}
}
Some(Err(Utf8Error::InvalidLeadingOctet)) => {
return yaml_parser_set_reader_error(
parser,
"invalid leading UTF-8 octet",
parser.offset,
parser.raw_buffer[0] as _,
);
}
Some(Err(Utf8Error::InvalidTrailingOctet(offset))) => {
return yaml_parser_set_reader_error(
parser,
"invalid trailing UTF-8 octet",
parser.offset + offset,
parser.raw_buffer[offset] as _,
);
}
Some(Err(Utf8Error::InvalidLength)) => {
return yaml_parser_set_reader_error(
parser,
"invalid length of a UTF-8 sequence",
parser.offset,
-1,
);
}
Some(Err(Utf8Error::InvalidUnicode(value))) => {
return yaml_parser_set_reader_error(
parser,
"invalid Unicode character",
parser.offset,
value as _,
);
}
None => (),
}
read_utf8_buffered(reader, &mut parser.buffer, &mut parser.offset)?
}
YAML_UTF16LE_ENCODING | YAML_UTF16BE_ENCODING => {
let is_big_endian = parser.encoding == YAML_UTF16BE_ENCODING;
let res = if is_big_endian {
read_char_utf16::<true>(&mut parser.raw_buffer)
} else {
read_char_utf16::<false>(&mut parser.raw_buffer)
};
match res {
Some(Ok(ch)) => {
push_char(parser, ch)?;
}
Some(Err(Utf16Error::Incomplete)) => {
if parser.eof {
return yaml_parser_set_reader_error(
parser,
"incomplete UTF-16 character",
parser.offset,
-1,
);
} else {
// Read more
}
}
Some(Err(Utf16Error::UnexpectedLowSurrogateArea(value))) => {
return yaml_parser_set_reader_error(
parser,
"unexpected low surrogate area",
parser.offset,
value as i32,
);
}
// Some(Err(Utf16Error::IncompleteSurrogatePair)) => {
// return yaml_parser_set_reader_error(
// parser,
// "incomplete UTF-16 surrogate pair",
// parser.offset,
// -1,
// );
// }
Some(Err(Utf16Error::ExpectedLowSurrogateArea(value))) => {
return yaml_parser_set_reader_error(
parser,
"expected low surrogate area",
parser.offset + 2,
value as i32,
);
}
Some(Err(Utf16Error::InvalidUnicode(value))) => {
return yaml_parser_set_reader_error(
parser,
"invalid Unicode character",
parser.offset,
value as i32,
);
}
None => (),
}
YAML_UTF16LE_ENCODING => {
read_utf16_buffered::<false>(reader, &mut parser.buffer, &mut parser.offset)?
}
_ => {
panic!("unhandled encoded enum variant")
YAML_UTF16BE_ENCODING => {
read_utf16_buffered::<true>(reader, &mut parser.buffer, &mut parser.offset)?
}
};
let num_read = parser.buffer.len() - tokens_before;
parser.unread += num_read;
if !not_eof {
parser.eof = true;
return Ok(());
}
}
if parser.offset >= (!0_usize).wrapping_div(2_usize) {
return yaml_parser_set_reader_error(parser, "input is too long", parser.offset, -1);
return yaml_parser_set_reader_error("input is too long", parser.offset, -1);
}
Ok(())
}

View file

@ -628,7 +628,7 @@ pub struct yaml_alias_data_t {
#[non_exhaustive]
pub struct yaml_parser_t<'r> {
/// Read handler.
pub(crate) read_handler: Option<&'r mut dyn std::io::Read>,
pub(crate) read_handler: Option<&'r mut dyn std::io::BufRead>,
/// EOF flag
pub(crate) eof: bool,
/// The working buffer.
@ -637,12 +637,6 @@ pub struct yaml_parser_t<'r> {
pub(crate) buffer: VecDeque<char>,
/// The number of unread characters in the buffer.
pub(crate) unread: usize,
/// The raw buffer.
///
/// This is the raw unchecked input from the read handler (for example, it
/// may be UTF-16 encoded).
// TODO: Get rid of this and ask users to provide something implementing `BufRead` instead of `Read`.
pub(crate) raw_buffer: VecDeque<u8>,
/// The input encoding.
pub(crate) encoding: yaml_encoding_t,
/// The offset of the current position (in bytes).