mirror of
https://github.com/simonask/libyaml-safer
synced 2024-11-26 13:20:24 +00:00
Parser: Replace internal buffering with std::io::BufRead
This commit is contained in:
parent
e28400ee5f
commit
5bf087e0a3
5 changed files with 281 additions and 297 deletions
|
@ -25,7 +25,6 @@ pub fn yaml_parser_new<'r>() -> yaml_parser_t<'r> {
|
||||||
eof: false,
|
eof: false,
|
||||||
buffer: VecDeque::with_capacity(INPUT_BUFFER_SIZE),
|
buffer: VecDeque::with_capacity(INPUT_BUFFER_SIZE),
|
||||||
unread: 0,
|
unread: 0,
|
||||||
raw_buffer: VecDeque::with_capacity(INPUT_RAW_BUFFER_SIZE),
|
|
||||||
encoding: YAML_ANY_ENCODING,
|
encoding: YAML_ANY_ENCODING,
|
||||||
offset: 0,
|
offset: 0,
|
||||||
mark: yaml_mark_t::default(),
|
mark: yaml_mark_t::default(),
|
||||||
|
@ -49,7 +48,6 @@ pub fn yaml_parser_new<'r>() -> yaml_parser_t<'r> {
|
||||||
|
|
||||||
/// Destroy a parser.
|
/// Destroy a parser.
|
||||||
pub fn yaml_parser_delete(parser: &mut yaml_parser_t) {
|
pub fn yaml_parser_delete(parser: &mut yaml_parser_t) {
|
||||||
parser.raw_buffer.clear();
|
|
||||||
parser.buffer.clear();
|
parser.buffer.clear();
|
||||||
parser.tokens.clear();
|
parser.tokens.clear();
|
||||||
parser.indents.clear();
|
parser.indents.clear();
|
||||||
|
@ -70,7 +68,10 @@ pub fn yaml_parser_set_input_string<'r>(parser: &mut yaml_parser_t<'r>, input: &
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Set a generic input handler.
|
/// Set a generic input handler.
|
||||||
pub fn yaml_parser_set_input<'r>(parser: &mut yaml_parser_t<'r>, input: &'r mut dyn std::io::Read) {
|
pub fn yaml_parser_set_input<'r>(
|
||||||
|
parser: &mut yaml_parser_t<'r>,
|
||||||
|
input: &'r mut dyn std::io::BufRead,
|
||||||
|
) {
|
||||||
assert!((parser.read_handler).is_none());
|
assert!((parser.read_handler).is_none());
|
||||||
parser.read_handler = Some(input);
|
parser.read_handler = Some(input);
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,7 +29,8 @@ pub(crate) fn test_main(
|
||||||
) -> Result<(), Box<dyn Error>> {
|
) -> Result<(), Box<dyn Error>> {
|
||||||
let mut parser = yaml_parser_new();
|
let mut parser = yaml_parser_new();
|
||||||
|
|
||||||
yaml_parser_set_input(&mut parser, stdin);
|
let mut stdin = std::io::BufReader::new(stdin);
|
||||||
|
yaml_parser_set_input(&mut parser, &mut stdin);
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
let event = match yaml_parser_parse(&mut parser) {
|
let event = match yaml_parser_parse(&mut parser) {
|
||||||
|
|
|
@ -24,6 +24,12 @@ pub enum ReaderError {
|
||||||
offset: usize,
|
offset: usize,
|
||||||
value: i32,
|
value: i32,
|
||||||
},
|
},
|
||||||
|
#[error("input stream produced an invalid byte order marker")]
|
||||||
|
InvalidBom,
|
||||||
|
#[error("invalid UTF-8 byte at offset: {value:x}")]
|
||||||
|
InvalidUtf8 { value: u8 },
|
||||||
|
#[error("invalid UTF-16 unpaired surrogate: {value:x}")]
|
||||||
|
InvalidUtf16 { value: u16 },
|
||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
Io(#[from] std::io::Error),
|
Io(#[from] std::io::Error),
|
||||||
}
|
}
|
||||||
|
|
532
src/reader.rs
532
src/reader.rs
|
@ -1,14 +1,13 @@
|
||||||
|
use std::io::BufRead;
|
||||||
|
|
||||||
use alloc::collections::VecDeque;
|
use alloc::collections::VecDeque;
|
||||||
|
|
||||||
use crate::api::INPUT_RAW_BUFFER_SIZE;
|
|
||||||
use crate::macros::vecdeque_starts_with;
|
|
||||||
use crate::{
|
use crate::{
|
||||||
yaml_parser_t, ReaderError, YAML_ANY_ENCODING, YAML_UTF16BE_ENCODING, YAML_UTF16LE_ENCODING,
|
yaml_encoding_t, yaml_parser_t, ReaderError, YAML_ANY_ENCODING, YAML_UTF16BE_ENCODING,
|
||||||
YAML_UTF8_ENCODING,
|
YAML_UTF16LE_ENCODING, YAML_UTF8_ENCODING,
|
||||||
};
|
};
|
||||||
|
|
||||||
fn yaml_parser_set_reader_error<T>(
|
fn yaml_parser_set_reader_error<T>(
|
||||||
_parser: &mut yaml_parser_t,
|
|
||||||
problem: &'static str,
|
problem: &'static str,
|
||||||
offset: usize,
|
offset: usize,
|
||||||
value: i32,
|
value: i32,
|
||||||
|
@ -20,168 +19,254 @@ fn yaml_parser_set_reader_error<T>(
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
const BOM_UTF8: &[u8] = b"\xEF\xBB\xBF";
|
const BOM_UTF8: [u8; 3] = [0xef, 0xbb, 0xbf];
|
||||||
const BOM_UTF16LE: &[u8] = b"\xFF\xFE";
|
const BOM_UTF16LE: [u8; 2] = [0xff, 0xfe];
|
||||||
const BOM_UTF16BE: &[u8] = b"\xFE\xFF";
|
const BOM_UTF16BE: [u8; 2] = [0xfe, 0xff];
|
||||||
|
|
||||||
fn yaml_parser_determine_encoding(parser: &mut yaml_parser_t) -> Result<(), ReaderError> {
|
fn yaml_parser_determine_encoding(
|
||||||
while !parser.eof && parser.raw_buffer.len() < 3 {
|
reader: &mut dyn BufRead,
|
||||||
yaml_parser_update_raw_buffer(parser)?;
|
) -> Result<Option<yaml_encoding_t>, ReaderError> {
|
||||||
|
let initial_bytes = reader.fill_buf()?;
|
||||||
|
if initial_bytes.is_empty() {
|
||||||
|
return Ok(None);
|
||||||
}
|
}
|
||||||
if vecdeque_starts_with(&parser.raw_buffer, BOM_UTF16LE) {
|
|
||||||
parser.encoding = YAML_UTF16LE_ENCODING;
|
match initial_bytes[0] {
|
||||||
parser.raw_buffer.drain(0..2);
|
0xef => {
|
||||||
parser.offset += 2;
|
let mut bom = [0; 3];
|
||||||
} else if vecdeque_starts_with(&parser.raw_buffer, BOM_UTF16BE) {
|
reader.read_exact(&mut bom)?;
|
||||||
parser.encoding = YAML_UTF16BE_ENCODING;
|
if bom == BOM_UTF8 {
|
||||||
parser.raw_buffer.drain(0..2);
|
Ok(Some(YAML_UTF8_ENCODING))
|
||||||
parser.offset += 2;
|
|
||||||
} else if vecdeque_starts_with(&parser.raw_buffer, BOM_UTF8) {
|
|
||||||
parser.encoding = YAML_UTF8_ENCODING;
|
|
||||||
parser.raw_buffer.drain(0..3);
|
|
||||||
parser.offset += 3;
|
|
||||||
} else {
|
} else {
|
||||||
parser.encoding = YAML_UTF8_ENCODING;
|
Err(ReaderError::InvalidBom)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
0xff | 0xfe => {
|
||||||
|
let mut bom = [0; 2];
|
||||||
|
reader.read_exact(&mut bom)?;
|
||||||
|
if bom == BOM_UTF16LE {
|
||||||
|
Ok(Some(YAML_UTF16LE_ENCODING))
|
||||||
|
} else if bom == BOM_UTF16BE {
|
||||||
|
Ok(Some(YAML_UTF16BE_ENCODING))
|
||||||
|
} else {
|
||||||
|
Err(ReaderError::InvalidBom)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => Ok(Some(YAML_UTF8_ENCODING)),
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn yaml_parser_update_raw_buffer(parser: &mut yaml_parser_t) -> Result<(), ReaderError> {
|
fn read_utf8_buffered(
|
||||||
if parser.raw_buffer.len() >= INPUT_RAW_BUFFER_SIZE {
|
reader: &mut dyn BufRead,
|
||||||
return Ok(());
|
out: &mut VecDeque<char>,
|
||||||
|
offset: &mut usize,
|
||||||
|
) -> Result<bool, ReaderError> {
|
||||||
|
let available = loop {
|
||||||
|
match reader.fill_buf() {
|
||||||
|
Ok([]) => return Ok(false),
|
||||||
|
Ok(available) => break available,
|
||||||
|
Err(err) if err.kind() == std::io::ErrorKind::Interrupted => continue,
|
||||||
|
Err(err) => return Err(err.into()),
|
||||||
}
|
}
|
||||||
if parser.eof {
|
};
|
||||||
return Ok(());
|
|
||||||
|
match core::str::from_utf8(available) {
|
||||||
|
Ok(valid) => {
|
||||||
|
let used = valid.len();
|
||||||
|
// The entire contents of the input buffer was valid UTF-8.
|
||||||
|
for ch in valid.chars() {
|
||||||
|
push_char(out, ch, *offset)?;
|
||||||
|
*offset += ch.len_utf8();
|
||||||
|
}
|
||||||
|
reader.consume(used);
|
||||||
|
Ok(true)
|
||||||
|
}
|
||||||
|
Err(err) => {
|
||||||
|
let valid_bytes = err.valid_up_to();
|
||||||
|
|
||||||
|
// If some of the buffer contents were valid, append that to the
|
||||||
|
// output.
|
||||||
|
let valid = unsafe {
|
||||||
|
// SAFETY: This is safe because of `valid_up_to()`.
|
||||||
|
core::str::from_utf8_unchecked(&available[..valid_bytes])
|
||||||
|
};
|
||||||
|
for ch in valid.chars() {
|
||||||
|
push_char(out, ch, *offset)?;
|
||||||
|
*offset += ch.len_utf8();
|
||||||
}
|
}
|
||||||
|
|
||||||
let len_before = parser.raw_buffer.len();
|
match err.error_len() {
|
||||||
debug_assert!(len_before < INPUT_RAW_BUFFER_SIZE);
|
Some(_invalid_len) => {
|
||||||
parser.raw_buffer.resize(INPUT_RAW_BUFFER_SIZE, 0);
|
return Err(ReaderError::InvalidUtf8 {
|
||||||
let contiguous = parser.raw_buffer.make_contiguous();
|
value: available[valid_bytes],
|
||||||
let write_to = &mut contiguous[len_before..];
|
});
|
||||||
|
}
|
||||||
let size_read = parser
|
None => {
|
||||||
.read_handler
|
if valid_bytes != 0 {
|
||||||
.as_mut()
|
// Some valid UTF-8 characters were present, and the
|
||||||
.expect("non-null read handler")
|
// tail end of the buffer was an incomplete sequence.
|
||||||
.read(write_to)?;
|
// Leave the incomplete sequence in the buffer.
|
||||||
|
reader.consume(valid_bytes);
|
||||||
let valid_size = len_before + size_read;
|
Ok(true)
|
||||||
parser.raw_buffer.truncate(valid_size);
|
} else {
|
||||||
if size_read == 0 {
|
// The beginning of the buffer was an incomplete UTF-8
|
||||||
parser.eof = true;
|
// sequence. Read the whole character unbuffered.
|
||||||
|
//
|
||||||
|
// This will return `UnexpectedEof` if the sequence
|
||||||
|
// cannot be completed. Note that `read_exact()` handles
|
||||||
|
// interrupt automatically.
|
||||||
|
let initial = available[0];
|
||||||
|
read_utf8_char_unbuffered(reader, out, initial, offset)?;
|
||||||
|
Ok(true)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn utf8_char_width_and_initial_value(initial: u8) -> (usize, u32) {
|
fn read_utf8_char_unbuffered(
|
||||||
let initial = initial as u32;
|
reader: &mut dyn BufRead,
|
||||||
|
out: &mut VecDeque<char>,
|
||||||
|
initial: u8,
|
||||||
|
offset: &mut usize,
|
||||||
|
) -> Result<(), ReaderError> {
|
||||||
|
let width = utf8_char_width(initial);
|
||||||
|
let mut buffer = [0; 4];
|
||||||
|
reader.read_exact(&mut buffer[..width])?;
|
||||||
|
if let Ok(valid) = core::str::from_utf8(&buffer[..width]) {
|
||||||
|
// We read a whole, valid character.
|
||||||
|
let Some(ch) = valid.chars().next() else {
|
||||||
|
unreachable!()
|
||||||
|
};
|
||||||
|
push_char(out, ch, *offset)?;
|
||||||
|
*offset += width;
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
// Since we read the exact character width, the only
|
||||||
|
// possible error here is invalid Unicode.
|
||||||
|
Err(ReaderError::InvalidUtf8 { value: buffer[0] })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_utf16_buffered<const BIG_ENDIAN: bool>(
|
||||||
|
reader: &mut dyn BufRead,
|
||||||
|
out: &mut VecDeque<char>,
|
||||||
|
offset: &mut usize,
|
||||||
|
) -> Result<bool, ReaderError> {
|
||||||
|
let available = loop {
|
||||||
|
match reader.fill_buf() {
|
||||||
|
Ok([]) => return Ok(false),
|
||||||
|
Ok(available) => break available,
|
||||||
|
Err(err) if err.kind() == std::io::ErrorKind::Interrupted => continue,
|
||||||
|
Err(err) => return Err(err.into()),
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let chunks = available.chunks_exact(2).map(|chunk| {
|
||||||
|
let [a, b] = chunk else { unreachable!() };
|
||||||
|
if BIG_ENDIAN {
|
||||||
|
u16::from_be_bytes([*a, *b])
|
||||||
|
} else {
|
||||||
|
u16::from_le_bytes([*a, *b])
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
let mut used = 0;
|
||||||
|
for ch in core::char::decode_utf16(chunks) {
|
||||||
|
match ch {
|
||||||
|
Ok(ch) => {
|
||||||
|
push_char(out, ch, *offset)?;
|
||||||
|
let n = ch.len_utf16();
|
||||||
|
*offset += n;
|
||||||
|
used += n;
|
||||||
|
}
|
||||||
|
Err(_) => {
|
||||||
|
// An unpaired surrogate may either be a corrupt stream, but it
|
||||||
|
// can also be that the buffer just happens to contain the first
|
||||||
|
// half of a surrogate pair. Consume all of the valid bytes in
|
||||||
|
// the buffer first, and then handle the unpaired surrogate in
|
||||||
|
// the "slow" path (`read_utf16_char_unbuffered`) the next time
|
||||||
|
// we are called.
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if used != 0 {
|
||||||
|
reader.consume(used);
|
||||||
|
*offset += used;
|
||||||
|
Ok(true)
|
||||||
|
} else {
|
||||||
|
debug_assert!(available.len() != 0 && available.len() < 2);
|
||||||
|
read_utf16_char_unbuffered::<BIG_ENDIAN>(reader, out, offset)?;
|
||||||
|
Ok(true)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_utf16_char_unbuffered<const BIG_ENDIAN: bool>(
|
||||||
|
reader: &mut dyn BufRead,
|
||||||
|
out: &mut VecDeque<char>,
|
||||||
|
offset: &mut usize,
|
||||||
|
) -> Result<(), ReaderError> {
|
||||||
|
let mut buffer = [0; 2];
|
||||||
|
reader.read_exact(&mut buffer)?;
|
||||||
|
let first = if BIG_ENDIAN {
|
||||||
|
u16::from_be_bytes(buffer)
|
||||||
|
} else {
|
||||||
|
u16::from_le_bytes(buffer)
|
||||||
|
};
|
||||||
|
|
||||||
|
if is_utf16_surrogate(first) {
|
||||||
|
reader.read_exact(&mut buffer)?;
|
||||||
|
let second = if BIG_ENDIAN {
|
||||||
|
u16::from_be_bytes(buffer)
|
||||||
|
} else {
|
||||||
|
u16::from_le_bytes(buffer)
|
||||||
|
};
|
||||||
|
|
||||||
|
match core::char::decode_utf16([first, second]).next() {
|
||||||
|
Some(Ok(ch)) => {
|
||||||
|
push_char(out, ch, *offset)?;
|
||||||
|
*offset += 4;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
Some(Err(err)) => Err(ReaderError::InvalidUtf16 {
|
||||||
|
value: err.unpaired_surrogate(),
|
||||||
|
}),
|
||||||
|
None => unreachable!(),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
match core::char::decode_utf16([first]).next() {
|
||||||
|
Some(Ok(ch)) => {
|
||||||
|
push_char(out, ch, *offset)?;
|
||||||
|
*offset += 2;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
Some(Err(_)) | None => unreachable!(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn utf8_char_width(initial: u8) -> usize {
|
||||||
if initial & 0x80 == 0 {
|
if initial & 0x80 == 0 {
|
||||||
(1, initial & 0x7f)
|
1
|
||||||
} else if initial & 0xE0 == 0xC0 {
|
} else if initial & 0xE0 == 0xC0 {
|
||||||
(2, initial & 0x1f)
|
2
|
||||||
} else if initial & 0xF0 == 0xE0 {
|
} else if initial & 0xF0 == 0xE0 {
|
||||||
(3, initial & 0x0f)
|
3
|
||||||
} else if initial & 0xF8 == 0xF0 {
|
} else if initial & 0xF8 == 0xF0 {
|
||||||
(4, initial & 0x07)
|
4
|
||||||
} else {
|
} else {
|
||||||
(0, 0)
|
0
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
enum Utf8Error {
|
fn is_utf16_surrogate(value: u16) -> bool {
|
||||||
Incomplete,
|
matches!(value, 0xD800..=0xDFFF)
|
||||||
InvalidLeadingOctet,
|
|
||||||
InvalidTrailingOctet(usize),
|
|
||||||
InvalidLength,
|
|
||||||
InvalidUnicode(u32),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn read_char_utf8(raw: &mut VecDeque<u8>) -> Option<Result<char, Utf8Error>> {
|
fn push_char(out: &mut VecDeque<char>, ch: char, offset: usize) -> Result<(), ReaderError> {
|
||||||
let first = raw.front().copied()?;
|
|
||||||
let (width, mut value) = utf8_char_width_and_initial_value(first);
|
|
||||||
if width == 0 {
|
|
||||||
return Some(Err(Utf8Error::InvalidLeadingOctet));
|
|
||||||
}
|
|
||||||
if raw.len() < width {
|
|
||||||
return Some(Err(Utf8Error::Incomplete));
|
|
||||||
}
|
|
||||||
for (i, trailing) in raw.iter().enumerate().take(width).skip(1) {
|
|
||||||
if trailing & 0xc0 != 0x80 {
|
|
||||||
return Some(Err(Utf8Error::InvalidTrailingOctet(i)));
|
|
||||||
}
|
|
||||||
value <<= 6;
|
|
||||||
value += *trailing as u32 & 0x3f;
|
|
||||||
}
|
|
||||||
if !(width == 1
|
|
||||||
|| width == 2 && value >= 0x80
|
|
||||||
|| width == 3 && value >= 0x800
|
|
||||||
|| width == 4 && value >= 0x10000)
|
|
||||||
{
|
|
||||||
return Some(Err(Utf8Error::InvalidLength));
|
|
||||||
}
|
|
||||||
if let Some(ch) = char::from_u32(value) {
|
|
||||||
raw.drain(..width);
|
|
||||||
Some(Ok(ch))
|
|
||||||
} else {
|
|
||||||
Some(Err(Utf8Error::InvalidUnicode(value)))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
enum Utf16Error {
|
|
||||||
Incomplete,
|
|
||||||
UnexpectedLowSurrogateArea(u32),
|
|
||||||
ExpectedLowSurrogateArea(u32),
|
|
||||||
InvalidUnicode(u32),
|
|
||||||
}
|
|
||||||
|
|
||||||
fn read_char_utf16<const BIG_ENDIAN: bool>(
|
|
||||||
raw: &mut VecDeque<u8>,
|
|
||||||
) -> Option<Result<char, Utf16Error>> {
|
|
||||||
if raw.is_empty() {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
if raw.len() < 2 {
|
|
||||||
return Some(Err(Utf16Error::Incomplete));
|
|
||||||
}
|
|
||||||
let bytes = [raw[0], raw[1]];
|
|
||||||
let mut value = if BIG_ENDIAN {
|
|
||||||
u16::from_be_bytes(bytes) as u32
|
|
||||||
} else {
|
|
||||||
u16::from_le_bytes(bytes) as u32
|
|
||||||
};
|
|
||||||
if value & 0xfc00 == 0xdc00 {
|
|
||||||
return Some(Err(Utf16Error::UnexpectedLowSurrogateArea(value)));
|
|
||||||
}
|
|
||||||
let width;
|
|
||||||
if value & 0xfc00 == 0xd800 {
|
|
||||||
width = 4;
|
|
||||||
if raw.len() < width {
|
|
||||||
return Some(Err(Utf16Error::Incomplete));
|
|
||||||
}
|
|
||||||
let bytes2 = [raw[2], raw[3]];
|
|
||||||
let value2 = if BIG_ENDIAN {
|
|
||||||
u16::from_be_bytes(bytes2) as u32
|
|
||||||
} else {
|
|
||||||
u16::from_le_bytes(bytes2) as u32
|
|
||||||
};
|
|
||||||
if value2 & 0xfc00 != 0xdc00 {
|
|
||||||
return Some(Err(Utf16Error::ExpectedLowSurrogateArea(value2)));
|
|
||||||
}
|
|
||||||
value = (0x10000 + (value & 0x3ff)) << (10 + (value2 & 0x3ff));
|
|
||||||
} else {
|
|
||||||
width = 2;
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(ch) = char::from_u32(value) {
|
|
||||||
raw.drain(..width);
|
|
||||||
Some(Ok(ch))
|
|
||||||
} else {
|
|
||||||
Some(Err(Utf16Error::InvalidUnicode(value)))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn push_char(parser: &mut yaml_parser_t, ch: char) -> Result<(), ReaderError> {
|
|
||||||
if !(ch == '\x09'
|
if !(ch == '\x09'
|
||||||
|| ch == '\x0A'
|
|| ch == '\x0A'
|
||||||
|| ch == '\x0D'
|
|| ch == '\x0D'
|
||||||
|
@ -191,16 +276,9 @@ fn push_char(parser: &mut yaml_parser_t, ch: char) -> Result<(), ReaderError> {
|
||||||
|| ch >= '\u{E000}' && ch <= '\u{FFFD}'
|
|| ch >= '\u{E000}' && ch <= '\u{FFFD}'
|
||||||
|| ch >= '\u{10000}' && ch <= '\u{10FFFF}')
|
|| ch >= '\u{10000}' && ch <= '\u{10FFFF}')
|
||||||
{
|
{
|
||||||
return yaml_parser_set_reader_error(
|
return yaml_parser_set_reader_error("control characters are not allowed", offset, ch as _);
|
||||||
parser,
|
|
||||||
"control characters are not allowed",
|
|
||||||
parser.offset,
|
|
||||||
ch as _,
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
parser.buffer.push_back(ch);
|
out.push_back(ch);
|
||||||
parser.offset += ch.len_utf8();
|
|
||||||
parser.unread += 1;
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -208,145 +286,49 @@ pub(crate) fn yaml_parser_update_buffer(
|
||||||
parser: &mut yaml_parser_t,
|
parser: &mut yaml_parser_t,
|
||||||
length: usize,
|
length: usize,
|
||||||
) -> Result<(), ReaderError> {
|
) -> Result<(), ReaderError> {
|
||||||
let mut first = true;
|
let reader = parser.read_handler.as_deref_mut().expect("no read handler");
|
||||||
assert!((parser.read_handler).is_some());
|
|
||||||
if parser.eof && parser.raw_buffer.is_empty() {
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
if parser.unread >= length {
|
if parser.unread >= length {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
if parser.encoding == YAML_ANY_ENCODING {
|
if parser.encoding == YAML_ANY_ENCODING {
|
||||||
yaml_parser_determine_encoding(parser)?;
|
if let Some(encoding) = yaml_parser_determine_encoding(reader)? {
|
||||||
|
parser.encoding = encoding;
|
||||||
|
} else {
|
||||||
|
parser.eof = true;
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
while parser.unread < length {
|
while parser.unread < length {
|
||||||
if parser.eof && parser.raw_buffer.is_empty() {
|
if parser.eof {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
if !first || parser.raw_buffer.is_empty() {
|
|
||||||
yaml_parser_update_raw_buffer(parser)?;
|
let tokens_before = parser.buffer.len();
|
||||||
}
|
|
||||||
first = false;
|
let not_eof = match parser.encoding {
|
||||||
match parser.encoding {
|
YAML_ANY_ENCODING => unreachable!(),
|
||||||
YAML_UTF8_ENCODING => {
|
YAML_UTF8_ENCODING => {
|
||||||
match read_char_utf8(&mut parser.raw_buffer) {
|
read_utf8_buffered(reader, &mut parser.buffer, &mut parser.offset)?
|
||||||
Some(Ok(ch)) => {
|
|
||||||
push_char(parser, ch)?;
|
|
||||||
}
|
}
|
||||||
Some(Err(Utf8Error::Incomplete)) => {
|
YAML_UTF16LE_ENCODING => {
|
||||||
if parser.eof {
|
read_utf16_buffered::<false>(reader, &mut parser.buffer, &mut parser.offset)?
|
||||||
return yaml_parser_set_reader_error(
|
|
||||||
parser,
|
|
||||||
"incomplete UTF-8 octet sequence",
|
|
||||||
parser.offset,
|
|
||||||
-1,
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
// Read more
|
|
||||||
}
|
}
|
||||||
|
YAML_UTF16BE_ENCODING => {
|
||||||
|
read_utf16_buffered::<true>(reader, &mut parser.buffer, &mut parser.offset)?
|
||||||
}
|
}
|
||||||
Some(Err(Utf8Error::InvalidLeadingOctet)) => {
|
|
||||||
return yaml_parser_set_reader_error(
|
|
||||||
parser,
|
|
||||||
"invalid leading UTF-8 octet",
|
|
||||||
parser.offset,
|
|
||||||
parser.raw_buffer[0] as _,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
Some(Err(Utf8Error::InvalidTrailingOctet(offset))) => {
|
|
||||||
return yaml_parser_set_reader_error(
|
|
||||||
parser,
|
|
||||||
"invalid trailing UTF-8 octet",
|
|
||||||
parser.offset + offset,
|
|
||||||
parser.raw_buffer[offset] as _,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
Some(Err(Utf8Error::InvalidLength)) => {
|
|
||||||
return yaml_parser_set_reader_error(
|
|
||||||
parser,
|
|
||||||
"invalid length of a UTF-8 sequence",
|
|
||||||
parser.offset,
|
|
||||||
-1,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
Some(Err(Utf8Error::InvalidUnicode(value))) => {
|
|
||||||
return yaml_parser_set_reader_error(
|
|
||||||
parser,
|
|
||||||
"invalid Unicode character",
|
|
||||||
parser.offset,
|
|
||||||
value as _,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
None => (),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
YAML_UTF16LE_ENCODING | YAML_UTF16BE_ENCODING => {
|
|
||||||
let is_big_endian = parser.encoding == YAML_UTF16BE_ENCODING;
|
|
||||||
let res = if is_big_endian {
|
|
||||||
read_char_utf16::<true>(&mut parser.raw_buffer)
|
|
||||||
} else {
|
|
||||||
read_char_utf16::<false>(&mut parser.raw_buffer)
|
|
||||||
};
|
};
|
||||||
match res {
|
|
||||||
Some(Ok(ch)) => {
|
let num_read = parser.buffer.len() - tokens_before;
|
||||||
push_char(parser, ch)?;
|
parser.unread += num_read;
|
||||||
}
|
if !not_eof {
|
||||||
Some(Err(Utf16Error::Incomplete)) => {
|
parser.eof = true;
|
||||||
if parser.eof {
|
return Ok(());
|
||||||
return yaml_parser_set_reader_error(
|
|
||||||
parser,
|
|
||||||
"incomplete UTF-16 character",
|
|
||||||
parser.offset,
|
|
||||||
-1,
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
// Read more
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Some(Err(Utf16Error::UnexpectedLowSurrogateArea(value))) => {
|
|
||||||
return yaml_parser_set_reader_error(
|
|
||||||
parser,
|
|
||||||
"unexpected low surrogate area",
|
|
||||||
parser.offset,
|
|
||||||
value as i32,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
// Some(Err(Utf16Error::IncompleteSurrogatePair)) => {
|
|
||||||
// return yaml_parser_set_reader_error(
|
|
||||||
// parser,
|
|
||||||
// "incomplete UTF-16 surrogate pair",
|
|
||||||
// parser.offset,
|
|
||||||
// -1,
|
|
||||||
// );
|
|
||||||
// }
|
|
||||||
Some(Err(Utf16Error::ExpectedLowSurrogateArea(value))) => {
|
|
||||||
return yaml_parser_set_reader_error(
|
|
||||||
parser,
|
|
||||||
"expected low surrogate area",
|
|
||||||
parser.offset + 2,
|
|
||||||
value as i32,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
Some(Err(Utf16Error::InvalidUnicode(value))) => {
|
|
||||||
return yaml_parser_set_reader_error(
|
|
||||||
parser,
|
|
||||||
"invalid Unicode character",
|
|
||||||
parser.offset,
|
|
||||||
value as i32,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
None => (),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ => {
|
|
||||||
panic!("unhandled encoded enum variant")
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if parser.offset >= (!0_usize).wrapping_div(2_usize) {
|
if parser.offset >= (!0_usize).wrapping_div(2_usize) {
|
||||||
return yaml_parser_set_reader_error(parser, "input is too long", parser.offset, -1);
|
return yaml_parser_set_reader_error("input is too long", parser.offset, -1);
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
|
@ -628,7 +628,7 @@ pub struct yaml_alias_data_t {
|
||||||
#[non_exhaustive]
|
#[non_exhaustive]
|
||||||
pub struct yaml_parser_t<'r> {
|
pub struct yaml_parser_t<'r> {
|
||||||
/// Read handler.
|
/// Read handler.
|
||||||
pub(crate) read_handler: Option<&'r mut dyn std::io::Read>,
|
pub(crate) read_handler: Option<&'r mut dyn std::io::BufRead>,
|
||||||
/// EOF flag
|
/// EOF flag
|
||||||
pub(crate) eof: bool,
|
pub(crate) eof: bool,
|
||||||
/// The working buffer.
|
/// The working buffer.
|
||||||
|
@ -637,12 +637,6 @@ pub struct yaml_parser_t<'r> {
|
||||||
pub(crate) buffer: VecDeque<char>,
|
pub(crate) buffer: VecDeque<char>,
|
||||||
/// The number of unread characters in the buffer.
|
/// The number of unread characters in the buffer.
|
||||||
pub(crate) unread: usize,
|
pub(crate) unread: usize,
|
||||||
/// The raw buffer.
|
|
||||||
///
|
|
||||||
/// This is the raw unchecked input from the read handler (for example, it
|
|
||||||
/// may be UTF-16 encoded).
|
|
||||||
// TODO: Get rid of this and ask users to provide something implementing `BufRead` instead of `Read`.
|
|
||||||
pub(crate) raw_buffer: VecDeque<u8>,
|
|
||||||
/// The input encoding.
|
/// The input encoding.
|
||||||
pub(crate) encoding: yaml_encoding_t,
|
pub(crate) encoding: yaml_encoding_t,
|
||||||
/// The offset of the current position (in bytes).
|
/// The offset of the current position (in bytes).
|
||||||
|
|
Loading…
Reference in a new issue