mirror of
https://github.com/simonask/libyaml-safer
synced 2024-11-26 13:20:24 +00:00
Parser: Replace internal buffering with std::io::BufRead
This commit is contained in:
parent
e28400ee5f
commit
5bf087e0a3
5 changed files with 281 additions and 297 deletions
|
@ -25,7 +25,6 @@ pub fn yaml_parser_new<'r>() -> yaml_parser_t<'r> {
|
|||
eof: false,
|
||||
buffer: VecDeque::with_capacity(INPUT_BUFFER_SIZE),
|
||||
unread: 0,
|
||||
raw_buffer: VecDeque::with_capacity(INPUT_RAW_BUFFER_SIZE),
|
||||
encoding: YAML_ANY_ENCODING,
|
||||
offset: 0,
|
||||
mark: yaml_mark_t::default(),
|
||||
|
@ -49,7 +48,6 @@ pub fn yaml_parser_new<'r>() -> yaml_parser_t<'r> {
|
|||
|
||||
/// Destroy a parser.
|
||||
pub fn yaml_parser_delete(parser: &mut yaml_parser_t) {
|
||||
parser.raw_buffer.clear();
|
||||
parser.buffer.clear();
|
||||
parser.tokens.clear();
|
||||
parser.indents.clear();
|
||||
|
@ -70,7 +68,10 @@ pub fn yaml_parser_set_input_string<'r>(parser: &mut yaml_parser_t<'r>, input: &
|
|||
}
|
||||
|
||||
/// Set a generic input handler.
|
||||
pub fn yaml_parser_set_input<'r>(parser: &mut yaml_parser_t<'r>, input: &'r mut dyn std::io::Read) {
|
||||
pub fn yaml_parser_set_input<'r>(
|
||||
parser: &mut yaml_parser_t<'r>,
|
||||
input: &'r mut dyn std::io::BufRead,
|
||||
) {
|
||||
assert!((parser.read_handler).is_none());
|
||||
parser.read_handler = Some(input);
|
||||
}
|
||||
|
|
|
@ -29,7 +29,8 @@ pub(crate) fn test_main(
|
|||
) -> Result<(), Box<dyn Error>> {
|
||||
let mut parser = yaml_parser_new();
|
||||
|
||||
yaml_parser_set_input(&mut parser, stdin);
|
||||
let mut stdin = std::io::BufReader::new(stdin);
|
||||
yaml_parser_set_input(&mut parser, &mut stdin);
|
||||
|
||||
loop {
|
||||
let event = match yaml_parser_parse(&mut parser) {
|
||||
|
|
|
@ -24,6 +24,12 @@ pub enum ReaderError {
|
|||
offset: usize,
|
||||
value: i32,
|
||||
},
|
||||
#[error("input stream produced an invalid byte order marker")]
|
||||
InvalidBom,
|
||||
#[error("invalid UTF-8 byte at offset: {value:x}")]
|
||||
InvalidUtf8 { value: u8 },
|
||||
#[error("invalid UTF-16 unpaired surrogate: {value:x}")]
|
||||
InvalidUtf16 { value: u16 },
|
||||
#[error(transparent)]
|
||||
Io(#[from] std::io::Error),
|
||||
}
|
||||
|
|
554
src/reader.rs
554
src/reader.rs
|
@ -1,14 +1,13 @@
|
|||
use std::io::BufRead;
|
||||
|
||||
use alloc::collections::VecDeque;
|
||||
|
||||
use crate::api::INPUT_RAW_BUFFER_SIZE;
|
||||
use crate::macros::vecdeque_starts_with;
|
||||
use crate::{
|
||||
yaml_parser_t, ReaderError, YAML_ANY_ENCODING, YAML_UTF16BE_ENCODING, YAML_UTF16LE_ENCODING,
|
||||
YAML_UTF8_ENCODING,
|
||||
yaml_encoding_t, yaml_parser_t, ReaderError, YAML_ANY_ENCODING, YAML_UTF16BE_ENCODING,
|
||||
YAML_UTF16LE_ENCODING, YAML_UTF8_ENCODING,
|
||||
};
|
||||
|
||||
fn yaml_parser_set_reader_error<T>(
|
||||
_parser: &mut yaml_parser_t,
|
||||
problem: &'static str,
|
||||
offset: usize,
|
||||
value: i32,
|
||||
|
@ -20,168 +19,254 @@ fn yaml_parser_set_reader_error<T>(
|
|||
})
|
||||
}
|
||||
|
||||
const BOM_UTF8: &[u8] = b"\xEF\xBB\xBF";
|
||||
const BOM_UTF16LE: &[u8] = b"\xFF\xFE";
|
||||
const BOM_UTF16BE: &[u8] = b"\xFE\xFF";
|
||||
const BOM_UTF8: [u8; 3] = [0xef, 0xbb, 0xbf];
|
||||
const BOM_UTF16LE: [u8; 2] = [0xff, 0xfe];
|
||||
const BOM_UTF16BE: [u8; 2] = [0xfe, 0xff];
|
||||
|
||||
fn yaml_parser_determine_encoding(parser: &mut yaml_parser_t) -> Result<(), ReaderError> {
|
||||
while !parser.eof && parser.raw_buffer.len() < 3 {
|
||||
yaml_parser_update_raw_buffer(parser)?;
|
||||
}
|
||||
if vecdeque_starts_with(&parser.raw_buffer, BOM_UTF16LE) {
|
||||
parser.encoding = YAML_UTF16LE_ENCODING;
|
||||
parser.raw_buffer.drain(0..2);
|
||||
parser.offset += 2;
|
||||
} else if vecdeque_starts_with(&parser.raw_buffer, BOM_UTF16BE) {
|
||||
parser.encoding = YAML_UTF16BE_ENCODING;
|
||||
parser.raw_buffer.drain(0..2);
|
||||
parser.offset += 2;
|
||||
} else if vecdeque_starts_with(&parser.raw_buffer, BOM_UTF8) {
|
||||
parser.encoding = YAML_UTF8_ENCODING;
|
||||
parser.raw_buffer.drain(0..3);
|
||||
parser.offset += 3;
|
||||
} else {
|
||||
parser.encoding = YAML_UTF8_ENCODING;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn yaml_parser_update_raw_buffer(parser: &mut yaml_parser_t) -> Result<(), ReaderError> {
|
||||
if parser.raw_buffer.len() >= INPUT_RAW_BUFFER_SIZE {
|
||||
return Ok(());
|
||||
}
|
||||
if parser.eof {
|
||||
return Ok(());
|
||||
fn yaml_parser_determine_encoding(
|
||||
reader: &mut dyn BufRead,
|
||||
) -> Result<Option<yaml_encoding_t>, ReaderError> {
|
||||
let initial_bytes = reader.fill_buf()?;
|
||||
if initial_bytes.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let len_before = parser.raw_buffer.len();
|
||||
debug_assert!(len_before < INPUT_RAW_BUFFER_SIZE);
|
||||
parser.raw_buffer.resize(INPUT_RAW_BUFFER_SIZE, 0);
|
||||
let contiguous = parser.raw_buffer.make_contiguous();
|
||||
let write_to = &mut contiguous[len_before..];
|
||||
|
||||
let size_read = parser
|
||||
.read_handler
|
||||
.as_mut()
|
||||
.expect("non-null read handler")
|
||||
.read(write_to)?;
|
||||
|
||||
let valid_size = len_before + size_read;
|
||||
parser.raw_buffer.truncate(valid_size);
|
||||
if size_read == 0 {
|
||||
parser.eof = true;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn utf8_char_width_and_initial_value(initial: u8) -> (usize, u32) {
|
||||
let initial = initial as u32;
|
||||
if initial & 0x80 == 0 {
|
||||
(1, initial & 0x7f)
|
||||
} else if initial & 0xE0 == 0xC0 {
|
||||
(2, initial & 0x1f)
|
||||
} else if initial & 0xF0 == 0xE0 {
|
||||
(3, initial & 0x0f)
|
||||
} else if initial & 0xF8 == 0xF0 {
|
||||
(4, initial & 0x07)
|
||||
} else {
|
||||
(0, 0)
|
||||
}
|
||||
}
|
||||
|
||||
enum Utf8Error {
|
||||
Incomplete,
|
||||
InvalidLeadingOctet,
|
||||
InvalidTrailingOctet(usize),
|
||||
InvalidLength,
|
||||
InvalidUnicode(u32),
|
||||
}
|
||||
|
||||
fn read_char_utf8(raw: &mut VecDeque<u8>) -> Option<Result<char, Utf8Error>> {
|
||||
let first = raw.front().copied()?;
|
||||
let (width, mut value) = utf8_char_width_and_initial_value(first);
|
||||
if width == 0 {
|
||||
return Some(Err(Utf8Error::InvalidLeadingOctet));
|
||||
}
|
||||
if raw.len() < width {
|
||||
return Some(Err(Utf8Error::Incomplete));
|
||||
}
|
||||
for (i, trailing) in raw.iter().enumerate().take(width).skip(1) {
|
||||
if trailing & 0xc0 != 0x80 {
|
||||
return Some(Err(Utf8Error::InvalidTrailingOctet(i)));
|
||||
match initial_bytes[0] {
|
||||
0xef => {
|
||||
let mut bom = [0; 3];
|
||||
reader.read_exact(&mut bom)?;
|
||||
if bom == BOM_UTF8 {
|
||||
Ok(Some(YAML_UTF8_ENCODING))
|
||||
} else {
|
||||
Err(ReaderError::InvalidBom)
|
||||
}
|
||||
}
|
||||
value <<= 6;
|
||||
value += *trailing as u32 & 0x3f;
|
||||
}
|
||||
if !(width == 1
|
||||
|| width == 2 && value >= 0x80
|
||||
|| width == 3 && value >= 0x800
|
||||
|| width == 4 && value >= 0x10000)
|
||||
{
|
||||
return Some(Err(Utf8Error::InvalidLength));
|
||||
}
|
||||
if let Some(ch) = char::from_u32(value) {
|
||||
raw.drain(..width);
|
||||
Some(Ok(ch))
|
||||
} else {
|
||||
Some(Err(Utf8Error::InvalidUnicode(value)))
|
||||
0xff | 0xfe => {
|
||||
let mut bom = [0; 2];
|
||||
reader.read_exact(&mut bom)?;
|
||||
if bom == BOM_UTF16LE {
|
||||
Ok(Some(YAML_UTF16LE_ENCODING))
|
||||
} else if bom == BOM_UTF16BE {
|
||||
Ok(Some(YAML_UTF16BE_ENCODING))
|
||||
} else {
|
||||
Err(ReaderError::InvalidBom)
|
||||
}
|
||||
}
|
||||
_ => Ok(Some(YAML_UTF8_ENCODING)),
|
||||
}
|
||||
}
|
||||
|
||||
enum Utf16Error {
|
||||
Incomplete,
|
||||
UnexpectedLowSurrogateArea(u32),
|
||||
ExpectedLowSurrogateArea(u32),
|
||||
InvalidUnicode(u32),
|
||||
}
|
||||
|
||||
fn read_char_utf16<const BIG_ENDIAN: bool>(
|
||||
raw: &mut VecDeque<u8>,
|
||||
) -> Option<Result<char, Utf16Error>> {
|
||||
if raw.is_empty() {
|
||||
return None;
|
||||
}
|
||||
if raw.len() < 2 {
|
||||
return Some(Err(Utf16Error::Incomplete));
|
||||
}
|
||||
let bytes = [raw[0], raw[1]];
|
||||
let mut value = if BIG_ENDIAN {
|
||||
u16::from_be_bytes(bytes) as u32
|
||||
} else {
|
||||
u16::from_le_bytes(bytes) as u32
|
||||
fn read_utf8_buffered(
|
||||
reader: &mut dyn BufRead,
|
||||
out: &mut VecDeque<char>,
|
||||
offset: &mut usize,
|
||||
) -> Result<bool, ReaderError> {
|
||||
let available = loop {
|
||||
match reader.fill_buf() {
|
||||
Ok([]) => return Ok(false),
|
||||
Ok(available) => break available,
|
||||
Err(err) if err.kind() == std::io::ErrorKind::Interrupted => continue,
|
||||
Err(err) => return Err(err.into()),
|
||||
}
|
||||
};
|
||||
if value & 0xfc00 == 0xdc00 {
|
||||
return Some(Err(Utf16Error::UnexpectedLowSurrogateArea(value)));
|
||||
}
|
||||
let width;
|
||||
if value & 0xfc00 == 0xd800 {
|
||||
width = 4;
|
||||
if raw.len() < width {
|
||||
return Some(Err(Utf16Error::Incomplete));
|
||||
}
|
||||
let bytes2 = [raw[2], raw[3]];
|
||||
let value2 = if BIG_ENDIAN {
|
||||
u16::from_be_bytes(bytes2) as u32
|
||||
} else {
|
||||
u16::from_le_bytes(bytes2) as u32
|
||||
};
|
||||
if value2 & 0xfc00 != 0xdc00 {
|
||||
return Some(Err(Utf16Error::ExpectedLowSurrogateArea(value2)));
|
||||
}
|
||||
value = (0x10000 + (value & 0x3ff)) << (10 + (value2 & 0x3ff));
|
||||
} else {
|
||||
width = 2;
|
||||
}
|
||||
|
||||
if let Some(ch) = char::from_u32(value) {
|
||||
raw.drain(..width);
|
||||
Some(Ok(ch))
|
||||
} else {
|
||||
Some(Err(Utf16Error::InvalidUnicode(value)))
|
||||
match core::str::from_utf8(available) {
|
||||
Ok(valid) => {
|
||||
let used = valid.len();
|
||||
// The entire contents of the input buffer was valid UTF-8.
|
||||
for ch in valid.chars() {
|
||||
push_char(out, ch, *offset)?;
|
||||
*offset += ch.len_utf8();
|
||||
}
|
||||
reader.consume(used);
|
||||
Ok(true)
|
||||
}
|
||||
Err(err) => {
|
||||
let valid_bytes = err.valid_up_to();
|
||||
|
||||
// If some of the buffer contents were valid, append that to the
|
||||
// output.
|
||||
let valid = unsafe {
|
||||
// SAFETY: This is safe because of `valid_up_to()`.
|
||||
core::str::from_utf8_unchecked(&available[..valid_bytes])
|
||||
};
|
||||
for ch in valid.chars() {
|
||||
push_char(out, ch, *offset)?;
|
||||
*offset += ch.len_utf8();
|
||||
}
|
||||
|
||||
match err.error_len() {
|
||||
Some(_invalid_len) => {
|
||||
return Err(ReaderError::InvalidUtf8 {
|
||||
value: available[valid_bytes],
|
||||
});
|
||||
}
|
||||
None => {
|
||||
if valid_bytes != 0 {
|
||||
// Some valid UTF-8 characters were present, and the
|
||||
// tail end of the buffer was an incomplete sequence.
|
||||
// Leave the incomplete sequence in the buffer.
|
||||
reader.consume(valid_bytes);
|
||||
Ok(true)
|
||||
} else {
|
||||
// The beginning of the buffer was an incomplete UTF-8
|
||||
// sequence. Read the whole character unbuffered.
|
||||
//
|
||||
// This will return `UnexpectedEof` if the sequence
|
||||
// cannot be completed. Note that `read_exact()` handles
|
||||
// interrupt automatically.
|
||||
let initial = available[0];
|
||||
read_utf8_char_unbuffered(reader, out, initial, offset)?;
|
||||
Ok(true)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn push_char(parser: &mut yaml_parser_t, ch: char) -> Result<(), ReaderError> {
|
||||
fn read_utf8_char_unbuffered(
|
||||
reader: &mut dyn BufRead,
|
||||
out: &mut VecDeque<char>,
|
||||
initial: u8,
|
||||
offset: &mut usize,
|
||||
) -> Result<(), ReaderError> {
|
||||
let width = utf8_char_width(initial);
|
||||
let mut buffer = [0; 4];
|
||||
reader.read_exact(&mut buffer[..width])?;
|
||||
if let Ok(valid) = core::str::from_utf8(&buffer[..width]) {
|
||||
// We read a whole, valid character.
|
||||
let Some(ch) = valid.chars().next() else {
|
||||
unreachable!()
|
||||
};
|
||||
push_char(out, ch, *offset)?;
|
||||
*offset += width;
|
||||
Ok(())
|
||||
} else {
|
||||
// Since we read the exact character width, the only
|
||||
// possible error here is invalid Unicode.
|
||||
Err(ReaderError::InvalidUtf8 { value: buffer[0] })
|
||||
}
|
||||
}
|
||||
|
||||
fn read_utf16_buffered<const BIG_ENDIAN: bool>(
|
||||
reader: &mut dyn BufRead,
|
||||
out: &mut VecDeque<char>,
|
||||
offset: &mut usize,
|
||||
) -> Result<bool, ReaderError> {
|
||||
let available = loop {
|
||||
match reader.fill_buf() {
|
||||
Ok([]) => return Ok(false),
|
||||
Ok(available) => break available,
|
||||
Err(err) if err.kind() == std::io::ErrorKind::Interrupted => continue,
|
||||
Err(err) => return Err(err.into()),
|
||||
}
|
||||
};
|
||||
|
||||
let chunks = available.chunks_exact(2).map(|chunk| {
|
||||
let [a, b] = chunk else { unreachable!() };
|
||||
if BIG_ENDIAN {
|
||||
u16::from_be_bytes([*a, *b])
|
||||
} else {
|
||||
u16::from_le_bytes([*a, *b])
|
||||
}
|
||||
});
|
||||
|
||||
let mut used = 0;
|
||||
for ch in core::char::decode_utf16(chunks) {
|
||||
match ch {
|
||||
Ok(ch) => {
|
||||
push_char(out, ch, *offset)?;
|
||||
let n = ch.len_utf16();
|
||||
*offset += n;
|
||||
used += n;
|
||||
}
|
||||
Err(_) => {
|
||||
// An unpaired surrogate may either be a corrupt stream, but it
|
||||
// can also be that the buffer just happens to contain the first
|
||||
// half of a surrogate pair. Consume all of the valid bytes in
|
||||
// the buffer first, and then handle the unpaired surrogate in
|
||||
// the "slow" path (`read_utf16_char_unbuffered`) the next time
|
||||
// we are called.
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if used != 0 {
|
||||
reader.consume(used);
|
||||
*offset += used;
|
||||
Ok(true)
|
||||
} else {
|
||||
debug_assert!(available.len() != 0 && available.len() < 2);
|
||||
read_utf16_char_unbuffered::<BIG_ENDIAN>(reader, out, offset)?;
|
||||
Ok(true)
|
||||
}
|
||||
}
|
||||
|
||||
fn read_utf16_char_unbuffered<const BIG_ENDIAN: bool>(
|
||||
reader: &mut dyn BufRead,
|
||||
out: &mut VecDeque<char>,
|
||||
offset: &mut usize,
|
||||
) -> Result<(), ReaderError> {
|
||||
let mut buffer = [0; 2];
|
||||
reader.read_exact(&mut buffer)?;
|
||||
let first = if BIG_ENDIAN {
|
||||
u16::from_be_bytes(buffer)
|
||||
} else {
|
||||
u16::from_le_bytes(buffer)
|
||||
};
|
||||
|
||||
if is_utf16_surrogate(first) {
|
||||
reader.read_exact(&mut buffer)?;
|
||||
let second = if BIG_ENDIAN {
|
||||
u16::from_be_bytes(buffer)
|
||||
} else {
|
||||
u16::from_le_bytes(buffer)
|
||||
};
|
||||
|
||||
match core::char::decode_utf16([first, second]).next() {
|
||||
Some(Ok(ch)) => {
|
||||
push_char(out, ch, *offset)?;
|
||||
*offset += 4;
|
||||
Ok(())
|
||||
}
|
||||
Some(Err(err)) => Err(ReaderError::InvalidUtf16 {
|
||||
value: err.unpaired_surrogate(),
|
||||
}),
|
||||
None => unreachable!(),
|
||||
}
|
||||
} else {
|
||||
match core::char::decode_utf16([first]).next() {
|
||||
Some(Ok(ch)) => {
|
||||
push_char(out, ch, *offset)?;
|
||||
*offset += 2;
|
||||
Ok(())
|
||||
}
|
||||
Some(Err(_)) | None => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn utf8_char_width(initial: u8) -> usize {
|
||||
if initial & 0x80 == 0 {
|
||||
1
|
||||
} else if initial & 0xE0 == 0xC0 {
|
||||
2
|
||||
} else if initial & 0xF0 == 0xE0 {
|
||||
3
|
||||
} else if initial & 0xF8 == 0xF0 {
|
||||
4
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
|
||||
fn is_utf16_surrogate(value: u16) -> bool {
|
||||
matches!(value, 0xD800..=0xDFFF)
|
||||
}
|
||||
|
||||
fn push_char(out: &mut VecDeque<char>, ch: char, offset: usize) -> Result<(), ReaderError> {
|
||||
if !(ch == '\x09'
|
||||
|| ch == '\x0A'
|
||||
|| ch == '\x0D'
|
||||
|
@ -191,16 +276,9 @@ fn push_char(parser: &mut yaml_parser_t, ch: char) -> Result<(), ReaderError> {
|
|||
|| ch >= '\u{E000}' && ch <= '\u{FFFD}'
|
||||
|| ch >= '\u{10000}' && ch <= '\u{10FFFF}')
|
||||
{
|
||||
return yaml_parser_set_reader_error(
|
||||
parser,
|
||||
"control characters are not allowed",
|
||||
parser.offset,
|
||||
ch as _,
|
||||
);
|
||||
return yaml_parser_set_reader_error("control characters are not allowed", offset, ch as _);
|
||||
}
|
||||
parser.buffer.push_back(ch);
|
||||
parser.offset += ch.len_utf8();
|
||||
parser.unread += 1;
|
||||
out.push_back(ch);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
@ -208,145 +286,49 @@ pub(crate) fn yaml_parser_update_buffer(
|
|||
parser: &mut yaml_parser_t,
|
||||
length: usize,
|
||||
) -> Result<(), ReaderError> {
|
||||
let mut first = true;
|
||||
assert!((parser.read_handler).is_some());
|
||||
if parser.eof && parser.raw_buffer.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
let reader = parser.read_handler.as_deref_mut().expect("no read handler");
|
||||
if parser.unread >= length {
|
||||
return Ok(());
|
||||
}
|
||||
if parser.encoding == YAML_ANY_ENCODING {
|
||||
yaml_parser_determine_encoding(parser)?;
|
||||
if let Some(encoding) = yaml_parser_determine_encoding(reader)? {
|
||||
parser.encoding = encoding;
|
||||
} else {
|
||||
parser.eof = true;
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
while parser.unread < length {
|
||||
if parser.eof && parser.raw_buffer.is_empty() {
|
||||
if parser.eof {
|
||||
return Ok(());
|
||||
}
|
||||
if !first || parser.raw_buffer.is_empty() {
|
||||
yaml_parser_update_raw_buffer(parser)?;
|
||||
}
|
||||
first = false;
|
||||
match parser.encoding {
|
||||
|
||||
let tokens_before = parser.buffer.len();
|
||||
|
||||
let not_eof = match parser.encoding {
|
||||
YAML_ANY_ENCODING => unreachable!(),
|
||||
YAML_UTF8_ENCODING => {
|
||||
match read_char_utf8(&mut parser.raw_buffer) {
|
||||
Some(Ok(ch)) => {
|
||||
push_char(parser, ch)?;
|
||||
}
|
||||
Some(Err(Utf8Error::Incomplete)) => {
|
||||
if parser.eof {
|
||||
return yaml_parser_set_reader_error(
|
||||
parser,
|
||||
"incomplete UTF-8 octet sequence",
|
||||
parser.offset,
|
||||
-1,
|
||||
);
|
||||
} else {
|
||||
// Read more
|
||||
}
|
||||
}
|
||||
Some(Err(Utf8Error::InvalidLeadingOctet)) => {
|
||||
return yaml_parser_set_reader_error(
|
||||
parser,
|
||||
"invalid leading UTF-8 octet",
|
||||
parser.offset,
|
||||
parser.raw_buffer[0] as _,
|
||||
);
|
||||
}
|
||||
Some(Err(Utf8Error::InvalidTrailingOctet(offset))) => {
|
||||
return yaml_parser_set_reader_error(
|
||||
parser,
|
||||
"invalid trailing UTF-8 octet",
|
||||
parser.offset + offset,
|
||||
parser.raw_buffer[offset] as _,
|
||||
);
|
||||
}
|
||||
Some(Err(Utf8Error::InvalidLength)) => {
|
||||
return yaml_parser_set_reader_error(
|
||||
parser,
|
||||
"invalid length of a UTF-8 sequence",
|
||||
parser.offset,
|
||||
-1,
|
||||
);
|
||||
}
|
||||
Some(Err(Utf8Error::InvalidUnicode(value))) => {
|
||||
return yaml_parser_set_reader_error(
|
||||
parser,
|
||||
"invalid Unicode character",
|
||||
parser.offset,
|
||||
value as _,
|
||||
);
|
||||
}
|
||||
None => (),
|
||||
}
|
||||
read_utf8_buffered(reader, &mut parser.buffer, &mut parser.offset)?
|
||||
}
|
||||
YAML_UTF16LE_ENCODING | YAML_UTF16BE_ENCODING => {
|
||||
let is_big_endian = parser.encoding == YAML_UTF16BE_ENCODING;
|
||||
let res = if is_big_endian {
|
||||
read_char_utf16::<true>(&mut parser.raw_buffer)
|
||||
} else {
|
||||
read_char_utf16::<false>(&mut parser.raw_buffer)
|
||||
};
|
||||
match res {
|
||||
Some(Ok(ch)) => {
|
||||
push_char(parser, ch)?;
|
||||
}
|
||||
Some(Err(Utf16Error::Incomplete)) => {
|
||||
if parser.eof {
|
||||
return yaml_parser_set_reader_error(
|
||||
parser,
|
||||
"incomplete UTF-16 character",
|
||||
parser.offset,
|
||||
-1,
|
||||
);
|
||||
} else {
|
||||
// Read more
|
||||
}
|
||||
}
|
||||
Some(Err(Utf16Error::UnexpectedLowSurrogateArea(value))) => {
|
||||
return yaml_parser_set_reader_error(
|
||||
parser,
|
||||
"unexpected low surrogate area",
|
||||
parser.offset,
|
||||
value as i32,
|
||||
);
|
||||
}
|
||||
// Some(Err(Utf16Error::IncompleteSurrogatePair)) => {
|
||||
// return yaml_parser_set_reader_error(
|
||||
// parser,
|
||||
// "incomplete UTF-16 surrogate pair",
|
||||
// parser.offset,
|
||||
// -1,
|
||||
// );
|
||||
// }
|
||||
Some(Err(Utf16Error::ExpectedLowSurrogateArea(value))) => {
|
||||
return yaml_parser_set_reader_error(
|
||||
parser,
|
||||
"expected low surrogate area",
|
||||
parser.offset + 2,
|
||||
value as i32,
|
||||
);
|
||||
}
|
||||
Some(Err(Utf16Error::InvalidUnicode(value))) => {
|
||||
return yaml_parser_set_reader_error(
|
||||
parser,
|
||||
"invalid Unicode character",
|
||||
parser.offset,
|
||||
value as i32,
|
||||
);
|
||||
}
|
||||
None => (),
|
||||
}
|
||||
YAML_UTF16LE_ENCODING => {
|
||||
read_utf16_buffered::<false>(reader, &mut parser.buffer, &mut parser.offset)?
|
||||
}
|
||||
_ => {
|
||||
panic!("unhandled encoded enum variant")
|
||||
YAML_UTF16BE_ENCODING => {
|
||||
read_utf16_buffered::<true>(reader, &mut parser.buffer, &mut parser.offset)?
|
||||
}
|
||||
};
|
||||
|
||||
let num_read = parser.buffer.len() - tokens_before;
|
||||
parser.unread += num_read;
|
||||
if !not_eof {
|
||||
parser.eof = true;
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
if parser.offset >= (!0_usize).wrapping_div(2_usize) {
|
||||
return yaml_parser_set_reader_error(parser, "input is too long", parser.offset, -1);
|
||||
return yaml_parser_set_reader_error("input is too long", parser.offset, -1);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
@ -628,7 +628,7 @@ pub struct yaml_alias_data_t {
|
|||
#[non_exhaustive]
|
||||
pub struct yaml_parser_t<'r> {
|
||||
/// Read handler.
|
||||
pub(crate) read_handler: Option<&'r mut dyn std::io::Read>,
|
||||
pub(crate) read_handler: Option<&'r mut dyn std::io::BufRead>,
|
||||
/// EOF flag
|
||||
pub(crate) eof: bool,
|
||||
/// The working buffer.
|
||||
|
@ -637,12 +637,6 @@ pub struct yaml_parser_t<'r> {
|
|||
pub(crate) buffer: VecDeque<char>,
|
||||
/// The number of unread characters in the buffer.
|
||||
pub(crate) unread: usize,
|
||||
/// The raw buffer.
|
||||
///
|
||||
/// This is the raw unchecked input from the read handler (for example, it
|
||||
/// may be UTF-16 encoded).
|
||||
// TODO: Get rid of this and ask users to provide something implementing `BufRead` instead of `Read`.
|
||||
pub(crate) raw_buffer: VecDeque<u8>,
|
||||
/// The input encoding.
|
||||
pub(crate) encoding: yaml_encoding_t,
|
||||
/// The offset of the current position (in bytes).
|
||||
|
|
Loading…
Reference in a new issue