From 6aef00ecffa4fd56e96d254019658d387d52c653 Mon Sep 17 00:00:00 2001 From: JT Date: Sat, 17 Jul 2021 09:55:12 +1200 Subject: [PATCH] basic signature parse --- src/eval.rs | 1 + src/lex.rs | 66 +++++++++----- src/lite_parse.rs | 2 +- src/main.rs | 6 +- src/parser.rs | 226 ++++++++++++++++++++++++++++++++++++++++++---- 5 files changed, 255 insertions(+), 46 deletions(-) diff --git a/src/eval.rs b/src/eval.rs index 078467ab73..3cabc4f75e 100644 --- a/src/eval.rs +++ b/src/eval.rs @@ -64,6 +64,7 @@ impl Engine { Expr::Table(_, _) => Err(ShellError::Unsupported(expr.span)), Expr::Literal(_) => Err(ShellError::Unsupported(expr.span)), Expr::String(_) => Err(ShellError::Unsupported(expr.span)), + Expr::Signature(_) => Err(ShellError::Unsupported(expr.span)), Expr::Garbage => Err(ShellError::Unsupported(expr.span)), } } diff --git a/src/lex.rs b/src/lex.rs index 47220aa13b..ca58acf70e 100644 --- a/src/lex.rs +++ b/src/lex.rs @@ -38,18 +38,28 @@ impl BlockKind { } } -#[derive(PartialEq, Eq, Debug, Clone, Copy)] +#[derive(PartialEq, Eq, Debug, Clone)] pub enum LexMode { Normal, - CommaIsSpace, - NewlineIsSpace, - CommaAndNewlineIsSpace, + Custom { + whitespace: Vec, + special: Vec, + }, +} + +impl LexMode { + pub fn whitespace_contains(&self, b: u8) -> bool { + match self { + LexMode::Custom { ref whitespace, .. } => whitespace.contains(&b), + _ => false, + } + } } // A baseline token is terminated if it's not nested inside of a paired // delimiter and the next character is one of: `|`, `;`, `#` or any // whitespace. -fn is_item_terminator(block_level: &[BlockKind], c: u8, lex_mode: LexMode) -> bool { +fn is_item_terminator(block_level: &[BlockKind], c: u8, lex_mode: &LexMode) -> bool { block_level.is_empty() && (c == b' ' || c == b'\t' @@ -57,14 +67,25 @@ fn is_item_terminator(block_level: &[BlockKind], c: u8, lex_mode: LexMode) -> bo || c == b'|' || c == b';' || c == b'#' - || (c == b',' && lex_mode == LexMode::CommaIsSpace) - || (c == b',' && lex_mode == LexMode::CommaAndNewlineIsSpace)) + || lex_mode.whitespace_contains(c)) +} + +// A special token is one that is a byte that stands alone as its own token. For example +// when parsing a signature you may want to have `:` be able to separate tokens and also +// to be handled as its own token to notify you you're about to parse a type in the example +// `foo:bar` +fn is_special_item(block_level: &[BlockKind], c: u8, lex_mode: &LexMode) -> bool { + block_level.is_empty() + && (match lex_mode { + LexMode::Custom { special, .. } => special.contains(&c), + _ => false, + }) } pub fn lex_item( input: &[u8], curr_offset: &mut usize, - lex_mode: LexMode, + lex_mode: &LexMode, ) -> (Span, Option) { // This variable tracks the starting character of a string literal, so that // we remain inside the string literal lexer mode until we encounter the @@ -99,19 +120,22 @@ pub fn lex_item( quote_start = None; } } else if c == b'#' { - if is_item_terminator(&block_level, c, lex_mode) { + if is_item_terminator(&block_level, c, &lex_mode) { break; } in_comment = true; } else if c == b'\n' { in_comment = false; - if is_item_terminator(&block_level, c, lex_mode) { + if is_item_terminator(&block_level, c, &lex_mode) { break; } } else if in_comment { - if is_item_terminator(&block_level, c, lex_mode) { + if is_item_terminator(&block_level, c, &lex_mode) { break; } + } else if is_special_item(&block_level, c, &lex_mode) && token_start == *curr_offset { + *curr_offset += 1; + break; } else if c == b'\'' || c == b'"' { // We encountered the opening quote of a string literal. quote_start = Some(c); @@ -140,7 +164,7 @@ pub fn lex_item( if let Some(BlockKind::Paren) = block_level.last() { let _ = block_level.pop(); } - } else if is_item_terminator(&block_level, c, lex_mode) { + } else if is_item_terminator(&block_level, c, &lex_mode) { break; } @@ -182,7 +206,7 @@ pub fn lex_item( pub fn lex( input: &[u8], span_offset: usize, - lex_mode: LexMode, + lex_mode: &LexMode, ) -> (Vec, Option) { let mut error = None; @@ -239,7 +263,7 @@ pub fn lex( let idx = curr_offset; curr_offset += 1; - if lex_mode != LexMode::NewlineIsSpace && lex_mode != LexMode::CommaAndNewlineIsSpace { + if !lex_mode.whitespace_contains(c) { output.push(Token::new(TokenContents::Eol, Span::new(idx, idx + 1))); } } else if c == b'#' { @@ -265,17 +289,13 @@ pub fn lex( Span::new(start, curr_offset), )); } - } else if c == b' ' - || c == b'\t' - || (c == b',' && lex_mode == LexMode::CommaIsSpace) - || (c == b',' && lex_mode == LexMode::CommaAndNewlineIsSpace) - { + } else if c == b' ' || c == b'\t' || lex_mode.whitespace_contains(c) { // If the next character is non-newline whitespace, skip it. curr_offset += 1; } else { // Otherwise, try to consume an unclassified token. - let (span, err) = lex_item(input, &mut curr_offset, lex_mode); + let (span, err) = lex_item(input, &mut curr_offset, &lex_mode); if error.is_none() { error = err; } @@ -294,7 +314,7 @@ mod lex_tests { fn lex_basic() { let file = b"let x = 4"; - let output = lex(file, 0, LexMode::Normal); + let output = lex(file, 0, &LexMode::Normal); assert!(output.1.is_none()); } @@ -303,7 +323,7 @@ mod lex_tests { fn lex_newline() { let file = b"let x = 300\nlet y = 500;"; - let output = lex(file, 0, LexMode::Normal); + let output = lex(file, 0, &LexMode::Normal); println!("{:#?}", output.0); assert!(output.0.contains(&Token { @@ -316,7 +336,7 @@ mod lex_tests { fn lex_empty() { let file = b""; - let output = lex(file, 0, LexMode::Normal); + let output = lex(file, 0, &LexMode::Normal); assert!(output.0.is_empty()); assert!(output.1.is_none()); diff --git a/src/lite_parse.rs b/src/lite_parse.rs index 9e3e15a3d9..6a28638391 100644 --- a/src/lite_parse.rs +++ b/src/lite_parse.rs @@ -128,7 +128,7 @@ mod tests { use crate::{lex, lite_parse, LiteBlock, ParseError, Span}; fn lite_parse_helper(input: &[u8]) -> Result { - let (output, err) = lex(input, 0, crate::LexMode::Normal); + let (output, err) = lex(input, 0, &crate::LexMode::Normal); if let Some(err) = err { return Err(err); } diff --git a/src/main.rs b/src/main.rs index a699ae5ab6..14af5a479d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -56,11 +56,7 @@ fn main() -> std::io::Result<()> { let sig = Signature::build("def") .required("def_name", SyntaxShape::String, "definition name") - .required( - "params", - SyntaxShape::List(Box::new(SyntaxShape::VarWithOptType)), - "parameters", - ) + .required("params", SyntaxShape::Signature, "parameters") .required("block", SyntaxShape::Block, "body of the definition"); working_set.add_decl(sig.into()); diff --git a/src/parser.rs b/src/parser.rs index 3afafe3912..3201f5067a 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -3,8 +3,8 @@ use std::ops::{Index, IndexMut}; use crate::{ lex, lite_parse, parser_state::{Type, VarId}, - signature::Flag, - BlockId, DeclId, Declaration, LiteBlock, ParseError, ParserWorkingSet, Signature, Span, + signature::{Flag, PositionalArg}, + BlockId, DeclId, Declaration, LiteBlock, ParseError, ParserWorkingSet, Signature, Span, Token, }; /// The syntactic shapes that values must match to be passed into a command. You can think of this as the type-checking that occurs when you call a function. @@ -71,6 +71,9 @@ pub enum SyntaxShape { /// A variable with optional type, `x` or `x: int` VarWithOptType, + /// A signature for a definition, `[x:int, --foo]` + Signature, + /// A general expression, eg `1 + 2` or `foo --bar` Expression, } @@ -135,6 +138,7 @@ pub enum Expr { Table(Vec, Vec>), Literal(Vec), String(String), // FIXME: improve this in the future? + Signature(Signature), Garbage, } @@ -185,6 +189,13 @@ impl Expression { } } + pub fn as_signature(self) -> Option { + match self.expr { + Expr::Signature(sig) => Some(sig), + _ => None, + } + } + pub fn as_list(self) -> Option> { match self.expr { Expr::List(list) => Some(list), @@ -787,7 +798,7 @@ impl ParserWorkingSet { let source = self.get_span_contents(span); - let (output, err) = lex(&source, start, crate::LexMode::Normal); + let (output, err) = lex(&source, start, &crate::LexMode::Normal); error = error.or(err); let (output, err) = lite_parse(&output); @@ -826,6 +837,28 @@ impl ParserWorkingSet { } } + //TODO: Handle error case + pub fn parse_shape_name(&self, bytes: &[u8]) -> SyntaxShape { + match bytes { + b"any" => SyntaxShape::Any, + b"string" => SyntaxShape::String, + b"column-path" => SyntaxShape::ColumnPath, + b"number" => SyntaxShape::Number, + b"range" => SyntaxShape::Range, + b"int" => SyntaxShape::Int, + b"path" => SyntaxShape::FilePath, + b"glob" => SyntaxShape::GlobPattern, + b"block" => SyntaxShape::Block, + b"cond" => SyntaxShape::RowCondition, + b"operator" => SyntaxShape::Operator, + b"math" => SyntaxShape::MathExpression, + b"variable" => SyntaxShape::Variable, + b"signature" => SyntaxShape::Signature, + b"expr" => SyntaxShape::Expression, + _ => SyntaxShape::Any, + } + } + pub fn parse_type(&self, bytes: &[u8]) -> Type { if bytes == b"int" { Type::Int @@ -887,6 +920,140 @@ impl ParserWorkingSet { self.parse_math_expression(spans) } + pub fn parse_signature(&mut self, span: Span) -> (Expression, Option) { + enum ParseMode { + ArgMode, + TypeMode, + } + + enum Arg { + Positional(PositionalArg), + Flag(Flag), + } + + println!("parse signature"); + let bytes = self.get_span_contents(span); + + let mut error = None; + let mut start = span.start; + let mut end = span.end; + + if bytes.starts_with(b"[") { + start += 1; + } + if bytes.ends_with(b"]") { + end -= 1; + } else { + error = error.or_else(|| { + Some(ParseError::Unclosed( + "]".into(), + Span { + start: end, + end: end + 1, + }, + )) + }); + } + + let span = Span { start, end }; + let source = &self.file_contents[..span.end]; + + let (output, err) = lex( + &source, + span.start, + &crate::LexMode::Custom { + whitespace: vec![b'\n', b','], + special: vec![b':', b'?'], + }, + ); + error = error.or(err); + + let mut args: Vec = vec![]; + let mut parse_mode = ParseMode::ArgMode; + + for token in &output { + match token { + Token { + contents: crate::TokenContents::Item, + span, + } => { + let contents = &self.file_contents[span.start..span.end]; + + if contents == b":" { + match parse_mode { + ParseMode::ArgMode => { + parse_mode = ParseMode::TypeMode; + } + ParseMode::TypeMode => { + // We're seeing two types for the same thing for some reason, error + error = error.or(Some(ParseError::Mismatch("type".into(), *span))); + } + } + } else { + match parse_mode { + ParseMode::ArgMode => { + if contents.starts_with(b"--") { + // Long flag + args.push(Arg::Flag(Flag { + arg: None, + desc: String::new(), + long: String::from_utf8_lossy(contents).to_string(), + short: None, + required: true, + })); + } else { + // Positional arg + args.push(Arg::Positional(PositionalArg { + desc: String::new(), + name: String::from_utf8_lossy(contents).to_string(), + shape: SyntaxShape::Any, + })) + } + } + ParseMode::TypeMode => { + if let Some(last) = args.last_mut() { + let syntax_shape = self.parse_shape_name(contents); + //TODO check if we're replacing one already + match last { + Arg::Positional(PositionalArg { name, desc, shape }) => { + *shape = syntax_shape; + } + Arg::Flag(Flag { + long, + short, + arg, + required, + desc, + }) => *arg = Some(syntax_shape), + } + } + parse_mode = ParseMode::ArgMode; + } + } + } + } + _ => {} + } + } + + let mut sig = Signature::new(String::new()); + + for arg in args { + match arg { + Arg::Positional(positional) => sig.required_positional.push(positional), + Arg::Flag(flag) => sig.named.push(flag), + } + } + + ( + Expression { + expr: Expr::Signature(sig), + span, + }, + error, + ) + } + pub fn parse_list_expression( &mut self, span: Span, @@ -919,7 +1086,14 @@ impl ParserWorkingSet { let span = Span { start, end }; let source = &self.file_contents[..span.end]; - let (output, err) = lex(&source, span.start, crate::LexMode::CommaAndNewlineIsSpace); + let (output, err) = lex( + &source, + span.start, + &crate::LexMode::Custom { + whitespace: vec![b'\n', b','], + special: vec![], + }, + ); error = error.or(err); let (output, err) = lite_parse(&output); @@ -983,7 +1157,14 @@ impl ParserWorkingSet { let source = &self.file_contents[..end]; - let (output, err) = lex(&source, start, crate::LexMode::CommaAndNewlineIsSpace); + let (output, err) = lex( + &source, + start, + &crate::LexMode::Custom { + whitespace: vec![b'\n', b','], + special: vec![], + }, + ); error = error.or(err); let (output, err) = lite_parse(&output); @@ -1073,7 +1254,7 @@ impl ParserWorkingSet { let source = &self.file_contents[..end]; - let (output, err) = lex(&source, start, crate::LexMode::Normal); + let (output, err) = lex(&source, start, &crate::LexMode::Normal); error = error.or(err); let (output, err) = lite_parse(&output); @@ -1116,11 +1297,14 @@ impl ParserWorkingSet { return self.parse_full_column_path(span); } else if bytes.starts_with(b"[") { match shape { - SyntaxShape::Any | SyntaxShape::List(_) | SyntaxShape::Table => {} + SyntaxShape::Any + | SyntaxShape::List(_) + | SyntaxShape::Table + | SyntaxShape::Signature => {} _ => { return ( Expression::garbage(span), - Some(ParseError::Mismatch("non-table/non-list".into(), span)), + Some(ParseError::Mismatch("non-[] value".into(), span)), ); } } @@ -1179,6 +1363,16 @@ impl ParserWorkingSet { ) } } + SyntaxShape::Signature => { + if bytes.starts_with(b"[") { + self.parse_signature(span) + } else { + ( + Expression::garbage(span), + Some(ParseError::Mismatch("signature".into(), span)), + ) + } + } SyntaxShape::List(elem) => { if bytes.starts_with(b"[") { self.parse_list_expression(span, &elem) @@ -1419,22 +1613,20 @@ impl ParserWorkingSet { .remove(0) .as_string() .expect("internal error: expected def name"); - let args = call + let mut signature = call .positional .remove(0) - .as_list() - .expect("internal error: expected param list") - .into_iter() - .map(|x| x.as_var().expect("internal error: expected parameter")) - .collect::>(); + .as_signature() + .expect("internal error: expected param list"); let block_id = call .positional .remove(0) .as_block() .expect("internal error: expected block"); + signature.name = name; let decl = Declaration { - signature: Signature::new(name), + signature, body: Some(block_id), }; @@ -1526,7 +1718,7 @@ impl ParserWorkingSet { pub fn parse_file(&mut self, fname: &str, contents: Vec) -> (Block, Option) { let mut error = None; - let (output, err) = lex(&contents, 0, crate::LexMode::Normal); + let (output, err) = lex(&contents, 0, &crate::LexMode::Normal); error = error.or(err); self.add_file(fname.into(), contents); @@ -1545,7 +1737,7 @@ impl ParserWorkingSet { self.add_file("source".into(), source.into()); - let (output, err) = lex(source, 0, crate::LexMode::Normal); + let (output, err) = lex(source, 0, &crate::LexMode::Normal); error = error.or(err); let (output, err) = lite_parse(&output);