From fd2a0dffa9656d4f63f2074ffee2a4edfe3c60ec Mon Sep 17 00:00:00 2001
From: ridiculousfish <corydoras@ridiculousfish.com>
Date: Sun, 18 Feb 2018 14:37:44 -0800
Subject: [PATCH] Reflect newlines after pipes in fish grammar

The previous attempt to support newlines after pipes changed the lexer to
swallow newlines after encountering a pipe. This has two problems that are
difficult to fix:

1. comments cannot be placed after the pipe
2. fish_indent won't know about the newlines, so it will erase them

Address these problems by removing the lexer behavior, and replacing it
with a new parser symbol "optional_newlines" allowing the newlines to be
reflected directly in the fish grammar.
---
 CHANGELOG.md                   |  1 +
 src/fish_tests.cpp             | 12 +++++++++
 src/parse_constants.h          |  2 ++
 src/parse_execution.cpp        |  4 +--
 src/parse_grammar.h            |  9 ++++++-
 src/parse_grammar_elements.inc |  1 +
 src/parse_productions.cpp      | 45 ++++++++++------------------------
 src/parse_tree.cpp             | 15 +++++-------
 src/parse_tree.h               |  1 +
 src/tnode.cpp                  |  2 +-
 src/tokenizer.cpp              | 35 +++-----------------------
 src/tokenizer.h                |  3 ---
 12 files changed, 50 insertions(+), 80 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4a997ddaf..4c7adf602 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -38,6 +38,7 @@ This section is for changes merged to the `major` branch that are not also merge
 - Slicing $history (in particular, `$history[1]` for the last executed command) is much faster.
 - The pager will now show the full command instead of just its last line if the number of completions is large (#4702).
 - Tildes in file names are now properly escaped in completions (#2274)
+- A pipe at the end of a line now allows the job to continue on the next line (#1285)
 
 ## Other significant changes
 - Command substitution output is now limited to 10 MB by default (#3822).
diff --git a/src/fish_tests.cpp b/src/fish_tests.cpp
index 5252a2b3a..0d9ceb208 100644
--- a/src/fish_tests.cpp
+++ b/src/fish_tests.cpp
@@ -739,6 +739,18 @@ static void test_parser() {
         err(L"leading pipe not reported properly");
     }
 
+    if (parse_util_detect_errors(L"true | # comment") != PARSER_TEST_INCOMPLETE) {
+        err(L"comment after pipe not reported as incomplete");
+    }
+
+    if (parse_util_detect_errors(L"true | # comment \n false ")) {
+        err(L"comment and newline after pipe wrongly reported as error");
+    }
+
+    if (parse_util_detect_errors(L"true | ; false ") != PARSER_TEST_ERROR) {
+        err(L"semicolon after pipe not detected as error");
+    }
+
     if (detect_argument_errors(L"foo")) {
         err(L"simple argument reported as error");
     }
diff --git a/src/parse_constants.h b/src/parse_constants.h
index ca6e739e4..b8e43fe0c 100644
--- a/src/parse_constants.h
+++ b/src/parse_constants.h
@@ -45,6 +45,7 @@ enum parse_token_type_t {
     symbol_argument,
     symbol_redirection,
     symbol_optional_background,
+    symbol_optional_newlines,
     symbol_end_command,
     // Terminal types.
     parse_token_type_string,
@@ -99,6 +100,7 @@ const enum_map<parse_token_type_t> token_enum_map[] = {
     {symbol_job, L"symbol_job"},
     {symbol_job_continuation, L"symbol_job_continuation"},
     {symbol_job_list, L"symbol_job_list"},
+    {symbol_optional_newlines, L"symbol_optional_newlines"},
     {symbol_optional_background, L"symbol_optional_background"},
     {symbol_plain_statement, L"symbol_plain_statement"},
     {symbol_redirection, L"symbol_redirection"},
diff --git a/src/parse_execution.cpp b/src/parse_execution.cpp
index e1ff43dc6..428e088b9 100644
--- a/src/parse_execution.cpp
+++ b/src/parse_execution.cpp
@@ -1056,7 +1056,7 @@ parse_execution_result_t parse_execution_context_t::populate_job_from_job_node(
         if (result != parse_execution_success) {
             break;
         }
-        tnode_t<g::statement> statement = job_cont.require_get_child<g::statement, 1>();
+        tnode_t<g::statement> statement = job_cont.require_get_child<g::statement, 2>();
 
         // Handle the pipe, whose fd may not be the obvious stdout.
         int pipe_write_fd = fd_redirected_by_pipe(get_source(pipe));
@@ -1071,7 +1071,7 @@ parse_execution_result_t parse_execution_context_t::populate_job_from_job_node(
         result = this->populate_job_process(j, processes.back().get(), statement);
 
         // Get the next continuation.
-        job_cont = job_cont.require_get_child<g::job_continuation, 2>();
+        job_cont = job_cont.require_get_child<g::job_continuation, 3>();
         assert(job_cont);
     }
 
diff --git a/src/parse_grammar.h b/src/parse_grammar.h
index c6ea29dee..0526ef055 100644
--- a/src/parse_grammar.h
+++ b/src/parse_grammar.h
@@ -210,7 +210,7 @@ DEF_ALT(job_list) {
 DEF(job) produces_sequence<statement, job_continuation, optional_background>{BODY(job)};
 
 DEF_ALT(job_continuation) {
-    using piped = seq<tok_pipe, statement, job_continuation>;
+    using piped = seq<tok_pipe, optional_newlines, statement, job_continuation>;
     using empty = grammar::empty;
     ALT_BODY(job_continuation, piped, empty);
 };
@@ -343,6 +343,13 @@ DEF_ALT(optional_background) {
 
 DEF(end_command) produces_single<keyword<parse_keyword_end>>{BODY(end_command)};
 
+// Note optional_newlines only allows newline-style tok_end, not semicolons.
+DEF_ALT(optional_newlines) {
+    using empty = grammar::empty;
+    using newlines = seq<tok_end, optional_newlines>;
+    ALT_BODY(optional_newlines, empty, newlines);
+};
+
 // A freestanding_argument_list is equivalent to a normal argument list, except it may contain
 // TOK_END (newlines, and even semicolons, for historical reasons)
 DEF_ALT(freestanding_argument_list) {
diff --git a/src/parse_grammar_elements.inc b/src/parse_grammar_elements.inc
index 4142db65f..ff087eeb7 100644
--- a/src/parse_grammar_elements.inc
+++ b/src/parse_grammar_elements.inc
@@ -25,6 +25,7 @@ ELEM(arguments_or_redirections_list)
 ELEM(argument)
 ELEM(redirection)
 ELEM(optional_background)
+ELEM(optional_newlines)
 ELEM(end_command)
 ELEM(freestanding_argument_list)
 #undef ELEM
diff --git a/src/parse_productions.cpp b/src/parse_productions.cpp
index 28d371d51..e97b27208 100644
--- a/src/parse_productions.cpp
+++ b/src/parse_productions.cpp
@@ -309,6 +309,12 @@ RESOLVE(arguments_or_redirections_list) {
     }
 }
 
+RESOLVE(optional_newlines) {
+    UNUSED(token2);
+    if (token1.is_newline) return production_for<newlines>();
+    return production_for<empty>();
+}
+
 RESOLVE(optional_background) {
     UNUSED(token2);
 
@@ -324,10 +330,6 @@ RESOLVE(optional_background) {
     }
 }
 
-#define TEST(SYM)                \
-    case (symbol_##SYM):         \
-        resolver = SYM::resolve; \
-        break;
 
 const production_element_t *parse_productions::production_for_token(parse_token_type_t node_type,
                                                                     const parse_token_t &input1,
@@ -342,35 +344,14 @@ const production_element_t *parse_productions::production_for_token(parse_token_
                                             parse_node_tag_t *out_tag) =  //!OCLINT(unused param)
         NULL;
     switch (node_type) {
-        TEST(job_list)
-        TEST(job)
-        TEST(statement)
-        TEST(job_continuation)
-        TEST(boolean_statement)
-        TEST(block_statement)
-        TEST(if_statement)
-        TEST(if_clause)
-        TEST(else_clause)
-        TEST(else_continuation)
-        TEST(switch_statement)
-        TEST(decorated_statement)
-        TEST(case_item_list)
-        TEST(case_item)
-        TEST(argument_list)
-        TEST(freestanding_argument_list)
-        TEST(block_header)
-        TEST(for_header)
-        TEST(while_header)
-        TEST(begin_header)
-        TEST(function_header)
-        TEST(plain_statement)
-        TEST(andor_job_list)
-        TEST(arguments_or_redirections_list)
-        TEST(argument)
-        TEST(redirection)
-        TEST(optional_background)
-        TEST(end_command)
+// Handle all of our grammar elements
+#define ELEM(SYM)                \
+    case (symbol_##SYM):         \
+        resolver = SYM::resolve; \
+        break;
+#include "parse_grammar_elements.inc"
 
+        // Everything else is an error.
         case parse_token_type_string:
         case parse_token_type_pipe:
         case parse_token_type_redirection:
diff --git a/src/parse_tree.cpp b/src/parse_tree.cpp
index b06bb54c2..ec5d31d4e 100644
--- a/src/parse_tree.cpp
+++ b/src/parse_tree.cpp
@@ -256,9 +256,6 @@ static inline parse_token_type_t parse_token_type_from_tokenizer_token(
     return result;
 }
 
-#if 1
-// Disabled for the 2.2.0 release: https://github.com/fish-shell/fish-shell/issues/1809.
-
 /// Helper function for parse_dump_tree().
 static void dump_tree_recursive(const parse_node_tree_t &nodes, const wcstring &src,
                                 node_offset_t node_idx, size_t indent, wcstring *result,
@@ -284,7 +281,6 @@ static void dump_tree_recursive(const parse_node_tree_t &nodes, const wcstring &
 
     append_format(*result, L"%2lu - %l2u  ", *line, node_idx);
     result->append(indent * spacesPerIndent, L' ');
-    ;
     result->append(node.describe());
     if (node.child_count > 0) {
         append_format(*result, L" <%lu children>", node.child_count);
@@ -335,7 +331,6 @@ wcstring parse_dump_tree(const parse_node_tree_t &nodes, const wcstring &src) {
     }
     return result;
 }
-#endif
 
 /// Struct representing elements of the symbol stack, used in the internal state of the LL parser.
 struct parse_stack_element_t {
@@ -1046,12 +1041,12 @@ static parse_keyword_t keyword_for_token(token_type tok, const wcstring &token)
 }
 
 /// Placeholder invalid token.
-static const parse_token_t kInvalidToken = {
-    token_type_invalid, parse_keyword_none, false, false, SOURCE_OFFSET_INVALID, 0};
+static constexpr parse_token_t kInvalidToken = {
+    token_type_invalid, parse_keyword_none, false, false, false, SOURCE_OFFSET_INVALID, 0};
 
 /// Terminal token.
-static const parse_token_t kTerminalToken = {
-    parse_token_type_terminate, parse_keyword_none, false, false, SOURCE_OFFSET_INVALID, 0};
+static constexpr parse_token_t kTerminalToken = {
+    parse_token_type_terminate, parse_keyword_none, false, false, false, SOURCE_OFFSET_INVALID, 0};
 
 static inline bool is_help_argument(const wcstring &txt) {
     return txt == L"-h" || txt == L"--help";
@@ -1074,6 +1069,7 @@ static inline parse_token_t next_parse_token(tokenizer_t *tok, tok_t *token) {
     result.keyword = keyword_for_token(token->type, token->text);
     result.has_dash_prefix = !token->text.empty() && token->text.at(0) == L'-';
     result.is_help_argument = result.has_dash_prefix && is_help_argument(token->text);
+    result.is_newline = (result.type == parse_token_type_end && token->text == L"\n");
 
     // These assertions are totally bogus. Basically our tokenizer works in size_t but we work in
     // uint32_t to save some space. If we have a source file larger than 4 GB, we'll probably just
@@ -1157,6 +1153,7 @@ bool parse_tree_from_string(const wcstring &str, parse_tree_flags_t parse_flags,
                                      parse_keyword_none,
                                      false,
                                      false,
+                                     false,
                                      queue[error_token_idx].source_start,
                                      queue[error_token_idx].source_length};
         parser.accept_tokens(token, kInvalidToken);
diff --git a/src/parse_tree.h b/src/parse_tree.h
index 7c5a1028b..ac46d6500 100644
--- a/src/parse_tree.h
+++ b/src/parse_tree.h
@@ -31,6 +31,7 @@ struct parse_token_t {
     enum parse_keyword_t keyword;  // Any keyword represented by this token
     bool has_dash_prefix;          // Hackish: whether the source contains a dash prefix
     bool is_help_argument;         // Hackish: whether the source looks like '-h' or '--help'
+    bool is_newline;               // Hackish: if TOK_END, whether the source is a newline.
     source_offset_t source_start;
     source_offset_t source_length;
 
diff --git a/src/tnode.cpp b/src/tnode.cpp
index 419d985cc..3df585d3b 100644
--- a/src/tnode.cpp
+++ b/src/tnode.cpp
@@ -121,7 +121,7 @@ bool statement_is_in_pipeline(tnode_t<grammar::statement> st, bool include_first
     // has a non-empty continuation.
     if (include_first) {
         tnode_t<job_continuation> jc = st.try_get_parent<job>().child<1>();
-        if (jc.try_get_child<statement, 1>()) {
+        if (jc.try_get_child<statement, 2>()) {
             return true;
         }
     }
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index bdbd4d2a8..dccbd8191 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -102,16 +102,7 @@ bool tokenizer_t::next(struct tok_t *result) {
     }
 
     assert(this->buff >= this->orig_buff);
-    if (this->last_type == TOK_PIPE) {
-        // Ignore subsequent whitespaces or a newline after a pipe (#1285).
-        int pipe_pos = current_pos - 1;
-        while (this->orig_buff[pipe_pos] != L'|') {
-            pipe_pos--;
-        }
-        result->length = pipe_pos - this->last_pos + 1;
-    } else {
-        result->length = current_pos >= this->last_pos ? current_pos - this->last_pos : 0;
-    }
+    result->length = current_pos >= this->last_pos ? current_pos - this->last_pos : 0;
 
     this->tok_next();
     return true;
@@ -538,12 +529,14 @@ void tokenizer_t::tok_next() {
             this->last_type = TOK_END;
             // fwprintf( stderr, L"End of string\n" );
             this->has_next = false;
+            this->last_token.clear();
             break;
         }
         case L'\r':  // carriage-return
         case L'\n':  // newline
         case L';': {
             this->last_type = TOK_END;
+            this->last_token.assign(1, *this->buff);
             this->buff++;
             // Hack: when we get a newline, swallow as many as we can. This compresses multiple
             // subsequent newlines into a single one.
@@ -553,7 +546,6 @@ void tokenizer_t::tok_next() {
                     this->buff++;
                 }
             }
-            this->last_token.clear();
             break;
         }
         case L'&': {
@@ -565,7 +557,6 @@ void tokenizer_t::tok_next() {
             this->last_token = L"1";
             this->last_type = TOK_PIPE;
             this->buff++;
-            skip_newline_after_pipe();
             break;
         }
         case L'>':
@@ -580,9 +571,6 @@ void tokenizer_t::tok_next() {
                 TOK_CALL_ERROR(this, TOK_OTHER, REDIRECT_ERROR, this->buff);
             } else {
                 this->buff += consumed;
-                if (mode == TOK_PIPE) {
-                    skip_newline_after_pipe();
-                }
                 this->last_type = mode;
                 this->last_token = to_string(fd);
             }
@@ -606,9 +594,6 @@ void tokenizer_t::tok_next() {
                     TOK_CALL_ERROR(this, TOK_OTHER, PIPE_ERROR, error_location);
                 } else {
                     this->buff += consumed;
-                    if (mode == TOK_PIPE) {
-                        skip_newline_after_pipe();
-                    }
                     this->last_type = mode;
                     this->last_token = to_string(fd);
                 }
@@ -621,20 +606,6 @@ void tokenizer_t::tok_next() {
     }
 }
 
-/// If the line ends with pipe, continue to the next line (#1285).
-void tokenizer_t::skip_newline_after_pipe() {
-    while (1) {
-        if (this->buff[0] == L'\n') {
-            this->buff++;
-            break;
-        } else if (my_iswspace(this->buff[0])) {
-            this->buff++;
-        } else {
-            break;
-        }
-    }
-}
-
 wcstring tok_first(const wcstring &str) {
     wcstring result;
     tokenizer_t t(str.c_str(), TOK_SQUASH_ERRORS);
diff --git a/src/tokenizer.h b/src/tokenizer.h
index aaada6e7e..c87a696b2 100644
--- a/src/tokenizer.h
+++ b/src/tokenizer.h
@@ -107,9 +107,6 @@ class tokenizer_t {
     void read_comment();
     void tok_next();
 
-    /// Skip whitespaces and a newline after a pipe at the end of the line.
-    void skip_newline_after_pipe();
-
    public:
     /// Constructor for a tokenizer. b is the string that is to be tokenized. It is not copied, and
     /// should not be freed by the caller until after the tokenizer is destroyed.