From f30bf4030090034fd4a2cfc917d576bae098aca4 Mon Sep 17 00:00:00 2001
From: ridiculousfish <corydoras@ridiculousfish.com>
Date: Mon, 19 Feb 2018 15:47:02 -0800
Subject: [PATCH] Clean up comment parsing in tokenizer

Unify the show_comments and non-show_comments path.
---
 src/tokenizer.cpp | 51 +++++++++++++++++++++++++++--------------------
 src/tokenizer.h   |  1 -
 2 files changed, 29 insertions(+), 23 deletions(-)

diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index c777e11ca..2de682a16 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -322,16 +322,6 @@ void tokenizer_t::read_string() {
     this->last_type = TOK_STRING;
 }
 
-/// Read the next token as a comment.
-void tokenizer_t::read_comment() {
-    const wchar_t *start = this->buff;
-    while (*(this->buff) != L'\n' && *(this->buff) != L'\0') this->buff++;
-
-    size_t len = this->buff - start;
-    this->last_token.assign(start, len);
-    this->last_type = TOK_COMMENT;
-}
-
 /// Reads a redirection or an "fd pipe" (like 2>|) from a string. Returns how many characters were
 /// consumed. If zero, then this string was not a redirection. Also returns by reference the
 /// redirection mode, and the fd to redirection. If there is overflow, *out_fd is set to -1.
@@ -465,18 +455,30 @@ int oflags_for_redirection_type(enum token_type type) {
 
 /// Test if a character is whitespace. Differs from iswspace in that it does not consider a newline
 /// to be whitespace.
-static bool my_iswspace(wchar_t c) { return c != L'\n' && iswspace(c); }
+static bool iswspace_not_nl(wchar_t c) {
+    switch (c) {
+        case L' ':
+        case L'\t':
+        case L'\r':
+            return true;
+        case L'\n':
+            return false;
+        default:
+            return iswspace(c);
+    }
+}
 
 bool tokenizer_t::tok_next() {
     if (!this->has_next) {
         return false;
     }
 
-    while (1) {
+    // Consume non-newline whitespace. If we get an escaped newline, mark it and continue past it.
+    for (;;) {
         if (this->buff[0] == L'\\' && this->buff[1] == L'\n') {
             this->buff += 2;
             this->continue_line_after_comment = true;
-        } else if (my_iswspace(this->buff[0])) {
+        } else if (iswspace_not_nl(this->buff[0])) {
             this->buff++;
         } else {
             break;
@@ -484,21 +486,26 @@ bool tokenizer_t::tok_next() {
     }
 
     while (*this->buff == L'#') {
-        if (this->show_comments) {
-            this->last_pos = this->buff - this->start;
-            this->read_comment();
+        // We have a comment, walk over the comment.
+        const wchar_t *comment_start = this->buff;
+        while (this->buff[0] != L'\n' && this->buff[0] != L'\0') this->buff++;
+        size_t comment_len = this->buff - comment_start;
 
-            if (this->buff[0] == L'\n' && this->continue_line_after_comment) this->buff++;
+        // If we are going to continue after the comment, skip any trailing newline.
+        if (this->buff[0] == L'\n' && this->continue_line_after_comment) this->buff++;
+
+        // Maybe return the comment.
+        if (this->show_comments) {
+            this->last_pos = comment_start - this->start;
+            this->last_token.assign(comment_start, comment_len);
+            this->last_type = TOK_COMMENT;
             return true;
         }
-
-        while (*this->buff != L'\n' && *this->buff != L'\0') this->buff++;
-        if (this->buff[0] == L'\n' && this->continue_line_after_comment) this->buff++;
-        while (my_iswspace(*this->buff)) this->buff++;
+        while (iswspace_not_nl(this->buff[0])) this->buff++;
     }
 
+    // We made it past the comments and ate any trailing newlines we wanted to ignore.
     this->continue_line_after_comment = false;
-
     this->last_pos = this->buff - this->start;
 
     switch (*this->buff) {
diff --git a/src/tokenizer.h b/src/tokenizer.h
index f870aaff9..9d755f7e5 100644
--- a/src/tokenizer.h
+++ b/src/tokenizer.h
@@ -104,7 +104,6 @@ class tokenizer_t {
     void call_error(enum tokenizer_error error_type, const wchar_t *where,
                     const wchar_t *error_message);
     void read_string();
-    void read_comment();
     bool tok_next();
 
    public: