Merge branch 'string_split0'

This merges support for `string split0` and `string join0`, easing working with nul-separated output. Fixes #3164
2024-11-15 17:28:19 +00:00 · 2018-07-01 16:35:30 -07:00 · 2018-07-01 16:35:30 -07:00 · 84b7c2b152
commit 84b7c2b152
parent a6031c42bf 62d73bee5e
12 changed files with 364 additions and 194 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -59,6 +59,7 @@ This section is for changes merged to the `major` branch that are not also merge
 - The universal variables file no longer contains the MAC address. It is now at the fixed location `.config/fish/fish_universal_variables` (#1912).
 - `alias` now has a `-s` and `--save` option to save the function generated by the alias using `funcsave` (#4878).
 - Path completions now support expansions, meaning expressions like `python ~/<TAB>` now provides file suggestions just like any other relative or absolute path. (This includes support for other expansions, too.)
+- The `string` builtin has new commands `split0` and `join0` for working with NUL-delimited output.

 ## Other significant changes
 - Command substitution output is now limited to 10 MB by default (#3822).
--- a/doc_src/string.txt
+++ b/doc_src/string.txt
@ -4,6 +4,7 @@
 \fish{synopsis}
 string escape [(-n | --no-quoted)] [--style=xxx] [STRING...]
 string join [(-q | --quiet)] SEP [STRING...]
+string join0 [(-q | --quiet)] [STRING...]
 string length [(-q | --quiet)] [STRING...]
 string lower [(-q | --quiet)] [STRING...]
 string match [(-a | --all)] [(-e | --entire)] [(-i | --ignore-case)] [(-r | --regex)]
@ -14,6 +15,8 @@ string replace [(-a | --all)] [(-f | --filter)] [(-i | --ignore-case)] [(-r | --
               [(-q | --quiet)] PATTERN REPLACEMENT [STRING...]
 string split [(-m | --max) MAX] [(-n | --no-empty)] [(-q | --quiet)] [(-r | --right)] SEP
             [STRING...]
+string split0 [(-m | --max) MAX] [(-n | --no-empty)] [(-q | --quiet)] [(-r | --right)]
+              [STRING...]
 string sub [(-s | --start) START] [(-l | --length) LENGTH] [(-q | --quiet)]
           [STRING...]
 string trim [(-l | --left)] [(-r | --right)] [(-c | --chars CHARS)]
@ -49,6 +52,10 @@ The third is `--style=url` which ensures the string can be used as a URL by hex

 `string join` joins its STRING arguments into a single string separated by SEP, which can be an empty string. Exit status: 0 if at least one join was performed, or 1 otherwise.

+\subsection string-join0 "join0" subcommand
+
+`string join` joins its STRING arguments into a single string separated by the zero byte (NUL), and adds a trailing NUL. This is most useful in conjunction with tools that accept NUL-delimited input, such as `sort -z`. Exit status: 0 if at least one join was performed, or 1 otherwise.
+
 \subsection string-length "length" subcommand

 `string length` reports the length of each string argument in characters. Exit status: 0 if at least one non-empty STRING was given, or 1 otherwise.
@ -93,6 +100,12 @@ Exit status: 0 if at least one replacement was performed, or 1 otherwise.

 See also `read --delimiter`.

+\subsection string-split0 "split0" subcommand
+
+`string split0` splits each STRING on the zero byte (NUL). Options are the same as `string split` except that no separator is given.
+
+`split0` has the important property that its output is not further split when used in a command substitution, allowing for the command substitution to produce elements containing newlines. This is most useful when used with Unix tools that produce zero bytes, such as `find -print0` or `sort -z`. See split0 examples below.
+
 \subsection string-sub "sub" subcommand

 `string sub` prints a substring of each string argument. The start of the substring can be specified with `-s` or `--start` followed by a 1-based index value. Positive index values are relative to the start of the string and negative index values are relative to the end of the string. The default start value is 1. The length of the substring can be specified with `-l` or `--length`. If the length is not specified, the substring continues to the end of each STRING. Exit status: 0 if at least one substring operation was performed, 1 otherwise.
@ -240,6 +253,20 @@ foo2
 <outp>0xBadC0de</outp>
 \endfish

+\subsection string-example-split0 NUL Delimited Examples
+
+\fish{cli-dark}
+>_ # Count files in a directory, without being confused by newlines.
+>_ count (find . -print0 | string split0)
+<outp>42</outp>
+
+>_ # Sort a list of elements which may contain newlines
+>_ set foo beta alpha\ngamma
+>_ set foo (string join0 $foo | sort -z | string split0)
+>_ string escape $foo[1]
+<outp>alpha\ngamma</outp>
+\endfish
+
 \subsection string-example-replace-literal Replace Literal Examples

 \fish{cli-dark}
--- a/src/builtin_string.cpp
+++ b/src/builtin_string.cpp
@ -75,25 +75,29 @@ class arg_iterator_t {
    int argidx_;
    // If not using argv, a string to store bytes that have been read but not yet returned.
    std::string buffer_;
+    // If set, when reading from a stream, split on zeros instead of newlines.
+    const bool split0_;
    // Backing storage for the next() string.
    wcstring storage_;
    const io_streams_t &streams_;

-    /// \return the next argument from stdin
-    const wchar_t *get_arg_stdin() {
+    /// Reads the next argument from stdin, returning true if an argument was produced and false if
+    /// not. On true, the string is stored in storage_.
+    bool get_arg_stdin() {
        assert(string_args_from_stdin(streams_) && "should not be reading from stdin");
-        // Read in chunks from fd until buffer has a line.
+        // Read in chunks from fd until buffer has a line (or zero if split0_ is set).
+        const char sep = split0_ ? '\0' : '\n';
        size_t pos;
-        while ((pos = buffer_.find('\n')) == std::string::npos) {
+        while ((pos = buffer_.find(sep)) == std::string::npos) {
            char buf[STRING_CHUNK_SIZE];
            long n = read_blocked(streams_.stdin_fd, buf, STRING_CHUNK_SIZE);
            if (n == 0) {
                // If we still have buffer contents, flush them,
-                // in case there was no trailing '\n'.
-                if (buffer_.empty()) return NULL;
+                // in case there was no trailing sep.
+                if (buffer_.empty()) return false;
                storage_ = str2wcstring(buffer_);
                buffer_.clear();
-                return storage_.c_str();
+                return true;
            }
            if (n == -1) {
                // Some error happened. We can't do anything about it,
@ -101,20 +105,21 @@ class arg_iterator_t {
                // (read_blocked already retries for EAGAIN and EINTR)
                storage_ = str2wcstring(buffer_);
                buffer_.clear();
-                return NULL;
+                return false;
            }
            buffer_.append(buf, n);
        }

-        // Split the buffer on the '\n' and return the first part.
+        // Split the buffer on the sep and return the first part.
        storage_ = str2wcstring(buffer_, pos);
        buffer_.erase(0, pos + 1);
-        return storage_.c_str();
+        return true;
    }

   public:
-    arg_iterator_t(const wchar_t *const *argv, int argidx, const io_streams_t &streams)
-        : argv_(argv), argidx_(argidx), streams_(streams) {}
+    arg_iterator_t(const wchar_t *const *argv, int argidx, const io_streams_t &streams,
+                   bool split0 = false)
+        : argv_(argv), argidx_(argidx), split0_(split0), streams_(streams) {}

    const wcstring *nextstr() {
        if (string_args_from_stdin(streams_)) {
@ -537,14 +542,15 @@ static int string_unescape(parser_t &parser, io_streams_t &streams, int argc, wc
    DIE("should never reach this statement");
 }

-static int string_join(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) {
+static int string_join_maybe0(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv,
+                              bool is_join0) {
    options_t opts;
    opts.quiet_valid = true;
    int optind;
-    int retval = parse_opts(&opts, &optind, 1, argc, argv, parser, streams);
+    int retval = parse_opts(&opts, &optind, is_join0 ? 0 : 1, argc, argv, parser, streams);
    if (retval != STATUS_CMD_OK) return retval;

-    const wchar_t *sep = opts.arg1;
+    const wcstring sep = is_join0 ? wcstring(1, L'\0') : wcstring(opts.arg1);
    int nargs = 0;
    arg_iterator_t aiter(argv, optind, streams);
    while (const wcstring *arg = aiter.nextstr()) {
@ -557,12 +563,20 @@ static int string_join(parser_t &parser, io_streams_t &streams, int argc, wchar_
        nargs++;
    }
    if (nargs > 0 && !opts.quiet) {
-        streams.out.push_back(L'\n');
+        streams.out.push_back(is_join0 ? L'\0' : L'\n');
    }

    return nargs > 1 ? STATUS_CMD_OK : STATUS_CMD_ERROR;
 }

+static int string_join(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) {
+    return string_join_maybe0(parser, streams, argc, argv, false /* is_join0 */);
+}
+
+static int string_join0(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) {
+    return string_join_maybe0(parser, streams, argc, argv, true /* is_join0 */);
+}
+
 static int string_length(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) {
    options_t opts;
    opts.quiet_valid = true;
@ -1037,7 +1051,8 @@ static int string_replace(parser_t &parser, io_streams_t &streams, int argc, wch
    return replacer->replace_count() > 0 ? STATUS_CMD_OK : STATUS_CMD_ERROR;
 }

-static int string_split(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) {
+static int string_split_maybe0(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv,
+                               bool is_split0) {
    options_t opts;
    opts.quiet_valid = true;
    opts.right_valid = true;
@ -1045,14 +1060,14 @@ static int string_split(parser_t &parser, io_streams_t &streams, int argc, wchar
    opts.max = LONG_MAX;
    opts.no_empty_valid = true;
    int optind;
-    int retval = parse_opts(&opts, &optind, 1, argc, argv, parser, streams);
+    int retval = parse_opts(&opts, &optind, is_split0 ? 0 : 1, argc, argv, parser, streams);
    if (retval != STATUS_CMD_OK) return retval;

-    const wcstring sep(opts.arg1);
+    const wcstring sep = is_split0 ? wcstring(1, L'\0') : wcstring(opts.arg1);

    wcstring_list_t splits;
    size_t arg_count = 0;
-    arg_iterator_t aiter(argv, optind, streams);
+    arg_iterator_t aiter(argv, optind, streams, is_split0);
    while (const wcstring *arg = aiter.nextstr()) {
        if (opts.right) {
            split_about(arg->rbegin(), arg->rend(), sep.rbegin(), sep.rend(), &splits, opts.max, opts.no_empty);
@ -1070,15 +1085,24 @@ static int string_split(parser_t &parser, io_streams_t &streams, int argc, wchar
        std::reverse(splits.begin(), splits.end());
    }

+    const size_t split_count = splits.size();
    if (!opts.quiet) {
-        for (wcstring_list_t::const_iterator si = splits.begin(); si != splits.end(); ++si) {
-            streams.out.append(*si);
-            streams.out.append(L'\n');
+        auto &buff = streams.out.buffer();
+        for (const wcstring &split : splits) {
+            buff.append(split, separation_type_t::explicitly);
        }
    }

    // We split something if we have more split values than args.
-    return splits.size() > arg_count ? STATUS_CMD_OK : STATUS_CMD_ERROR;
+    return split_count > arg_count ? STATUS_CMD_OK : STATUS_CMD_ERROR;
+}
+
+static int string_split(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) {
+    return string_split_maybe0(parser, streams, argc, argv, false /* is_split0 */);
+}
+
+static int string_split0(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) {
+    return string_split_maybe0(parser, streams, argc, argv, true /* is_split0 */);
 }

 // Helper function to abstract the repeat logic from string_repeat
@ -1256,19 +1280,12 @@ static const struct string_subcommand {
                   wchar_t **argv);                       //!OCLINT(unused param)
 }

-string_subcommands[] = {{L"escape", &string_escape},
-                        {L"join", &string_join},
-                        {L"length", &string_length},
-                        {L"match", &string_match},
-                        {L"replace", &string_replace},
-                        {L"split", &string_split},
-                        {L"sub", &string_sub},
-                        {L"trim", &string_trim},
-                        {L"lower", &string_lower},
-                        {L"upper", &string_upper},
-                        {L"repeat", &string_repeat},
-                        {L"unescape", &string_unescape},
-                        {NULL, NULL}};
+string_subcommands[] = {
+    {L"escape", &string_escape}, {L"join", &string_join},         {L"join0", &string_join0},
+    {L"length", &string_length}, {L"match", &string_match},       {L"replace", &string_replace},
+    {L"split", &string_split},   {L"split0", &string_split0},     {L"sub", &string_sub},
+    {L"trim", &string_trim},     {L"lower", &string_lower},       {L"upper", &string_upper},
+    {L"repeat", &string_repeat}, {L"unescape", &string_unescape}, {NULL, NULL}};

 /// The string builtin, for manipulating strings.
 int builtin_string(parser_t &parser, io_streams_t &streams, wchar_t **argv) {
--- a/src/exec.cpp
+++ b/src/exec.cpp
@ -540,7 +540,7 @@ void exec_job(parser_t &parser, job_t *j) {
        if ((io->io_mode == IO_BUFFER)) {
            io_buffer_t *io_buffer = static_cast<io_buffer_t *>(io.get());
            assert(!io_buffer->is_input);
-            stdout_read_limit = io_buffer->get_buffer_limit();
+            stdout_read_limit = io_buffer->buffer().limit();
        }
    }

@ -891,8 +891,10 @@ void exec_job(parser_t &parser, job_t *j) {

                block_output_io_buffer->read();

-                const char *buffer = block_output_io_buffer->out_buffer_ptr();
-                size_t count = block_output_io_buffer->out_buffer_size();
+                const std::string buffer_contents =
+                    block_output_io_buffer->buffer().newline_serialized();
+                const char *buffer = buffer_contents.data();
+                size_t count = buffer_contents.size();
                if (count > 0) {
                    // We don't have to drain threads here because our child process is simple.
                    const char *fork_reason = p->type == INTERNAL_BLOCK_NODE ? "internal block io" : "internal function io";
@ -925,8 +927,8 @@ void exec_job(parser_t &parser, job_t *j) {
                    process_net_io_chain.get_io_for_fd(STDERR_FILENO);

                assert(builtin_io_streams.get() != NULL);
-                const wcstring &stdout_buffer = builtin_io_streams->out.buffer();
-                const wcstring &stderr_buffer = builtin_io_streams->err.buffer();
+                const output_stream_t &stdout_stream = builtin_io_streams->out;
+                const output_stream_t &stderr_stream = builtin_io_streams->err;

                // If we are outputting to a file, we have to actually do it, even if we have no
                // output, so that we can truncate the file. Does not apply to /dev/null.
@ -936,9 +938,9 @@ void exec_job(parser_t &parser, job_t *j) {
                    // We are handling reads directly in the main loop. Note that we may still end
                    // up forking.
                    const bool stdout_is_to_buffer = stdout_io && stdout_io->io_mode == IO_BUFFER;
-                    const bool no_stdout_output = stdout_buffer.empty();
-                    const bool no_stderr_output = stderr_buffer.empty();
-                    const bool stdout_discarded = builtin_io_streams->out.output_discarded();
+                    const bool no_stdout_output = stdout_stream.empty();
+                    const bool no_stderr_output = stderr_stream.empty();
+                    const bool stdout_discarded = stdout_stream.buffer().discarded();

                    if (!stdout_discarded && no_stdout_output && no_stderr_output) {
                        // The builtin produced no output and is not inside of a pipeline. No
@ -950,23 +952,24 @@ void exec_job(parser_t &parser, job_t *j) {
                        // The builtin produced no stderr, and its stdout is going to an
                        // internal buffer. There is no need to fork. This helps out the
                        // performance quite a bit in complex completion code.
+                        // TODO: we're sloppy about handling explicitly separated output.
+                        // Theoretically we could have explicitly separated output on stdout and
+                        // also stderr output; in that case we ought to thread the exp-sep output
+                        // through to the io buffer. We're getting away with this because the only
+                        // thing that can output exp-sep output is `string split0` which doesn't
+                        // also produce stderr.
                        debug(3, L"Skipping fork: buffered output for internal builtin '%ls'",
                              p->argv0());

                        io_buffer_t *io_buffer = static_cast<io_buffer_t *>(stdout_io.get());
-                        if (stdout_discarded) {
-                            io_buffer->set_discard();
-                        } else {
-                            const std::string res = wcs2string(builtin_io_streams->out.buffer());
-                            io_buffer->out_buffer_append(res.data(), res.size());
-                        }
+                        io_buffer->append_from_stream(stdout_stream);
                        fork_was_skipped = true;
                    } else if (stdout_io.get() == NULL && stderr_io.get() == NULL) {
                        // We are writing to normal stdout and stderr. Just do it - no need to fork.
                        debug(3, L"Skipping fork: ordinary output for internal builtin '%ls'",
                              p->argv0());
-                        const std::string outbuff = wcs2string(stdout_buffer);
-                        const std::string errbuff = wcs2string(stderr_buffer);
+                        const std::string outbuff = wcs2string(stdout_stream.contents());
+                        const std::string errbuff = wcs2string(stderr_stream.contents());
                        bool builtin_io_done = do_builtin_io(outbuff.data(), outbuff.size(),
                                                             errbuff.data(), errbuff.size());
                        if (!builtin_io_done && errno != EPIPE) {
@ -995,11 +998,11 @@ void exec_job(parser_t &parser, job_t *j) {
                    // in the child.
                    //
                    // These strings may contain embedded nulls, so don't treat them as C strings.
-                    const std::string outbuff_str = wcs2string(stdout_buffer);
+                    const std::string outbuff_str = wcs2string(stdout_stream.contents());
                    const char *outbuff = outbuff_str.data();
                    size_t outbuff_len = outbuff_str.size();

-                    const std::string errbuff_str = wcs2string(stderr_buffer);
+                    const std::string errbuff_str = wcs2string(stderr_stream.contents());
                    const char *errbuff = errbuff_str.data();
                    size_t errbuff_len = errbuff_str.size();

@ -1191,7 +1194,7 @@ static int exec_subshell_internal(const wcstring &cmd, wcstring_list_t *lst, boo
        io_buffer->read();
    }

-    if (io_buffer->output_discarded()) subcommand_status = STATUS_READ_TOO_MUCH;
+    if (io_buffer->buffer().discarded()) subcommand_status = STATUS_READ_TOO_MUCH;

    // If the caller asked us to preserve the exit status, restore the old status. Otherwise set the
    // status of the subcommand.
@ -1201,33 +1204,41 @@ static int exec_subshell_internal(const wcstring &cmd, wcstring_list_t *lst, boo
    if (lst == NULL || io_buffer.get() == NULL) {
        return subcommand_status;
    }
+    // Walk over all the elements.
+    for (const auto &elem : io_buffer->buffer().elements()) {
+        if (elem.is_explicitly_separated()) {
+            // Just append this one.
+            lst->push_back(str2wcstring(elem.contents));
+            continue;
+        }

-    const char *begin = io_buffer->out_buffer_ptr();
-    const char *end = begin + io_buffer->out_buffer_size();
-    if (split_output) {
-        const char *cursor = begin;
-        while (cursor < end) {
-            // Look for the next separator.
-            const char *stop = (const char *)memchr(cursor, '\n', end - cursor);
-            const bool hit_separator = (stop != NULL);
-            if (!hit_separator) {
-                // If it's not found, just use the end.
-                stop = end;
+        // Not explicitly separated. We have to split it explicitly.
+        assert(!elem.is_explicitly_separated() && "should not be explicitly separated");
+        const char *begin = elem.contents.data();
+        const char *end = begin + elem.contents.size();
+        if (split_output) {
+            const char *cursor = begin;
+            while (cursor < end) {
+                // Look for the next separator.
+                const char *stop = (const char *)memchr(cursor, '\n', end - cursor);
+                const bool hit_separator = (stop != NULL);
+                if (!hit_separator) {
+                    // If it's not found, just use the end.
+                    stop = end;
+                }
+                // Stop now points at the first character we do not want to copy.
+                lst->push_back(str2wcstring(cursor, stop - cursor));
+
+                // If we hit a separator, skip over it; otherwise we're at the end.
+                cursor = stop + (hit_separator ? 1 : 0);
            }
-            // Stop now points at the first character we do not want to copy.
-            const wcstring wc = str2wcstring(cursor, stop - cursor);
-            lst->push_back(wc);
-
-            // If we hit a separator, skip over it; otherwise we're at the end.
-            cursor = stop + (hit_separator ? 1 : 0);
+        } else {
+            // We're not splitting output, but we still want to trim off a trailing newline.
+            if (end != begin && end[-1] == '\n') {
+                --end;
+            }
+            lst->push_back(str2wcstring(begin, end - begin));
        }
-    } else {
-        // We're not splitting output, but we still want to trim off a trailing newline.
-        if (end != begin && end[-1] == '\n') {
-            --end;
-        }
-        const wcstring wc = str2wcstring(begin, end - begin);
-        lst->push_back(wc);
    }

    return subcommand_status;
--- a/src/fish_tests.cpp
+++ b/src/fish_tests.cpp
@ -929,9 +929,9 @@ static void test_1_cancellation(const wchar_t *src) {
    });
    parser_t::principal_parser().eval(src, io_chain, TOP);
    out_buff->read();
-    if (out_buff->out_buffer_size() != 0) {
+    if (out_buff->buffer().size() != 0) {
        err(L"Expected 0 bytes in out_buff, but instead found %lu bytes\n",
-            out_buff->out_buffer_size());
+            out_buff->buffer().size());
    }
    iothread_drain_all();
 }
@ -4107,10 +4107,10 @@ static void run_one_string_test(const wchar_t *const *argv, int expected_rc,
    if (rc != expected_rc) {
        err(L"Test failed on line %lu: [%ls]: expected return code %d but got %d", __LINE__,
            args.c_str(), expected_rc, rc);
-    } else if (streams.out.buffer() != expected_out) {
+    } else if (streams.out.contents() != expected_out) {
        err(L"Test failed on line %lu: [%ls]: expected [%ls] but got [%ls]", __LINE__, args.c_str(),
            escape_string(expected_out, ESCAPE_ALL).c_str(),
-            escape_string(streams.out.buffer(), ESCAPE_ALL).c_str());
+            escape_string(streams.out.contents(), ESCAPE_ALL).c_str());
    }
 }

--- a/src/io.cpp
+++ b/src/io.cpp
@ -27,8 +27,17 @@ void io_pipe_t::print() const {
 }

 void io_buffer_t::print() const {
-    fwprintf(stderr, L"buffer %p (input: %s, size %lu)\n", out_buffer_ptr(),
-             is_input ? "yes" : "no", (unsigned long)out_buffer_size());
+    fwprintf(stderr, L"buffer (input: %s, size %lu)\n",
+             is_input ? "yes" : "no", (unsigned long)buffer_.size());
+}
+
+void io_buffer_t::append_from_stream(const output_stream_t &stream) {
+    if (buffer_.discarded()) return;
+    if (stream.buffer().discarded()) {
+        buffer_.set_discard();
+        return;
+    }
+    buffer_.append_wide_buffer(stream.buffer());
 }

 void io_buffer_t::read() {
@ -44,11 +53,10 @@ void io_buffer_t::read() {
        debug(4, L"io_buffer_t::read: blocking read on fd %d", pipe_fd[0]);
        while (1) {
            char b[4096];
-            long l;
-            l = read_blocked(pipe_fd[0], b, 4096);
-            if (l == 0) {
+            long len = read_blocked(pipe_fd[0], b, 4096);
+            if (len == 0) {
                break;
-            } else if (l < 0) {
+            } else if (len < 0) {
                // exec_read_io_buffer is only called on jobs that have exited, and will therefore
                // never block. But a broken pipe seems to cause some flags to reset, causing the
                // EOF flag to not be set. Therefore, EAGAIN is ignored and we exit anyway.
@ -61,7 +69,7 @@ void io_buffer_t::read() {

                break;
            } else {
-                out_buffer_append(b, l);
+                buffer_.append(&b[0], &b[len]);
            }
        }
    }
--- a/src/io.h
+++ b/src/io.h
@ -22,6 +22,129 @@ using std::tr1::shared_ptr;
 #include "common.h"
 #include "env.h"

+/// separated_buffer_t is composed of a sequence of elements, some of which may be explicitly
+/// separated (e.g. through string spit0) and some of which the separation is inferred. This enum
+/// tracks the type.
+enum class separation_type_t {
+    /// This element's separation should be inferred, e.g. through IFS.
+    inferred,
+    /// This element was explicitly separated and should not be separated further.
+    explicitly
+};
+
+/// A separated_buffer_t contains a list of elements, some of which may be separated explicitly and
+/// others which must be separated further by the user (e.g. via IFS).
+template <typename StringType>
+class separated_buffer_t {
+   public:
+    struct element_t {
+        StringType contents;
+        separation_type_t separation;
+
+        element_t(StringType contents, separation_type_t sep)
+            : contents(std::move(contents)), separation(sep) {}
+
+        bool is_explicitly_separated() const { return separation == separation_type_t::explicitly; }
+    };
+
+   private:
+    /// Limit on how much data we'll buffer. Zero means no limit.
+    size_t buffer_limit_;
+
+    /// Current size of all contents.
+    size_t contents_size_{0};
+
+    /// List of buffer elements.
+    std::vector<element_t> elements_;
+
+    /// True if we're discarding input because our buffer_limit has been exceeded.
+    bool discard = false;
+
+    /// Mark that we are about to add the given size \p delta to the buffer. \return true if we
+    /// succeed, false if we exceed buffer_limit.
+    bool try_add_size(size_t delta) {
+        if (discard) return false;
+        contents_size_ += delta;
+        if (contents_size_ < delta) {
+            // Overflow!
+            set_discard();
+            return false;
+        }
+        if (buffer_limit_ > 0 && contents_size_ > buffer_limit_) {
+            set_discard();
+            return false;
+        }
+        return true;
+    }
+
+    /// separated_buffer_t may not be copied.
+    separated_buffer_t(const separated_buffer_t &) = delete;
+    void operator=(const separated_buffer_t &) = delete;
+
+public:
+ /// Construct a separated_buffer_t with the given buffer limit \p limit, or 0 for no limit.
+ separated_buffer_t(size_t limit) : buffer_limit_(limit) {}
+
+ /// \return the buffer limit size, or 0 for no limit.
+ size_t limit() const { return buffer_limit_; }
+
+ /// \return the contents size.
+ size_t size() const { return contents_size_; }
+
+ /// \return whether the output has been discarded.
+ bool discarded() const { return discard; }
+
+ /// Mark the contents as discarded.
+ void set_discard() {
+     elements_.clear();
+     contents_size_ = 0;
+     discard = true;
+ }
+
+ /// Serialize the contents to a single string, where explicitly separated elements have a
+ /// newline appended.
+ StringType newline_serialized() const {
+     StringType result;
+     result.reserve(size());
+     for (const auto &elem : elements_) {
+         result.append(elem.contents);
+         if (elem.is_explicitly_separated()) {
+             result.push_back('\n');
+         }
+     }
+     return result;
+    }
+
+    /// \return the list of elements.
+    const std::vector<element_t> &elements() const { return elements_; }
+
+    /// Append an element with range [begin, end) and the given separation type \p sep.
+    template <typename Iterator>
+    void append(Iterator begin, Iterator end, separation_type_t sep = separation_type_t::inferred) {
+        if (!try_add_size(std::distance(begin, end))) return;
+        // Try merging with the last element.
+        if (sep == separation_type_t::inferred && !elements_.empty() && !elements_.back().is_explicitly_separated()) {
+            elements_.back().contents.append(begin, end);
+        } else {
+            elements_.emplace_back(StringType(begin, end), sep);
+        }
+    }
+
+    /// Append a string \p str with the given separation type \p sep.
+    void append(const StringType &str, separation_type_t sep = separation_type_t::inferred) {
+        append(str.begin(), str.end(), sep);
+    }
+
+    // Given that this is a narrow stream, convert a wide stream \p rhs to narrow and then append
+    // it.
+    template <typename RHSStringType>
+    void append_wide_buffer(const separated_buffer_t<RHSStringType> &rhs) {
+        for (const auto &rhs_elem : rhs.elements()) {
+            append(wcs2string(rhs_elem.contents), rhs_elem.separation);
+        }
+    }
+};
+
 /// Describes what type of IO operation an io_data_t represents.
 enum io_mode_t { IO_FILE, IO_PIPE, IO_FD, IO_BUFFER, IO_CLOSE };

@ -98,57 +221,25 @@ class io_pipe_t : public io_data_t {
 };

 class io_chain_t;
+class output_stream_t;
 class io_buffer_t : public io_pipe_t {
   private:
-    /// True if we're discarding input.
-    bool discard;
-    /// Limit on how much data we'll buffer. Zero means no limit.
-    size_t buffer_limit;
-    /// Buffer to save output in.
-    std::vector<char> out_buffer;
+    separated_buffer_t<std::string> buffer_;

    explicit io_buffer_t(int f, size_t limit)
        : io_pipe_t(IO_BUFFER, f, false /* not input */),
-          discard(false),
-          buffer_limit(limit),
-          out_buffer() {}
+          buffer_(limit) {}

   public:
    void print() const override;

    ~io_buffer_t() override;

+    /// Access the underlying buffer.
+    const separated_buffer_t<std::string> &buffer() const { return buffer_; }
+
    /// Function to append to the buffer.
-    void out_buffer_append(const char *ptr, size_t count) {
-        if (discard) return;
-        if (buffer_limit && out_buffer.size() + count > buffer_limit) {
-            discard = true;
-            out_buffer.clear();
-            return;
-        }
-        out_buffer.insert(out_buffer.end(), ptr, ptr + count);
-    }
-
-    /// Function to get a pointer to the buffer.
-    char *out_buffer_ptr(void) { return out_buffer.empty() ? NULL : &out_buffer.at(0); }
-
-    const char *out_buffer_ptr(void) const { return out_buffer.empty() ? NULL : &out_buffer.at(0); }
-
-    /// Function to get the size of the buffer.
-    size_t out_buffer_size(void) const { return out_buffer.size(); }
-
-    /// Function that returns true if we discarded the input because there was too much data.
-    bool output_discarded(void) { return discard; }
-
-    /// Function to explicitly put the object in discard mode. Meant to be used when moving
-    /// the results from an output_stream_t to an io_buffer_t.
-    void set_discard(void) {
-        discard = true;
-        out_buffer.clear();
-    }
-
-    /// This is used to transfer the buffer limit for this object to a output_stream_t object.
-    size_t get_buffer_limit(void) { return buffer_limit; }
+    void append(const char *ptr, size_t count) { buffer_.append(ptr, ptr + count); }

    /// Ensures that the pipes do not conflict with any fd redirections in the chain.
    bool avoid_conflicts_with_io_chain(const io_chain_t &ios);
@ -156,6 +247,10 @@ class io_buffer_t : public io_pipe_t {
    /// Close output pipe, and read from input pipe until eof.
    void read();

+    /// Appends data from a given output_stream_t.
+    /// Marks the receiver as discarded if the stream was discarded.
+    void append_from_stream(const output_stream_t &stream);
+
    /// Create a IO_BUFFER type io redirection, complete with a pipe and a vector<char> for output.
    /// The default file descriptor used is STDOUT_FILENO for buffering.
    ///
@ -193,81 +288,42 @@ bool pipe_avoid_conflicts_with_io_chain(int fds[2], const io_chain_t &ios);
 /// Class representing the output that a builtin can generate.
 class output_stream_t {
   private:
-    /// Limit on how much data we'll buffer. Zero means no limit.
-    size_t buffer_limit;
-    /// True if we're discarding input.
-    bool discard;
+    /// Storage for our data.
+    separated_buffer_t<wcstring> buffer_;
+
    // No copying.
-    output_stream_t(const output_stream_t &s);
-    void operator=(const output_stream_t &s);
-
-    wcstring buffer_;
-
-    void check_for_overflow() {
-        if (buffer_limit && buffer_.size() > buffer_limit) {
-            discard = true;
-            buffer_.clear();
-        }
-    }
+    output_stream_t(const output_stream_t &s) = delete;
+    void operator=(const output_stream_t &s) = delete;

   public:
-    output_stream_t(size_t buffer_limit_) : buffer_limit(buffer_limit_), discard(false) {}
+    output_stream_t(size_t buffer_limit) : buffer_(buffer_limit) {}

-#if 0
-    void set_buffer_limit(size_t buffer_limit_) { buffer_limit = buffer_limit_; }
-#endif
+    void append(const wcstring &s) { buffer_.append(s.begin(), s.end()); }

-    void append(const wcstring &s) {
-        if (discard) return;
-        buffer_.append(s);
-        check_for_overflow();
-    }
+    separated_buffer_t<wcstring> &buffer() { return buffer_; }

-    void append(const wchar_t *s) {
-        if (discard) return;
-        buffer_.append(s);
-        check_for_overflow();
-    }
+    const separated_buffer_t<wcstring> &buffer() const { return buffer_; }

-    void append(wchar_t s) {
-        if (discard) return;
-        buffer_.push_back(s);
-        check_for_overflow();
-    }
+    void append(const wchar_t *s) { append(s, wcslen(s)); }

-    void append(const wchar_t *s, size_t amt) {
-        if (discard) return;
-        buffer_.append(s, amt);
-        check_for_overflow();
-    }
+    void append(wchar_t s) { append(&s, 1); }

-    void push_back(wchar_t c) {
-        if (discard) return;
-        buffer_.push_back(c);
-        check_for_overflow();
-    }
+    void append(const wchar_t *s, size_t amt) { buffer_.append(s, s + amt); }
+
+    void push_back(wchar_t c) { append(c); }

    void append_format(const wchar_t *format, ...) {
-        if (discard) return;
        va_list va;
        va_start(va, format);
-        ::append_formatv(buffer_, format, va);
+        append_formatv(format, va);
        va_end(va);
-        check_for_overflow();
    }

-    void append_formatv(const wchar_t *format, va_list va_orig) {
-        if (discard) return;
-        ::append_formatv(buffer_, format, va_orig);
-        check_for_overflow();
-    }
+    void append_formatv(const wchar_t *format, va_list va) { append(vformat_string(format, va)); }

-    const wcstring &buffer() const { return buffer_; }
+    bool empty() const { return buffer_.size() == 0; }

-    /// Function that returns true if we discarded the input because there was too much data.
-    bool output_discarded(void) { return discard; }
-
-    bool empty() const { return buffer_.empty(); }
+    wcstring contents() const { return buffer_.newline_serialized(); }
 };

 struct io_streams_t {
--- a/src/parse_execution.cpp
+++ b/src/parse_execution.cpp
@ -328,7 +328,7 @@ parse_execution_result_t parse_execution_context_t::run_function_statement(
    proc_set_last_status(err);

    if (!streams.err.empty()) {
-        this->report_error(header, L"%ls", streams.err.buffer().c_str());
+        this->report_error(header, L"%ls", streams.err.contents().c_str());
        result = parse_execution_errored;
    }

--- a/src/proc.cpp
+++ b/src/proc.cpp
@ -758,19 +758,17 @@ static void read_try(job_t *j) {
        debug(3, L"proc::read_try('%ls')", j->command_wcstr());
        while (1) {
            char b[BUFFER_SIZE];
-            long l;
-
-            l = read_blocked(buff->pipe_fd[0], b, BUFFER_SIZE);
-            if (l == 0) {
+            long len = read_blocked(buff->pipe_fd[0], b, BUFFER_SIZE);
+            if (len == 0) {
                break;
-            } else if (l < 0) {
+            } else if (len < 0) {
                if (errno != EAGAIN) {
                    debug(1, _(L"An error occured while reading output from code block"));
                    wperror(L"read_try");
                }
                break;
            } else {
-                buff->out_buffer_append(b, l);
+                buff->append(b, len);
            }
        }
    }
--- a/tests/string.err
+++ b/tests/string.err
@ -294,3 +294,12 @@ string repeat -l fakearg

 ####################
 # Check NUL
+
+####################
+# string split0
+
+####################
+# string join0
+
+####################
+# string split0 in functions
--- a/tests/string.in
+++ b/tests/string.in
@ -340,4 +340,28 @@ printf 'a\0b' | string replace -r b g | string escape
 # TODO: These do not yet work!
 # printf 'a\0b' | string match '*b' | string escape

+logmsg string split0
+count (echo -ne 'abcdefghi' | string split0)
+count (echo -ne 'abc\x00def\x00ghi\x00' | string split0)
+count (echo -ne 'abc\x00def\x00ghi\x00\x00' | string split0)
+count (echo -ne 'abc\x00def\x00ghi' | string split0)
+count (echo -ne 'abc\ndef\x00ghi\x00' | string split0)
+count (echo -ne 'abc\ndef\nghi' | string split0)
+
+logmsg string join0
+set tmp beta alpha\ngamma
+count (string join \n $tmp)
+count (string join0 $tmp)
+count (string join0 $tmp | string split0)
+
+logmsg string split0 in functions
+# This function outputs some newline-separated content, and some
+# explicitly separated content.
+function dualsplit
+  echo alpha
+  echo beta
+  echo -ne 'gamma\x00delta' | string split0
+end
+count (dualsplit)
+
 exit 0
--- a/tests/string.out
+++ b/tests/string.out
@ -433,3 +433,22 @@ d
 a\x00b
 a\x00g
 a\x00g
+
+####################
+# string split0
+1
+3
+4
+3
+2
+1
+
+####################
+# string join0
+3
+2
+2
+
+####################
+# string split0 in functions
+4