Introduce wcs2string_callback

This is like wcs2string, but instead of returning a std::string, it invokes
a user-supplied function with each converted character.

The idea is to allow interleaved conversion and output.
This commit is contained in:
ridiculousfish 2020-07-29 17:16:51 -07:00
parent c9b42c6f1f
commit a0cb23bea5
3 changed files with 52 additions and 29 deletions

View file

@ -332,38 +332,13 @@ wcstring str2wcstring(const std::string &in, size_t len) {
return str2wcs_internal(in.data(), len); return str2wcs_internal(in.data(), len);
} }
/// This function is distinguished from wcs2str_internal in that it allows embedded null bytes.
std::string wcs2string(const wcstring &input) { std::string wcs2string(const wcstring &input) {
std::string result; std::string result;
result.reserve(input.size()); result.reserve(input.size());
wcs2string_callback(input.data(), input.size(), [&](const char *buff, size_t bufflen) {
mbstate_t state = {}; result.append(buff, bufflen);
char converted[MB_LEN_MAX]; return true;
});
for (auto wc : input) {
if (wc == INTERNAL_SEPARATOR) {
// do nothing
} else if (wc >= ENCODE_DIRECT_BASE && wc < ENCODE_DIRECT_BASE + 256) {
result.push_back(wc - ENCODE_DIRECT_BASE);
} else if (MB_CUR_MAX == 1) { // single-byte locale (C/POSIX/ISO-8859)
// If `wc` contains a wide character we emit a question-mark.
if (wc & ~0xFF) {
wc = '?';
}
converted[0] = wc;
result.append(converted, 1);
} else {
std::memset(converted, 0, sizeof converted);
size_t len = std::wcrtomb(converted, wc, &state);
if (len == static_cast<size_t>(-1)) {
FLOGF(char_encoding, L"Wide character U+%4X has no narrow representation", wc);
std::memset(&state, 0, sizeof(state));
} else {
result.append(converted, len);
}
}
}
return result; return result;
} }

View file

@ -8,6 +8,7 @@
#include <locale> #include <locale>
#include "common.h" #include "common.h"
#include "flog.h"
wcstring_range wcstring_tok(wcstring &str, const wcstring &needle, wcstring_range last) { wcstring_range wcstring_tok(wcstring &str, const wcstring &needle, wcstring_range last) {
using size_type = wcstring::size_type; using size_type = wcstring::size_type;
@ -196,3 +197,7 @@ wcstring join_strings(const wcstring_list_t &vals, wchar_t sep) {
} }
return result; return result;
} }
void wcs2string_bad_char(wchar_t wc) {
FLOGF(char_encoding, L"Wide character U+%4X has no narrow representation", wc);
}

View file

@ -3,10 +3,12 @@
#define FISH_WCSTRINGUTIL_H #define FISH_WCSTRINGUTIL_H
#include <algorithm> #include <algorithm>
#include <cstring>
#include <string> #include <string>
#include <utility> #include <utility>
#include "common.h" #include "common.h"
#include "expand.h"
/// Test if a string prefixes another. Returns true if a is a prefix of b. /// Test if a string prefixes another. Returns true if a is a prefix of b.
bool string_prefixes_string(const wcstring &proposed_prefix, const wcstring &value); bool string_prefixes_string(const wcstring &proposed_prefix, const wcstring &value);
@ -136,6 +138,47 @@ wcstring trim(wcstring input, const wchar_t *any_of);
/// Converts a string to lowercase. /// Converts a string to lowercase.
wcstring wcstolower(wcstring input); wcstring wcstolower(wcstring input);
// Out-of-line helper for wcs2string_callback.
void wcs2string_bad_char(wchar_t);
/// Implementation of wcs2string that accepts a callback.
/// This invokes \p func with (const char*, size_t) pairs.
/// If \p func returns false, it stops; otherwise it continues.
/// \return false if the callback returned false, otherwise true.
template <typename Func>
bool wcs2string_callback(const wchar_t *input, size_t len, const Func &func) {
mbstate_t state = {};
char converted[MB_LEN_MAX];
for (size_t i = 0; i < len; i++) {
wchar_t wc = input[i];
// TODO: this doesn't seem sound.
if (wc == INTERNAL_SEPARATOR) {
// do nothing
} else if (wc >= ENCODE_DIRECT_BASE && wc < ENCODE_DIRECT_BASE + 256) {
converted[0] = wc - ENCODE_DIRECT_BASE;
if (!func(converted, 1)) return false;
} else if (MB_CUR_MAX == 1) { // single-byte locale (C/POSIX/ISO-8859)
// If `wc` contains a wide character we emit a question-mark.
if (wc & ~0xFF) {
wc = '?';
}
converted[0] = wc;
if (!func(converted, 1)) return false;
} else {
std::memset(converted, 0, sizeof converted);
size_t len = std::wcrtomb(converted, wc, &state);
if (len == static_cast<size_t>(-1)) {
wcs2string_bad_char(wc);
std::memset(&state, 0, sizeof(state));
} else {
if (!func(converted, len)) return false;
}
}
}
return true;
}
/// Support for iterating over a newline-separated string. /// Support for iterating over a newline-separated string.
template <typename Collection> template <typename Collection>
class line_iterator_t { class line_iterator_t {