Simplify the cached_esc_sequences_t structure

The type cached_esc_sequences_t caches escape sequences, and is tasked with finding an escape sequence that prefixes a given string. Before this fix, it did so by storing the lengths of cached escape sequences, and searching for substrings of that length. The new implementation instead stores all cached escape sequences in a sorted vector, and uses binary search to find the shortest escape sequence that is a prefix of the input. This is a substantial simplification that also reduces allocations.
2025-01-13 05:28:49 +00:00 · 2017-09-01 14:33:59 -07:00 · 2017-09-01 14:33:59 -07:00 · cb352317bd
commit cb352317bd
parent 91a0ba9be5
5 changed files with 62 additions and 62 deletions
--- a/src/common.cpp
+++ b/src/common.cpp
@ -1633,6 +1633,16 @@ bool string_prefixes_string(const wcstring &proposed_prefix, const wcstring &val
    return prefix_size <= value.size() && value.compare(0, prefix_size, proposed_prefix) == 0;
 }

+bool string_prefixes_string(const wchar_t *proposed_prefix, const wchar_t *value) {
+    for (size_t idx = 0; proposed_prefix[idx] != L'\0'; idx++) {
+        // Note if the prefix is longer than value, then we will compare a nonzero prefix character
+        // against a zero value character, and so we'll return false;
+        if (proposed_prefix[idx] != value[idx]) return false;
+    }
+    // We must have that proposed_prefix[idx] == L'\0', so we have a prefix match.
+    return true;
+}
+
 bool string_prefixes_string_case_insensitive(const wcstring &proposed_prefix,
                                             const wcstring &value) {
    size_t prefix_size = proposed_prefix.size();
--- a/src/common.h
+++ b/src/common.h
@ -286,6 +286,7 @@ std::string wcs2string(const wcstring &input);
 /// Test if a string prefixes another. Returns true if a is a prefix of b.
 bool string_prefixes_string(const wcstring &proposed_prefix, const wcstring &value);
 bool string_prefixes_string(const wchar_t *proposed_prefix, const wcstring &value);
+bool string_prefixes_string(const wchar_t *proposed_prefix, const wchar_t *value);

 /// Test if a string is a suffix of another.
 bool string_suffixes_string(const wcstring &proposed_suffix, const wcstring &value);
--- a/src/fish_tests.cpp
+++ b/src/fish_tests.cpp
@ -4280,6 +4280,30 @@ void test_maybe() {
    do_test(m2.missing_or_empty());
 }

+void test_cached_esc_sequences() {
+    cached_esc_sequences_t seqs;
+    do_test(seqs.find_entry(L"abc") == 0);
+    seqs.add_entry(L"abc");
+    seqs.add_entry(L"abc");
+    do_test(seqs.size() == 1);
+    do_test(seqs.find_entry(L"abc") == 3);
+    do_test(seqs.find_entry(L"abcd") == 3);
+    do_test(seqs.find_entry(L"abcde") == 3);
+    do_test(seqs.find_entry(L"xabcde") == 0);
+    seqs.add_entry(L"ac");
+    do_test(seqs.find_entry(L"abcd") == 3);
+    do_test(seqs.find_entry(L"acbd") == 2);
+    seqs.add_entry(L"wxyz");
+    do_test(seqs.find_entry(L"abc") == 3);
+    do_test(seqs.find_entry(L"abcd") == 3);
+    do_test(seqs.find_entry(L"wxyz123") == 4);
+    do_test(seqs.find_entry(L"qwxyz123") == 0);
+    do_test(seqs.size() == 3);
+    seqs.clear();
+    do_test(seqs.size() == 0);
+    do_test(seqs.find_entry(L"abcd") == 0);
+}
+
 /// Main test.
 int main(int argc, char **argv) {
    UNUSED(argc);
@ -4376,6 +4400,7 @@ int main(int argc, char **argv) {
    if (should_test_function("string")) test_string();
    if (should_test_function("illegal_command_exit_code")) test_illegal_command_exit_code();
    if (should_test_function("maybe")) test_maybe();
+    if (should_test_function("cached_esc_sequences")) test_cached_esc_sequences();
    // history_tests_t::test_history_speed();

    say(L"Encountered %d errors in low-level tests", err_count);
--- a/src/screen.cpp
+++ b/src/screen.cpp
@ -80,7 +80,8 @@ class scoped_buffer_t {
 };

 // Singleton of the cached escape sequences seen in prompts and similar strings.
-cached_esc_sequences_t cached_esc_sequences = cached_esc_sequences_t();
+// Note this is deliberately exported so that init_curses can clear it.
+cached_esc_sequences_t cached_esc_sequences;

 /// Tests if the specified narrow character sequence is present at the specified position of the
 /// specified wide character string. All of \c seq must match, but str may be longer than seq.
@ -278,7 +279,7 @@ size_t escape_code_length(const wchar_t *code) {
    if (!found) found = is_single_byte_escape_seq(code, &esc_seq_len);
    if (!found) found = is_csi_style_escape_seq(code, &esc_seq_len);
    if (!found) found = is_two_byte_escape_seq(code, &esc_seq_len);
-    if (found) cached_esc_sequences.add_entry(code, esc_seq_len);
+    if (found) cached_esc_sequences.add_entry(wcstring(code, esc_seq_len));
    return esc_seq_len;
 }

--- a/src/screen.h
+++ b/src/screen.h
@ -202,75 +202,38 @@ size_t escape_code_length(const wchar_t *code);
 // Maintain a mapping of escape sequences to their length for fast lookup.
 class cached_esc_sequences_t {
   private:
-    // Cached escape sequences we've already detected in the prompt and similar strings.
-    std::unordered_set<wcstring> cache;
-    // The escape sequence lengths we've cached. My original implementation used min and max
-    // length variables. The cache was then iterated over using a loop like this:
-    // `for (size_t l = min; l <= max; l++)`.
-    //
-    // However that is inefficient when there are big gaps in the lengths. This has been observed
-    // with the BobTheFish theme which has a bunch of 5 and 6 char sequences and 16 to 19 char
-    // sequences and almost nothing in between. So instead we keep track of only those escape
-    // sequence lengths we've actually cached to avoid checking for matches of lengths we know are
-    // not in our cache.
-    std::vector<size_t> lengths;
-    std::unordered_map<size_t, size_t> lengths_match_count;
-    size_t cache_hits;
+    // Cached escape sequences we've already detected in the prompt and similar strings, ordered
+    // lexicographically.
+    std::vector<wcstring> cache_;

   public:
-    explicit cached_esc_sequences_t() : cache(), lengths(), lengths_match_count(), cache_hits(0) {}
+    /// \return the size of the cache.
+    size_t size() const { return cache_.size(); }

-    void add_entry(const wchar_t *entry, size_t len) {
-        auto str = wcstring(entry, len);
-
-#if 0
-        // This is a can't happen scenario. I only wrote this to validate during testing that it
-        // wouldn't be triggered. I'm leaving it in but commented out in case someone feels the need
-        // to re-enable the check.
-        auto match = cache.find(str);
-        if (match != cache.end()) {
-            debug(0, "unexpected add_entry() call of a value already in the cache: '%ls'",
-                  escape(str.c_str(), ESCAPE_ALL).c_str());
-            return;
-        }
-#endif
-
-        cache.emplace(str);
-        if (std::find(lengths.begin(), lengths.end(), len) == lengths.end()) {
-            lengths.push_back(len);
-            lengths_match_count[len] = 0;
+    /// Insert the entry \p str in its sorted position, if it is not already present in the cache.
+    void add_entry(wcstring str) {
+        auto where = std::upper_bound(cache_.begin(), cache_.end(), str);
+        if (where == cache_.begin() || where[-1] != str) {
+            cache_.emplace(where, std::move(str));
        }
    }

-    size_t find_entry(const wchar_t *entry) {
-        size_t entry_len = wcslen(entry);
-        for (auto len : lengths) {
-            if (len > entry_len) continue;
-            auto match = cache.find(wcstring(entry, len));
-            if (match != cache.end()) {  // we found a matching cached sequence
-                // Periodically sort  the sequence lengths so we check for matches going from the
-                // most frequently matching lengths to least frequent.
-                lengths_match_count[len]++;
-                if (++cache_hits % 1000 == 0) {
-                    // std::sort(lengths.begin(), lengths.end(), custom_cmp(lengths_match_count));
-                    std::sort(lengths.begin(), lengths.end(), [&](size_t l1, size_t l2) {
-                        return lengths_match_count[l1] > lengths_match_count[l2];
-                    });
-                }
-
-                return len;  // return the length of the matching cached sequence
-            }
+    /// \return the length of a string that matches a prefix of \p entry.
+    size_t find_entry(const wchar_t *entry) const {
+        // Do a binary search and see if the escape code right before our entry is a prefix of our
+        // entry. Note this assumes that escape codes are prefix-free: no escape code is a prefix of
+        // another one. This seems like a safe assumption.
+        auto where = std::upper_bound(cache_.begin(), cache_.end(), entry);
+        // 'where' is now the first element that is greater than entry. Thus where-1 is the last
+        // element that is less than or equal to entry.
+        if (where != cache_.begin()) {
+            const wcstring &candidate = where[-1];
+            if (string_prefixes_string(candidate.c_str(), entry)) return candidate.size();
        }
-
-        return 0;  // no cached sequence matches the entry
+        return 0;
    }

-    void clear() {
-        cache.clear();
-        lengths.clear();
-        lengths_match_count.clear();
-        cache_hits = 0;
-    }
+    void clear() { cache_.clear(); }
 };

 // Singleton that is exposed so that the cache can be invalidated when terminal related variables