Optimize keyword detection

The data stored in these containers is small enough that it is worth creating distinct sets for each lookup. In a microbenchmark of these changes, the single-lookup version of the function with lookups gated on the length of input (bypassed entirely if the input is longer than the longest key in the container) provided a 1.5x-3.5x speedup over the previous implementation. Additionally, as the collections are static and their contents are never modified after startup, it makes no sense to continously calculate the location of and allocate an iterator for the `!= foo.end()` comparison; the end iterator is now statically cached. I'm not expecting massive speed gains out of this change, but the parser does perform enough of these to make it worth optimizing in this way.
2024-12-27 05:13:10 +00:00 · 2019-04-03 20:38:29 -05:00 · 2019-04-03 20:38:29 -05:00 · bc66921ac9
commit bc66921ac9
parent e2ed6baf43
1 changed files with 70 additions and 14 deletions
--- a/src/parser_keywords.cpp
+++ b/src/parser_keywords.cpp
@ -8,26 +8,82 @@
 #include "fallback.h"  // IWYU pragma: keep
 #include "parser_keywords.h"

-bool parser_keywords_skip_arguments(const wcstring &cmd) {
-    static const wcstring el = L"else";
-    static const wcstring beg = L"begin";
-    return cmd == el || cmd == beg;
-}
+typedef std::unordered_set<wcstring> string_set_t;

-static const std::unordered_set<wcstring> subcommand_keywords = {L"command", L"builtin", L"while", L"exec",
-                                                     L"if",      L"and",     L"or",    L"not"};
-bool parser_keywords_is_subcommand(const wcstring &cmd) {
-    return parser_keywords_skip_arguments(cmd) || contains(subcommand_keywords, cmd);
-}
+static const wcstring skip_keywords[] {
+        L"else",
+        L"begin",
+};

-static const std::unordered_set<wcstring> block_keywords = {L"for",      L"while",  L"if",
-                                                L"function", L"switch", L"begin"};
-bool parser_keywords_is_block(const wcstring &word) { return contains(block_keywords, word); }
+static const wcstring subcommand_keywords[] {
+        L"command", L"builtin", L"while", L"exec",
+        L"if",      L"and",     L"or",    L"not"
+};

-static const std::unordered_set<wcstring> reserved_keywords = {L"end",      L"case",   L"else",     L"return",
+static const string_set_t block_keywords = {
+        L"for",      L"while",  L"if",
+        L"function", L"switch", L"begin"
+};
+
+static const wcstring reserved_keywords[] = {
+        L"end",      L"case",   L"else",     L"return",
        L"continue", L"break",  L"argparse", L"read",
-                                                   L"set",      L"status", L"test",     L"["};
-bool parser_keywords_is_reserved(const wcstring &word) {
-    return parser_keywords_is_block(word) || parser_keywords_is_subcommand(word) ||
-           contains(reserved_keywords, word);
+        L"set",      L"status", L"test",     L"["
+};
+
+// The lists above are purposely implemented separately from the logic below, so that future
+// maintainers may assume the contents of the list based off their names, and not off what the
+// functions below require them to contain.
+
+static size_t list_max_length(const string_set_t &list) {
+    size_t result = 0;
+    for (const auto &w: list) {
+        if (w.length() > result) {
+            result = w.length();
+        }
+    }
+    return result;
+}
+
+bool parser_keywords_skip_arguments(const wcstring &cmd) {
+    return cmd == skip_keywords[0] || cmd == skip_keywords[1];
+}
+
+bool parser_keywords_is_subcommand(const wcstring &cmd) {
+    const static string_set_t search_list = ([](){
+        string_set_t results;
+        results.insert(std::begin(subcommand_keywords), std::end(subcommand_keywords));
+        results.insert(std::begin(skip_keywords), std::end(skip_keywords));
+        return results;
+    })();
+
+    const static auto max_len = list_max_length(search_list);
+    const static auto not_found = search_list.end();
+
+    // Everything above is executed only at startup, this is the actual optimized search routine:
+    return cmd.length() <= max_len && search_list.find(cmd) != not_found;
+}
+
+bool parser_keywords_is_block(const wcstring &word) {
+    const static auto max_len = list_max_length(block_keywords);
+    const static auto not_found = block_keywords.end();
+
+    // Everything above is executed only at startup, this is the actual optimized search routine:
+    return word.length() <= max_len && block_keywords.find(word) != not_found;
+}
+
+bool parser_keywords_is_reserved(const wcstring &word) {
+    const static string_set_t search_list = ([](){
+        string_set_t results;
+        results.insert(std::begin(subcommand_keywords), std::end(subcommand_keywords));
+        results.insert(std::begin(skip_keywords), std::end(skip_keywords));
+        results.insert(std::begin(block_keywords), std::end(block_keywords));
+        results.insert(std::begin(reserved_keywords), std::end(reserved_keywords));
+        return results;
+    })();
+    const static auto max_len = list_max_length(search_list);
+    const static auto not_found = search_list.end();
+
+    // Everything above is executed only at startup, this is the actual optimized search routine:
+    return word.length() <= max_len && search_list.find(word) != not_found;
 }