Space and time optimizations for parse_node_t. Reduced size from 48

bytes to 20 bytes.
2024-12-27 05:13:10 +00:00 · 2014-03-25 20:06:34 -07:00 · 2014-03-25 20:06:34 -07:00 · 9fece3fdf1
commit 9fece3fdf1
parent b520a03c57
5 changed files with 63 additions and 48 deletions
--- a/osx/config.h
+++ b/osx/config.h
@ -218,7 +218,9 @@
 #if __GNUC__ >= 3
 #define __warn_unused __attribute__ ((warn_unused_result))
 #define __sentinel __attribute__ ((sentinel))
 #define __packed __attribute__ ((packed))
 #else
 #define __warn_unused
 #define __sentinel
 #define __packed
 #endif
--- a/parse_constants.h
+++ b/parse_constants.h
@ -70,7 +70,7 @@ enum parse_token_type_t
    LAST_TOKEN_OR_SYMBOL = parse_token_type_terminate,
    FIRST_PARSE_TOKEN_TYPE = parse_token_type_string
-};
+} __packed;
 enum parse_keyword_t
 {
@ -93,7 +93,7 @@ enum parse_keyword_t
    parse_keyword_exec,
    LAST_KEYWORD = parse_keyword_builtin
-};
+} __packed;
 /* Statement decorations. This matches the order of productions in decorated_statement */
 enum parse_statement_decoration_t
--- a/parse_execution.cpp
+++ b/parse_execution.cpp
@ -72,7 +72,8 @@ node_offset_t parse_execution_context_t::get_offset(const parse_node_t &node) co
    const parse_node_t *addr = &node;
    const parse_node_t *base = &this->tree.at(0);
    assert(addr >= base);
-    node_offset_t offset = addr - base;
+    assert(addr - base < SOURCE_OFFSET_INVALID);
    node_offset_t offset = static_cast<node_offset_t>(addr - base);
    assert(offset < this->tree.size());
    assert(&tree.at(offset) == &node);
    return offset;
--- a/parse_tree.cpp
+++ b/parse_tree.cpp
@ -13,6 +13,13 @@ static bool production_is_empty(const production_t *production)
    return (*production)[0] == token_type_invalid;
 }
 void swap2(parse_node_tree_t &a, parse_node_tree_t &b)
 {
    fprintf(stderr, "Swapping!\n");
    // This uses the base vector implementation
    a.swap(b);
 }
 /** Returns a string description of this parse error */
 wcstring parse_error_t::describe_with_prefix(const wcstring &src, const wcstring &prefix, bool skip_caret) const
 {
@ -404,7 +411,7 @@ static void dump_tree_recursive(const parse_node_tree_t &nodes, const wcstring &
    result->push_back(L'\n');
    ++*line;
-    for (size_t child_idx = node.child_start; child_idx < node.child_start + node.child_count; child_idx++)
+    for (node_offset_t child_idx = node.child_start; child_idx < node.child_start + node.child_count; child_idx++)
    {
        dump_tree_recursive(nodes, src, child_idx, indent + 1, result, line, inout_first_node_not_dumped);
    }
@ -529,11 +536,18 @@ class parse_ll_t
        }
        // Get the parent index. But we can't get the parent parse node yet, since it may be made invalid by adding children
-        const size_t parent_node_idx = symbol_stack.back().node_idx;
+        const node_offset_t parent_node_idx = symbol_stack.back().node_idx;
        // Add the children. Confusingly, we want our nodes to be in forwards order (last token last, so dumps look nice), but the symbols should be reverse order (last token first, so it's lowest on the stack)
-        const size_t child_start = nodes.size();
+        const size_t child_start_big = nodes.size();
-        size_t child_count = 0;
+        assert(child_start_big < NODE_OFFSET_INVALID);
        node_offset_t child_start = static_cast<node_offset_t>(child_start_big);
        // To avoid constructing multiple nodes, we push_back a single one that we modify
        parse_node_t representative_child(token_type_invalid);
        representative_child.parent = parent_node_idx;
        node_offset_t child_count = 0;
        for (size_t i=0; i < MAX_SYMBOLS_PER_PRODUCTION; i++)
        {
            production_element_t elem = (*production)[i];
@ -542,16 +556,12 @@ class parse_ll_t
                // All done, bail out
                break;
            }
-            else
+
-            {
+            // Append the parse node.
-                // Generate the parse node.
+            representative_child.type = production_element_type(elem);
-                parse_token_type_t child_type = production_element_type(elem);
+            nodes.push_back(representative_child);
                parse_node_t child = parse_node_t(child_type);
                child.parent = parent_node_idx;
                nodes.push_back(child);
            child_count++;
        }
        }
        // Update the parent
        parse_node_t &parent_node = nodes.at(parent_node_idx);
@ -566,7 +576,7 @@ class parse_ll_t
        // Replace the top of the stack with new stack elements corresponding to our new nodes. Note that these go in reverse order.
        symbol_stack.pop_back();
        symbol_stack.reserve(symbol_stack.size() + child_count);
-        size_t idx = child_count;
+        node_offset_t idx = child_count;
        while (idx--)
        {
            production_element_t elem = (*production)[idx];
@ -652,18 +662,17 @@ void parse_ll_t::dump_stack(void) const
 // Since children always appear after their parents, we can implement this very simply by walking backwards
 void parse_ll_t::determine_node_ranges(void)
 {
    const size_t source_start_invalid = -1;
    size_t idx = nodes.size();
    while (idx--)
    {
-        parse_node_t *parent = &nodes.at(idx);
+        parse_node_t *parent = &nodes[idx];
        // Skip nodes that already have a source range. These are terminal nodes.
-        if (parent->source_start != source_start_invalid)
+        if (parent->source_start != SOURCE_OFFSET_INVALID)
            continue;
        // Ok, this node needs a source range. Get all of its children, and then set its range.
-        size_t min_start = source_start_invalid, max_end = 0; //note source_start_invalid is huge
+        source_offset_t min_start = SOURCE_OFFSET_INVALID, max_end = 0; //note SOURCE_OFFSET_INVALID is huge
        for (node_offset_t i=0; i < parent->child_count; i++)
        {
            const parse_node_t &child = nodes.at(parent->child_offset(i));
@ -674,7 +683,7 @@ void parse_ll_t::determine_node_ranges(void)
            }
        }
-        if (min_start != source_start_invalid)
+        if (min_start != SOURCE_OFFSET_INVALID)
        {
            assert(max_end >= min_start);
            parent->source_start = min_start;
@ -831,7 +840,7 @@ void parse_ll_t::parse_error(const wchar_t *expected, parse_token_t token)
 void parse_ll_t::reset_symbols(enum parse_token_type_t goal)
 {
    /* Add a new goal node, and then reset our symbol list to point at it */
-    node_offset_t where = nodes.size();
+    node_offset_t where = static_cast<node_offset_t>(nodes.size());
    nodes.push_back(parse_node_t(goal));
    symbol_stack.clear();
@ -1047,14 +1056,10 @@ static parse_keyword_t keyword_for_token(token_type tok, const wchar_t *tok_txt)
 }
 /* Placeholder invalid token */
-static const parse_token_t kInvalidToken = {token_type_invalid,
+static const parse_token_t kInvalidToken = {token_type_invalid, parse_keyword_none, false, false, SOURCE_OFFSET_INVALID, 0};
 parse_keyword_none, false, false, static_cast<size_t>(-1),
                                  static_cast<size_t>(-1)};
 /* Terminal token */
-static const parse_token_t kTerminalToken = {parse_token_type_terminate,
+static const parse_token_t kTerminalToken = {parse_token_type_terminate, parse_keyword_none, false, false, SOURCE_OFFSET_INVALID, 0};
 parse_keyword_none, false, false, static_cast<size_t>(-1),
                                  static_cast<size_t>(-1)};
 static inline bool is_help_argument(const wchar_t *txt)
 {
@ -1082,8 +1087,8 @@ static inline parse_token_t next_parse_token(tokenizer_t *tok)
    result.keyword = keyword_for_token(tok_type, tok_txt);
    result.has_dash_prefix = (tok_txt[0] == L'-');
    result.is_help_argument = result.has_dash_prefix && is_help_argument(tok_txt);
-    result.source_start = (size_t)tok_start;
+    result.source_start = (source_offset_t)tok_start;
-    result.source_length = tok_extent;
+    result.source_length = (source_offset_t)tok_extent;
    tok_next(tok);
    return result;
@ -1195,7 +1200,7 @@ const parse_node_t *parse_node_tree_t::get_child(const parse_node_t &parent, nod
 const parse_node_t &parse_node_tree_t::find_child(const parse_node_t &parent, parse_token_type_t type) const
 {
-    for (size_t i=0; i < parent.child_count; i++)
+    for (node_offset_t i=0; i < parent.child_count; i++)
    {
        const parse_node_t *child = this->get_child(parent, i);
        if (child->type == type)
@ -1241,7 +1246,7 @@ static void find_nodes_recursive(const parse_node_tree_t &tree, const parse_node
    if (result->size() < max_count)
    {
        if (parent.type == type) result->push_back(&parent);
-        for (size_t i=0; i < parent.child_count; i++)
+        for (node_offset_t i=0; i < parent.child_count; i++)
        {
            const parse_node_t *child = tree.get_child(parent, i);
            assert(child != NULL);
@ -1478,7 +1483,7 @@ const parse_node_t *parse_node_tree_t::next_node_in_node_list(const parse_node_t
        const parse_node_t *next_cursor = NULL;
        /* Walk through the children */
-        for (size_t i=0; i < list_cursor->child_count; i++)
+        for (node_offset_t i=0; i < list_cursor->child_count; i++)
        {
            const parse_node_t *child = this->get_child(*list_cursor, i);
            if (child->type == entry_type)
--- a/parse_tree.h
+++ b/parse_tree.h
@ -18,9 +18,15 @@
 class parse_node_t;
 class parse_node_tree_t;
-typedef size_t node_offset_t;
+
 typedef uint32_t node_offset_t;
 #define NODE_OFFSET_INVALID (static_cast<node_offset_t>(-1))
 typedef uint32_t source_offset_t;
 #define SOURCE_OFFSET_INVALID (static_cast<source_offset_t>(-1))
 struct parse_error_t
 {
    /** Text of the error */
@ -51,8 +57,8 @@ struct parse_token_t
    enum parse_keyword_t keyword; // Any keyword represented by this token
    bool has_dash_prefix; // Hackish: whether the source contains a dash prefix
    bool is_help_argument; // Hackish: whether the source looks like '-h' or '--help'
-    size_t source_start;
+    source_offset_t source_start;
-    size_t source_length;
+    source_offset_t source_length;
    wcstring describe() const;
    wcstring user_presentable_description() const;
@ -83,35 +89,36 @@ wcstring parse_dump_tree(const parse_node_tree_t &tree, const wcstring &src);
 wcstring token_type_description(parse_token_type_t type);
 wcstring keyword_description(parse_keyword_t type);
-/** Class for nodes of a parse tree */
+/** Class for nodes of a parse tree. Since there's a lot of these, the size and order of the fields is important. */
 class parse_node_t
 {
 public:
    /* Type of the node */
    enum parse_token_type_t type;
    /* Start in the source code */
-    size_t source_start;
+    source_offset_t source_start;
    /* Length of our range in the source code */
-    size_t source_length;
+    source_offset_t source_length;
    /* Parent */
    node_offset_t parent;
    /* Children */
    node_offset_t child_start;
    /* Number of children */
    uint8_t child_count;
    /* Which production was used */
    uint8_t production_idx;
    /* Type of the node */
    enum parse_token_type_t type;
    /* Description */
    wcstring describe(void) const;
    /* Constructor */
-    explicit parse_node_t(parse_token_type_t ty) : type(ty), source_start(-1), source_length(0), parent(NODE_OFFSET_INVALID), child_start(0), child_count(0), production_idx(-1)
+    explicit parse_node_t(parse_token_type_t ty) : source_start(SOURCE_OFFSET_INVALID), source_length(0), parent(NODE_OFFSET_INVALID), child_start(0), child_count(0), production_idx(-1), type(ty)
    {
    }
@ -124,7 +131,7 @@ public:
    /* Indicate if this node has a range of source code associated with it */
    bool has_source() const
    {
-        return source_start != (size_t)(-1);
+        return source_start != SOURCE_OFFSET_INVALID;
    }
    /* Gets source for the node, or the empty string if it has no source */
@ -143,7 +150,6 @@ public:
    }
 };
 /* The parse tree itself */
 class parse_node_tree_t : public std::vector<parse_node_t>
 {
@ -200,6 +206,7 @@ public:
    parse_node_list_t specific_statements_for_job(const parse_node_t &job) const;
 };
 /* The big entry point. Parse a string, attempting to produce a tree for the given goal type */
 bool parse_tree_from_string(const wcstring &str, parse_tree_flags_t flags, parse_node_tree_t *output, parse_error_list_t *errors, parse_token_type_t goal = symbol_job_list);