From 9fece3fdf17ae642260ab8f8b85cb97a65bd14f7 Mon Sep 17 00:00:00 2001 From: ridiculousfish Date: Tue, 25 Mar 2014 20:06:34 -0700 Subject: [PATCH] Space and time optimizations for parse_node_t. Reduced size from 48 bytes to 20 bytes. --- osx/config.h | 2 ++ parse_constants.h | 4 +-- parse_execution.cpp | 3 +- parse_tree.cpp | 69 ++++++++++++++++++++++++--------------------- parse_tree.h | 33 +++++++++++++--------- 5 files changed, 63 insertions(+), 48 deletions(-) diff --git a/osx/config.h b/osx/config.h index 4968a78b2..99f837427 100644 --- a/osx/config.h +++ b/osx/config.h @@ -218,7 +218,9 @@ #if __GNUC__ >= 3 #define __warn_unused __attribute__ ((warn_unused_result)) #define __sentinel __attribute__ ((sentinel)) +#define __packed __attribute__ ((packed)) #else #define __warn_unused #define __sentinel +#define __packed #endif diff --git a/parse_constants.h b/parse_constants.h index 8ff96407f..10818ffd0 100644 --- a/parse_constants.h +++ b/parse_constants.h @@ -70,7 +70,7 @@ enum parse_token_type_t LAST_TOKEN_OR_SYMBOL = parse_token_type_terminate, FIRST_PARSE_TOKEN_TYPE = parse_token_type_string -}; +} __packed; enum parse_keyword_t { @@ -93,7 +93,7 @@ enum parse_keyword_t parse_keyword_exec, LAST_KEYWORD = parse_keyword_builtin -}; +} __packed; /* Statement decorations. This matches the order of productions in decorated_statement */ enum parse_statement_decoration_t diff --git a/parse_execution.cpp b/parse_execution.cpp index 9267fb79b..484d8741e 100644 --- a/parse_execution.cpp +++ b/parse_execution.cpp @@ -72,7 +72,8 @@ node_offset_t parse_execution_context_t::get_offset(const parse_node_t &node) co const parse_node_t *addr = &node; const parse_node_t *base = &this->tree.at(0); assert(addr >= base); - node_offset_t offset = addr - base; + assert(addr - base < SOURCE_OFFSET_INVALID); + node_offset_t offset = static_cast(addr - base); assert(offset < this->tree.size()); assert(&tree.at(offset) == &node); return offset; diff --git a/parse_tree.cpp b/parse_tree.cpp index a1de4f966..7ea8481a9 100644 --- a/parse_tree.cpp +++ b/parse_tree.cpp @@ -13,6 +13,13 @@ static bool production_is_empty(const production_t *production) return (*production)[0] == token_type_invalid; } +void swap2(parse_node_tree_t &a, parse_node_tree_t &b) +{ + fprintf(stderr, "Swapping!\n"); + // This uses the base vector implementation + a.swap(b); +} + /** Returns a string description of this parse error */ wcstring parse_error_t::describe_with_prefix(const wcstring &src, const wcstring &prefix, bool skip_caret) const { @@ -404,7 +411,7 @@ static void dump_tree_recursive(const parse_node_tree_t &nodes, const wcstring & result->push_back(L'\n'); ++*line; - for (size_t child_idx = node.child_start; child_idx < node.child_start + node.child_count; child_idx++) + for (node_offset_t child_idx = node.child_start; child_idx < node.child_start + node.child_count; child_idx++) { dump_tree_recursive(nodes, src, child_idx, indent + 1, result, line, inout_first_node_not_dumped); } @@ -529,28 +536,31 @@ class parse_ll_t } // Get the parent index. But we can't get the parent parse node yet, since it may be made invalid by adding children - const size_t parent_node_idx = symbol_stack.back().node_idx; + const node_offset_t parent_node_idx = symbol_stack.back().node_idx; // Add the children. Confusingly, we want our nodes to be in forwards order (last token last, so dumps look nice), but the symbols should be reverse order (last token first, so it's lowest on the stack) - const size_t child_start = nodes.size(); - size_t child_count = 0; + const size_t child_start_big = nodes.size(); + assert(child_start_big < NODE_OFFSET_INVALID); + node_offset_t child_start = static_cast(child_start_big); + + // To avoid constructing multiple nodes, we push_back a single one that we modify + parse_node_t representative_child(token_type_invalid); + representative_child.parent = parent_node_idx; + + node_offset_t child_count = 0; for (size_t i=0; i < MAX_SYMBOLS_PER_PRODUCTION; i++) { production_element_t elem = (*production)[i]; - if (!production_element_is_valid(elem)) + if (! production_element_is_valid(elem)) { // All done, bail out break; } - else - { - // Generate the parse node. - parse_token_type_t child_type = production_element_type(elem); - parse_node_t child = parse_node_t(child_type); - child.parent = parent_node_idx; - nodes.push_back(child); - child_count++; - } + + // Append the parse node. + representative_child.type = production_element_type(elem); + nodes.push_back(representative_child); + child_count++; } // Update the parent @@ -566,7 +576,7 @@ class parse_ll_t // Replace the top of the stack with new stack elements corresponding to our new nodes. Note that these go in reverse order. symbol_stack.pop_back(); symbol_stack.reserve(symbol_stack.size() + child_count); - size_t idx = child_count; + node_offset_t idx = child_count; while (idx--) { production_element_t elem = (*production)[idx]; @@ -652,18 +662,17 @@ void parse_ll_t::dump_stack(void) const // Since children always appear after their parents, we can implement this very simply by walking backwards void parse_ll_t::determine_node_ranges(void) { - const size_t source_start_invalid = -1; size_t idx = nodes.size(); while (idx--) { - parse_node_t *parent = &nodes.at(idx); + parse_node_t *parent = &nodes[idx]; // Skip nodes that already have a source range. These are terminal nodes. - if (parent->source_start != source_start_invalid) + if (parent->source_start != SOURCE_OFFSET_INVALID) continue; // Ok, this node needs a source range. Get all of its children, and then set its range. - size_t min_start = source_start_invalid, max_end = 0; //note source_start_invalid is huge + source_offset_t min_start = SOURCE_OFFSET_INVALID, max_end = 0; //note SOURCE_OFFSET_INVALID is huge for (node_offset_t i=0; i < parent->child_count; i++) { const parse_node_t &child = nodes.at(parent->child_offset(i)); @@ -674,7 +683,7 @@ void parse_ll_t::determine_node_ranges(void) } } - if (min_start != source_start_invalid) + if (min_start != SOURCE_OFFSET_INVALID) { assert(max_end >= min_start); parent->source_start = min_start; @@ -831,7 +840,7 @@ void parse_ll_t::parse_error(const wchar_t *expected, parse_token_t token) void parse_ll_t::reset_symbols(enum parse_token_type_t goal) { /* Add a new goal node, and then reset our symbol list to point at it */ - node_offset_t where = nodes.size(); + node_offset_t where = static_cast(nodes.size()); nodes.push_back(parse_node_t(goal)); symbol_stack.clear(); @@ -1047,14 +1056,10 @@ static parse_keyword_t keyword_for_token(token_type tok, const wchar_t *tok_txt) } /* Placeholder invalid token */ -static const parse_token_t kInvalidToken = {token_type_invalid, -parse_keyword_none, false, false, static_cast(-1), - static_cast(-1)}; +static const parse_token_t kInvalidToken = {token_type_invalid, parse_keyword_none, false, false, SOURCE_OFFSET_INVALID, 0}; /* Terminal token */ -static const parse_token_t kTerminalToken = {parse_token_type_terminate, -parse_keyword_none, false, false, static_cast(-1), - static_cast(-1)}; +static const parse_token_t kTerminalToken = {parse_token_type_terminate, parse_keyword_none, false, false, SOURCE_OFFSET_INVALID, 0}; static inline bool is_help_argument(const wchar_t *txt) { @@ -1082,8 +1087,8 @@ static inline parse_token_t next_parse_token(tokenizer_t *tok) result.keyword = keyword_for_token(tok_type, tok_txt); result.has_dash_prefix = (tok_txt[0] == L'-'); result.is_help_argument = result.has_dash_prefix && is_help_argument(tok_txt); - result.source_start = (size_t)tok_start; - result.source_length = tok_extent; + result.source_start = (source_offset_t)tok_start; + result.source_length = (source_offset_t)tok_extent; tok_next(tok); return result; @@ -1195,7 +1200,7 @@ const parse_node_t *parse_node_tree_t::get_child(const parse_node_t &parent, nod const parse_node_t &parse_node_tree_t::find_child(const parse_node_t &parent, parse_token_type_t type) const { - for (size_t i=0; i < parent.child_count; i++) + for (node_offset_t i=0; i < parent.child_count; i++) { const parse_node_t *child = this->get_child(parent, i); if (child->type == type) @@ -1241,7 +1246,7 @@ static void find_nodes_recursive(const parse_node_tree_t &tree, const parse_node if (result->size() < max_count) { if (parent.type == type) result->push_back(&parent); - for (size_t i=0; i < parent.child_count; i++) + for (node_offset_t i=0; i < parent.child_count; i++) { const parse_node_t *child = tree.get_child(parent, i); assert(child != NULL); @@ -1478,7 +1483,7 @@ const parse_node_t *parse_node_tree_t::next_node_in_node_list(const parse_node_t const parse_node_t *next_cursor = NULL; /* Walk through the children */ - for (size_t i=0; i < list_cursor->child_count; i++) + for (node_offset_t i=0; i < list_cursor->child_count; i++) { const parse_node_t *child = this->get_child(*list_cursor, i); if (child->type == entry_type) diff --git a/parse_tree.h b/parse_tree.h index f6406be4f..2f0beb293 100644 --- a/parse_tree.h +++ b/parse_tree.h @@ -18,9 +18,15 @@ class parse_node_t; class parse_node_tree_t; -typedef size_t node_offset_t; + +typedef uint32_t node_offset_t; + #define NODE_OFFSET_INVALID (static_cast(-1)) +typedef uint32_t source_offset_t; + +#define SOURCE_OFFSET_INVALID (static_cast(-1)) + struct parse_error_t { /** Text of the error */ @@ -51,8 +57,8 @@ struct parse_token_t enum parse_keyword_t keyword; // Any keyword represented by this token bool has_dash_prefix; // Hackish: whether the source contains a dash prefix bool is_help_argument; // Hackish: whether the source looks like '-h' or '--help' - size_t source_start; - size_t source_length; + source_offset_t source_start; + source_offset_t source_length; wcstring describe() const; wcstring user_presentable_description() const; @@ -83,35 +89,36 @@ wcstring parse_dump_tree(const parse_node_tree_t &tree, const wcstring &src); wcstring token_type_description(parse_token_type_t type); wcstring keyword_description(parse_keyword_t type); -/** Class for nodes of a parse tree */ +/** Class for nodes of a parse tree. Since there's a lot of these, the size and order of the fields is important. */ class parse_node_t { public: - - /* Type of the node */ - enum parse_token_type_t type; - /* Start in the source code */ - size_t source_start; + source_offset_t source_start; /* Length of our range in the source code */ - size_t source_length; + source_offset_t source_length; /* Parent */ node_offset_t parent; /* Children */ node_offset_t child_start; + + /* Number of children */ uint8_t child_count; /* Which production was used */ uint8_t production_idx; + + /* Type of the node */ + enum parse_token_type_t type; /* Description */ wcstring describe(void) const; /* Constructor */ - explicit parse_node_t(parse_token_type_t ty) : type(ty), source_start(-1), source_length(0), parent(NODE_OFFSET_INVALID), child_start(0), child_count(0), production_idx(-1) + explicit parse_node_t(parse_token_type_t ty) : source_start(SOURCE_OFFSET_INVALID), source_length(0), parent(NODE_OFFSET_INVALID), child_start(0), child_count(0), production_idx(-1), type(ty) { } @@ -124,7 +131,7 @@ public: /* Indicate if this node has a range of source code associated with it */ bool has_source() const { - return source_start != (size_t)(-1); + return source_start != SOURCE_OFFSET_INVALID; } /* Gets source for the node, or the empty string if it has no source */ @@ -143,7 +150,6 @@ public: } }; - /* The parse tree itself */ class parse_node_tree_t : public std::vector { @@ -200,6 +206,7 @@ public: parse_node_list_t specific_statements_for_job(const parse_node_t &job) const; }; + /* The big entry point. Parse a string, attempting to produce a tree for the given goal type */ bool parse_tree_from_string(const wcstring &str, parse_tree_flags_t flags, parse_node_tree_t *output, parse_error_list_t *errors, parse_token_type_t goal = symbol_job_list);