diff --git a/src/js/config/Categories.js b/src/js/config/Categories.js index 5120e5c9..09912a95 100755 --- a/src/js/config/Categories.js +++ b/src/js/config/Categories.js @@ -187,6 +187,7 @@ var Categories = [ "Extract dates", "Regular expression", "XPath expression", + "CSS selector", ] }, { diff --git a/src/js/config/OperationConfig.js b/src/js/config/OperationConfig.js index dec017e0..98539c3a 100755 --- a/src/js/config/OperationConfig.js +++ b/src/js/config/OperationConfig.js @@ -1911,6 +1911,24 @@ var OperationConfig = { } ] }, + "CSS selector": { + description: "Extract information from an HTML document with an CSS selector", + run: Extract.run_css_query, + input_type: "string", + output_type: "string", + args: [ + { + name: "CSS selector", + type: "string", + value: Extract.SELECTOR_INITIAL + }, + { + name: "Delimiter", + type: "binary_short_string", + value: Extract.CSS_QUERY_DELIMITER + }, + ] + }, "From UNIX Timestamp": { description: "Converts a UNIX timestamp to a datetime string.

e.g. 978346800 becomes Mon 1 January 2001 11:00:00 UTC", run: DateTime.run_from_unix_timestamp, diff --git a/src/js/operations/Extract.js b/src/js/operations/Extract.js index 6ee0b071..5582baee 100755 --- a/src/js/operations/Extract.js +++ b/src/js/operations/Extract.js @@ -314,8 +314,8 @@ var Extract = { * @returns {string} */ run_xpath:function(input, args) { - var query = args[0]; - var delimiter = args[1]; + const query = args[0]; + const delimiter = args[1]; try { var xml = $.parseXML(input); @@ -329,7 +329,7 @@ var Extract = { return "Invalid XPath. Details:\n" + err.message; } - var serializer = new XMLSerializer(); + const serializer = new XMLSerializer(); const nodeToString = function(node) { const { nodeType, value, wholeText, data } = node; switch (nodeType) { @@ -344,5 +344,59 @@ var Extract = { return Object.values(result).slice(0, -1) // all values except last (length) .map(nodeToString) .join(delimiter); - } + }, + + + /** + * @constant + * @default + */ + SELECTOR_INITIAL: "", + /** + * @constant + * @default + */ + CSS_QUERY_DELIMITER: "\\n", + + /** + * Extract information (from an hmtl document) with an css selector + * + * @param {string} input + * @param {Object[]} args + * @returns {string} + */ + run_css_query: function(input, args) { + const query = args[0]; + const delimiter = args[1]; + + try { + var html = $.parseHTML(input); + } catch (err) { + return "Invalid input HTML."; + } + + try { + var result = $(html).find(query); + } catch (err) { + return "Invalid CSS Selector. Details:\n" + err.message; + } + + const nodeToString = function(node) { + const { nodeType, value, wholeText, data } = node; + switch (nodeType) { + case Node.ELEMENT_NODE: return node.outerHTML; + case Node.ATTRIBUTE_NODE: return value; + case Node.COMMENT_NODE: return data; + case Node.TEXT_NODE: return wholeText; + case Node.DOCUMENT_NODE: return node.outerHTML; + default: throw new Error(`Unknown Node Type: ${nodeType}`); + } + } + + return Array.apply(null, Array(result.length)) + .map(function (_, i) {return result[i];}) + .map(nodeToString) + .join(delimiter); + }, + };