mirror of
https://github.com/gchq/CyberChef
synced 2024-11-15 00:57:08 +00:00
Added RAKE functionality into CC
This commit is contained in:
parent
6ed9d4554a
commit
7cfb5e0b2a
4 changed files with 174 additions and 1 deletions
|
@ -328,7 +328,8 @@
|
|||
"CSS selector",
|
||||
"Extract EXIF",
|
||||
"Extract ID3",
|
||||
"Extract Files"
|
||||
"Extract Files",
|
||||
"RAKE"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
149
src/core/operations/RAKE.mjs
Normal file
149
src/core/operations/RAKE.mjs
Normal file
|
@ -0,0 +1,149 @@
|
|||
/**
|
||||
* @author sw5678
|
||||
* @copyright Crown Copyright 2024
|
||||
* @license Apache-2.0
|
||||
*/
|
||||
|
||||
import Operation from "../Operation.mjs";
|
||||
|
||||
/**
|
||||
* RAKE operation
|
||||
*/
|
||||
class RAKE extends Operation {
|
||||
|
||||
/**
|
||||
* RAKE constructor
|
||||
*/
|
||||
constructor() {
|
||||
super();
|
||||
|
||||
this.name = "RAKE";
|
||||
this.module = "Default";
|
||||
this.description = [
|
||||
"Rapid Keyword Extraction (RAKE)",
|
||||
"<br><br>",
|
||||
"RAKE is a domain-independent keyword extraction algorithm in Natural Language Processing.",
|
||||
"<br><br>",
|
||||
"The list of stop words are from the NLTK python package",
|
||||
].join("\n");
|
||||
this.inputType = "string";
|
||||
this.outputType = "string";
|
||||
this.args = [
|
||||
{
|
||||
name: "Word Delimiter (Regex)",
|
||||
type: "text",
|
||||
value: "\\s"
|
||||
},
|
||||
{
|
||||
name: "Sentence Delimiter (Regex)",
|
||||
type: "text",
|
||||
value: "\\.\\s|\\n"
|
||||
},
|
||||
{
|
||||
name: "Stop Words",
|
||||
type: "text",
|
||||
value: "i,me,my,myself,we,our,ours,ourselves,you,you're,you've,you'll,you'd,your,yours,yourself,yourselves,he,him,his,himself,she,she's,her,hers,herself,it,it's,its,itsef,they,them,their,theirs,themselves,what,which,who,whom,this,that,that'll,these,those,am,is,are,was,were,be,been,being,have,has,had,having,do,does',did,doing,a,an,the,and,but,if,or,because,as,until,while,of,at,by,for,with,about,against,between,into,through,during,before,after,above,below,to,from,up,down,in,out,on,off,over,under,again,further,then,once,here,there,when,where,why,how,all,any,both,each,few,more,most,other,some,such,no,nor,not,only,own,same,so,than,too,very,s,t,can,will,just,don,don't,should,should've,now,d,ll,m,o,re,ve,y,ain,aren,aren't,couldn,couldn't,didn,didn't,doesn,doesn't,hadn,hadn't,hasn,hasn't,haven,haven't,isn,isn't,ma,mightn,mightn't,mustn,mustn't,needn,needn't,shan,shan't,shouldn,shouldn't,wasn,wasn't,weren,weren't,won,won't,wouldn,wouldn't"
|
||||
}
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} input
|
||||
* @param {Object[]} args
|
||||
* @returns {string}
|
||||
*/
|
||||
run(input, args) {
|
||||
|
||||
// Get delimiter regexs
|
||||
const wordDelim = new RegExp(args[0], "g");
|
||||
const sentDelim = new RegExp(args[1], "g");
|
||||
|
||||
// Deduplicate the stop words and add the empty string
|
||||
const stopWords = args[2].toLowerCase().replace(/ /g, "").split(",").unique();
|
||||
stopWords.push("");
|
||||
|
||||
// Lower case input and remove start and ending whitespace
|
||||
input = input.toLowerCase().trim();
|
||||
|
||||
// Get tokens, token count, and phrases
|
||||
const tokens = [];
|
||||
const wordFrequencies = [];
|
||||
let phrases = [];
|
||||
|
||||
// Build up list of phrases and token counts
|
||||
const sentences = input.split(sentDelim);
|
||||
for (const sent of sentences) {
|
||||
|
||||
// Split sentence into words
|
||||
const splitSent = sent.split(wordDelim);
|
||||
let startIndex = 0;
|
||||
|
||||
for (let i = 0; i < splitSent.length; i++) {
|
||||
const token = splitSent[i];
|
||||
if (stopWords.includes(token)) {
|
||||
// If token is stop word then split to create phrase
|
||||
phrases.push(splitSent.slice(startIndex, i));
|
||||
startIndex = i + 1;
|
||||
} else {
|
||||
// If token is not a stop word add to the count of the list of words
|
||||
if (tokens.includes(token)) {
|
||||
wordFrequencies[tokens.indexOf(token)]+=1;
|
||||
} else {
|
||||
tokens.push(token);
|
||||
wordFrequencies.push(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
phrases.push(splitSent.slice(startIndex));
|
||||
}
|
||||
|
||||
// remove empty phrases
|
||||
phrases = phrases.filter(subArray => subArray.length > 0);
|
||||
|
||||
// Remove duplicate phrases
|
||||
const uniquePhrases = [...new Set(phrases.map(function (phrase) {
|
||||
return phrase.join(" ");
|
||||
}))];
|
||||
phrases = uniquePhrases.map(function (phrase) {
|
||||
return phrase.split(" ");
|
||||
});
|
||||
|
||||
// Generate word_degree_matrix and populate
|
||||
const wordDegreeMatrix = Array.from(Array(tokens.length), _ => Array(tokens.length).fill(0));
|
||||
phrases.forEach(function (phrase) {
|
||||
phrase.forEach(function (word1) {
|
||||
phrase.forEach(function (word2) {
|
||||
wordDegreeMatrix[tokens.indexOf(word1)][tokens.indexOf(word2)]++;
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
// Calculate degree score for each token
|
||||
const degreeScores = Array(tokens.length).fill(0);
|
||||
for (let i=0; i<tokens.length; i++) {
|
||||
let wordDegree = 0;
|
||||
for (let j=0; j<wordDegreeMatrix.length; j++) {
|
||||
wordDegree += wordDegreeMatrix[j][i];
|
||||
}
|
||||
degreeScores[i] = wordDegree / wordFrequencies[i];
|
||||
}
|
||||
|
||||
// Calculate score for each phrase
|
||||
const scores = phrases.map(function (phrase) {
|
||||
let score = 0;
|
||||
phrase.forEach(function (token) {
|
||||
score += degreeScores[tokens.indexOf(token)];
|
||||
});
|
||||
return new Array(score, phrase.join(" "));
|
||||
});
|
||||
scores.sort((a, b) => b[0] - a[0]);
|
||||
scores.unshift(new Array("Scores: ", "Keywords: "));
|
||||
|
||||
// Output works with the 'To Table' functionality already built into CC
|
||||
return scores.map(function (score) {
|
||||
return score.join(", ");
|
||||
}).join("\n");
|
||||
}
|
||||
}
|
||||
|
||||
export default RAKE;
|
|
@ -74,6 +74,7 @@ import "./tests/ParseIPRange.mjs";
|
|||
import "./tests/ParseQRCode.mjs";
|
||||
import "./tests/PEMtoHex.mjs";
|
||||
import "./tests/PowerSet.mjs";
|
||||
import "./tests/RAKE.mjs";
|
||||
import "./tests/Regex.mjs";
|
||||
import "./tests/Register.mjs";
|
||||
import "./tests/Rotate.mjs";
|
||||
|
|
22
tests/operations/tests/RAKE.mjs
Normal file
22
tests/operations/tests/RAKE.mjs
Normal file
|
@ -0,0 +1,22 @@
|
|||
/**
|
||||
* RAKE, Rapid Automatic Keyword Extraction tests.
|
||||
*
|
||||
* @author sw5678
|
||||
* @copyright Crown Copyright 2024
|
||||
* @license Apache-2.0
|
||||
*/
|
||||
import TestRegister from "../../lib/TestRegister.mjs";
|
||||
|
||||
TestRegister.addTests([
|
||||
{
|
||||
"name": "RAKE: Basic Example",
|
||||
"input": "test1 test2. test2",
|
||||
"expectedOutput": "Scores: , Keywords: \n3.5, test1 test2\n1.5, test2",
|
||||
"recipeConfig": [
|
||||
{
|
||||
"op": "RAKE",
|
||||
"args": ["\\s", "\\.\\s|\\n", "i,me,my,myself,we,our"]
|
||||
},
|
||||
],
|
||||
}
|
||||
]);
|
Loading…
Reference in a new issue