diff --git a/src/core/config/Categories.json b/src/core/config/Categories.json index 434c8bb619..03f12023b8 100644 --- a/src/core/config/Categories.json +++ b/src/core/config/Categories.json @@ -336,6 +336,7 @@ "Pseudo-Random Number Generator", "Sleep", "File Tree", + "N-gram" "Take nth bytes", "Drop nth bytes" ] diff --git a/src/core/operations/Ngram.mjs b/src/core/operations/Ngram.mjs new file mode 100644 index 0000000000..269c788dc5 --- /dev/null +++ b/src/core/operations/Ngram.mjs @@ -0,0 +1,60 @@ +/** + * @author benjcal [benj.calderon@gmail.com] + * @copyright Crown Copyright 2024 + * @license Apache-2.0 + */ + +import Operation from "../Operation.mjs"; +import {JOIN_DELIM_OPTIONS} from "../lib/Delim.mjs"; + +/** + * ngram operation + */ +class Ngram extends Operation { + + /** + * Ngram constructor + */ + constructor() { + super(); + + this.name = "N-gram"; + this.module = "Default"; + this.description = "Extracts n-grams from the input text. N-grams are contiguous sequences of n characters from a given text sample."; + this.infoURL = "https://wikipedia.org/wiki/N-gram"; + this.inputType = "string"; + this.outputType = "string"; + this.args = [ + { + name: "N-gram size", + type: "number", + value: 3 + }, + { + "name": "Join delimiter", + "type": "editableOptionShort", + "value": JOIN_DELIM_OPTIONS + } + ]; + } + + /** + * @param {string} input + * @param {Object[]} args + * @returns {string} + */ + run(input, args) { + const nGramSize = args[0], + joinDelim = args[1]; + + const ngrams = []; + for (let i = 0; i <= input.length - nGramSize; i++) { + ngrams.push(input.slice(i, i + nGramSize)); + } + + return ngrams.join(joinDelim); + } + +} + +export default Ngram; diff --git a/tests/operations/index.mjs b/tests/operations/index.mjs index f147e9e7c7..7487480f67 100644 --- a/tests/operations/index.mjs +++ b/tests/operations/index.mjs @@ -114,6 +114,7 @@ import "./tests/MS.mjs"; import "./tests/MultipleBombe.mjs"; import "./tests/MurmurHash3.mjs"; import "./tests/NetBIOS.mjs"; +import "./tests/Ngram.mjs"; import "./tests/NormaliseUnicode.mjs"; import "./tests/NTLM.mjs"; import "./tests/OTP.mjs"; diff --git a/tests/operations/tests/Ngram.mjs b/tests/operations/tests/Ngram.mjs new file mode 100644 index 0000000000..97af9d125e --- /dev/null +++ b/tests/operations/tests/Ngram.mjs @@ -0,0 +1,34 @@ +/** + * Ngram tests. + * + * @author jg42526 + * + * @copyright Crown Copyright 2017 + * @license Apache-2.0 + */ +import TestRegister from "../../lib/TestRegister.mjs"; + +TestRegister.addTests([ + { + name: "Ngram 3", + input: "hello", + expectedOutput: "hel,ell,llo", + recipeConfig: [ + { + op: "N-gram", + args: ["3", ","], + }, + ], + }, + { + name: "Ngram longer than input", + input: "hello", + expectedOutput: "", + recipeConfig: [ + { + op: "N-gram", + args: ["6", ","], + }, + ], + }, +]);