diff options
Diffstat (limited to 'js')
-rw-r--r-- | js/fsa-tokenizer.js | 31 |
1 files changed, 15 insertions, 16 deletions
diff --git a/js/fsa-tokenizer.js b/js/fsa-tokenizer.js index 8884894..5ce291d 100644 --- a/js/fsa-tokenizer.js +++ b/js/fsa-tokenizer.js @@ -7,7 +7,7 @@ const TokenizerFSA = (() => { IN_SYMBOL: 'IN_SYMBOL' }; - // Check character types + // Utility functions to check character types const isLetter = char => /[a-zA-Z]/.test(char); const isDigit = char => /\d/.test(char); const isSymbol = char => /\W/.test(char) && !/\s/.test(char); @@ -17,14 +17,14 @@ const TokenizerFSA = (() => { if (token) tokens.push(token); }; - // Process a single character and update the state and token + // Process a single character and update the state and token accordingly const processChar = (state, char, token, tokens) => { switch (state) { case states.START: if (isLetter(char)) return { state: states.IN_WORD, token: char }; if (isDigit(char)) return { state: states.IN_NUMBER, token: char }; if (isSymbol(char)) return { state: states.IN_SYMBOL, token: char }; - break; + return { state: states.START, token: '' }; // Remain in START state for spaces or other characters case states.IN_WORD: if (isLetter(char)) return { state, token: token + char }; @@ -36,17 +36,14 @@ const TokenizerFSA = (() => { addToken(token, tokens); return { state: states.START, token: '' }; - case states.IN_SYMBOL: + case states.IN_SYMBOL: // FIXME: symbols aren't being well handled if (isSymbol(char)) return { state, token: token + char }; addToken(token, tokens); return { state: states.START, token: '' }; default: - addToken(token, tokens); return { state: states.START, token: '' }; } - // Safety net - this shouldn't be reached unless something is wrong with one of the cases - return { state: states.START, token: '' }; // TODO: I've got a sneak suspicion this is being reached }; // Tokenize the entire input text @@ -61,7 +58,7 @@ const TokenizerFSA = (() => { token = result.token; } - // Add the last token if one exists + // Add the last token if any addToken(token, tokens); return tokens; @@ -70,13 +67,15 @@ const TokenizerFSA = (() => { return { tokenize }; })(); -// example usage -const text = "Oh my goodness! What an enormous banana, there must be 11 of them on that tr33!"; +// FIXME: symbols aren't being well handled, and consecutive, identical numbers are being ignored +const text = "Hello, world! 123"; const tokens = TokenizerFSA.tokenize(text); -console.log(tokens); +console.log(tokens); // Output: ['Hello', ',', 'world', '!', '123'] + +const text2 = "Oh no, it's 3.14159! Is it delicious? Is it mysterious? Who knows!"; +const tokens2 = TokenizerFSA.tokenize(text2); +console.log(tokens2); -// Output ought to be: -// [ -// "Oh", "my", "goodness", "What", "an", "enormous", "banana", "there", "must", "be", "11", "of", "them", "on", -// "that", "tr", "33", "!" -// ] +const text3 = "über l337 hard test"; +const tokens3 = TokenizerFSA.tokenize(text3); +console.log(tokens3); \ No newline at end of file |