const TokenizerFSA = (() => { // Define the states const states = { START: 'START', IN_WORD: 'IN_WORD', IN_NUMBER: 'IN_NUMBER', IN_SYMBOL: 'IN_SYMBOL' }; // Utility functions to check character types const isLetter = char => /[a-zA-Z]/.test(char); const isDigit = char => /\d/.test(char); const isSymbol = char => /\W/.test(char) && !/\s/.test(char); // Add a token to the list if it's not empty const addToken = (token, tokens) => { if (token) tokens.push(token); }; // Process a single character and update the state and token accordingly const processChar = (state, char, token, tokens) => { switch (state) { case states.START: if (isLetter(char)) return { state: states.IN_WORD, token: char }; if (isDigit(char)) return { state: states.IN_NUMBER, token: char }; if (isSymbol(char)) return { state: states.IN_SYMBOL, token: char }; return { state: states.START, token: '' }; // Remain in START state for spaces or other characters case states.IN_WORD: if (isLetter(char)) return { state, token: token + char }; addToken(token, tokens); return { state: states.START, token: '' }; case states.IN_NUMBER: if (isDigit(char)) return { state, token: token + char }; addToken(token, tokens); return { state: states.START, token: '' }; case states.IN_SYMBOL: // FIXME: symbols aren't being well handled if (isSymbol(char)) return { state, token: token + char }; addToken(token, tokens); return { state: states.START, token: '' }; default: return { state: states.START, token: '' }; } }; // Tokenize the entire input text const tokenize = text => { let state = states.START; let token = ''; const tokens = []; for (const char of text) { const result = processChar(state, char, token, tokens); state = result.state; token = result.token; } // Add the last token if any addToken(token, tokens); return tokens; }; return { tokenize }; })(); // FIXME: symbols aren't being well handled, and consecutive, identical numbers are being ignored const text = "Hello, world! 123"; const tokens = TokenizerFSA.tokenize(text); console.log(tokens); // Output: ['Hello', ',', 'world', '!', '123'] const text2 = "Oh no, it's 3.14159! Is it delicious? Is it mysterious? Who knows!"; const tokens2 = TokenizerFSA.tokenize(text2); console.log(tokens2); const text3 = "über l337 hard test"; const tokens3 = TokenizerFSA.tokenize(text3); console.log(tokens3);