diff options
author | elioat <elioat@tilde.institute> | 2024-06-23 13:19:41 -0400 |
---|---|---|
committer | elioat <elioat@tilde.institute> | 2024-06-23 13:19:41 -0400 |
commit | dff45625c50d2314c66199fc9df6071409ca4afe (patch) | |
tree | cc457ca09beea94d99c7b504b64445c64d7626e7 /js/fsa-tokenizer.js | |
parent | 20b3050a18b0902d8620f3dbd82d9fc4c7a61cbd (diff) | |
download | tour-dff45625c50d2314c66199fc9df6071409ca4afe.tar.gz |
*
Diffstat (limited to 'js/fsa-tokenizer.js')
-rw-r--r-- | js/fsa-tokenizer.js | 74 |
1 files changed, 74 insertions, 0 deletions
diff --git a/js/fsa-tokenizer.js b/js/fsa-tokenizer.js new file mode 100644 index 0000000..601583c --- /dev/null +++ b/js/fsa-tokenizer.js @@ -0,0 +1,74 @@ +const TokenizerFSA = (() => { + // Define the states + const states = { + START: 'START', + IN_WORD: 'IN_WORD', + IN_NUMBER: 'IN_NUMBER', + IN_SYMBOL: 'IN_SYMBOL' + }; + + // Utility functions to check character types + const isLetter = char => /[a-zA-Z]/.test(char); + const isDigit = char => /\d/.test(char); + const isSymbol = char => /\W/.test(char) && !/\s/.test(char); + + // Add a token to the list if it's not empty + const addToken = (token, tokens) => { + if (token) tokens.push(token); + }; + + // Process a single character and update the state and token accordingly + const processChar = (state, char, token, tokens) => { + switch (state) { + case states.START: + if (isLetter(char)) return { state: states.IN_WORD, token: char }; + if (isDigit(char)) return { state: states.IN_NUMBER, token: char }; + if (isSymbol(char)) return { state: states.IN_SYMBOL, token: char }; + break; + + case states.IN_WORD: + if (isLetter(char)) return { state, token: token + char }; + addToken(token, tokens); + return { state: states.START, token: '' }; + + case states.IN_NUMBER: + if (isDigit(char)) return { state, token: token + char }; + addToken(token, tokens); + return { state: states.START, token: '' }; + + case states.IN_SYMBOL: + if (isSymbol(char)) return { state, token: token + char }; + addToken(token, tokens); + return { state: states.START, token: '' }; + + default: + return { state: states.START, token: '' }; + } + }; + + // Tokenize the entire input text + const tokenize = text => { + let state = states.START; + let token = ''; + const tokens = []; + + for (const char of text) { + const result = processChar(state, char, token, tokens); + state = result.state; + token = result.token; + } + + // Add the last token if any + addToken(token, tokens); + + return tokens; + }; + + // Expose the tokenize function as a public method + return { tokenize }; +})(); + +// Example usage +const text = "Hello, world! 123"; +const tokens = TokenizerFSA.tokenize(text); +console.log(tokens); // Output: ['Hello', ',', 'world', '!', '123'] |