about summary refs log tree commit diff stats
path: root/js/fsa-tokenizer.js
diff options
context:
space:
mode:
authorelioat <elioat@tilde.institute>2024-06-23 13:19:41 -0400
committerelioat <elioat@tilde.institute>2024-06-23 13:19:41 -0400
commitdff45625c50d2314c66199fc9df6071409ca4afe (patch)
treecc457ca09beea94d99c7b504b64445c64d7626e7 /js/fsa-tokenizer.js
parent20b3050a18b0902d8620f3dbd82d9fc4c7a61cbd (diff)
downloadtour-dff45625c50d2314c66199fc9df6071409ca4afe.tar.gz
*
Diffstat (limited to 'js/fsa-tokenizer.js')
-rw-r--r--js/fsa-tokenizer.js74
1 files changed, 74 insertions, 0 deletions
diff --git a/js/fsa-tokenizer.js b/js/fsa-tokenizer.js
new file mode 100644
index 0000000..601583c
--- /dev/null
+++ b/js/fsa-tokenizer.js
@@ -0,0 +1,74 @@
+const TokenizerFSA = (() => {
+  // Define the states
+  const states = {
+    START: 'START',
+    IN_WORD: 'IN_WORD',
+    IN_NUMBER: 'IN_NUMBER',
+    IN_SYMBOL: 'IN_SYMBOL'
+  };
+
+  // Utility functions to check character types
+  const isLetter = char => /[a-zA-Z]/.test(char);
+  const isDigit = char => /\d/.test(char);
+  const isSymbol = char => /\W/.test(char) && !/\s/.test(char);
+
+  // Add a token to the list if it's not empty
+  const addToken = (token, tokens) => {
+    if (token) tokens.push(token);
+  };
+
+  // Process a single character and update the state and token accordingly
+  const processChar = (state, char, token, tokens) => {
+    switch (state) {
+      case states.START:
+        if (isLetter(char)) return { state: states.IN_WORD, token: char };
+        if (isDigit(char)) return { state: states.IN_NUMBER, token: char };
+        if (isSymbol(char)) return { state: states.IN_SYMBOL, token: char };
+        break;
+
+      case states.IN_WORD:
+        if (isLetter(char)) return { state, token: token + char };
+        addToken(token, tokens);
+        return { state: states.START, token: '' };
+
+      case states.IN_NUMBER:
+        if (isDigit(char)) return { state, token: token + char };
+        addToken(token, tokens);
+        return { state: states.START, token: '' };
+
+      case states.IN_SYMBOL:
+        if (isSymbol(char)) return { state, token: token + char };
+        addToken(token, tokens);
+        return { state: states.START, token: '' };
+
+      default:
+        return { state: states.START, token: '' };
+    }
+  };
+
+  // Tokenize the entire input text
+  const tokenize = text => {
+    let state = states.START;
+    let token = '';
+    const tokens = [];
+
+    for (const char of text) {
+      const result = processChar(state, char, token, tokens);
+      state = result.state;
+      token = result.token;
+    }
+
+    // Add the last token if any
+    addToken(token, tokens);
+
+    return tokens;
+  };
+
+  // Expose the tokenize function as a public method
+  return { tokenize };
+})();
+
+// Example usage
+const text = "Hello, world! 123";
+const tokens = TokenizerFSA.tokenize(text);
+console.log(tokens);  // Output: ['Hello', ',', 'world', '!', '123']