about summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--js/fsa-tokenizer.js31
1 files changed, 15 insertions, 16 deletions
diff --git a/js/fsa-tokenizer.js b/js/fsa-tokenizer.js
index 8884894..5ce291d 100644
--- a/js/fsa-tokenizer.js
+++ b/js/fsa-tokenizer.js
@@ -7,7 +7,7 @@ const TokenizerFSA = (() => {
     IN_SYMBOL: 'IN_SYMBOL'
   };
 
-  // Check character types
+  // Utility functions to check character types
   const isLetter = char => /[a-zA-Z]/.test(char);
   const isDigit = char => /\d/.test(char);
   const isSymbol = char => /\W/.test(char) && !/\s/.test(char);
@@ -17,14 +17,14 @@ const TokenizerFSA = (() => {
     if (token) tokens.push(token);
   };
 
-  // Process a single character and update the state and token
+  // Process a single character and update the state and token accordingly
   const processChar = (state, char, token, tokens) => {
     switch (state) {
       case states.START:
         if (isLetter(char)) return { state: states.IN_WORD, token: char };
         if (isDigit(char)) return { state: states.IN_NUMBER, token: char };
         if (isSymbol(char)) return { state: states.IN_SYMBOL, token: char };
-        break;
+        return { state: states.START, token: '' }; // Remain in START state for spaces or other characters
 
       case states.IN_WORD:
         if (isLetter(char)) return { state, token: token + char };
@@ -36,17 +36,14 @@ const TokenizerFSA = (() => {
         addToken(token, tokens);
         return { state: states.START, token: '' };
 
-      case states.IN_SYMBOL:
+      case states.IN_SYMBOL: // FIXME: symbols aren't being well handled
         if (isSymbol(char)) return { state, token: token + char };
         addToken(token, tokens);
         return { state: states.START, token: '' };
 
       default:
-        addToken(token, tokens);
         return { state: states.START, token: '' };
     }
-    // Safety net - this shouldn't be reached unless something is wrong with one of the cases
-    return { state: states.START, token: '' }; // TODO: I've got a sneak suspicion this is being reached
   };
 
   // Tokenize the entire input text
@@ -61,7 +58,7 @@ const TokenizerFSA = (() => {
       token = result.token;
     }
 
-    // Add the last token if one exists
+    // Add the last token if any
     addToken(token, tokens);
 
     return tokens;
@@ -70,13 +67,15 @@ const TokenizerFSA = (() => {
   return { tokenize };
 })();
 
-// example usage
-const text = "Oh my goodness! What an enormous banana, there must be 11 of them on that tr33!";
+// FIXME: symbols aren't being well handled, and consecutive, identical numbers are being ignored
+const text = "Hello, world! 123";
 const tokens = TokenizerFSA.tokenize(text);
-console.log(tokens);  
+console.log(tokens);  // Output: ['Hello', ',', 'world', '!', '123']
+
+const text2 = "Oh no, it's 3.14159! Is it delicious? Is it mysterious? Who knows!";
+const tokens2 = TokenizerFSA.tokenize(text2);
+console.log(tokens2);
 
-// Output ought to be:
-// [
-//   "Oh", "my", "goodness", "What", "an", "enormous", "banana", "there", "must", "be", "11", "of", "them", "on",
-//   "that", "tr", "33", "!"
-// ]
+const text3 = "über l337 hard test";
+const tokens3 = TokenizerFSA.tokenize(text3);
+console.log(tokens3);
\ No newline at end of file