class TokenizerFSA: def __init__(self): self.state = 'START' self.token = '' self.tokens = [] def is_letter(self, char): return char.isalpha() def is_digit(self, char): return char.isdigit() def is_symbol(self, char): return not char.isalnum() and not char.isspace() def add_token(self): if self.token: self.tokens.append(self.token) self.token = '' def process(self, text): for char in text: if self.state == 'START': if self.is_letter(char): self.state = 'IN_WORD' self.token += char elif self.is_digit(char): self.state = 'IN_NUMBER' self.token += char elif self.is_symbol(char): self.state = 'IN_SYMBOL' self.token += char elif self.state == 'IN_WORD': if self.is_letter(char): self.token += char else: self.add_token() self.state = 'START' if not char.isspace(): self.process(char) # Reprocess this character elif self.state == 'IN_NUMBER': if self.is_digit(char): self.token += char else: self.add_token() self.state = 'START' if not char.isspace(): self.process(char) # Reprocess this character elif self.state == 'IN_SYMBOL': if self.is_symbol(char): self.token += char else: self.add_token() self.state = 'START' if not char.isspace(): self.process(char) # Reprocess this character # End of input, add any remaining token self.add_token() def tokenize(self, text): self.process(text) return self.tokens # Example usage tokenizer = TokenizerFSA() # text = "Hello, world! 123" # tokens = tokenizer.tokenize(text) # print(tokens) # Output: ['Hello', ',', 'world', '!', '123'] t = "this is a test l33t, banana, banana! Bang? Wh00T2? We hope;; 12396 233,973,000" tt = tokenizer.tokenize(t) print(tt)