about summary refs log blame commit diff stats
path: root/py/fsa-tokenizer.py
blob: 2d73693ebdda89453abb30ac79fcc1fa5f378481 (plain) (tree)

































































                                                                      






                                                                                    
class TokenizerFSA:
    def __init__(self):
        self.state = 'START'
        self.token = ''
        self.tokens = []

    def is_letter(self, char):
        return char.isalpha()

    def is_digit(self, char):
        return char.isdigit()

    def is_symbol(self, char):
        return not char.isalnum() and not char.isspace()

    def add_token(self):
        if self.token:
            self.tokens.append(self.token)
            self.token = ''

    def process(self, text):
        for char in text:
            if self.state == 'START':
                if self.is_letter(char):
                    self.state = 'IN_WORD'
                    self.token += char
                elif self.is_digit(char):
                    self.state = 'IN_NUMBER'
                    self.token += char
                elif self.is_symbol(char):
                    self.state = 'IN_SYMBOL'
                    self.token += char
            elif self.state == 'IN_WORD':
                if self.is_letter(char):
                    self.token += char
                else:
                    self.add_token()
                    self.state = 'START'
                    if not char.isspace():
                        self.process(char)  # Reprocess this character
            elif self.state == 'IN_NUMBER':
                if self.is_digit(char):
                    self.token += char
                else:
                    self.add_token()
                    self.state = 'START'
                    if not char.isspace():
                        self.process(char)  # Reprocess this character
            elif self.state == 'IN_SYMBOL':
                if self.is_symbol(char):
                    self.token += char
                else:
                    self.add_token()
                    self.state = 'START'
                    if not char.isspace():
                        self.process(char)  # Reprocess this character

        # End of input, add any remaining token
        self.add_token()

    def tokenize(self, text):
        self.process(text)
        return self.tokens

# Example usage
tokenizer = TokenizerFSA()
# text = "Hello, world! 123"
# tokens = tokenizer.tokenize(text)
# print(tokens)  # Output: ['Hello', ',', 'world', '!', '123']

t = "this is a test l33t, banana, banana! Bang? Wh00T2? We hope;; 12396 233,973,000"
tt = tokenizer.tokenize(t)
print(tt)