about summary refs log tree commit diff stats
path: root/py/fsa-tokenizer.py
blob: 2d73693ebdda89453abb30ac79fcc1fa5f378481 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
class TokenizerFSA:
    def __init__(self):
        self.state = 'START'
        self.token = ''
        self.tokens = []

    def is_letter(self, char):
        return char.isalpha()

    def is_digit(self, char):
        return char.isdigit()

    def is_symbol(self, char):
        return not char.isalnum() and not char.isspace()

    def add_token(self):
        if self.token:
            self.tokens.append(self.token)
            self.token = ''

    def process(self, text):
        for char in text:
            if self.state == 'START':
                if self.is_letter(char):
                    self.state = 'IN_WORD'
                    self.token += char
                elif self.is_digit(char):
                    self.state = 'IN_NUMBER'
                    self.token += char
                elif self.is_symbol(char):
                    self.state = 'IN_SYMBOL'
                    self.token += char
            elif self.state == 'IN_WORD':
                if self.is_letter(char):
                    self.token += char
                else:
                    self.add_token()
                    self.state = 'START'
                    if not char.isspace():
                        self.process(char)  # Reprocess this character
            elif self.state == 'IN_NUMBER':
                if self.is_digit(char):
                    self.token += char
                else:
                    self.add_token()
                    self.state = 'START'
                    if not char.isspace():
                        self.process(char)  # Reprocess this character
            elif self.state == 'IN_SYMBOL':
                if self.is_symbol(char):
                    self.token += char
                else:
                    self.add_token()
                    self.state = 'START'
                    if not char.isspace():
                        self.process(char)  # Reprocess this character

        # End of input, add any remaining token
        self.add_token()

    def tokenize(self, text):
        self.process(text)
        return self.tokens

# Example usage
tokenizer = TokenizerFSA()
# text = "Hello, world! 123"
# tokens = tokenizer.tokenize(text)
# print(tokens)  # Output: ['Hello', ',', 'world', '!', '123']

t = "this is a test l33t, banana, banana! Bang? Wh00T2? We hope;; 12396 233,973,000"
tt = tokenizer.tokenize(t)
print(tt)