1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
|
class TokenizerFSA:
def __init__(self):
self.state = 'START'
self.token = ''
self.tokens = []
def is_letter(self, char):
return char.isalpha()
def is_digit(self, char):
return char.isdigit()
def is_symbol(self, char):
return not char.isalnum() and not char.isspace()
def add_token(self):
if self.token:
self.tokens.append(self.token)
self.token = ''
def process(self, text):
for char in text:
if self.state == 'START':
if self.is_letter(char):
self.state = 'IN_WORD'
self.token += char
elif self.is_digit(char):
self.state = 'IN_NUMBER'
self.token += char
elif self.is_symbol(char):
self.state = 'IN_SYMBOL'
self.token += char
elif self.state == 'IN_WORD':
if self.is_letter(char):
self.token += char
else:
self.add_token()
self.state = 'START'
if not char.isspace():
self.process(char) # Reprocess this character
elif self.state == 'IN_NUMBER':
if self.is_digit(char):
self.token += char
else:
self.add_token()
self.state = 'START'
if not char.isspace():
self.process(char) # Reprocess this character
elif self.state == 'IN_SYMBOL':
if self.is_symbol(char):
self.token += char
else:
self.add_token()
self.state = 'START'
if not char.isspace():
self.process(char) # Reprocess this character
# End of input, add any remaining token
self.add_token()
def tokenize(self, text):
self.process(text)
return self.tokens
# Example usage
tokenizer = TokenizerFSA()
# text = "Hello, world! 123"
# tokens = tokenizer.tokenize(text)
# print(tokens) # Output: ['Hello', ',', 'world', '!', '123']
t = "this is a test l33t, banana, banana! Bang? Wh00T2? We hope;; 12396 233,973,000"
tt = tokenizer.tokenize(t)
print(tt)
|