blob: 2d73693ebdda89453abb30ac79fcc1fa5f378481 (
plain) (
tree)
|
|
class TokenizerFSA:
def __init__(self):
self.state = 'START'
self.token = ''
self.tokens = []
def is_letter(self, char):
return char.isalpha()
def is_digit(self, char):
return char.isdigit()
def is_symbol(self, char):
return not char.isalnum() and not char.isspace()
def add_token(self):
if self.token:
self.tokens.append(self.token)
self.token = ''
def process(self, text):
for char in text:
if self.state == 'START':
if self.is_letter(char):
self.state = 'IN_WORD'
self.token += char
elif self.is_digit(char):
self.state = 'IN_NUMBER'
self.token += char
elif self.is_symbol(char):
self.state = 'IN_SYMBOL'
self.token += char
elif self.state == 'IN_WORD':
if self.is_letter(char):
self.token += char
else:
self.add_token()
self.state = 'START'
if not char.isspace():
self.process(char) # Reprocess this character
elif self.state == 'IN_NUMBER':
if self.is_digit(char):
self.token += char
else:
self.add_token()
self.state = 'START'
if not char.isspace():
self.process(char) # Reprocess this character
elif self.state == 'IN_SYMBOL':
if self.is_symbol(char):
self.token += char
else:
self.add_token()
self.state = 'START'
if not char.isspace():
self.process(char) # Reprocess this character
# End of input, add any remaining token
self.add_token()
def tokenize(self, text):
self.process(text)
return self.tokens
# Example usage
tokenizer = TokenizerFSA()
# text = "Hello, world! 123"
# tokens = tokenizer.tokenize(text)
# print(tokens) # Output: ['Hello', ',', 'world', '!', '123']
t = "this is a test l33t, banana, banana! Bang? Wh00T2? We hope;; 12396 233,973,000"
tt = tokenizer.tokenize(t)
print(tt)
|