https://github.com/akkartik/mu/blob/master/403unicode.mu
  1 # Helpers for Unicode.
  2 #
  3 # Mu has no characters, only code points and graphemes.
  4 # Code points are the indivisible atoms of text streams.
  5 #   https://en.wikipedia.org/wiki/Code_point
  6 # Graphemes are the smallest self-contained unit of text.
  7 # Graphemes may consist of multiple code points.
  8 #
  9 # Mu graphemes are always represented in utf-8, and they are required to fit
 10 # in 4 bytes.
 11 #
 12 # Mu doesn't currently support combining code points, or graphemes made of
 13 # multiple code points. One day we will.
 14 # We also don't currently support code points that translate into multiple
 15 # or wide graphemes. (In particular, Tab will never be supported.)
 16 
 17 # transliterated from tb_utf8_unicode_to_char in https://github.com/nsf/termbox
 18 # https://wiki.tcl-lang.org/page/UTF%2D8+bit+by+bit explains the algorithm
 19 #
 20 # The day we want to support combining characters, this function will need to
 21 # take multiple code points. Or something.
 22 fn to-grapheme in: code-point -> _/eax: grapheme {
 23   var c/eax: int <- copy in
 24   var num-trailers/ecx: int <- copy 0
 25   var first/edx: int <- copy 0
 26   $to-grapheme:compute-length: {
 27     # single byte: just return it
 28     compare c, 0x7f
 29     {
 30       break-if->
 31       var g/eax: grapheme <- copy c
 32       return g
 33     }
 34     # 2 bytes
 35     compare c, 0x7ff
 36     {
 37       break-if->
 38       num-trailers <- copy 1
 39       first <- copy 0xc0pre { line-height: 125%; }
td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
.highlight .hll { background-color: #ffffcc }
.highlight .c { color: #888888 } /* Comment */
.highlight .err { color: #a61717; background-color: #e3d2d2 } /* Error */
.highlight .k { color: #008800; font-weight: bold } /* Keyword */
.highlight .ch { color: #888888 } /* Comment.Hashbang */
.highlight .cm { color: #888888 } /* Comment.Multiline */
.highlight .cp { color: #cc0000; font-weight: bold } /* Comment.Preproc */
.highlight .cpf { color: #888888 } /* Comment.PreprocFile */
.highlight .c1 { color: #888888 } /* Comment.Single */
.highlight .cs { color: #cc0000; font-weight: bold; background-color: #fff0f0 } /* Comment.Special */
.highlight .gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */
.highlight .ge { font-style: italic } /* Generic.Emph */
.highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */
.highlight .gr { color: #aa0000 } /* Generic.Error */
.highlight .gh { color: #333333 } /* Generic.Heading */
.highlight .gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */
.highlight .go { color: #888888 } /* Generic.Output */
.highlight .gp { color: #555555 } /* Generic.Prompt */
.highlight .gs { font-weight: bold } /* Generic.Strong */
.highlight .gu { color: #666666 } /* Generic.Subheading */
.highlight .gt { color: #aa0000 } /* Generic.Traceback */
.highlight .kc { color: #008800; font-weight: bold } /* Keyword.Constant */
.highlight .kd { color: #008800; font-weight: bold } /* Keyword.Declaration */
.highlight .kn { color: #008800; font-weight: bold } /* Keyword.Namespace */
.highlight .kp { color: #008800 } /* Keyword.Pseudo */
.highlight .kr { color: #008800; font-weight: bold } /* Keyword.Reserved */
.highlight .kt { color: #888888; font-weight: bold } /* Keyword.Type */
.highlight .m { color: #0000DD; font-weight: bold } /* Literal.Number */
.highlight .s { color: #dd2200; background-color: #fff0f0 } /* Literal.String */
.highlight .na { color: #336699 } /* Name.Attribute */
.highlight .nb { color: #003388 } /* Name.Builtin */
.highlight .nc { color: #bb0066; font-weight: bold } /* Name.Class */
.highlight .no { color: #003366; font-weight: bold } /* Name.Constant */
.highlight .nd { color: #555555 } /* Name.Decorator */
.highlight .ne { color: #bb0066; font-weight: bold } /* Name.Exception */
.highlight .nf { color: #0066bb; font-weight: bold } /* Name.Function */
.highlight .nl { color: #336699; font-style: italic } /* Name.Label */
.highlight .nn { color: #bb0066; font-weight: bold } /* Name.Namespace */
.highlight .py { color: #336699; font-weight: bold } /* Name.Property */
.highlight .nt { color: #bb0066; font-weight: bold } /* Name.Tag */
.highlight .nv { color: #336699 } /* Name.Variable */
.highlight .ow { color: #008800 } /* Operator.Word */
.highlight .w { color: #bbbbbb } /* Text.Whitespace */
.highlight .mb { color: #0000DD; font-weight: bold } /* Literal.Number.Bin */
.highlight .mf { color: #0000DD; font-weight: bold } /* Literal.Number.Float */
.highlight .mh { color: #0000DD; font-weight: bold } /* Literal.Number.Hex */
.highlight .mi { color: #0000DD; font-weight: bold } /* Literal.Number.Integer */
.highlight .mo { color: #0000DD; font-weight: bold } /* Literal.Number.Oct */
.highlight .sa { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Affix */
.highlight .sb { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Backtick */
.highlight .sc { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Char */
.highlight .dl { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Delimiter */
.highlight .sd { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Doc */
.highlight .s2 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Double */
.highlight .se { color: #0044dd; background-color: #fff0f0 } /* Literal.String.Escape */
.highlight .sh { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Heredoc */
.highlight .si { color: #3333bb; background-color: #fff0f0 } /* Literal.String.Interpol */
.highlight .sx { color: #22bb22; background-color: #f0fff0 } /* Literal.String.Other */
.highlight .sr { color: #008800; background-color: #fff0ff } /* Literal.String.Regex */
.highlight .s1 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Single */
.highlight .ss { color: #aa6600; background-color: #fff0f0 } /* Literal.String.Symbol */
.highlight .bp { color: #003388 } /* Name.Builtin.Pseudo */
.highlight .fm { color: #0066bb; font-weight: bold } /* Name.Function.Magic */
.highlight .vc { color: #336699 } /* Name.Variable.Class */
.highlight .vg { color: #dd7700 } /* Name.Variable.Global */
.highlight .vi { color: #3333bb } /* Name.Variable.Instance */
.highlight .vm { color: #336699 } /* Name.Variable.Magic */
.highlight .il { color: #0000DD; font-weight: bold } /* Literal.Number.Integer.Long */
# Copyright (C) 2009, 2010  Roman Zimbelmann <romanz@lavabit.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

if __name__ == '__main__': from __init__ import init; init()

from os.path import realpath, join, dirname

from ranger import fsobject
from ranger.fsobject.file import File
from ranger.fsobject.directory import Directory
from ranger.shared.settings import SettingsAware

SettingsAware._setup()

TESTDIR = realpath(join(dirname(__file__), 'testdir'))
TESTFILE = join(TESTDIR, 'testfile5234148')
NONEXISTANT_DIR = join(TESTDIR, 'nonexistant')

import unittest
class Test1(unittest.TestCase):
	def test_initial_condition(self):
		# Check for the expected initial condition
		dir = Directory(TESTDIR)

		self.assertEqual(dir.path, TESTDIR)
		self.assertFalse(dir.content_loaded)
		self.assertEqual(dir.filenames, None)
		self.assertEqual(dir.files, None)
		self.assertRaises(fsobject.NotLoadedYet, len, dir)

	def test_after_content_loaded(self):
		import os
		# Check whether the directory has the correct list of filenames.
		dir = Directory(TESTDIR)
		dir.load_content()

		self.assertTrue(dir.exists)
		self.assertEqual(type(dir.filenames), list)

		# Get the filenames you expect it to have and sort both before
		# comparing. I don't expect any order after only loading the filenames.
		assumed_filenames = os.listdir(TESTDIR)
		assumed_filenames = list(map(lambda str: os.path.join(TESTDIR, str),
			assumed_filenames))
		assumed_filenames.sort()
		dir.filenames.sort()

		self.assertTrue(len(dir) > 0)
		self.assertEqual(dir.filenames, assumed_filenames)

		# build a file object for each file in the list assumed_filenames
		# and find exactly one equivalent in dir.files
		for name in assumed_filenames:
			f = File(name)
			f.load()
			for dirfile in dir.files:
				if (f.path == dirfile.path and f.stat == dirfile.stat):
					break
			else:
				self.fail("couldn't find file {0}".format(name))

	def test_nonexistant_dir(self):
		dir = Directory(NONEXISTANT_DIR)
		dir.load_content()
		
		self.assertTrue(dir.content_loaded)
		self.assertFalse(dir.exists)
		self.assertFalse(dir.accessible)
		self.assertEqual(dir.filenames, None)
		self.assertRaises(fsobject.NotLoadedYet, len, dir)

	def test_load_if_outdated(self):
		import os
		import time
		# modify the directory. If the time between the last modification
		# was within the filesystems resolution of mtime, we should have a reload

		def modify_dir():
			open(TESTFILE, 'w').close()
			os.unlink(TESTFILE)

		def mtime():
			return os.stat(TESTDIR).st_mtime

		dir = Directory(TESTDIR)
		dir.load()

		# If the modification happens to be in the same second as the
		# last modification, it will result in mtime having the same
		# integer value. So we wait until the resolution is exceeded
		# and mtime differs.
		old_mtime = mtime()
		for i in range(50):
			modify_dir()
			if old_mtime != mtime(): break
			time.sleep(0.1)
		else:
			# fail after 5 seconds of trying
			self.fail(
					"Cannot perform test: mtime of TESTDIR is not being updated.")

		self.assertTrue(dir.load_if_outdated())

if __name__ == '__main__':
	unittest.main()
4 break $read-grapheme:compute-length 195 } 196 # 4 bytes 197 compare c, 0xf8 198 { 199 break-if->= 200 num-trailers <- copy 3 201 break $read-grapheme:compute-length 202 } 203 $read-grapheme:abort: { 204 # TODO: print to stderr 205 print-string-to-real-screen "utf-8 encodings larger than 4 bytes are not supported. First byte seen: " 206 var n/eax: int <- copy c 207 print-int32-hex-to-real-screen n 208 print-string-to-real-screen "\n" 209 var exit-status/ebx: int <- copy 1 210 syscall_exit 211 } 212 } 213 # prepend trailer bytes 214 var result/edi: grapheme <- copy c 215 var num-byte-shifts/edx: int <- copy 1 216 { 217 compare num-trailers, 0 218 break-if-<= 219 var tmp/eax: byte <- read-byte in 220 var tmp2/eax: int <- copy tmp 221 tmp2 <- shift-left-bytes tmp2, num-byte-shifts 222 result <- or tmp2 223 # update loop state 224 num-byte-shifts <- increment 225 num-trailers <- decrement 226 loop 227 } 228 return result 229 } 230 231 fn test-read-grapheme { 232 var s: (stream byte 0x30) 233 var s2/ecx: (addr stream byte) <- address s 234 write s2, "aΒc世d界e" 235 var c/eax: grapheme <- read-grapheme s2 236 var n/eax: int <- copy c 237 check-ints-equal n, 0x61, "F - test grapheme/0" 238 var c/eax: grapheme <- read-grapheme s2 239 var n/eax: int <- copy c 240 check-ints-equal n, 0x92ce, "F - test grapheme/1" # greek capital letter beta 241 var c/eax: grapheme <- read-grapheme s2 242 var n/eax: int <- copy c 243 check-ints-equal n, 0x63, "F - test grapheme/2" 244 var c/eax: grapheme <- read-grapheme s2 245 var n/eax: int <- copy c 246 check-ints-equal n, 0x96b8e4, "F - test grapheme/3" 247 var c/eax: grapheme <- read-grapheme s2 248 var n/eax: int <- copy c 249 check-ints-equal n, 0x64, "F - test grapheme/4" 250 var c/eax: grapheme <- read-grapheme s2 251 var n/eax: int <- copy c 252 check-ints-equal n, 0x8c95e7, "F - test grapheme/5" 253 var c/eax: grapheme <- read-grapheme s2 254 var n/eax: int <- copy c 255 check-ints-equal n, 0x65, "F - test grapheme/6" 256 } 257 258 fn read-grapheme-buffered in: (addr buffered-file) -> _/eax: grapheme { 259 var c/eax: byte <- read-byte-buffered in 260 var num-trailers/ecx: int <- copy 0 261 $read-grapheme-buffered:compute-length: { 262 # single byte: just return it 263 compare c, 0xc0 264 { 265 break-if->= 266 var g/eax: grapheme <- copy c 267 return g 268 } 269 compare c, 0xfe 270 { 271 break-if-< 272 var g/eax: grapheme <- copy c 273 return g 274 } 275 # 2 bytes 276 compare c, 0xe0 277 { 278 break-if->= 279 num-trailers <- copy 1 280 break $read-grapheme-buffered:compute-length 281 } 282 # 3 bytes 283 compare c, 0xf0 284 { 285 break-if->= 286 num-trailers <- copy 2 287 break $read-grapheme-buffered:compute-length 288 } 289 # 4 bytes 290 compare c, 0xf8 291 { 292 break-if->= 293 num-trailers <- copy 3 294 break $read-grapheme-buffered:compute-length 295 } 296 $read-grapheme-buffered:abort: { 297 # TODO: print to stderr 298 print-string-to-real-screen "utf-8 encodings larger than 4 bytes are not supported. First byte seen: " 299 var n/eax: int <- copy c 300 print-int32-hex-to-real-screen n 301 print-string-to-real-screen "\n" 302 var exit-status/ebx: int <- copy 1 303 syscall_exit 304 } 305 } 306 # prepend trailer bytes 307 var result/edi: grapheme <- copy c 308 var num-byte-shifts/edx: int <- copy 1 309 { 310 compare num-trailers, 0 311 break-if-<= 312 var tmp/eax: byte <- read-byte-buffered in 313 var tmp2/eax: int <- copy tmp 314 tmp2 <- shift-left-bytes tmp2, num-byte-shifts 315 result <- or tmp2 316 # update loop state 317 num-byte-shifts <- increment 318 num-trailers <- decrement 319 loop 320 } 321 return result 322 } 323 324 # needed because available primitives only shift by a literal/constant number of bits 325 fn shift-left-bytes n: int, k: int -> _/eax: int { 326 var i/ecx: int <- copy 0 327 var result/eax: int <- copy n 328 { 329 compare i, k 330 break-if->= 331 compare i, 4 # only 4 bytes in 32 bits 332 break-if->= 333 result <- shift-left 8 334 i <- increment 335 loop 336 } 337 return result 338 } 339 340 fn test-shift-left-bytes-0 { 341 var result/eax: int <- shift-left-bytes 1, 0 342 check-ints-equal result, 1, "F - shift-left-bytes 0" 343 } 344 345 fn test-shift-left-bytes-1 { 346 var result/eax: int <- shift-left-bytes 1, 1 347 check-ints-equal result, 0x100, "F - shift-left-bytes 1" 348 } 349 350 fn test-shift-left-bytes-2 { 351 var result/eax: int <- shift-left-bytes 1, 2 352 check-ints-equal result, 0x10000, "F - shift-left-bytes 2" 353 } 354 355 fn test-shift-left-bytes-3 { 356 var result/eax: int <- shift-left-bytes 1, 3 357 check-ints-equal result, 0x1000000, "F - shift-left-bytes 3" 358 } 359 360 fn test-shift-left-bytes-4 { 361 var result/eax: int <- shift-left-bytes 1, 4 362 check-ints-equal result, 0, "F - shift-left-bytes 4" 363 } 364 365 fn test-shift-left-bytes-5 { 366 var result/eax: int <- shift-left-bytes 1, 5 367 check-ints-equal result, 0, "F - shift-left-bytes >4" 368 } 369 370 # To run all tests, uncomment this and run: 371 # $ ./translate_mu && ./a.elf 372 #? fn main -> _/ebx: int { 373 #? run-tests 374 #? r <- copy 0 375 #? } 376 377 # write a grapheme to a stream of bytes 378 # this is like write-to-stream, except we skip leading 0 bytes 379 fn write-grapheme out: (addr stream byte), g: grapheme { 380 $write-grapheme:body: { 381 var c/eax: int <- copy g 382 append-byte out, c # first byte is always written 383 c <- shift-right 8 384 compare c, 0 385 break-if-= $write-grapheme:body 386 append-byte out, c 387 c <- shift-right 8 388 compare c, 0 389 break-if-= $write-grapheme:body 390 append-byte out, c 391 c <- shift-right 8 392 compare c, 0 393 break-if-= $write-grapheme:body 394 append-byte out, c 395 } 396 }