https://github.com/akkartik/mu/blob/main/403unicode.mu
  1 # Helpers for Unicode.
  2 #
  3 # The basic unit for rendering Unicode is the code point.
  4 #   https://en.wikipedia.org/wiki/Code_point
  5 # The glyph a non-cursive font displays may represent multiple code points.
  6 #
  7 # In addition to raw code points (just integers assigned special meaning), Mu
  8 # provides a common encoding as a convenience: code-point-utf8.
  9 
 10 fn test-unicode-serialization-and-deserialization {
 11   var i/ebx: int <- copy 0
 12   var init?/esi: boolean <- copy 1/true
 13   {
 14     compare i, 0x10000  # 32 bits of utf-8 are sufficient for https://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane
 15                         # but not emoji
 16     break-if->=
 17     var c/eax: code-point <- copy i
 18     var _g/eax: code-point-utf8 <- to-utf8 c
 19     var g/ecx: code-point-utf8 <- copy _g
 20     var c2/eax: code-point <- to-code-point g
 21     compare i, c2
 22     {
 23       break-if-=
 24       {
 25         compare init?, 0/false
 26         break-if-=
 27         draw-text-wrapping-right-then-down-from-cursor-over-full-screen 
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en">
	<head>
		<title>andrewyu.org Mailing List Guidelines</title>
		<link rel="stylesheet" href="/plain.css" />
		<link rel="shortcut icon" href="/favicon.ico" type="image/x-icon" />
		<meta charset="utf-8" />
	</head>
	<body class="indent">
		<h1><code>andrewyu.org</code> Mailing List Guidelines</h1>
		<p>The following guidelines apply to most <a href="https://mail.andrewyu.org/mailman/listinfo">public mailing lists on <code>andrewyu.org</code></a>.  Different lists may have different specific guidelines.</p>
		<ul>
			<li>
				Be considerate of other subscribers on the mailing lists.
			</li>
			<li>
				Plain text, preferrably 72 characters per line.
				<br />
				Many subscribers and maintainers read their mail on text-based mailers like mail(1), emacs, mutt, etc., and they often find HTML-formatted messages (or lines that stretch beyond 72 characters) unreadable. Many mailing lists strip messages of MIME content before sending them out to the rest of the list. If you don't use plain text, your messages will be reformatted. If they cannot be reformatted, they will be summarily rejected. Use attachments with care, they will be removed from messages on some lists.
			</li>
			<li>
				Include a useful Subject line.
				<br />
				Messages with an empty Subject will get bounced to the list manager and will take longer to show up. Including a relevant Subject in the message will ensure that more people actually read what you've written. Also, avoid Subject lines with excessive uppercase and exclamations. "Help!" or "I can't get it to work!" are not useful subject lines. Do not change the subject line while on the same topic. You may know what it is regarding, the rest of us who get several hundred messages a day will have no idea.
			</li>
			<li>
				Trim your signature.
				<br />
				Keep the signature lines at the bottom of your mail to a reasonable length.  PGP signatures should be attachments rather than inline.
			</li>
			<li>
				Stay on topic.
				<br />
				Please keep the subject of the post relevant to the topic of the mailing list.
			</li>
			<li>
				Include important information for questions.
				<br />
				Don't waste everyone's time with a hopelessly incomplete question. No one other than you has the information needed to resolve your problem, it is better to provide more information than needed than not enough detail.
			</li>
			<li>
				Respect differences in opinion and philosophy.
				<br />
				Intelligent people may look at the same set of facts and come to very different conclusions. Repeating the same points that didn't convince someone previously rarely changes their mind, and irritates all the other readers.
			</li>
			<li>
				No spam.
			</li>
			<li>
				Interweave quoted original messages with replies.
				<br />
				Post inline, and trim quotes. This means that your replies are interspersed with the original mail, and any irrelevant content is removed (trimmed):
				<pre>From: A
Subject: Re: S
To: B
CC: list

B wrote:
&gt; ...
&gt; Do you think that this is correct?

Yes, I believe so.

&gt; How about iterating through the items and applying the function to each of them?

Sounds good.</pre>
			</li>
			<li>
				Reply to the list.
				<br />
				For most discussions that are not going wildly off-topic and still may be of interest to those in the list, CC the list so others can follow up.  However, as it suggests, if things are going to be irrelevant to the list's topic, reply in private.
			</li>
		</ul>
		<div id="footer">
			<hr />
			<p><a href="/">Andrew Yu's Website</a></p>
		</div>
	</body>
</html>
ss="Special"><- decrement 154 loop 155 } 156 # emit engine 157 result <- shift-left 8 158 result <- or c 159 result <- or first 160 # 161 return result 162 } 163 164 # single-byte code point have identical code-point-utf8s 165 fn test-to-utf8-single-byte { 166 var in-int/ecx: int <- copy 0 167 { 168 compare in-int, 0x7f 169 break-if-> 170 var in/eax: code-point <- copy in-int 171 var out/eax: code-point-utf8 <- to-utf8 in 172 var out-int/eax: int <- copy out 173 check-ints-equal out-int, in-int, "F - test-to-utf8-single-byte" 174 in-int <- increment 175 loop 176 } 177 } 178 179 # byte | byte | byte | byte 180 # smallest 2-byte utf-8 181 fn test-to-utf8-two-bytes-min { 182 var in/eax: code-point <- copy 0x80 # 10 00-0000 183 var out/eax: code-point-utf8 <- to-utf8 in 184 var out-int/eax: int <- copy out 185 check-ints-equal out-int, 0x80c2, "F - to-utf8/2a" # 110 0-0010 10 00-0000 186 } 187 188 # largest 2-byte utf-8 189 fn test-to-utf8-two-bytes-max { 190 var in/eax: code-point <- copy 0x7ff # 1-1111 11-1111 191 var out/eax: code-point-utf8 <- to-utf8 in 192 var out-int/eax: int <- copy out 193 check-ints-equal out-int, 0xbfdf, "F - to-utf8/2b" # 110 1-1111 10 11-1111 194 } 195 196 # smallest 3-byte utf-8 197 fn test-to-utf8-three-bytes-min { 198 var in/eax: code-point <- copy 0x800 # 10-0000 00-0000 199 var out/eax: code-point-utf8 <- to-utf8 in 200 var out-int/eax: int <- copy out 201 check-ints-equal out-int, 0x80a0e0, "F - to-utf8/3a" # 1110 0000 10 10-0000 10 00-0000 202 } 203 204 # largest 3-byte utf-8 205 fn test-to-utf8-three-bytes-max { 206 var in/eax: code-point <- copy 0xffff # 1111 11-1111 11-1111 207 var out/eax: code-point-utf8 <- to-utf8 in 208 var out-int/eax: int <- copy out 209 check-ints-equal out-int, 0xbfbfef, "F - to-utf8/3b" # 1110 1111 10 11-1111 10 11-1111 210 } 211 212 # smallest 4-byte utf-8 213 fn test-to-utf8-four-bytes-min { 214 var in/eax: code-point <- copy 0x10000 # 1-0000 00-0000 00-0000 215 var out/eax: code-point-utf8 <- to-utf8 in 216 var out-int/eax: int <- copy out 217 check-ints-equal out-int, 0x808090f0, "F - to-utf8/4a" # 1111-0 000 10 01-0000 10 00-0000 10 00-0000 218 } 219 220 # largest 4-byte utf-8 221 fn test-to-utf8-four-bytes-max { 222 var in/eax: code-point <- copy 0x1fffff # 111 11-1111 11-1111 11-1111 223 var out/eax: code-point-utf8 <- to-utf8 in 224 var out-int/eax: int <- copy out 225 check-ints-equal out-int, 0xbfbfbff7, "F - to-utf8/4b" # 1111-0 111 10 11-1111 10 11-1111 10 11-1111 226 } 227 228 # read the next code-point-utf8 from a stream of bytes 229 fn read-code-point-utf8 in: (addr stream byte) -> _/eax: code-point-utf8 { 230 # if at eof, return EOF 231 { 232 var eof?/eax: boolean <- stream-empty? in 233 compare eof?, 0/false 234 break-if-= 235 return 0xffffffff 236 } 237 var c/eax: byte <- read-byte in 238 var num-trailers/ecx: int <- copy 0 239 $read-code-point-utf8:compute-length: { 240 # single byte: just return it 241 compare c, 0xc0 242 { 243 break-if->= 244 var g/eax: code-point-utf8 <- copy c 245 return g 246 } 247 compare c, 0xfe 248 { 249 break-if-< 250 var g/eax: code-point-utf8 <- copy c 251 return g 252 } 253 # 2 bytes 254 compare c, 0xe0 255 { 256 break-if->= 257 num-trailers <- copy 1 258 break $read-code-point-utf8:compute-length 259 } 260 # 3 bytes 261 compare c, 0xf0 262 { 263 break-if->= 264 num-trailers <- copy 2 265 break $read-code-point-utf8:compute-length 266 } 267 # 4 bytes 268 compare c, 0xf8 269 { 270 break-if->= 271 num-trailers <- copy 3 272 break $read-code-point-utf8:compute-length 273 } 274 abort "utf-8 encodings larger than 4 bytes are not yet supported" 275 return 0 276 } 277 # prepend trailer bytes 278 var result/edi: code-point-utf8 <- copy c 279 var num-byte-shifts/edx: int <- copy 1 280 { 281 compare num-trailers, 0 282 break-if-<= 283 var tmp/eax: byte <- read-byte in 284 var tmp2/eax: int <- copy tmp 285 tmp2 <- shift-left-bytes tmp2, num-byte-shifts 286 result <- or tmp2 287 # update loop state 288 num-byte-shifts <- increment 289 num-trailers <- decrement 290 loop 291 } 292 return result 293 } 294 295 fn test-read-code-point-utf8 { 296 var s: (stream byte 0x30) 297 var s2/ecx: (addr stream byte) <- address s 298 write s2, "aΒc世d界e" 299 var c/eax: code-point-utf8 <- read-code-point-utf8 s2 300 var n/eax: int <- copy c 301 check-ints-equal n, 0x61, "F - test code-point-utf8/0" 302 var c/eax: code-point-utf8 <- read-code-point-utf8 s2 303 var n/eax: int <- copy c 304 check-ints-equal n, 0x92ce/greek-capital-letter-beta, "F - test code-point-utf8/1" 305 var c/eax: code-point-utf8 <- read-code-point-utf8 s2 306 var n/eax: int <- copy c 307 check-ints-equal n, 0x63, "F - test code-point-utf8/2" 308 var c/eax: code-point-utf8 <- read-code-point-utf8 s2 309 var n/eax: int <- copy c 310 check-ints-equal n, 0x96b8e4, "F - test code-point-utf8/3" 311 var c/eax: code-point-utf8 <- read-code-point-utf8 s2 312 var n/eax: int <- copy c 313 check-ints-equal n, 0x64, "F - test code-point-utf8/4" 314 var c/eax: code-point-utf8 <- read-code-point-utf8 s2 315 var n/eax: int <- copy c 316 check-ints-equal n, 0x8c95e7, "F - test code-point-utf8/5" 317 var c/eax: code-point-utf8 <- read-code-point-utf8 s2 318 var n/eax: int <- copy c 319 check-ints-equal n, 0x65, "F - test code-point-utf8/6" 320 } 321 322 fn utf8-length g: code-point-utf8 -> _/edx: int { 323 { 324 compare g, 0xff 325 break-if-> 326 return 1 327 } 328 { 329 compare g, 0xffff 330 break-if-> 331 return 2 332 } 333 { 334 compare g, 0xffffff 335 break-if-> 336 return 3 337 } 338 return 4 339 } 340 341 # needed because available primitives only shift by a literal/constant number of bits 342 fn shift-left-bytes n: int, k: int -> _/eax: int { 343 var i/ecx: int <- copy 0 344 var result/eax: int <- copy n 345 { 346 compare i, k 347 break-if->= 348 compare i, 4 # only 4 bytes in 32 bits 349 break-if->= 350 result <- shift-left 8 351 i <- increment 352 loop 353 } 354 return result 355 } 356 357 fn test-shift-left-bytes-0 { 358 var result/eax: int <- shift-left-bytes 1, 0 359 check-ints-equal result, 1, "F - shift-left-bytes 0" 360 } 361 362 fn test-shift-left-bytes-1 { 363 var result/eax: int <- shift-left-bytes 1, 1 364 check-ints-equal result, 0x100, "F - shift-left-bytes 1" 365 } 366 367 fn test-shift-left-bytes-2 { 368 var result/eax: int <- shift-left-bytes 1, 2 369 check-ints-equal result, 0x10000, "F - shift-left-bytes 2" 370 } 371 372 fn test-shift-left-bytes-3 { 373 var result/eax: int <- shift-left-bytes 1, 3 374 check-ints-equal result, 0x1000000, "F - shift-left-bytes 3" 375 } 376 377 fn test-shift-left-bytes-4 { 378 var result/eax: int <- shift-left-bytes 1, 4 379 check-ints-equal result, 0, "F - shift-left-bytes 4" 380 } 381 382 fn test-shift-left-bytes-5 { 383 var result/eax: int <- shift-left-bytes 1, 5 384 check-ints-equal result, 0, "F - shift-left-bytes >4" 385 } 386 387 # write a code-point-utf8 to a stream of bytes 388 # this is like write-to-stream, except we skip leading 0 bytes 389 fn write-code-point-utf8 out: (addr stream byte), g: code-point-utf8 { 390 $write-code-point-utf8:body: { 391 var c/eax: int <- copy g 392 append-byte out, c # first byte is always written 393 c <- shift-right 8 394 compare c, 0 395 break-if-= $write-code-point-utf8:body 396 append-byte out, c 397 c <- shift-right 8 398 compare c, 0 399 break-if-= $write-code-point-utf8:body 400 append-byte out, c 401 c <- shift-right 8 402 compare c, 0 403 break-if-= $write-code-point-utf8:body 404 append-byte out, c 405 } 406 }