From 2b36eee9b13eb16fb2e4b05d7e26f6d09f431912 Mon Sep 17 00:00:00 2001
From: Kartik Agaram <vc@akkartik.com>
Date: Sat, 22 Sep 2018 21:56:00 -0700
Subject: 4502 - support string literals directly in code

Doesn't de-duplicate in the data segment, though. If you use the literal
"foo" a hundred times in your code segment you're gonna spend a hundred
times the space you need to.

We can now simplify our test harness a bit in the factorial app, but we
still have to put in commandline args to compare with manually. We only
support length-prefixed strings, not null-terminated ones.
---
 subx/011run.cc                |  29 ++++--
 subx/038---literal_strings.cc | 215 ++++++++++++++++++++++++++++++++++++++++++
 subx/040---tests.cc           |  28 +-----
 subx/apps/factorial.subx      |  16 +---
 4 files changed, 243 insertions(+), 45 deletions(-)
 create mode 100644 subx/038---literal_strings.cc

(limited to 'subx')

diff --git a/subx/011run.cc b/subx/011run.cc
index 2549ae04..d3963e3e 100644
--- a/subx/011run.cc
+++ b/subx/011run.cc
@@ -138,6 +138,7 @@ void parse(istream& fin, program& out) {
     getline(fin, line_data);
     curr.original = line_data;
     trace(99, "parse") << "line: " << line_data << end();
+    // End Line Parsing Special-cases(line_data -> l)
     istringstream lin(line_data);
     while (has_data(lin)) {
       string word_data;
@@ -166,14 +167,8 @@ void parse(istream& fin, program& out) {
         break;
       }
       curr.words.push_back(word());
-      curr.words.back().original = word_data;
-      istringstream win(word_data);
-      if (getline(win, curr.words.back().data, '/')) {
-        string m;
-        while (getline(win, m, '/'))
-          curr.words.back().metadata.push_back(m);
-      }
-      trace(99, "parse") << "new word: " << curr.words.back().data << end();
+      parse_word(word_data, curr.words.back());
+      trace(99, "parse") << "word: " << to_string(curr.words.back());
     }
     if (!curr.words.empty())
       l.push_back(curr);
@@ -186,6 +181,24 @@ void parse(istream& fin, program& out) {
   trace(99, "parse") << "done" << end();
 }
 
+void parse_word(const string& data, word& out) {
+  out.original = data;
+  istringstream win(data);
+  if (getline(win, out.data, '/')) {
+    string m;
+    while (getline(win, m, '/'))
+      out.metadata.push_back(m);
+  }
+}
+
+string to_string(const word& w) {
+  ostringstream out;
+  out << w.data;
+  for (int i = 0;  i < SIZE(w.metadata);  ++i)
+    out << " /" << w.metadata.at(i);
+  return out.str();
+}
+
 //:: transform
 
 :(before "End Types")
diff --git a/subx/038---literal_strings.cc b/subx/038---literal_strings.cc
new file mode 100644
index 00000000..97542f43
--- /dev/null
+++ b/subx/038---literal_strings.cc
@@ -0,0 +1,215 @@
+//: Allow instructions to mention literals directly.
+//:
+//: This layer will transparently move them to the global segment (assumed to
+//: always be the second segment).
+
+:(scenario transform_literal_string)
+% Mem_offset = CODE_START;
+% Mem.resize(AFTER_STACK - CODE_START);
+== code
+  b8/copy "test"/imm32  # copy to EAX
++transform: -- move literal strings to data segment
++transform: adding global variable '__subx_global_1' containing "test"
++transform: instruction after transform: 'b8 __subx_global_1'
+
+//: We don't rely on any transforms running in previous layers, but this layer
+//: knows about labels and global variables and will emit them for previous
+//: layers to transform.
+:(after "Begin Transforms")
+// Begin Level-3 Transforms
+Transform.push_back(transform_literal_strings);
+// End Level-3 Transforms
+
+:(before "End Globals")
+int Next_auto_global = 1;
+:(code)
+void transform_literal_strings(program& p) {
+  trace(99, "transform") << "-- move literal strings to data segment" << end();
+  if (p.segments.empty()) return;
+  segment& code = p.segments.at(0);
+  segment data;
+  for (int i = 0;  i < SIZE(code.lines);  ++i) {
+    line& inst = code.lines.at(i);
+    for (int j = 0;  j < SIZE(inst.words);  ++j) {
+      word& curr = inst.words.at(j);
+      if (curr.data.at(0) != '"') continue;
+      ostringstream global_name;
+      global_name << "__subx_global_" << Next_auto_global;
+      ++Next_auto_global;
+      add_global_to_data_segment(global_name.str(), curr, data);
+      curr.data = global_name.str();
+    }
+    trace(99, "transform") << "instruction after transform: '" << data_to_string(inst) << "'" << end();
+  }
+  if (data.lines.empty()) return;
+  if (SIZE(p.segments) < 2) {
+    p.segments.resize(2);
+    p.segments.at(1).lines.swap(data.lines);
+  }
+  vector<line>& existing_data = p.segments.at(1).lines;
+  existing_data.insert(existing_data.end(), data.lines.begin(), data.lines.end());
+}
+
+void add_global_to_data_segment(const string& name, const word& value, segment& data) {
+  trace(99, "transform") << "adding global variable '" << name << "' containing " << value.data << end();
+  // emit label
+  data.lines.push_back(label(name));
+  // emit size for size-prefixed array
+  data.lines.push_back(line());
+  emit_hex_bytes(data.lines.back(), SIZE(value.data)-/*skip quotes*/2, 4/*bytes*/);
+  // emit data byte by byte
+  data.lines.push_back(line());
+  line& curr = data.lines.back();
+  for (int i = /*skip start quote*/1;  i < SIZE(value.data)-/*skip end quote*/1;  ++i) {
+    char c = value.data.at(i);
+    curr.words.push_back(word());
+    curr.words.back().data = hex_byte_to_string(c);
+    curr.words.back().metadata.push_back(string(1, c));
+  }
+}
+
+line label(string s) {
+  line result;
+  result.words.push_back(word());
+  result.words.back().data = (s+":");
+  return result;
+}
+
+//: Within strings, whitespace is significant. So we need to redo our instruction
+//: parsing.
+
+:(scenarios parse_instruction_character_by_character)
+:(scenario instruction_with_string_literal)
+a "abc  def" z  # two spaces inside string
++parse2: word: a
++parse2: word: "abc  def"
++parse2: word: z
+# no other words
+$parse2: 3
+
+:(before "End Line Parsing Special-cases(line_data -> l)")
+if (line_data.find('"') != string::npos) {  // can cause false-positives, but we can handle them
+  parse_instruction_character_by_character(line_data, l);
+  continue;
+}
+
+:(code)
+void parse_instruction_character_by_character(const string& line_data, vector<line>& out) {
+  // parse literals
+  istringstream in(line_data);
+  in >> std::noskipws;
+  line result;
+  // add tokens (words or strings) one by one
+  while (has_data(in)) {
+    skip_whitespace(in);
+    if (!has_data(in)) break;
+    char c = in.get();
+    if (c == '#') break;  // comment; drop rest of line
+    if (c == ':') break;  // line metadata; skip for now
+    if (c == '.') {
+      if (!has_data(in)) break;  // comment token at end of line
+      if (isspace(in.peek()))
+        continue;  // '.' followed by space is comment token; skip
+    }
+    ostringstream w;
+    w << c;
+    if (c == '"') {
+      // slurp until '"'
+      while (has_data(in)) {
+        in >> c;
+        w << c;
+        if (c == '"') break;
+      }
+    }
+    // slurp any remaining characters until whitespace
+    while (!isspace(in.peek()) && has_data(in)) {  // peek can sometimes trigger eof(), so do it first
+      in >> c;
+      w << c;
+    }
+    result.words.push_back(word());
+    parse_word(w.str(), result.words.back());
+    trace(99, "parse2") << "word: " << to_string(result.words.back()) << end();
+  }
+  if (!result.words.empty())
+    out.push_back(result);
+}
+
+void skip_whitespace(istream& in) {
+  while (true) {
+    if (has_data(in) && isspace(in.peek())) in.get();
+    else break;
+  }
+}
+
+void skip_comment(istream& in) {
+  if (has_data(in) && in.peek() == '#') {
+    in.get();
+    while (has_data(in) && in.peek() != '\n') in.get();
+  }
+}
+
+// helper for tests
+void parse_instruction_character_by_character(const string& line_data) {
+  vector<line> out;
+  parse_instruction_character_by_character(line_data, out);
+}
+
+:(scenario parse2_comment_token_in_middle)
+a . z
++parse2: word: a
++parse2: word: z
+-parse2: word: .
+# no other words
+$parse2: 2
+
+:(scenario parse2_word_starting_with_dot)
+a .b c
++parse2: word: a
++parse2: word: .b
++parse2: word: c
+
+:(scenario parse2_comment_token_at_start)
+. a b
++parse2: word: a
++parse2: word: b
+-parse2: word: .
+
+:(scenario parse2_comment_token_at_end)
+a b .
++parse2: word: a
++parse2: word: b
+-parse2: word: .
+
+:(scenario parse2_word_starting_with_dot_at_start)
+.a b c
++parse2: word: .a
++parse2: word: b
++parse2: word: c
+
+:(scenario parse2_metadata)
+.a b/c d
++parse2: word: .a
++parse2: word: b /c
++parse2: word: d
+
+:(scenario parse2_string_with_metadata)
+a "bc  def"/disp32 g
++parse2: word: a
++parse2: word: "bc  def" /disp32
++parse2: word: g
+
+:(scenario parse2_string_with_metadata_at_end)
+a "bc  def"/disp32
++parse2: word: a
++parse2: word: "bc  def" /disp32
+
+:(code)
+void test_parse2_string_with_metadata_at_end_of_line_without_newline() {
+  parse_instruction_character_by_character(
+      "68/push \"test\"/f"  // no newline, which is how calls from parse() will look
+  );
+  CHECK_TRACE_CONTENTS(
+      "parse2: word: 68 /push"
+      "parse2: word: \"test\" /f"
+  );
+}
diff --git a/subx/040---tests.cc b/subx/040---tests.cc
index 10356174..b0bcef38 100644
--- a/subx/040---tests.cc
+++ b/subx/040---tests.cc
@@ -1,14 +1,11 @@
-//: Beginning of level 3: support for automatically aggregating functions into
-//: test suites.
-//:
-//: (As explained in the transform layer, level 3 runs before level 2. We
-//: can't use any of the transforms in previous layers. But we *do* rely on
-//: those concepts being present in the input. Particularly labels.)
+//: Automatically aggregating functions into test suites.
 
+//: We don't rely on any transforms running in previous layers, but this layer
+//: knows about labels and will emit labels for previous layers to transform.
 :(after "Begin Transforms")
-// Begin Level-3 Transforms
+// Begin Level-4 Transforms
 Transform.push_back(create_test_function);
-// End Level-3 Transforms
+// End Level-4 Transforms
 
 :(scenario run_test)
 % Reg[ESP].u = 0x100;
@@ -59,21 +56,6 @@ string to_string(const segment& s) {
   return out.str();
 }
 
-string to_string(const word& w) {
-  ostringstream out;
-  out << w.data;
-  for (int i = 0;  i < SIZE(w.metadata);  ++i)
-    out << '/' << w.metadata.at(i);
-  return out.str();
-}
-
-line label(string s) {
-  line result;
-  result.words.push_back(word());
-  result.words.back().data = (s+":");
-  return result;
-}
-
 line call(string s) {
   line result;
   result.words.push_back(call());
diff --git a/subx/apps/factorial.subx b/subx/apps/factorial.subx
index c8de4790..9aef6634 100644
--- a/subx/apps/factorial.subx
+++ b/subx/apps/factorial.subx
@@ -94,7 +94,7 @@ test_factorial:
   75/jump-if-unequal              .               .             .           .             .           .           $test_factorial:else/disp8
     # print('.')
       # push args
-  68/push  Test_passed/imm32
+  68/push  "."/imm32
       # call
   e8/call  write_stderr/disp32
       # discard arg
@@ -105,7 +105,7 @@ test_factorial:
 $test_factorial:else:
     # print('F')
       # push args
-  68/push  Test_failed/imm32
+  68/push  "F"/imm32
       # call
   e8/call  write_stderr/disp32
       # discard arg
@@ -181,16 +181,4 @@ Test_argv:  # null-terminated
   # data
   74/t 65/e 73/s 74/t 00/null
 
-Test_passed:
-  # size
-  01 00 00 00
-  # data
-  2e/dot
-
-Test_failed:
-  # size
-  01 00 00 00
-  # data
-  46/F
-
 # vim:ft=subx:nowrap:so=0
-- 
cgit 1.4.1-2-gfad0