From 2b36eee9b13eb16fb2e4b05d7e26f6d09f431912 Mon Sep 17 00:00:00 2001 From: Kartik Agaram Date: Sat, 22 Sep 2018 21:56:00 -0700 Subject: 4502 - support string literals directly in code Doesn't de-duplicate in the data segment, though. If you use the literal "foo" a hundred times in your code segment you're gonna spend a hundred times the space you need to. We can now simplify our test harness a bit in the factorial app, but we still have to put in commandline args to compare with manually. We only support length-prefixed strings, not null-terminated ones. --- subx/011run.cc | 29 ++++-- subx/038---literal_strings.cc | 215 ++++++++++++++++++++++++++++++++++++++++++ subx/040---tests.cc | 28 +----- subx/apps/factorial.subx | 16 +--- 4 files changed, 243 insertions(+), 45 deletions(-) create mode 100644 subx/038---literal_strings.cc (limited to 'subx') diff --git a/subx/011run.cc b/subx/011run.cc index 2549ae04..d3963e3e 100644 --- a/subx/011run.cc +++ b/subx/011run.cc @@ -138,6 +138,7 @@ void parse(istream& fin, program& out) { getline(fin, line_data); curr.original = line_data; trace(99, "parse") << "line: " << line_data << end(); + // End Line Parsing Special-cases(line_data -> l) istringstream lin(line_data); while (has_data(lin)) { string word_data; @@ -166,14 +167,8 @@ void parse(istream& fin, program& out) { break; } curr.words.push_back(word()); - curr.words.back().original = word_data; - istringstream win(word_data); - if (getline(win, curr.words.back().data, '/')) { - string m; - while (getline(win, m, '/')) - curr.words.back().metadata.push_back(m); - } - trace(99, "parse") << "new word: " << curr.words.back().data << end(); + parse_word(word_data, curr.words.back()); + trace(99, "parse") << "word: " << to_string(curr.words.back()); } if (!curr.words.empty()) l.push_back(curr); @@ -186,6 +181,24 @@ void parse(istream& fin, program& out) { trace(99, "parse") << "done" << end(); } +void parse_word(const string& data, word& out) { + out.original = data; + istringstream win(data); + if (getline(win, out.data, '/')) { + string m; + while (getline(win, m, '/')) + out.metadata.push_back(m); + } +} + +string to_string(const word& w) { + ostringstream out; + out << w.data; + for (int i = 0; i < SIZE(w.metadata); ++i) + out << " /" << w.metadata.at(i); + return out.str(); +} + //:: transform :(before "End Types") diff --git a/subx/038---literal_strings.cc b/subx/038---literal_strings.cc new file mode 100644 index 00000000..97542f43 --- /dev/null +++ b/subx/038---literal_strings.cc @@ -0,0 +1,215 @@ +//: Allow instructions to mention literals directly. +//: +//: This layer will transparently move them to the global segment (assumed to +//: always be the second segment). + +:(scenario transform_literal_string) +% Mem_offset = CODE_START; +% Mem.resize(AFTER_STACK - CODE_START); +== code + b8/copy "test"/imm32 # copy to EAX ++transform: -- move literal strings to data segment ++transform: adding global variable '__subx_global_1' containing "test" ++transform: instruction after transform: 'b8 __subx_global_1' + +//: We don't rely on any transforms running in previous layers, but this layer +//: knows about labels and global variables and will emit them for previous +//: layers to transform. +:(after "Begin Transforms") +// Begin Level-3 Transforms +Transform.push_back(transform_literal_strings); +// End Level-3 Transforms + +:(before "End Globals") +int Next_auto_global = 1; +:(code) +void transform_literal_strings(program& p) { + trace(99, "transform") << "-- move literal strings to data segment" << end(); + if (p.segments.empty()) return; + segment& code = p.segments.at(0); + segment data; + for (int i = 0; i < SIZE(code.lines); ++i) { + line& inst = code.lines.at(i); + for (int j = 0; j < SIZE(inst.words); ++j) { + word& curr = inst.words.at(j); + if (curr.data.at(0) != '"') continue; + ostringstream global_name; + global_name << "__subx_global_" << Next_auto_global; + ++Next_auto_global; + add_global_to_data_segment(global_name.str(), curr, data); + curr.data = global_name.str(); + } + trace(99, "transform") << "instruction after transform: '" << data_to_string(inst) << "'" << end(); + } + if (data.lines.empty()) return; + if (SIZE(p.segments) < 2) { + p.segments.resize(2); + p.segments.at(1).lines.swap(data.lines); + } + vector& existing_data = p.segments.at(1).lines; + existing_data.insert(existing_data.end(), data.lines.begin(), data.lines.end()); +} + +void add_global_to_data_segment(const string& name, const word& value, segment& data) { + trace(99, "transform") << "adding global variable '" << name << "' containing " << value.data << end(); + // emit label + data.lines.push_back(label(name)); + // emit size for size-prefixed array + data.lines.push_back(line()); + emit_hex_bytes(data.lines.back(), SIZE(value.data)-/*skip quotes*/2, 4/*bytes*/); + // emit data byte by byte + data.lines.push_back(line()); + line& curr = data.lines.back(); + for (int i = /*skip start quote*/1; i < SIZE(value.data)-/*skip end quote*/1; ++i) { + char c = value.data.at(i); + curr.words.push_back(word()); + curr.words.back().data = hex_byte_to_string(c); + curr.words.back().metadata.push_back(string(1, c)); + } +} + +line label(string s) { + line result; + result.words.push_back(word()); + result.words.back().data = (s+":"); + return result; +} + +//: Within strings, whitespace is significant. So we need to redo our instruction +//: parsing. + +:(scenarios parse_instruction_character_by_character) +:(scenario instruction_with_string_literal) +a "abc def" z # two spaces inside string ++parse2: word: a ++parse2: word: "abc def" ++parse2: word: z +# no other words +$parse2: 3 + +:(before "End Line Parsing Special-cases(line_data -> l)") +if (line_data.find('"') != string::npos) { // can cause false-positives, but we can handle them + parse_instruction_character_by_character(line_data, l); + continue; +} + +:(code) +void parse_instruction_character_by_character(const string& line_data, vector& out) { + // parse literals + istringstream in(line_data); + in >> std::noskipws; + line result; + // add tokens (words or strings) one by one + while (has_data(in)) { + skip_whitespace(in); + if (!has_data(in)) break; + char c = in.get(); + if (c == '#') break; // comment; drop rest of line + if (c == ':') break; // line metadata; skip for now + if (c == '.') { + if (!has_data(in)) break; // comment token at end of line + if (isspace(in.peek())) + continue; // '.' followed by space is comment token; skip + } + ostringstream w; + w << c; + if (c == '"') { + // slurp until '"' + while (has_data(in)) { + in >> c; + w << c; + if (c == '"') break; + } + } + // slurp any remaining characters until whitespace + while (!isspace(in.peek()) && has_data(in)) { // peek can sometimes trigger eof(), so do it first + in >> c; + w << c; + } + result.words.push_back(word()); + parse_word(w.str(), result.words.back()); + trace(99, "parse2") << "word: " << to_string(result.words.back()) << end(); + } + if (!result.words.empty()) + out.push_back(result); +} + +void skip_whitespace(istream& in) { + while (true) { + if (has_data(in) && isspace(in.peek())) in.get(); + else break; + } +} + +void skip_comment(istream& in) { + if (has_data(in) && in.peek() == '#') { + in.get(); + while (has_data(in) && in.peek() != '\n') in.get(); + } +} + +// helper for tests +void parse_instruction_character_by_character(const string& line_data) { + vector out; + parse_instruction_character_by_character(line_data, out); +} + +:(scenario parse2_comment_token_in_middle) +a . z ++parse2: word: a ++parse2: word: z +-parse2: word: . +# no other words +$parse2: 2 + +:(scenario parse2_word_starting_with_dot) +a .b c ++parse2: word: a ++parse2: word: .b ++parse2: word: c + +:(scenario parse2_comment_token_at_start) +. a b ++parse2: word: a ++parse2: word: b +-parse2: word: . + +:(scenario parse2_comment_token_at_end) +a b . ++parse2: word: a ++parse2: word: b +-parse2: word: . + +:(scenario parse2_word_starting_with_dot_at_start) +.a b c ++parse2: word: .a ++parse2: word: b ++parse2: word: c + +:(scenario parse2_metadata) +.a b/c d ++parse2: word: .a ++parse2: word: b /c ++parse2: word: d + +:(scenario parse2_string_with_metadata) +a "bc def"/disp32 g ++parse2: word: a ++parse2: word: "bc def" /disp32 ++parse2: word: g + +:(scenario parse2_string_with_metadata_at_end) +a "bc def"/disp32 ++parse2: word: a ++parse2: word: "bc def" /disp32 + +:(code) +void test_parse2_string_with_metadata_at_end_of_line_without_newline() { + parse_instruction_character_by_character( + "68/push \"test\"/f" // no newline, which is how calls from parse() will look + ); + CHECK_TRACE_CONTENTS( + "parse2: word: 68 /push" + "parse2: word: \"test\" /f" + ); +} diff --git a/subx/040---tests.cc b/subx/040---tests.cc index 10356174..b0bcef38 100644 --- a/subx/040---tests.cc +++ b/subx/040---tests.cc @@ -1,14 +1,11 @@ -//: Beginning of level 3: support for automatically aggregating functions into -//: test suites. -//: -//: (As explained in the transform layer, level 3 runs before level 2. We -//: can't use any of the transforms in previous layers. But we *do* rely on -//: those concepts being present in the input. Particularly labels.) +//: Automatically aggregating functions into test suites. +//: We don't rely on any transforms running in previous layers, but this layer +//: knows about labels and will emit labels for previous layers to transform. :(after "Begin Transforms") -// Begin Level-3 Transforms +// Begin Level-4 Transforms Transform.push_back(create_test_function); -// End Level-3 Transforms +// End Level-4 Transforms :(scenario run_test) % Reg[ESP].u = 0x100; @@ -59,21 +56,6 @@ string to_string(const segment& s) { return out.str(); } -string to_string(const word& w) { - ostringstream out; - out << w.data; - for (int i = 0; i < SIZE(w.metadata); ++i) - out << '/' << w.metadata.at(i); - return out.str(); -} - -line label(string s) { - line result; - result.words.push_back(word()); - result.words.back().data = (s+":"); - return result; -} - line call(string s) { line result; result.words.push_back(call()); diff --git a/subx/apps/factorial.subx b/subx/apps/factorial.subx index c8de4790..9aef6634 100644 --- a/subx/apps/factorial.subx +++ b/subx/apps/factorial.subx @@ -94,7 +94,7 @@ test_factorial: 75/jump-if-unequal . . . . . . $test_factorial:else/disp8 # print('.') # push args - 68/push Test_passed/imm32 + 68/push "."/imm32 # call e8/call write_stderr/disp32 # discard arg @@ -105,7 +105,7 @@ test_factorial: $test_factorial:else: # print('F') # push args - 68/push Test_failed/imm32 + 68/push "F"/imm32 # call e8/call write_stderr/disp32 # discard arg @@ -181,16 +181,4 @@ Test_argv: # null-terminated # data 74/t 65/e 73/s 74/t 00/null -Test_passed: - # size - 01 00 00 00 - # data - 2e/dot - -Test_failed: - # size - 01 00 00 00 - # data - 46/F - # vim:ft=subx:nowrap:so=0 -- cgit 1.4.1-2-gfad0