From a630d7fb9666bbacdd714ddc12d3c42ef1466ba7 Mon Sep 17 00:00:00 2001 From: Kartik Agaram Date: Wed, 13 Feb 2019 00:01:14 -0800 Subject: 4959 It's always seemed ugly to explain the rules for segment names. Let's just always require a fixed name for the code and data segments. --- html/subx/034compute_segment_address.cc.html | 205 +++++++++++++++------------ subx/034compute_segment_address.cc | 39 +++-- subx/Readme.md | 3 +- 3 files changed, 142 insertions(+), 105 deletions(-) diff --git a/html/subx/034compute_segment_address.cc.html b/html/subx/034compute_segment_address.cc.html index 8f579b69..ae9a2461 100644 --- a/html/subx/034compute_segment_address.cc.html +++ b/html/subx/034compute_segment_address.cc.html @@ -71,7 +71,7 @@ if ('onhashchange' in window) { 11 +load: 0x09000056 -> 0b 12 +load: 0x09000057 -> 0c 13 +load: 0x09000058 -> 0d - 14 +run: add imm32 0x0d0c0b0a to reg EAX + 14 +run: add imm32 0x0d0c0b0a to reg EAX 15 +run: storing 0x0d0c0b0a 16 17 //: Update the parser to handle non-numeric segment name. @@ -81,45 +81,45 @@ if ('onhashchange' in window) { 21 //: the new data to existing data for the segment. 22 23 :(before "End Globals") - 24 map</*name*/string, int> Segment_index; - 25 bool Currently_parsing_named_segment = false; // global to permit cross-layer communication - 26 int Currently_parsing_segment_index = -1; // global to permit cross-layer communication + 24 map</*name*/string, int> Segment_index; + 25 bool Currently_parsing_named_segment = false; // global to permit cross-layer communication + 26 int Currently_parsing_segment_index = -1; // global to permit cross-layer communication 27 :(before "End Reset") - 28 Segment_index.clear(); - 29 Currently_parsing_named_segment = false; - 30 Currently_parsing_segment_index = -1; + 28 Segment_index.clear(); + 29 Currently_parsing_named_segment = false; + 30 Currently_parsing_segment_index = -1; 31 32 :(before "End Segment Parsing Special-cases(segment_title)") 33 if (!starts_with(segment_title, "0x")) { - 34 Currently_parsing_named_segment = true; - 35 if (!contains_key(Segment_index, segment_title)) { - 36 trace(99, "parse") << "new segment '" << segment_title << "'" << end(); - 37 if (segment_title == "code") - 38 put(Segment_index, segment_title, 0); - 39 else if (segment_title == "data") - 40 put(Segment_index, segment_title, 1); - 41 else - 42 put(Segment_index, segment_title, max(2, SIZE(out.segments))); - 43 out.segments.push_back(segment()); - 44 } - 45 else { - 46 trace(99, "parse") << "prepending to segment '" << segment_title << "'" << end(); + 34 Currently_parsing_named_segment = true; + 35 if (!contains_key(Segment_index, segment_title)) { + 36 trace(99, "parse") << "new segment '" << segment_title << "'" << end(); + 37 if (out.segments.empty() && segment_title != "code") { + 38 raise << "first segment must be 'code' but is '" << segment_title << "'\n" << end(); + 39 return; + 40 } + 41 if (SIZE(out.segments) == 1 && segment_title != "data") { + 42 raise << "second segment must be 'data' but is '" << segment_title << "'\n" << end(); + 43 return; + 44 } + 45 put(Segment_index, segment_title, SIZE(out.segments)); + 46 out.segments.push_back(segment()); 47 } - 48 Currently_parsing_segment_index = get(Segment_index, segment_title); - 49 } - 50 - 51 :(before "End flush(p, lines) Special-cases") - 52 if (Currently_parsing_named_segment) { - 53 if (p.segments.empty() || Currently_parsing_segment_index < 0) { - 54 raise << "input does not start with a '==' section header\n" << end(); - 55 return; - 56 } - 57 trace(99, "parse") << "flushing to segment" << end(); - 58 vector<line>& curr_segment_data = p.segments.at(Currently_parsing_segment_index).lines; - 59 curr_segment_data.insert(curr_segment_data.begin(), lines.begin(), lines.end()); - 60 lines.clear(); - 61 Currently_parsing_named_segment = false; - 62 Currently_parsing_segment_index = -1; + 48 else { + 49 trace(99, "parse") << "prepending to segment '" << segment_title << "'" << end(); + 50 } + 51 Currently_parsing_segment_index = get(Segment_index, segment_title); + 52 } + 53 + 54 :(before "End flush(p, lines) Special-cases") + 55 if (Currently_parsing_named_segment) { + 56 assert(!p.segments.empty()); + 57 trace(99, "parse") << "flushing to segment" << end(); + 58 vector<line>& curr_segment_data = p.segments.at(Currently_parsing_segment_index).lines; + 59 curr_segment_data.insert(curr_segment_data.begin(), lines.begin(), lines.end()); + 60 lines.clear(); + 61 Currently_parsing_named_segment = false; + 62 Currently_parsing_segment_index = -1; 63 return; 64 } 65 @@ -141,65 +141,84 @@ if ('onhashchange' in window) { 81 +load: 0x0900005c -> 0c 82 +load: 0x0900005d -> 0d 83 - 84 //: compute segment address - 85 - 86 :(before "End Level-2 Transforms") - 87 Transform.push_back(compute_segment_starts); + 84 :(scenario error_on_missing_segment_header) + 85 % Hide_errors = true; + 86 05/add-to-EAX 0/imm32 + 87 +error: input does not start with a '==' section header 88 - 89 :(code) - 90 void compute_segment_starts(program& p) { - 91 trace(99, "transform") << "-- compute segment addresses" << end(); - 92 uint32_t p_offset = /*size of ehdr*/0x34 + SIZE(p.segments)*0x20/*size of each phdr*/; - 93 for (size_t i = 0; i < p.segments.size(); ++i) { - 94 segment& curr = p.segments.at(i); - 95 if (curr.start == 0) { - 96 curr.start = CODE_SEGMENT + i*SPACE_FOR_SEGMENT + p_offset; - 97 trace(99, "transform") << "segment " << i << " begins at address 0x" << HEXWORD << curr.start << end(); - 98 } - 99 p_offset += size_of(curr); -100 assert(p_offset < SEGMENT_ALIGNMENT); // for now we get less and less available space in each successive segment -101 } -102 } -103 -104 uint32_t size_of(const segment& s) { -105 uint32_t sum = 0; -106 for (int i = 0; i < SIZE(s.lines); ++i) -107 sum += num_bytes(s.lines.at(i)); -108 return sum; -109 } -110 -111 // Assumes all bitfields are packed. -112 uint32_t num_bytes(const line& inst) { -113 uint32_t sum = 0; -114 for (int i = 0; i < SIZE(inst.words); ++i) -115 sum += size_of(inst.words.at(i)); -116 return sum; -117 } -118 -119 int size_of(const word& w) { -120 if (has_operand_metadata(w, "disp32") || has_operand_metadata(w, "imm32")) -121 return 4; -122 else if (has_operand_metadata(w, "disp16")) -123 return 2; -124 // End size_of(word w) Special-cases -125 else -126 return 1; -127 } -128 -129 //: Dependencies: -130 //: - We'd like to compute segment addresses before setting up global variables, -131 //: because computing addresses for global variables requires knowing where -132 //: the data segment starts. -133 //: - We'd like to finish expanding labels before computing segment addresses, -134 //: because it would make computing the sizes of segments more self-contained -135 //: (num_bytes). -136 //: -137 //: Decision: compute segment addresses before expanding labels, by being -138 //: aware in this layer of certain operand types that will eventually occupy -139 //: multiple bytes. -140 //: -141 //: The layer to expand labels later hooks into num_bytes() to teach this -142 //: layer that labels occupy zero space in the binary. + 89 :(scenario error_on_first_segment_not_code) + 90 % Hide_errors = true; + 91 == data + 92 05 00 00 00 00 + 93 +error: first segment must be 'code' but is 'data' + 94 + 95 :(scenario error_on_second_segment_not_data) + 96 % Hide_errors = true; + 97 == code + 98 05/add-to-EAX 0/imm32 + 99 == bss +100 05 00 00 00 00 +101 +error: second segment must be 'data' but is 'bss' +102 +103 //: compute segment address +104 +105 :(before "End Level-2 Transforms") +106 Transform.push_back(compute_segment_starts); +107 +108 :(code) +109 void compute_segment_starts(program& p) { +110 trace(99, "transform") << "-- compute segment addresses" << end(); +111 uint32_t p_offset = /*size of ehdr*/0x34 + SIZE(p.segments)*0x20/*size of each phdr*/; +112 for (size_t i = 0; i < p.segments.size(); ++i) { +113 segment& curr = p.segments.at(i); +114 if (curr.start == 0) { +115 curr.start = CODE_SEGMENT + i*SPACE_FOR_SEGMENT + p_offset; +116 trace(99, "transform") << "segment " << i << " begins at address 0x" << HEXWORD << curr.start << end(); +117 } +118 p_offset += size_of(curr); +119 assert(p_offset < SEGMENT_ALIGNMENT); // for now we get less and less available space in each successive segment +120 } +121 } +122 +123 uint32_t size_of(const segment& s) { +124 uint32_t sum = 0; +125 for (int i = 0; i < SIZE(s.lines); ++i) +126 sum += num_bytes(s.lines.at(i)); +127 return sum; +128 } +129 +130 // Assumes all bitfields are packed. +131 uint32_t num_bytes(const line& inst) { +132 uint32_t sum = 0; +133 for (int i = 0; i < SIZE(inst.words); ++i) +134 sum += size_of(inst.words.at(i)); +135 return sum; +136 } +137 +138 int size_of(const word& w) { +139 if (has_operand_metadata(w, "disp32") || has_operand_metadata(w, "imm32")) +140 return 4; +141 else if (has_operand_metadata(w, "disp16")) +142 return 2; +143 // End size_of(word w) Special-cases +144 else +145 return 1; +146 } +147 +148 //: Dependencies: +149 //: - We'd like to compute segment addresses before setting up global variables, +150 //: because computing addresses for global variables requires knowing where +151 //: the data segment starts. +152 //: - We'd like to finish expanding labels before computing segment addresses, +153 //: because it would make computing the sizes of segments more self-contained +154 //: (num_bytes). +155 //: +156 //: Decision: compute segment addresses before expanding labels, by being +157 //: aware in this layer of certain operand types that will eventually occupy +158 //: multiple bytes. +159 //: +160 //: The layer to expand labels later hooks into num_bytes() to teach this +161 //: layer that labels occupy zero space in the binary. diff --git a/subx/034compute_segment_address.cc b/subx/034compute_segment_address.cc index b6c191e4..d37060a4 100644 --- a/subx/034compute_segment_address.cc +++ b/subx/034compute_segment_address.cc @@ -34,12 +34,15 @@ if (!starts_with(segment_title, "0x")) { Currently_parsing_named_segment = true; if (!contains_key(Segment_index, segment_title)) { trace(99, "parse") << "new segment '" << segment_title << "'" << end(); - if (segment_title == "code") - put(Segment_index, segment_title, 0); - else if (segment_title == "data") - put(Segment_index, segment_title, 1); - else - put(Segment_index, segment_title, max(2, SIZE(out.segments))); + if (out.segments.empty() && segment_title != "code") { + raise << "first segment must be 'code' but is '" << segment_title << "'\n" << end(); + return; + } + if (SIZE(out.segments) == 1 && segment_title != "data") { + raise << "second segment must be 'data' but is '" << segment_title << "'\n" << end(); + return; + } + put(Segment_index, segment_title, SIZE(out.segments)); out.segments.push_back(segment()); } else { @@ -50,10 +53,7 @@ if (!starts_with(segment_title, "0x")) { :(before "End flush(p, lines) Special-cases") if (Currently_parsing_named_segment) { - if (p.segments.empty() || Currently_parsing_segment_index < 0) { - raise << "input does not start with a '==' section header\n" << end(); - return; - } + assert(!p.segments.empty()); trace(99, "parse") << "flushing to segment" << end(); vector& curr_segment_data = p.segments.at(Currently_parsing_segment_index).lines; curr_segment_data.insert(curr_segment_data.begin(), lines.begin(), lines.end()); @@ -81,6 +81,25 @@ if (Currently_parsing_named_segment) { +load: 0x0900005c -> 0c +load: 0x0900005d -> 0d +:(scenario error_on_missing_segment_header) +% Hide_errors = true; +05/add-to-EAX 0/imm32 ++error: input does not start with a '==' section header + +:(scenario error_on_first_segment_not_code) +% Hide_errors = true; +== data +05 00 00 00 00 ++error: first segment must be 'code' but is 'data' + +:(scenario error_on_second_segment_not_data) +% Hide_errors = true; +== code +05/add-to-EAX 0/imm32 +== bss +05 00 00 00 00 ++error: second segment must be 'data' but is 'bss' + //: compute segment address :(before "End Level-2 Transforms") diff --git a/subx/Readme.md b/subx/Readme.md index de7a0fa7..b03176f0 100644 --- a/subx/Readme.md +++ b/subx/Readme.md @@ -274,8 +274,7 @@ SubX programs map to the same ELF binaries that a conventional Linux system uses. Linux ELF binaries consist of a series of _segments_. In particular, they distinguish between code and data. Correspondingly, SubX programs consist of a series of segments, each starting with a header line: `==` followed by a name. -The first segment is assumed to be for code, and the second for data. By -convention, I name them `code` and `data`. +The first segment must be named `code`; the second must be named `data`. Execution always begins at the start of the `code` segment. -- cgit 1.4.1-2-gfad0