Tom Cherry | 2a5a4e7 | 2018-06-26 13:56:34 -0700 | [diff] [blame] | 1 | // |
| 2 | // Copyright (C) 2018 The Android Open Source Project |
| 3 | // |
| 4 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | // you may not use this file except in compliance with the License. |
| 6 | // You may obtain a copy of the License at |
| 7 | // |
| 8 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | // |
| 10 | // Unless required by applicable law or agreed to in writing, software |
| 11 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | // See the License for the specific language governing permissions and |
| 14 | // limitations under the License. |
| 15 | // |
| 16 | |
| 17 | #include "tokenizer.h" |
| 18 | |
| 19 | #include <string> |
| 20 | #include <vector> |
| 21 | |
| 22 | #include <gtest/gtest.h> |
| 23 | |
| 24 | namespace android { |
| 25 | namespace init { |
| 26 | |
| 27 | namespace { |
| 28 | |
| 29 | void RunTest(const std::string& data, const std::vector<std::vector<std::string>>& expected_tokens) { |
| 30 | auto data_copy = std::string{data}; |
Tom Cherry | 85f2bc9 | 2020-04-10 10:15:30 -0700 | [diff] [blame] | 31 | data_copy.push_back('\n'); |
Tom Cherry | 2a5a4e7 | 2018-06-26 13:56:34 -0700 | [diff] [blame] | 32 | data_copy.push_back('\0'); |
| 33 | |
| 34 | parse_state state; |
| 35 | state.line = 0; |
| 36 | state.ptr = data_copy.data(); |
| 37 | state.nexttoken = 0; |
| 38 | |
| 39 | std::vector<std::string> current_line; |
| 40 | std::vector<std::vector<std::string>> tokens; |
| 41 | |
| 42 | while (true) { |
| 43 | switch (next_token(&state)) { |
| 44 | case T_EOF: |
| 45 | EXPECT_EQ(expected_tokens, tokens) << data; |
| 46 | return; |
| 47 | case T_NEWLINE: |
| 48 | tokens.emplace_back(std::move(current_line)); |
Tom Cherry | 247ffbf | 2019-07-08 15:09:36 -0700 | [diff] [blame] | 49 | current_line.clear(); |
Tom Cherry | 2a5a4e7 | 2018-06-26 13:56:34 -0700 | [diff] [blame] | 50 | break; |
| 51 | case T_TEXT: |
| 52 | current_line.emplace_back(state.text); |
| 53 | break; |
| 54 | } |
| 55 | } |
| 56 | } |
| 57 | |
| 58 | } // namespace |
| 59 | |
| 60 | TEST(tokenizer, null) { |
| 61 | RunTest("", {{}}); |
| 62 | } |
| 63 | |
| 64 | TEST(tokenizer, simple_oneline) { |
| 65 | RunTest("one two\tthree\rfour", {{"one", "two", "three", "four"}}); |
| 66 | } |
| 67 | |
| 68 | TEST(tokenizer, simple_multiline) { |
| 69 | RunTest("1 2 3\n4 5 6\n7 8 9", {{"1", "2", "3"}, {"4", "5", "6"}, {"7", "8", "9"}}); |
| 70 | } |
| 71 | |
| 72 | TEST(tokenizer, preceding_space) { |
| 73 | // Preceding spaces are ignored. |
| 74 | RunTest(" 1 2 3\n\t\t\t\t4 5 6\n\r\r\r\r7 8 9", |
| 75 | {{"1", "2", "3"}, {"4", "5", "6"}, {"7", "8", "9"}}); |
| 76 | } |
| 77 | |
| 78 | TEST(tokenizer, comments) { |
| 79 | // Entirely commented lines still produce a T_NEWLINE token for tracking line count. |
| 80 | RunTest("1 2 3\n#4 5 6\n7 8 9", {{"1", "2", "3"}, {}, {"7", "8", "9"}}); |
| 81 | |
| 82 | RunTest("#1 2 3\n4 5 6\n7 8 9", {{}, {"4", "5", "6"}, {"7", "8", "9"}}); |
| 83 | |
| 84 | RunTest("1 2 3\n4 5 6\n#7 8 9", {{"1", "2", "3"}, {"4", "5", "6"}, {}}); |
| 85 | |
| 86 | RunTest("1 2 #3\n4 #5 6\n#7 8 9", {{"1", "2"}, {"4"}, {}}); |
| 87 | } |
| 88 | |
| 89 | TEST(tokenizer, control_chars) { |
| 90 | // Literal \n, \r, \t, and \\ produce the control characters \n, \r, \t, and \\ respectively. |
| 91 | // Literal \? produces ? for all other character '?' |
| 92 | |
| 93 | RunTest(R"(1 token\ntoken 2)", {{"1", "token\ntoken", "2"}}); |
| 94 | RunTest(R"(1 token\rtoken 2)", {{"1", "token\rtoken", "2"}}); |
| 95 | RunTest(R"(1 token\ttoken 2)", {{"1", "token\ttoken", "2"}}); |
| 96 | RunTest(R"(1 token\\token 2)", {{"1", "token\\token", "2"}}); |
| 97 | RunTest(R"(1 token\btoken 2)", {{"1", "tokenbtoken", "2"}}); |
| 98 | |
| 99 | RunTest(R"(1 token\n 2)", {{"1", "token\n", "2"}}); |
| 100 | RunTest(R"(1 token\r 2)", {{"1", "token\r", "2"}}); |
| 101 | RunTest(R"(1 token\t 2)", {{"1", "token\t", "2"}}); |
| 102 | RunTest(R"(1 token\\ 2)", {{"1", "token\\", "2"}}); |
| 103 | RunTest(R"(1 token\b 2)", {{"1", "tokenb", "2"}}); |
| 104 | |
| 105 | RunTest(R"(1 \ntoken 2)", {{"1", "\ntoken", "2"}}); |
| 106 | RunTest(R"(1 \rtoken 2)", {{"1", "\rtoken", "2"}}); |
| 107 | RunTest(R"(1 \ttoken 2)", {{"1", "\ttoken", "2"}}); |
| 108 | RunTest(R"(1 \\token 2)", {{"1", "\\token", "2"}}); |
| 109 | RunTest(R"(1 \btoken 2)", {{"1", "btoken", "2"}}); |
| 110 | |
| 111 | RunTest(R"(1 \n 2)", {{"1", "\n", "2"}}); |
| 112 | RunTest(R"(1 \r 2)", {{"1", "\r", "2"}}); |
| 113 | RunTest(R"(1 \t 2)", {{"1", "\t", "2"}}); |
| 114 | RunTest(R"(1 \\ 2)", {{"1", "\\", "2"}}); |
| 115 | RunTest(R"(1 \b 2)", {{"1", "b", "2"}}); |
| 116 | } |
| 117 | |
| 118 | TEST(tokenizer, cr_lf) { |
| 119 | // \ before \n, \r, or \r\n is interpreted as a line continuation |
| 120 | // Extra whitespace on the next line is eaten, except \r unlike in the above tests. |
| 121 | |
| 122 | RunTest("lf\\\ncont", {{"lfcont"}}); |
| 123 | RunTest("lf\\\n \t\t\t\tcont", {{"lfcont"}}); |
| 124 | |
| 125 | RunTest("crlf\\\r\ncont", {{"crlfcont"}}); |
| 126 | RunTest("crlf\\\r\n \t\t\t\tcont", {{"crlfcont"}}); |
| 127 | |
| 128 | RunTest("cr\\\rcont", {{"crcont"}}); |
| 129 | |
| 130 | RunTest("lfspace \\\ncont", {{"lfspace", "cont"}}); |
| 131 | RunTest("lfspace \\\n \t\t\t\tcont", {{"lfspace", "cont"}}); |
| 132 | |
| 133 | RunTest("crlfspace \\\r\ncont", {{"crlfspace", "cont"}}); |
| 134 | RunTest("crlfspace \\\r\n \t\t\t\tcont", {{"crlfspace", "cont"}}); |
| 135 | |
| 136 | RunTest("crspace \\\rcont", {{"crspace", "cont"}}); |
| 137 | } |
| 138 | |
| 139 | TEST(tokenizer, quoted) { |
| 140 | RunTest("\"quoted simple string\"", {{"quoted simple string"}}); |
| 141 | |
| 142 | // Unterminated quotes just return T_EOF without any T_NEWLINE. |
| 143 | RunTest("\"unterminated quoted string", {}); |
| 144 | |
| 145 | RunTest("\"1 2 3\"\n \"unterminated quoted string", {{"1 2 3"}}); |
| 146 | |
| 147 | // Escaping quotes is not allowed and are treated as an unterminated quoted string. |
| 148 | RunTest("\"quoted escaped quote\\\"\"", {}); |
| 149 | RunTest("\"quoted escaped\\\" quote\"", {}); |
| 150 | RunTest("\"\\\"quoted escaped quote\"", {}); |
| 151 | |
| 152 | RunTest("\"quoted control characters \\n \\r \\t \\\\ \\b \\\r \\\n \r \n\"", |
| 153 | {{"quoted control characters \\n \\r \\t \\\\ \\b \\\r \\\n \r \n"}}); |
| 154 | |
| 155 | RunTest("\"quoted simple string\" \"second quoted string\"", |
| 156 | {{"quoted simple string", "second quoted string"}}); |
| 157 | |
| 158 | RunTest("\"# comment quoted string\"", {{"# comment quoted string"}}); |
| 159 | |
| 160 | RunTest("\"Adjacent \"\"quoted strings\"", {{"Adjacent quoted strings"}}); |
| 161 | } |
| 162 | |
| 163 | } // namespace init |
| 164 | } // namespace android |