AAPT2: Fix styled string whitespace processing
Change styled string whitespace processing to be like AAPT's was.
Main changes:
- whitespace around tags is preserved.
- tags start exactly where they are supposed to, not off by one.
Bug: 72406283
Test: make aapt2_tests
Change-Id: I4d12728c493efd8c978e2e3d2718b56534ff52ef
diff --git a/tools/aapt2/util/Util.cpp b/tools/aapt2/util/Util.cpp
index e42145d..d1c9ca1 100644
--- a/tools/aapt2/util/Util.cpp
+++ b/tools/aapt2/util/Util.cpp
@@ -76,6 +76,34 @@
return str.substr(str.size() - suffix.size(), suffix.size()) == suffix;
}
+StringPiece TrimLeadingWhitespace(const StringPiece& str) {
+ if (str.size() == 0 || str.data() == nullptr) {
+ return str;
+ }
+
+ const char* start = str.data();
+ const char* end = start + str.length();
+
+ while (start != end && isspace(*start)) {
+ start++;
+ }
+ return StringPiece(start, end - start);
+}
+
+StringPiece TrimTrailingWhitespace(const StringPiece& str) {
+ if (str.size() == 0 || str.data() == nullptr) {
+ return str;
+ }
+
+ const char* start = str.data();
+ const char* end = start + str.length();
+
+ while (end != start && isspace(*(end - 1))) {
+ end--;
+ }
+ return StringPiece(start, end - start);
+}
+
StringPiece TrimWhitespace(const StringPiece& str) {
if (str.size() == 0 || str.data() == nullptr) {
return str;
@@ -269,162 +297,6 @@
return true;
}
-static bool AppendCodepointToUtf8String(char32_t codepoint, std::string* output) {
- ssize_t len = utf32_to_utf8_length(&codepoint, 1);
- if (len < 0) {
- return false;
- }
-
- const size_t start_append_pos = output->size();
-
- // Make room for the next character.
- output->resize(output->size() + len);
-
- char* dst = &*(output->begin() + start_append_pos);
- utf32_to_utf8(&codepoint, 1, dst, len + 1);
- return true;
-}
-
-static bool AppendUnicodeCodepoint(Utf8Iterator* iter, std::string* output) {
- char32_t code = 0;
- for (size_t i = 0; i < 4 && iter->HasNext(); i++) {
- char32_t codepoint = iter->Next();
- char32_t a;
- if (codepoint >= U'0' && codepoint <= U'9') {
- a = codepoint - U'0';
- } else if (codepoint >= U'a' && codepoint <= U'f') {
- a = codepoint - U'a' + 10;
- } else if (codepoint >= U'A' && codepoint <= U'F') {
- a = codepoint - U'A' + 10;
- } else {
- return {};
- }
- code = (code << 4) | a;
- }
- return AppendCodepointToUtf8String(code, output);
-}
-
-static bool IsCodepointSpace(char32_t codepoint) {
- if (static_cast<uint32_t>(codepoint) & 0xffffff00u) {
- return false;
- }
- return isspace(static_cast<char>(codepoint));
-}
-
-StringBuilder::StringBuilder(bool preserve_spaces) : preserve_spaces_(preserve_spaces) {
-}
-
-StringBuilder& StringBuilder::Append(const StringPiece& str) {
- if (!error_.empty()) {
- return *this;
- }
-
- // Where the new data will be appended to.
- const size_t new_data_index = str_.size();
-
- Utf8Iterator iter(str);
- while (iter.HasNext()) {
- const char32_t codepoint = iter.Next();
-
- if (last_char_was_escape_) {
- switch (codepoint) {
- case U't':
- str_ += '\t';
- break;
-
- case U'n':
- str_ += '\n';
- break;
-
- case U'#':
- case U'@':
- case U'?':
- case U'"':
- case U'\'':
- case U'\\':
- str_ += static_cast<char>(codepoint);
- break;
-
- case U'u':
- if (!AppendUnicodeCodepoint(&iter, &str_)) {
- error_ = "invalid unicode escape sequence";
- return *this;
- }
- break;
-
- default:
- // Ignore the escape character and just include the codepoint.
- AppendCodepointToUtf8String(codepoint, &str_);
- break;
- }
- last_char_was_escape_ = false;
-
- } else if (!preserve_spaces_ && codepoint == U'"') {
- if (!quote_ && trailing_space_) {
- // We found an opening quote, and we have trailing space, so we should append that
- // space now.
- if (trailing_space_) {
- // We had trailing whitespace, so replace with a single space.
- if (!str_.empty()) {
- str_ += ' ';
- }
- trailing_space_ = false;
- }
- }
- quote_ = !quote_;
-
- } else if (!preserve_spaces_ && codepoint == U'\'' && !quote_) {
- // This should be escaped.
- error_ = "unescaped apostrophe";
- return *this;
-
- } else if (codepoint == U'\\') {
- // This is an escape sequence, convert to the real value.
- if (!quote_ && trailing_space_) {
- // We had trailing whitespace, so
- // replace with a single space.
- if (!str_.empty()) {
- str_ += ' ';
- }
- trailing_space_ = false;
- }
- last_char_was_escape_ = true;
- } else {
- if (preserve_spaces_ || quote_) {
- // Quotes mean everything is taken, including whitespace.
- AppendCodepointToUtf8String(codepoint, &str_);
- } else {
- // This is not quoted text, so we will accumulate whitespace and only emit a single
- // character of whitespace if it is followed by a non-whitespace character.
- if (IsCodepointSpace(codepoint)) {
- // We found whitespace.
- trailing_space_ = true;
- } else {
- if (trailing_space_) {
- // We saw trailing space before, so replace all
- // that trailing space with one space.
- if (!str_.empty()) {
- str_ += ' ';
- }
- trailing_space_ = false;
- }
- AppendCodepointToUtf8String(codepoint, &str_);
- }
- }
- }
- }
-
- // Accumulate the added string's UTF-16 length.
- ssize_t len = utf8_to_utf16_length(reinterpret_cast<const uint8_t*>(str_.data()) + new_data_index,
- str_.size() - new_data_index);
- if (len < 0) {
- error_ = "invalid unicode code point";
- return *this;
- }
- utf16_len_ += len;
- return *this;
-}
-
std::u16string Utf8ToUtf16(const StringPiece& utf8) {
ssize_t utf16_length = utf8_to_utf16_length(
reinterpret_cast<const uint8_t*>(utf8.data()), utf8.length());