AAPT2: Fix pseudolocalization (again)
Pseudolocalization didn't properly handle spans in
strings like "<small><small>Hello</small></small>".
The spans would be identical and when doing range checks
only one of them would be updated.
Switched to a more robust way of extracting the relevant
chunks of a styled string. This uses a stack, which is more
in line with the real representation in XML.
Bug: 34088357
Test: make aapt2_tests
Change-Id: Ia4e4501713e688c96a89e26e4e2b1384f4cd3889
diff --git a/tools/aapt2/compile/PseudolocaleGenerator.cpp b/tools/aapt2/compile/PseudolocaleGenerator.cpp
index fad9edd..a031ea4 100644
--- a/tools/aapt2/compile/PseudolocaleGenerator.cpp
+++ b/tools/aapt2/compile/PseudolocaleGenerator.cpp
@@ -22,136 +22,194 @@
#include "ResourceValues.h"
#include "ValueVisitor.h"
#include "compile/Pseudolocalizer.h"
+#include "util/Util.h"
using android::StringPiece;
+using android::StringPiece16;
namespace aapt {
-std::unique_ptr<StyledString> PseudolocalizeStyledString(
- StyledString* string, Pseudolocalizer::Method method, StringPool* pool) {
+// The struct that represents both Span objects and UntranslatableSections.
+struct UnifiedSpan {
+ // Only present for Span objects. If not present, this was an UntranslatableSection.
+ Maybe<std::string> tag;
+
+ // The UTF-16 index into the string where this span starts.
+ uint32_t first_char;
+
+ // The UTF-16 index into the string where this span ends, inclusive.
+ uint32_t last_char;
+};
+
+inline static bool operator<(const UnifiedSpan& left, const UnifiedSpan& right) {
+ if (left.first_char < right.first_char) {
+ return true;
+ } else if (left.first_char > right.first_char) {
+ return false;
+ } else if (left.last_char < right.last_char) {
+ return true;
+ }
+ return false;
+}
+
+inline static UnifiedSpan SpanToUnifiedSpan(const StringPool::Span& span) {
+ return UnifiedSpan{*span.name, span.first_char, span.last_char};
+}
+
+inline static UnifiedSpan UntranslatableSectionToUnifiedSpan(const UntranslatableSection& section) {
+ return UnifiedSpan{
+ {}, static_cast<uint32_t>(section.start), static_cast<uint32_t>(section.end) - 1};
+}
+
+// Merges the Span and UntranslatableSections of this StyledString into a single vector of
+// UnifiedSpans. This will first check that the Spans are sorted in ascending order.
+static std::vector<UnifiedSpan> MergeSpans(const StyledString& string) {
+ // Ensure the Spans are sorted and converted.
+ std::vector<UnifiedSpan> sorted_spans;
+ sorted_spans.reserve(string.value->spans.size());
+ std::transform(string.value->spans.begin(), string.value->spans.end(),
+ std::back_inserter(sorted_spans), SpanToUnifiedSpan);
+
+ // Stable sort to ensure tag sequences like "<b><i>" are preserved.
+ std::stable_sort(sorted_spans.begin(), sorted_spans.end());
+
+ // Ensure the UntranslatableSections are sorted and converted.
+ std::vector<UnifiedSpan> sorted_untranslatable_sections;
+ sorted_untranslatable_sections.reserve(string.untranslatable_sections.size());
+ std::transform(string.untranslatable_sections.begin(), string.untranslatable_sections.end(),
+ std::back_inserter(sorted_untranslatable_sections),
+ UntranslatableSectionToUnifiedSpan);
+ std::sort(sorted_untranslatable_sections.begin(), sorted_untranslatable_sections.end());
+
+ std::vector<UnifiedSpan> merged_spans;
+ merged_spans.reserve(sorted_spans.size() + sorted_untranslatable_sections.size());
+ auto span_iter = sorted_spans.begin();
+ auto untranslatable_iter = sorted_untranslatable_sections.begin();
+ while (span_iter != sorted_spans.end() &&
+ untranslatable_iter != sorted_untranslatable_sections.end()) {
+ if (*span_iter < *untranslatable_iter) {
+ merged_spans.push_back(std::move(*span_iter));
+ ++span_iter;
+ } else {
+ merged_spans.push_back(std::move(*untranslatable_iter));
+ ++untranslatable_iter;
+ }
+ }
+
+ while (span_iter != sorted_spans.end()) {
+ merged_spans.push_back(std::move(*span_iter));
+ ++span_iter;
+ }
+
+ while (untranslatable_iter != sorted_untranslatable_sections.end()) {
+ merged_spans.push_back(std::move(*untranslatable_iter));
+ ++untranslatable_iter;
+ }
+ return merged_spans;
+}
+
+std::unique_ptr<StyledString> PseudolocalizeStyledString(StyledString* string,
+ Pseudolocalizer::Method method,
+ StringPool* pool) {
Pseudolocalizer localizer(method);
- const StringPiece original_text = *string->value->str;
+ // Collect the spans and untranslatable sections into one set of spans, sorted by first_char.
+ // This will effectively subdivide the string into multiple sections that can be individually
+ // pseudolocalized, while keeping the span indices synchronized.
+ std::vector<UnifiedSpan> merged_spans = MergeSpans(*string);
+
+ // All Span indices are UTF-16 based, according to the resources.arsc format expected by the
+ // runtime. So we will do all our processing in UTF-16, then convert back.
+ const std::u16string text16 = util::Utf8ToUtf16(*string->value->str);
+
+ // Convenient wrapper around the text that allows us to work with StringPieces.
+ const StringPiece16 text(text16);
+
+ // The new string.
+ std::string new_string = localizer.Start();
+
+ // The stack that keeps track of what nested Span we're in.
+ std::vector<size_t> span_stack;
+
+ // The current position in the original text.
+ uint32_t cursor = 0u;
+
+ // The current position in the new text.
+ uint32_t new_cursor = utf8_to_utf16_length(reinterpret_cast<const uint8_t*>(new_string.data()),
+ new_string.size(), false);
+
+ // We assume no nesting of untranslatable sections, since XLIFF doesn't allow it.
+ bool translatable = true;
+ size_t span_idx = 0u;
+ while (span_idx < merged_spans.size() || !span_stack.empty()) {
+ UnifiedSpan* span = span_idx >= merged_spans.size() ? nullptr : &merged_spans[span_idx];
+ UnifiedSpan* parent_span = span_stack.empty() ? nullptr : &merged_spans[span_stack.back()];
+
+ if (span != nullptr) {
+ if (parent_span == nullptr || parent_span->last_char > span->first_char) {
+ // There is no parent, or this span is the child of the parent.
+ // Pseudolocalize all the text until this span.
+ const StringPiece16 substr = text.substr(cursor, span->first_char - cursor);
+ cursor += substr.size();
+
+ // Pseudolocalize the substring.
+ std::string new_substr = util::Utf16ToUtf8(substr);
+ if (translatable) {
+ new_substr = localizer.Text(new_substr);
+ }
+ new_cursor += utf8_to_utf16_length(reinterpret_cast<const uint8_t*>(new_substr.data()),
+ new_substr.size(), false);
+ new_string += new_substr;
+
+ // Rewrite the first_char.
+ span->first_char = new_cursor;
+ if (!span->tag) {
+ // An untranslatable section has begun!
+ translatable = false;
+ }
+ span_stack.push_back(span_idx);
+ ++span_idx;
+ continue;
+ }
+ }
+
+ if (parent_span != nullptr) {
+ // There is a parent, and either this span is not a child of it, or there are no more spans.
+ // Pop this off the stack.
+ const StringPiece16 substr = text.substr(cursor, parent_span->last_char - cursor + 1);
+ cursor += substr.size();
+
+ // Pseudolocalize the substring.
+ std::string new_substr = util::Utf16ToUtf8(substr);
+ if (translatable) {
+ new_substr = localizer.Text(new_substr);
+ }
+ new_cursor += utf8_to_utf16_length(reinterpret_cast<const uint8_t*>(new_substr.data()),
+ new_substr.size(), false);
+ new_string += new_substr;
+
+ parent_span->last_char = new_cursor - 1;
+ if (parent_span->tag) {
+ // An end to an untranslatable section.
+ translatable = true;
+ }
+ span_stack.pop_back();
+ }
+ }
+
+ // Finish the pseudolocalization at the end of the string.
+ new_string += localizer.Text(util::Utf16ToUtf8(text.substr(cursor, text.size() - cursor)));
+ new_string += localizer.End();
StyleString localized;
+ localized.str = std::move(new_string);
- // Copy the spans. We will update their offsets when we localize.
- localized.spans.reserve(string->value->spans.size());
- for (const StringPool::Span& span : string->value->spans) {
- localized.spans.push_back(
- Span{*span.name, span.first_char, span.last_char});
- }
-
- // The ranges are all represented with a single value. This is the start of
- // one range and end of another.
- struct Range {
- size_t start;
-
- // If set to true, toggles the state of translatability.
- bool toggle_translatability;
-
- // Once the new string is localized, these are the pointers to the spans to adjust.
- // Since this struct represents the start of one range and end of another,
- // we have the two pointers respectively.
- uint32_t* update_start;
- uint32_t* update_end;
- };
-
- auto cmp = [](const Range& r, size_t index) -> bool {
- return r.start < index;
- };
-
- // Construct the ranges. The ranges are represented like so: [0, 2, 5, 7]
- // The ranges are the spaces in between. In this example, with a total string
- // length of 9, the vector represents: (0,1], (2,4], (5,6], (7,9]
- //
- std::vector<Range> ranges;
- ranges.push_back(Range{0, false, nullptr, nullptr});
- ranges.push_back(Range{original_text.size() - 1, false, nullptr, nullptr});
- for (size_t i = 0; i < string->value->spans.size(); i++) {
- const StringPool::Span& span = string->value->spans[i];
-
- // Insert or update the Range marker for the start of this span.
- auto iter =
- std::lower_bound(ranges.begin(), ranges.end(), span.first_char, cmp);
- if (iter != ranges.end() && iter->start == span.first_char) {
- iter->update_start = &localized.spans[i].first_char;
- } else {
- ranges.insert(iter, Range{span.first_char, false, &localized.spans[i].first_char, nullptr});
- }
-
- // Insert or update the Range marker for the end of this span.
- iter = std::lower_bound(ranges.begin(), ranges.end(), span.last_char, cmp);
- if (iter != ranges.end() && iter->start == span.last_char) {
- iter->update_end = &localized.spans[i].last_char;
- } else {
- ranges.insert(iter, Range{span.last_char, false, nullptr, &localized.spans[i].last_char});
+ // Convert the UnifiedSpans into regular Spans, skipping the UntranslatableSections.
+ for (UnifiedSpan& span : merged_spans) {
+ if (span.tag) {
+ localized.spans.push_back(Span{std::move(span.tag.value()), span.first_char, span.last_char});
}
}
-
- // Parts of the string may be untranslatable. Merge those ranges
- // in as well, so that we have continuous sections of text to
- // feed into the pseudolocalizer.
- // We do this by marking the beginning of a range as either toggling
- // the translatability state or not.
- for (const UntranslatableSection& section : string->untranslatable_sections) {
- auto iter = std::lower_bound(ranges.begin(), ranges.end(), section.start, cmp);
- if (iter != ranges.end() && iter->start == section.start) {
- // An existing span starts (or ends) here. We just need to mark that
- // the translatability should toggle here. If translatability was
- // already being toggled, then that means we have two adjacent ranges of untranslatable
- // text, so remove the toggle and only toggle at the end of this range,
- // effectively merging these ranges.
- iter->toggle_translatability = !iter->toggle_translatability;
- } else {
- // Insert a new range that specifies to toggle the translatability.
- iter = ranges.insert(iter, Range{section.start, true, nullptr, nullptr});
- }
-
- // Update/create an end to the untranslatable section.
- iter = std::lower_bound(iter, ranges.end(), section.end, cmp);
- if (iter != ranges.end() && iter->start == section.end) {
- iter->toggle_translatability = true;
- } else {
- iter = ranges.insert(iter, Range{section.end, true, nullptr, nullptr});
- }
- }
-
- localized.str += localizer.Start();
-
- // Iterate over the ranges and localize each section.
- // The text starts as translatable, and each time a range has toggle_translatability
- // set to true, we toggle whether to translate or not.
- // This assumes no untranslatable ranges overlap.
- bool translatable = true;
- for (size_t i = 0; i < ranges.size(); i++) {
- const size_t start = ranges[i].start;
- size_t len = original_text.size() - start;
- if (i + 1 < ranges.size()) {
- len = ranges[i + 1].start - start;
- }
-
- if (ranges[i].update_start) {
- *ranges[i].update_start = localized.str.size();
- }
-
- if (ranges[i].update_end) {
- *ranges[i].update_end = localized.str.size();
- }
-
- if (ranges[i].toggle_translatability) {
- translatable = !translatable;
- }
-
- if (translatable) {
- localized.str += localizer.Text(original_text.substr(start, len));
- } else {
- localized.str += original_text.substr(start, len);
- }
- }
-
- localized.str += localizer.End();
-
return util::make_unique<StyledString>(pool->MakeRef(localized));
}
@@ -175,8 +233,7 @@
if (sub_visitor.value) {
localized->values[i] = std::move(sub_visitor.item);
} else {
- localized->values[i] =
- std::unique_ptr<Item>(plural->values[i]->Clone(pool_));
+ localized->values[i] = std::unique_ptr<Item>(plural->values[i]->Clone(pool_));
}
}
}
@@ -210,8 +267,7 @@
}
result += localizer_.End();
- std::unique_ptr<String> localized =
- util::make_unique<String>(pool_->MakeRef(result));
+ std::unique_ptr<String> localized = util::make_unique<String>(pool_->MakeRef(result));
localized->SetSource(string->GetSource());
localized->SetWeak(true);
item = std::move(localized);
@@ -282,14 +338,10 @@
}
}
-/**
- * A value is pseudolocalizable if it does not define a locale (or is the
- * default locale)
- * and is translatable.
- */
+// A value is pseudolocalizable if it does not define a locale (or is the default locale) and is
+// translatable.
static bool IsPseudolocalizable(ResourceConfigValue* config_value) {
- const int diff =
- config_value->config.diff(ConfigDescription::DefaultConfig());
+ const int diff = config_value->config.diff(ConfigDescription::DefaultConfig());
if (diff & ConfigDescription::CONFIG_LOCALE) {
return false;
}
@@ -298,19 +350,16 @@
} // namespace
-bool PseudolocaleGenerator::Consume(IAaptContext* context,
- ResourceTable* table) {
+bool PseudolocaleGenerator::Consume(IAaptContext* context, ResourceTable* table) {
for (auto& package : table->packages) {
for (auto& type : package->types) {
for (auto& entry : type->entries) {
- std::vector<ResourceConfigValue*> values =
- entry->FindValuesIf(IsPseudolocalizable);
-
+ std::vector<ResourceConfigValue*> values = entry->FindValuesIf(IsPseudolocalizable);
for (ResourceConfigValue* value : values) {
- PseudolocalizeIfNeeded(Pseudolocalizer::Method::kAccent, value,
- &table->string_pool, entry.get());
- PseudolocalizeIfNeeded(Pseudolocalizer::Method::kBidi, value,
- &table->string_pool, entry.get());
+ PseudolocalizeIfNeeded(Pseudolocalizer::Method::kAccent, value, &table->string_pool,
+ entry.get());
+ PseudolocalizeIfNeeded(Pseudolocalizer::Method::kBidi, value, &table->string_pool,
+ entry.get());
}
}
}