Skip PtNodes with non-Unicode code points for suggestion.

Bug: 14119293
Change-Id: Id1d3b789b5f18757070878dba35a7980bfb44591
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h
index 91192fc..bef401f 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h
@@ -23,6 +23,7 @@
 #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
 #include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
 #include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
+#include "utils/char_utils.h"
 
 namespace latinime {
 
@@ -158,6 +159,10 @@
         return PatriciaTrieReadingUtils::hasShortcutTargets(mFlags);
     }
 
+    AK_FORCE_INLINE bool representsNonWordInfo() const {
+        return getCodePointCount() > 0 && CharUtils::isInUnicodeSpace(getCodePoints()[0]);
+    }
+
     // Parent node position
     AK_FORCE_INLINE int getParentPos() const {
         return mParentPos;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp
index b3af1f4..30dcfba 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp
@@ -24,6 +24,7 @@
 #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
 #include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
 #include "suggest/policyimpl/dictionary/utils/probability_utils.h"
+#include "utils/char_utils.h"
 
 namespace latinime {
 
@@ -318,12 +319,15 @@
     PatriciaTrieReadingUtils::readPtNodeInfo(mDictRoot, ptNodePos, getShortcutsStructurePolicy(),
             getBigramsStructurePolicy(), &flags, &mergedNodeCodePointCount, mergedNodeCodePoints,
             &probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos);
-    childDicNodes->pushLeavingChild(dicNode, ptNodePos, childrenPos, probability,
-            PatriciaTrieReadingUtils::isTerminal(flags),
-            PatriciaTrieReadingUtils::hasChildrenInFlags(flags),
-            PatriciaTrieReadingUtils::isBlacklisted(flags)
-                    || PatriciaTrieReadingUtils::isNotAWord(flags),
-            mergedNodeCodePointCount, mergedNodeCodePoints);
+    // Skip PtNodes don't start with Unicode code point because they represent non-word information.
+    if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) {
+        childDicNodes->pushLeavingChild(dicNode, ptNodePos, childrenPos, probability,
+                PatriciaTrieReadingUtils::isTerminal(flags),
+                PatriciaTrieReadingUtils::hasChildrenInFlags(flags),
+                PatriciaTrieReadingUtils::isBlacklisted(flags)
+                        || PatriciaTrieReadingUtils::isNotAWord(flags),
+                mergedNodeCodePointCount, mergedNodeCodePoints);
+    }
     return siblingPos;
 }
 
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp
index bbfd22e..f8587f5 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp
@@ -59,13 +59,17 @@
             // valid terminal DicNode.
             isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY;
         }
+        readingHelper.readNextSiblingNode(ptNodeParams);
+        if (!ptNodeParams.representsNonWordInfo()) {
+            // Skip PtNodes that represent non-word information.
+            continue;
+        }
         childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(),
                 ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), isTerminal,
                 ptNodeParams.hasChildren(),
                 ptNodeParams.isBlacklisted()
                         || ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */,
                 ptNodeParams.getCodePointCount(), ptNodeParams.getCodePoints());
-        readingHelper.readNextSiblingNode(ptNodeParams);
     }
     if (readingHelper.isError()) {
         mIsCorrupted = true;
diff --git a/native/jni/src/utils/char_utils.cpp b/native/jni/src/utils/char_utils.cpp
index adc474b..b17e084 100644
--- a/native/jni/src/utils/char_utils.cpp
+++ b/native/jni/src/utils/char_utils.cpp
@@ -22,6 +22,9 @@
 
 namespace latinime {
 
+const int CharUtils::MIN_UNICODE_CODE_POINT = 0;
+const int CharUtils::MAX_UNICODE_CODE_POINT = 0x10FFFF;
+
 struct LatinCapitalSmallPair {
   unsigned short capital;
   unsigned short small;
diff --git a/native/jni/src/utils/char_utils.h b/native/jni/src/utils/char_utils.h
index 239419d..634c45b 100644
--- a/native/jni/src/utils/char_utils.h
+++ b/native/jni/src/utils/char_utils.h
@@ -86,12 +86,19 @@
         return spaceCount;
     }
 
+    static AK_FORCE_INLINE int isInUnicodeSpace(const int codePoint) {
+        return codePoint >= MIN_UNICODE_CODE_POINT && codePoint <= MAX_UNICODE_CODE_POINT;
+    }
+
     static unsigned short latin_tolower(const unsigned short c);
     static const std::vector<int> EMPTY_STRING;
 
  private:
     DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils);
 
+    static const int MIN_UNICODE_CODE_POINT;
+    static const int MAX_UNICODE_CODE_POINT;
+
     /**
      * Table mapping most combined Latin, Greek, and Cyrillic characters
      * to their base characters.  If c is in range, BASE_CHARS[c] == c