patch 9.0.1771: regex: combining chars in collections not handled
Problem: regex: combining chars in collections not handled
Solution: Check for following combining characters for NFA and BT engine
closes: #10459
closes: #10286
Signed-off-by: Christian Brabandt <cb@256bit.org>
diff --git a/src/regexp_nfa.c b/src/regexp_nfa.c
index d724d52..60cd29c 100644
--- a/src/regexp_nfa.c
+++ b/src/regexp_nfa.c
@@ -1764,6 +1764,7 @@
endp = skip_anyof(p);
if (*endp == ']')
{
+ int plen;
/*
* Try to reverse engineer character classes. For example,
* recognize that [0-9] stands for \d and [A-Za-z_] for \h,
@@ -2035,11 +2036,34 @@
if (got_coll_char == TRUE && startc == 0)
EMIT(0x0a);
else
+ {
EMIT(startc);
- EMIT(NFA_CONCAT);
+ if (!(enc_utf8 && (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse)))))
+ {
+ EMIT(NFA_CONCAT);
+ }
+ }
}
}
+ if (enc_utf8 && (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse))))
+ {
+ int i = utf_ptr2len(regparse);
+
+ c = utf_ptr2char(regparse + i);
+
+ // Add composing characters
+ for (;;)
+ {
+ EMIT(c);
+ EMIT(NFA_CONCAT);
+ if ((i += utf_char2len(c)) >= plen)
+ break;
+ c = utf_ptr2char(regparse + i);
+ }
+ EMIT(NFA_COMPOSING);
+ EMIT(NFA_CONCAT);
+ }
MB_PTR_ADV(regparse);
} // while (p < endp)
@@ -6418,6 +6442,84 @@
result_if_matched = (t->state->c == NFA_START_COLL);
for (;;)
{
+ if (state->c == NFA_COMPOSING)
+ {
+ int mc = curc;
+ int len = 0;
+ nfa_state_T *end;
+ nfa_state_T *sta;
+ int cchars[MAX_MCO];
+ int ccount = 0;
+ int j;
+
+ sta = t->state->out->out;
+ len = 0;
+ if (utf_iscomposing(sta->c))
+ {
+ // Only match composing character(s), ignore base
+ // character. Used for ".{composing}" and "{composing}"
+ // (no preceding character).
+ len += mb_char2len(mc);
+ }
+ if (rex.reg_icombine && len == 0)
+ {
+ // If \Z was present, then ignore composing characters.
+ // When ignoring the base character this always matches.
+ if (sta->c != curc)
+ result = FAIL;
+ else
+ result = OK;
+ while (sta->c != NFA_END_COMPOSING)
+ sta = sta->out;
+ }
+ // Check base character matches first, unless ignored.
+ else if (len > 0 || mc == sta->c)
+// if (len > 0 || mc == sta->c)
+ {
+ if (len == 0)
+ {
+ len += mb_char2len(mc);
+ sta = sta->out;
+ }
+
+ // We don't care about the order of composing characters.
+ // Get them into cchars[] first.
+ while (len < clen)
+ {
+ mc = mb_ptr2char(rex.input + len);
+ cchars[ccount++] = mc;
+ len += mb_char2len(mc);
+ if (ccount == MAX_MCO)
+ break;
+ }
+
+ // Check that each composing char in the pattern matches a
+ // composing char in the text. We do not check if all
+ // composing chars are matched.
+ result = OK;
+ while (sta->c != NFA_END_COMPOSING)
+ {
+ for (j = 0; j < ccount; ++j)
+ if (cchars[j] == sta->c)
+ break;
+ if (j == ccount)
+ {
+ result = FAIL;
+ break;
+ }
+ sta = sta->out;
+ }
+ }
+ else
+ result = FAIL;
+
+ if (t->state->out->out1->c == NFA_END_COMPOSING)
+ {
+ end = t->state->out->out1;
+ ADD_STATE_IF_MATCH(end);
+ }
+ break;
+ }
if (state->c == NFA_END_COLL)
{
result = !result_if_matched;