vim-patch:9.1.0011: regexp cannot match combining chars in collection (#26992)

Problem: regexp cannot match combining chars in collection Solution: Check for combining characters in regex collections for the NFA and BT Regex Engine Also, while at it, make debug mode work again. fixes vim/vim#10286 closes: vim/vim#12871 d2cc51f9a1 Co-authored-by: Christian Brabandt <cb@256bit.org>
2024-12-19 18:55:14 -07:00 · 2024-01-12 14:09:10 +08:00 · 2024-01-12 14:09:10 +08:00 · 3bcf8e5622
commit 3bcf8e5622
parent 1813661a61
2 changed files with 141 additions and 6 deletions
--- a/src/nvim/regexp.c
+++ b/src/nvim/regexp.c
@ -140,7 +140,9 @@ struct regengine {
  int (*regexec_nl)(regmatch_T *, uint8_t *, colnr_T, bool);
  /// bt_regexec_mult or nfa_regexec_mult
  int (*regexec_multi)(regmmatch_T *, win_T *, buf_T *, linenr_T, colnr_T, proftime_T *, int *);
-  // uint8_t *expr;
+#ifdef REGEXP_DEBUG
+  uint8_t *expr;
+#endif
 };

 // Structure used to save the current input state, when it needs to be
@ -6426,15 +6428,33 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)
        break;

        case ANYOF:
-        case ANYBUT:
+        case ANYBUT: {
+          uint8_t *q = OPERAND(scan);
+
          if (c == NUL) {
            status = RA_NOMATCH;
-          } else if ((cstrchr((char *)OPERAND(scan), c) == NULL) == (op == ANYOF)) {
+          } else if ((cstrchr((char *)q, c) == NULL) == (op == ANYOF)) {
            status = RA_NOMATCH;
-          } else {
-            ADVANCE_REGINPUT();
+          } else {  // Check following combining characters
+            int len = utfc_ptr2len((char *)q) - utf_ptr2len((char *)q);
+
+            rex.input += utf_ptr2len((char *)rex.input);
+            q += utf_ptr2len((char *)q);
+
+            if (len == 0) {
+              break;
+            }
+
+            for (int i = 0; i < len; i++) {
+              if (q[i] != rex.input[i]) {
+                status = RA_NOMATCH;
+                break;
+              }
+            }
+            rex.input += len;
          }
          break;
+        }

        case MULTIBYTECODE: {
          int i, len;
@ -10448,13 +10468,39 @@ collection:
          } else {
            if (got_coll_char == true && startc == 0) {
              EMIT(0x0a);
+              EMIT(NFA_CONCAT);
            } else {
              EMIT(startc);
-            }
+              if (utf_ptr2len(regparse) == utfc_ptr2len(regparse)) {
                EMIT(NFA_CONCAT);
              }
            }
+          }
+        }

+        int plen;
+        if (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse))) {
+          int i = utf_ptr2len(regparse);
+
+          c = utf_ptr2char(regparse + i);
+
+          // Add composing characters
+          while (true) {
+            if (c == 0) {
+              // \x00 is translated to \x0a, start at \x01.
+              EMIT(1);
+            } else {
+              EMIT(c);
+            }
+            EMIT(NFA_CONCAT);
+            if ((i += utf_char2len(c)) >= plen) {
+              break;
+            }
+            c = utf_ptr2char(regparse + i);
+          }
+          EMIT(NFA_COMPOSING);
+          EMIT(NFA_CONCAT);
+        }
        MB_PTR_ADV(regparse);
      }           // while (p < endp)

@ -14503,6 +14549,78 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm
        state = t->state->out;
        result_if_matched = (t->state->c == NFA_START_COLL);
        while (true) {
+          if (state->c == NFA_COMPOSING) {
+            int mc = curc;
+            int len = 0;
+            nfa_state_T *end;
+            nfa_state_T *sta;
+            int cchars[MAX_MCO];
+            int ccount = 0;
+            int j;
+
+            sta = t->state->out->out;
+            if (utf_iscomposing(sta->c)) {
+              // Only match composing character(s), ignore base
+              // character.  Used for ".{composing}" and "{composing}"
+              // (no preceding character).
+              len += utf_char2len(mc);
+            }
+            if (rex.reg_icombine && len == 0) {
+              // If \Z was present, then ignore composing characters.
+              // When ignoring the base character this always matches.
+              if (sta->c != curc) {
+                result = FAIL;
+              } else {
+                result = OK;
+              }
+              while (sta->c != NFA_END_COMPOSING) {
+                sta = sta->out;
+              }
+            }
+            // Check base character matches first, unless ignored.
+            else if (len > 0 || mc == sta->c) {
+              if (len == 0) {
+                len += utf_char2len(mc);
+                sta = sta->out;
+              }
+
+              // We don't care about the order of composing characters.
+              // Get them into cchars[] first.
+              while (len < clen) {
+                mc = utf_ptr2char((char *)rex.input + len);
+                cchars[ccount++] = mc;
+                len += utf_char2len(mc);
+                if (ccount == MAX_MCO) {
+                  break;
+                }
+              }
+
+              // Check that each composing char in the pattern matches a
+              // composing char in the text.  We do not check if all
+              // composing chars are matched.
+              result = OK;
+              while (sta->c != NFA_END_COMPOSING) {
+                for (j = 0; j < ccount; j++) {
+                  if (cchars[j] == sta->c) {
+                    break;
+                  }
+                }
+                if (j == ccount) {
+                  result = FAIL;
+                  break;
+                }
+                sta = sta->out;
+              }
+            } else {
+              result = FAIL;
+            }
+
+            if (t->state->out->out1->c == NFA_END_COMPOSING) {
+              end = t->state->out->out1;
+              ADD_STATE_IF_MATCH(end);
+            }
+            break;
+          }
          if (state->c == NFA_END_COLL) {
            result = !result_if_matched;
            break;
@ -15645,6 +15763,9 @@ static regengine_T bt_regengine = {
  bt_regfree,
  bt_regexec_nl,
  bt_regexec_multi,
+#ifdef REGEXP_DEBUG
+  "",
+#endif
 };

 static regengine_T nfa_regengine = {
@ -15652,6 +15773,9 @@ static regengine_T nfa_regengine = {
  nfa_regfree,
  nfa_regexec_nl,
  nfa_regexec_multi,
+#ifdef REGEXP_DEBUG
+  "",
+#endif
 };

 // Which regexp engine to use? Needed for vim_regcomp().
--- a/test/old/testdir/test_regexp_utf8.vim
+++ b/test/old/testdir/test_regexp_utf8.vim
@ -599,5 +599,16 @@ func Test_match_too_complicated()
  set regexpengine=0
 endfunc

+func Test_combining_chars_in_collection()
+  new
+  for i in range(0,2)
+    exe "set re=".i
+    put =['ɔ̃', 'ɔ',  '̃  ã', 'abcd']
+    :%s/[ɔ̃]//
+    call assert_equal(['', '', 'ɔ', '̃  ã', 'abcd'], getline(1,'$'))
+    %d
+  endfor
+  bw!
+endfunc

 " vim: shiftwidth=2 sts=2 expandtab