From 3bcf8e5622f29a65564bf2f7672997432e183dfa Mon Sep 17 00:00:00 2001
From: zeertzjq <zeertzjq@outlook.com>
Date: Fri, 12 Jan 2024 14:09:10 +0800
Subject: [PATCH] vim-patch:9.1.0011: regexp cannot match combining chars in
 collection (#26992)

Problem:  regexp cannot match combining chars in collection
Solution: Check for combining characters in regex collections for the
          NFA and BT Regex Engine

Also, while at it, make debug mode work again.

fixes vim/vim#10286
closes: vim/vim#12871

https://github.com/vim/vim/commit/d2cc51f9a1a5a30ef5d2e732f49d7f495cae24cf

Co-authored-by: Christian Brabandt <cb@256bit.org>
---
 src/nvim/regexp.c                     | 136 ++++++++++++++++++++++++--
 test/old/testdir/test_regexp_utf8.vim |  11 +++
 2 files changed, 141 insertions(+), 6 deletions(-)

diff --git a/src/nvim/regexp.c b/src/nvim/regexp.c
index eff31e4d33..74f3b3452b 100644
--- a/src/nvim/regexp.c
+++ b/src/nvim/regexp.c
@@ -140,7 +140,9 @@ struct regengine {
   int (*regexec_nl)(regmatch_T *, uint8_t *, colnr_T, bool);
   /// bt_regexec_mult or nfa_regexec_mult
   int (*regexec_multi)(regmmatch_T *, win_T *, buf_T *, linenr_T, colnr_T, proftime_T *, int *);
-  // uint8_t *expr;
+#ifdef REGEXP_DEBUG
+  uint8_t *expr;
+#endif
 };
 
 // Structure used to save the current input state, when it needs to be
@@ -6426,15 +6428,33 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)
         break;
 
         case ANYOF:
-        case ANYBUT:
+        case ANYBUT: {
+          uint8_t *q = OPERAND(scan);
+
           if (c == NUL) {
             status = RA_NOMATCH;
-          } else if ((cstrchr((char *)OPERAND(scan), c) == NULL) == (op == ANYOF)) {
+          } else if ((cstrchr((char *)q, c) == NULL) == (op == ANYOF)) {
             status = RA_NOMATCH;
-          } else {
-            ADVANCE_REGINPUT();
+          } else {  // Check following combining characters
+            int len = utfc_ptr2len((char *)q) - utf_ptr2len((char *)q);
+
+            rex.input += utf_ptr2len((char *)rex.input);
+            q += utf_ptr2len((char *)q);
+
+            if (len == 0) {
+              break;
+            }
+
+            for (int i = 0; i < len; i++) {
+              if (q[i] != rex.input[i]) {
+                status = RA_NOMATCH;
+                break;
+              }
+            }
+            rex.input += len;
           }
           break;
+        }
 
         case MULTIBYTECODE: {
           int i, len;
@@ -10448,13 +10468,39 @@ collection:
           } else {
             if (got_coll_char == true && startc == 0) {
               EMIT(0x0a);
+              EMIT(NFA_CONCAT);
             } else {
               EMIT(startc);
+              if (utf_ptr2len(regparse) == utfc_ptr2len(regparse)) {
+                EMIT(NFA_CONCAT);
+              }
             }
-            EMIT(NFA_CONCAT);
           }
         }
 
+        int plen;
+        if (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse))) {
+          int i = utf_ptr2len(regparse);
+
+          c = utf_ptr2char(regparse + i);
+
+          // Add composing characters
+          while (true) {
+            if (c == 0) {
+              // \x00 is translated to \x0a, start at \x01.
+              EMIT(1);
+            } else {
+              EMIT(c);
+            }
+            EMIT(NFA_CONCAT);
+            if ((i += utf_char2len(c)) >= plen) {
+              break;
+            }
+            c = utf_ptr2char(regparse + i);
+          }
+          EMIT(NFA_COMPOSING);
+          EMIT(NFA_CONCAT);
+        }
         MB_PTR_ADV(regparse);
       }           // while (p < endp)
 
@@ -14503,6 +14549,78 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm
         state = t->state->out;
         result_if_matched = (t->state->c == NFA_START_COLL);
         while (true) {
+          if (state->c == NFA_COMPOSING) {
+            int mc = curc;
+            int len = 0;
+            nfa_state_T *end;
+            nfa_state_T *sta;
+            int cchars[MAX_MCO];
+            int ccount = 0;
+            int j;
+
+            sta = t->state->out->out;
+            if (utf_iscomposing(sta->c)) {
+              // Only match composing character(s), ignore base
+              // character.  Used for ".{composing}" and "{composing}"
+              // (no preceding character).
+              len += utf_char2len(mc);
+            }
+            if (rex.reg_icombine && len == 0) {
+              // If \Z was present, then ignore composing characters.
+              // When ignoring the base character this always matches.
+              if (sta->c != curc) {
+                result = FAIL;
+              } else {
+                result = OK;
+              }
+              while (sta->c != NFA_END_COMPOSING) {
+                sta = sta->out;
+              }
+            }
+            // Check base character matches first, unless ignored.
+            else if (len > 0 || mc == sta->c) {
+              if (len == 0) {
+                len += utf_char2len(mc);
+                sta = sta->out;
+              }
+
+              // We don't care about the order of composing characters.
+              // Get them into cchars[] first.
+              while (len < clen) {
+                mc = utf_ptr2char((char *)rex.input + len);
+                cchars[ccount++] = mc;
+                len += utf_char2len(mc);
+                if (ccount == MAX_MCO) {
+                  break;
+                }
+              }
+
+              // Check that each composing char in the pattern matches a
+              // composing char in the text.  We do not check if all
+              // composing chars are matched.
+              result = OK;
+              while (sta->c != NFA_END_COMPOSING) {
+                for (j = 0; j < ccount; j++) {
+                  if (cchars[j] == sta->c) {
+                    break;
+                  }
+                }
+                if (j == ccount) {
+                  result = FAIL;
+                  break;
+                }
+                sta = sta->out;
+              }
+            } else {
+              result = FAIL;
+            }
+
+            if (t->state->out->out1->c == NFA_END_COMPOSING) {
+              end = t->state->out->out1;
+              ADD_STATE_IF_MATCH(end);
+            }
+            break;
+          }
           if (state->c == NFA_END_COLL) {
             result = !result_if_matched;
             break;
@@ -15645,6 +15763,9 @@ static regengine_T bt_regengine = {
   bt_regfree,
   bt_regexec_nl,
   bt_regexec_multi,
+#ifdef REGEXP_DEBUG
+  "",
+#endif
 };
 
 static regengine_T nfa_regengine = {
@@ -15652,6 +15773,9 @@ static regengine_T nfa_regengine = {
   nfa_regfree,
   nfa_regexec_nl,
   nfa_regexec_multi,
+#ifdef REGEXP_DEBUG
+  "",
+#endif
 };
 
 // Which regexp engine to use? Needed for vim_regcomp().
diff --git a/test/old/testdir/test_regexp_utf8.vim b/test/old/testdir/test_regexp_utf8.vim
index 2253242a7c..97f48a0c09 100644
--- a/test/old/testdir/test_regexp_utf8.vim
+++ b/test/old/testdir/test_regexp_utf8.vim
@@ -599,5 +599,16 @@ func Test_match_too_complicated()
   set regexpengine=0
 endfunc
 
+func Test_combining_chars_in_collection()
+  new
+  for i in range(0,2)
+    exe "set re=".i
+    put =['ɔ̃', 'ɔ',  '̃  ã', 'abcd']
+    :%s/[ɔ̃]//
+    call assert_equal(['', '', 'ɔ', '̃  ã', 'abcd'], getline(1,'$'))
+    %d
+  endfor
+  bw!
+endfunc
 
 " vim: shiftwidth=2 sts=2 expandtab