From 23290e7676e6f0a5cb5d9dc9fa1933df815aed33 Mon Sep 17 00:00:00 2001
From: bfredl <bjorn.linse@gmail.com>
Date: Sun, 29 Sep 2024 10:05:27 +0200
Subject: [PATCH] feat(editor): handle new multibyte sequences in normal mode
 replacement

while the implementation is not tied to screen chars, it is a reasonable
expectation to support the same size. If nvim is able to display a
multibyte character, it will accept the same character as input,
including in normal mode commands like r{char}
---
 src/nvim/grid_defs.h                        |  4 --
 src/nvim/mbyte.c                            | 10 +++-
 src/nvim/normal.c                           | 55 ++++++++++++---------
 src/nvim/normal_defs.h                      |  5 +-
 src/nvim/search.c                           | 15 +++---
 src/nvim/types_defs.h                       |  4 ++
 test/functional/editor/mode_normal_spec.lua | 20 ++++++++
 7 files changed, 73 insertions(+), 40 deletions(-)

diff --git a/src/nvim/grid_defs.h b/src/nvim/grid_defs.h
index 19a79ff810..8fa3092fd6 100644
--- a/src/nvim/grid_defs.h
+++ b/src/nvim/grid_defs.h
@@ -7,10 +7,6 @@
 #include "nvim/pos_defs.h"
 #include "nvim/types_defs.h"
 
-// Includes final NUL. MAX_MCO is no longer used, but at least 4*(MAX_MCO+1)+1=29
-// ensures we can fit all composed chars which did fit before.
-#define MAX_SCHAR_SIZE 32
-
 enum {
   kZIndexDefaultGrid = 0,
   kZIndexFloatDefault = 50,
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c
index 6340ff8c94..65f718f925 100644
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -839,6 +839,13 @@ bool utf_composinglike(const char *p1, const char *p2, GraphemeState *state)
   return arabic_combine(first, second);
 }
 
+/// same as utf_composinglike but operating on UCS-4 values
+bool utf_iscomposing(int c1, int c2, GraphemeState *state)
+{
+  return (!utf8proc_grapheme_break_stateful(c1, c2, state)
+          || arabic_combine(c1, c2));
+}
+
 /// Get the screen char at the beginning of a string
 ///
 /// Caller is expected to check for things like unprintable chars etc
@@ -1852,8 +1859,7 @@ StrCharInfo utfc_next_impl(StrCharInfo cur)
   while (true) {
     uint8_t const next_len = utf8len_tab[*next];
     int32_t const next_code = utf_ptr2CharInfo_impl(next, (uintptr_t)next_len);
-    if (utf8proc_grapheme_break_stateful(prev_code, next_code, &state)
-        && !arabic_combine(prev_code, next_code)) {
+    if (!utf_iscomposing(prev_code, next_code, &state)) {
       return (StrCharInfo){
         .ptr = (char *)next,
         .chr = (CharInfo){ .value = next_code, .len = (next_code < 0 ? 1 : next_len) },
diff --git a/src/nvim/normal.c b/src/nvim/normal.c
index be9987cc7f..aa247e39e6 100644
--- a/src/nvim/normal.c
+++ b/src/nvim/normal.c
@@ -835,21 +835,29 @@ static void normal_get_additional_char(NormalState *s)
       // because if it's put back with vungetc() it's too late to apply
       // mapping.
       no_mapping--;
+      GraphemeState state = GRAPHEME_STATE_INIT;
+      int prev_code = s->ca.nchar;
+
       while ((s->c = vpeekc()) > 0
              && (s->c >= 0x100 || MB_BYTE2LEN(vpeekc()) > 1)) {
         s->c = plain_vgetc();
-        // TODO(bfredl): only allowing up to two composing chars is cringe af.
-        // Could reuse/abuse schar_T to at least allow us to input anything we are able
-        // to display and use the stateful utf8proc algorithm like utf_composinglike
-        if (!utf_iscomposing_legacy(s->c)) {
+
+        if (!utf_iscomposing(prev_code, s->c, &state)) {
           vungetc(s->c);                   // it wasn't, put it back
           break;
-        } else if (s->ca.ncharC1 == 0) {
-          s->ca.ncharC1 = s->c;
-        } else {
-          s->ca.ncharC2 = s->c;
         }
+
+        // first composing char, first put base char into buffer
+        if (s->ca.nchar_len == 0) {
+          s->ca.nchar_len = utf_char2bytes(s->ca.nchar, s->ca.nchar_composing);
+        }
+
+        if (s->ca.nchar_len + utf_char2len(s->c) < (int)sizeof(s->ca.nchar_composing)) {
+          s->ca.nchar_len += utf_char2bytes(s->c, s->ca.nchar_composing + s->ca.nchar_len);
+        }
+        prev_code = s->c;
       }
+      s->ca.nchar_composing[s->ca.nchar_len] = NUL;
       no_mapping++;
       // Vim may be in a different mode when the user types the next key,
       // but when replaying a recording the next key is already in the
@@ -1735,7 +1743,12 @@ size_t find_ident_at_pos(win_T *wp, linenr_T lnum, colnr_T startcol, char **text
 static void prep_redo_cmd(cmdarg_T *cap)
 {
   prep_redo(cap->oap->regname, cap->count0,
-            NUL, cap->cmdchar, NUL, NUL, cap->nchar);
+            NUL, cap->cmdchar, NUL, NUL, NUL);
+  if (cap->nchar_len > 0) {
+    AppendToRedobuff(cap->nchar_composing);
+  } else {
+    AppendCharToRedobuff(cap->nchar);
+  }
 }
 
 /// Prepare for redo of any command.
@@ -4548,17 +4561,15 @@ static void nv_replace(cmdarg_T *cap)
     // Give 'r' to edit(), to get the redo command right.
     invoke_edit(cap, true, 'r', false);
   } else {
-    prep_redo(cap->oap->regname, cap->count1,
-              NUL, 'r', NUL, had_ctrl_v, cap->nchar);
+    prep_redo(cap->oap->regname, cap->count1, NUL, 'r', NUL, had_ctrl_v, 0);
 
     curbuf->b_op_start = curwin->w_cursor;
     const int old_State = State;
 
-    if (cap->ncharC1 != 0) {
-      AppendCharToRedobuff(cap->ncharC1);
-    }
-    if (cap->ncharC2 != 0) {
-      AppendCharToRedobuff(cap->ncharC2);
+    if (cap->nchar_len > 0) {
+      AppendToRedobuff(cap->nchar_composing);
+    } else {
+      AppendCharToRedobuff(cap->nchar);
     }
 
     // This is slow, but it handles replacing a single-byte with a
@@ -4576,15 +4587,13 @@ static void nv_replace(cmdarg_T *cap)
           curwin->w_cursor.col++;
         }
       } else {
-        ins_char(cap->nchar);
+        if (cap->nchar_len) {
+          ins_char_bytes(cap->nchar_composing, (size_t)cap->nchar_len);
+        } else {
+          ins_char(cap->nchar);
+        }
       }
       State = old_State;
-      if (cap->ncharC1 != 0) {
-        ins_char(cap->ncharC1);
-      }
-      if (cap->ncharC2 != 0) {
-        ins_char(cap->ncharC2);
-      }
     }
     curwin->w_cursor.col--;         // cursor on the last replaced char
     // if the character on the left of the current cursor is a multi-byte
diff --git a/src/nvim/normal_defs.h b/src/nvim/normal_defs.h
index 0309f6bc80..7b49b28a0f 100644
--- a/src/nvim/normal_defs.h
+++ b/src/nvim/normal_defs.h
@@ -3,6 +3,7 @@
 #include <stdbool.h>
 
 #include "nvim/pos_defs.h"
+#include "nvim/types_defs.h"
 
 /// Motion types, used for operators and for yank/delete registers.
 ///
@@ -47,8 +48,8 @@ typedef struct {
   int prechar;      ///< prefix character (optional, always 'g')
   int cmdchar;      ///< command character
   int nchar;        ///< next command character (optional)
-  int ncharC1;      ///< first composing character (optional)
-  int ncharC2;      ///< second composing character (optional)
+  char nchar_composing[MAX_SCHAR_SIZE];  ///< next char with composing chars (optional)
+  int nchar_len;    ///< len of nchar_composing (when zero, use nchar instead)
   int extra_char;   ///< yet another character (optional)
   int opcount;      ///< count before an operator
   int count0;       ///< count before command, default 0
diff --git a/src/nvim/search.c b/src/nvim/search.c
index 2a935f6878..5d3d3db3fe 100644
--- a/src/nvim/search.c
+++ b/src/nvim/search.c
@@ -113,7 +113,7 @@ static int last_idx = 0;        // index in spats[] for RE_LAST
 static uint8_t lastc[2] = { NUL, NUL };   // last character searched for
 static Direction lastcdir = FORWARD;      // last direction of character search
 static bool last_t_cmd = true;            // last search t_cmd
-static char lastc_bytes[MB_MAXBYTES + 1];
+static char lastc_bytes[MAX_SCHAR_SIZE + 1];
 static int lastc_bytelen = 1;             // >1 for multi-byte char
 
 // copy of spats[], for keeping the search patterns while executing autocmds
@@ -1550,14 +1550,11 @@ int searchc(cmdarg_T *cap, bool t_cmd)
       *lastc = (uint8_t)c;
       set_csearch_direction(dir);
       set_csearch_until(t_cmd);
-      lastc_bytelen = utf_char2bytes(c, lastc_bytes);
-      if (cap->ncharC1 != 0) {
-        lastc_bytelen += utf_char2bytes(cap->ncharC1,
-                                        lastc_bytes + lastc_bytelen);
-        if (cap->ncharC2 != 0) {
-          lastc_bytelen += utf_char2bytes(cap->ncharC2,
-                                          lastc_bytes + lastc_bytelen);
-        }
+      if (cap->nchar_len) {
+        lastc_bytelen = cap->nchar_len;
+        memcpy(lastc_bytes, cap->nchar_composing, (size_t)cap->nchar_len);
+      } else {
+        lastc_bytelen = utf_char2bytes(c, lastc_bytes);
       }
     }
   } else {            // repeat previous search
diff --git a/src/nvim/types_defs.h b/src/nvim/types_defs.h
index 2dd2b01adf..bec0950653 100644
--- a/src/nvim/types_defs.h
+++ b/src/nvim/types_defs.h
@@ -12,6 +12,10 @@ typedef int32_t sattr_T;
 // must be at least as big as the biggest of schar_T, sattr_T, colnr_T
 typedef int32_t sscratch_T;
 
+// Includes final NUL. MAX_MCO is no longer used, but at least 4*(MAX_MCO+1)+1=29
+// ensures we can fit all composed chars which did fit before.
+#define MAX_SCHAR_SIZE 32
+
 // Opaque handle used by API clients to refer to various objects in vim
 typedef int handle_T;
 
diff --git a/test/functional/editor/mode_normal_spec.lua b/test/functional/editor/mode_normal_spec.lua
index b3ef4866dc..cca244e06c 100644
--- a/test/functional/editor/mode_normal_spec.lua
+++ b/test/functional/editor/mode_normal_spec.lua
@@ -9,6 +9,7 @@ local feed = n.feed
 local fn = n.fn
 local command = n.command
 local eq = t.eq
+local api = n.api
 
 describe('Normal mode', function()
   before_each(clear)
@@ -41,4 +42,23 @@ describe('Normal mode', function()
       attr_ids = {},
     })
   end)
+
+  it('replacing with ZWJ emoji sequences', function()
+    local screen = Screen.new(30, 8)
+    screen:attach()
+    api.nvim_buf_set_lines(0, 0, -1, true, { 'abcdefg' })
+    feed('05r🧑‍🌾') -- ZWJ
+    screen:expect([[
+      🧑‍🌾🧑‍🌾🧑‍🌾🧑‍🌾^🧑‍🌾fg                  |
+      {1:~                             }|*6
+                                    |
+    ]])
+
+    feed('2r🏳️‍⚧️') -- ZWJ and variant selectors
+    screen:expect([[
+      🧑‍🌾🧑‍🌾🧑‍🌾🧑‍🌾🏳️‍⚧️^🏳️‍⚧️g                 |
+      {1:~                             }|*6
+                                    |
+    ]])
+  end)
 end)