From 23290e7676e6f0a5cb5d9dc9fa1933df815aed33 Mon Sep 17 00:00:00 2001 From: bfredl Date: Sun, 29 Sep 2024 10:05:27 +0200 Subject: [PATCH] feat(editor): handle new multibyte sequences in normal mode replacement while the implementation is not tied to screen chars, it is a reasonable expectation to support the same size. If nvim is able to display a multibyte character, it will accept the same character as input, including in normal mode commands like r{char} --- src/nvim/grid_defs.h | 4 -- src/nvim/mbyte.c | 10 +++- src/nvim/normal.c | 55 ++++++++++++--------- src/nvim/normal_defs.h | 5 +- src/nvim/search.c | 15 +++--- src/nvim/types_defs.h | 4 ++ test/functional/editor/mode_normal_spec.lua | 20 ++++++++ 7 files changed, 73 insertions(+), 40 deletions(-) diff --git a/src/nvim/grid_defs.h b/src/nvim/grid_defs.h index 19a79ff810..8fa3092fd6 100644 --- a/src/nvim/grid_defs.h +++ b/src/nvim/grid_defs.h @@ -7,10 +7,6 @@ #include "nvim/pos_defs.h" #include "nvim/types_defs.h" -// Includes final NUL. MAX_MCO is no longer used, but at least 4*(MAX_MCO+1)+1=29 -// ensures we can fit all composed chars which did fit before. -#define MAX_SCHAR_SIZE 32 - enum { kZIndexDefaultGrid = 0, kZIndexFloatDefault = 50, diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c index 6340ff8c94..65f718f925 100644 --- a/src/nvim/mbyte.c +++ b/src/nvim/mbyte.c @@ -839,6 +839,13 @@ bool utf_composinglike(const char *p1, const char *p2, GraphemeState *state) return arabic_combine(first, second); } +/// same as utf_composinglike but operating on UCS-4 values +bool utf_iscomposing(int c1, int c2, GraphemeState *state) +{ + return (!utf8proc_grapheme_break_stateful(c1, c2, state) + || arabic_combine(c1, c2)); +} + /// Get the screen char at the beginning of a string /// /// Caller is expected to check for things like unprintable chars etc @@ -1852,8 +1859,7 @@ StrCharInfo utfc_next_impl(StrCharInfo cur) while (true) { uint8_t const next_len = utf8len_tab[*next]; int32_t const next_code = utf_ptr2CharInfo_impl(next, (uintptr_t)next_len); - if (utf8proc_grapheme_break_stateful(prev_code, next_code, &state) - && !arabic_combine(prev_code, next_code)) { + if (!utf_iscomposing(prev_code, next_code, &state)) { return (StrCharInfo){ .ptr = (char *)next, .chr = (CharInfo){ .value = next_code, .len = (next_code < 0 ? 1 : next_len) }, diff --git a/src/nvim/normal.c b/src/nvim/normal.c index be9987cc7f..aa247e39e6 100644 --- a/src/nvim/normal.c +++ b/src/nvim/normal.c @@ -835,21 +835,29 @@ static void normal_get_additional_char(NormalState *s) // because if it's put back with vungetc() it's too late to apply // mapping. no_mapping--; + GraphemeState state = GRAPHEME_STATE_INIT; + int prev_code = s->ca.nchar; + while ((s->c = vpeekc()) > 0 && (s->c >= 0x100 || MB_BYTE2LEN(vpeekc()) > 1)) { s->c = plain_vgetc(); - // TODO(bfredl): only allowing up to two composing chars is cringe af. - // Could reuse/abuse schar_T to at least allow us to input anything we are able - // to display and use the stateful utf8proc algorithm like utf_composinglike - if (!utf_iscomposing_legacy(s->c)) { + + if (!utf_iscomposing(prev_code, s->c, &state)) { vungetc(s->c); // it wasn't, put it back break; - } else if (s->ca.ncharC1 == 0) { - s->ca.ncharC1 = s->c; - } else { - s->ca.ncharC2 = s->c; } + + // first composing char, first put base char into buffer + if (s->ca.nchar_len == 0) { + s->ca.nchar_len = utf_char2bytes(s->ca.nchar, s->ca.nchar_composing); + } + + if (s->ca.nchar_len + utf_char2len(s->c) < (int)sizeof(s->ca.nchar_composing)) { + s->ca.nchar_len += utf_char2bytes(s->c, s->ca.nchar_composing + s->ca.nchar_len); + } + prev_code = s->c; } + s->ca.nchar_composing[s->ca.nchar_len] = NUL; no_mapping++; // Vim may be in a different mode when the user types the next key, // but when replaying a recording the next key is already in the @@ -1735,7 +1743,12 @@ size_t find_ident_at_pos(win_T *wp, linenr_T lnum, colnr_T startcol, char **text static void prep_redo_cmd(cmdarg_T *cap) { prep_redo(cap->oap->regname, cap->count0, - NUL, cap->cmdchar, NUL, NUL, cap->nchar); + NUL, cap->cmdchar, NUL, NUL, NUL); + if (cap->nchar_len > 0) { + AppendToRedobuff(cap->nchar_composing); + } else { + AppendCharToRedobuff(cap->nchar); + } } /// Prepare for redo of any command. @@ -4548,17 +4561,15 @@ static void nv_replace(cmdarg_T *cap) // Give 'r' to edit(), to get the redo command right. invoke_edit(cap, true, 'r', false); } else { - prep_redo(cap->oap->regname, cap->count1, - NUL, 'r', NUL, had_ctrl_v, cap->nchar); + prep_redo(cap->oap->regname, cap->count1, NUL, 'r', NUL, had_ctrl_v, 0); curbuf->b_op_start = curwin->w_cursor; const int old_State = State; - if (cap->ncharC1 != 0) { - AppendCharToRedobuff(cap->ncharC1); - } - if (cap->ncharC2 != 0) { - AppendCharToRedobuff(cap->ncharC2); + if (cap->nchar_len > 0) { + AppendToRedobuff(cap->nchar_composing); + } else { + AppendCharToRedobuff(cap->nchar); } // This is slow, but it handles replacing a single-byte with a @@ -4576,15 +4587,13 @@ static void nv_replace(cmdarg_T *cap) curwin->w_cursor.col++; } } else { - ins_char(cap->nchar); + if (cap->nchar_len) { + ins_char_bytes(cap->nchar_composing, (size_t)cap->nchar_len); + } else { + ins_char(cap->nchar); + } } State = old_State; - if (cap->ncharC1 != 0) { - ins_char(cap->ncharC1); - } - if (cap->ncharC2 != 0) { - ins_char(cap->ncharC2); - } } curwin->w_cursor.col--; // cursor on the last replaced char // if the character on the left of the current cursor is a multi-byte diff --git a/src/nvim/normal_defs.h b/src/nvim/normal_defs.h index 0309f6bc80..7b49b28a0f 100644 --- a/src/nvim/normal_defs.h +++ b/src/nvim/normal_defs.h @@ -3,6 +3,7 @@ #include #include "nvim/pos_defs.h" +#include "nvim/types_defs.h" /// Motion types, used for operators and for yank/delete registers. /// @@ -47,8 +48,8 @@ typedef struct { int prechar; ///< prefix character (optional, always 'g') int cmdchar; ///< command character int nchar; ///< next command character (optional) - int ncharC1; ///< first composing character (optional) - int ncharC2; ///< second composing character (optional) + char nchar_composing[MAX_SCHAR_SIZE]; ///< next char with composing chars (optional) + int nchar_len; ///< len of nchar_composing (when zero, use nchar instead) int extra_char; ///< yet another character (optional) int opcount; ///< count before an operator int count0; ///< count before command, default 0 diff --git a/src/nvim/search.c b/src/nvim/search.c index 2a935f6878..5d3d3db3fe 100644 --- a/src/nvim/search.c +++ b/src/nvim/search.c @@ -113,7 +113,7 @@ static int last_idx = 0; // index in spats[] for RE_LAST static uint8_t lastc[2] = { NUL, NUL }; // last character searched for static Direction lastcdir = FORWARD; // last direction of character search static bool last_t_cmd = true; // last search t_cmd -static char lastc_bytes[MB_MAXBYTES + 1]; +static char lastc_bytes[MAX_SCHAR_SIZE + 1]; static int lastc_bytelen = 1; // >1 for multi-byte char // copy of spats[], for keeping the search patterns while executing autocmds @@ -1550,14 +1550,11 @@ int searchc(cmdarg_T *cap, bool t_cmd) *lastc = (uint8_t)c; set_csearch_direction(dir); set_csearch_until(t_cmd); - lastc_bytelen = utf_char2bytes(c, lastc_bytes); - if (cap->ncharC1 != 0) { - lastc_bytelen += utf_char2bytes(cap->ncharC1, - lastc_bytes + lastc_bytelen); - if (cap->ncharC2 != 0) { - lastc_bytelen += utf_char2bytes(cap->ncharC2, - lastc_bytes + lastc_bytelen); - } + if (cap->nchar_len) { + lastc_bytelen = cap->nchar_len; + memcpy(lastc_bytes, cap->nchar_composing, (size_t)cap->nchar_len); + } else { + lastc_bytelen = utf_char2bytes(c, lastc_bytes); } } } else { // repeat previous search diff --git a/src/nvim/types_defs.h b/src/nvim/types_defs.h index 2dd2b01adf..bec0950653 100644 --- a/src/nvim/types_defs.h +++ b/src/nvim/types_defs.h @@ -12,6 +12,10 @@ typedef int32_t sattr_T; // must be at least as big as the biggest of schar_T, sattr_T, colnr_T typedef int32_t sscratch_T; +// Includes final NUL. MAX_MCO is no longer used, but at least 4*(MAX_MCO+1)+1=29 +// ensures we can fit all composed chars which did fit before. +#define MAX_SCHAR_SIZE 32 + // Opaque handle used by API clients to refer to various objects in vim typedef int handle_T; diff --git a/test/functional/editor/mode_normal_spec.lua b/test/functional/editor/mode_normal_spec.lua index b3ef4866dc..cca244e06c 100644 --- a/test/functional/editor/mode_normal_spec.lua +++ b/test/functional/editor/mode_normal_spec.lua @@ -9,6 +9,7 @@ local feed = n.feed local fn = n.fn local command = n.command local eq = t.eq +local api = n.api describe('Normal mode', function() before_each(clear) @@ -41,4 +42,23 @@ describe('Normal mode', function() attr_ids = {}, }) end) + + it('replacing with ZWJ emoji sequences', function() + local screen = Screen.new(30, 8) + screen:attach() + api.nvim_buf_set_lines(0, 0, -1, true, { 'abcdefg' }) + feed('05rπŸ§‘β€πŸŒΎ') -- ZWJ + screen:expect([[ + πŸ§‘β€πŸŒΎπŸ§‘β€πŸŒΎπŸ§‘β€πŸŒΎπŸ§‘β€πŸŒΎ^πŸ§‘β€πŸŒΎfg | + {1:~ }|*6 + | + ]]) + + feed('2rπŸ³οΈβ€βš§οΈ') -- ZWJ and variant selectors + screen:expect([[ + πŸ§‘β€πŸŒΎπŸ§‘β€πŸŒΎπŸ§‘β€πŸŒΎπŸ§‘β€πŸŒΎπŸ³οΈβ€βš§οΈ^πŸ³οΈβ€βš§οΈg | + {1:~ }|*6 + | + ]]) + end) end)