From c0993ed3433ef4111a39e59642d15b15261e8b68 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Linse?= <bjorn.linse@gmail.com>
Date: Sun, 4 Aug 2019 12:22:22 +0200
Subject: [PATCH] lua: support getting UTF-32 and UTF-16 sizes of replaced text

---
 runtime/doc/api.txt                         |  9 ++-
 src/nvim/api/buffer.c                       | 11 ++-
 src/nvim/buffer_defs.h                      | 17 ++++-
 src/nvim/buffer_updates.c                   | 16 ++++-
 src/nvim/fileio.c                           |  2 +
 src/nvim/globals.h                          |  2 +
 src/nvim/mbyte.c                            | 33 +++++++++
 src/nvim/memline.c                          | 48 +++++++++----
 src/nvim/misc1.c                            |  4 +-
 test/functional/lua/buffer_updates_spec.lua | 80 +++++++++++++++++++--
 10 files changed, 192 insertions(+), 30 deletions(-)

diff --git a/runtime/doc/api.txt b/runtime/doc/api.txt
index f4366cb1af..2c6b053994 100644
--- a/runtime/doc/api.txt
+++ b/runtime/doc/api.txt
@@ -208,14 +208,17 @@ they are allowed.
 
 |nvim_buf_attach| will take keyword args for the callbacks. "on_lines" will
 receive parameters ("lines", {buf}, {changedtick}, {firstline}, {lastline},
-{new_lastline}, {old_bytecount}).
+{new_lastline}, {old_byte_size}[, {old_utf32_size}, {old_utf16_size}]).
 Unlike remote channel events the text contents are not passed. The new text can
 be accessed inside the callback as
 
     `vim.api.nvim_buf_get_lines(buf, firstline, new_lastline, true)`
 
-{old_bytecount} is the total size of the replaced region {firstline} to
-{lastline} in bytes, including the final newline after {lastline}.
+{old_byte_size} is the total size of the replaced region {firstline} to
+{lastline} in bytes, including the final newline after {lastline}. if
+`utf_sizes` is set to true in |nvim_buf_attach()| keyword args, then the
+UTF-32 and UTF-16 sizes of the deleted region is also passed as additional
+arguments {old_utf32_size} and {old_utf16_size}.
 
 "on_changedtick" is invoked when |b:changedtick| was incremented but no text
 was changed. The parameters recieved are ("changedtick", {buf}, {changedtick}).
diff --git a/src/nvim/api/buffer.c b/src/nvim/api/buffer.c
index 497b4ae9a4..c6f82e9d85 100644
--- a/src/nvim/api/buffer.c
+++ b/src/nvim/api/buffer.c
@@ -109,9 +109,11 @@ String buffer_get_line(Buffer buffer, Integer index, Error *err)
 ///        `nvim_buf_lines_event`. Otherwise, the first notification will be
 ///        a `nvim_buf_changedtick_event`. Not used for lua callbacks.
 /// @param  opts  Optional parameters.
-///               `on_lines`: lua callback received on change.
+///               `on_lines`:       lua callback received on change.
 ///               `on_changedtick`: lua callback received on changedtick
 ///                                 increment without text change.
+///               `utf_sizes`:      include UTF-32 and UTF-16 size of
+///                                 the replaced region.
 ///               See |api-buffer-updates-lua| for more information
 /// @param[out] err Error details, if any
 /// @return False when updates couldn't be enabled because the buffer isn't
@@ -156,6 +158,12 @@ Boolean nvim_buf_attach(uint64_t channel_id,
       }
       cb.on_detach = v->data.luaref;
       v->data.integer = LUA_NOREF;
+    } else if (is_lua && strequal("utf_sizes", k.data)) {
+      if (v->type != kObjectTypeBoolean) {
+        api_set_error(err, kErrorTypeValidation, "utf_sizes must be boolean");
+        goto error;
+      }
+      cb.utf_sizes = v->data.boolean;
     } else {
       api_set_error(err, kErrorTypeValidation, "unexpected key: %s", k.data);
       goto error;
@@ -1196,6 +1204,7 @@ Dictionary nvim__buf_stats(Buffer buffer, Error *err)
   // NB: this should be zero at any time API functions are called,
   // this exists to debug issues
   PUT(rv, "dirty_bytes", INTEGER_OBJ((Integer)buf->deleted_bytes));
+
   return rv;
 }
 
diff --git a/src/nvim/buffer_defs.h b/src/nvim/buffer_defs.h
index eb26e4ad8e..b11eaefdd0 100644
--- a/src/nvim/buffer_defs.h
+++ b/src/nvim/buffer_defs.h
@@ -459,8 +459,9 @@ typedef struct {
   LuaRef on_lines;
   LuaRef on_changedtick;
   LuaRef on_detach;
+  bool utf_sizes;
 } BufUpdateCallbacks;
-#define BUF_UPDATE_CALLBACKS_INIT { LUA_NOREF, LUA_NOREF, LUA_NOREF }
+#define BUF_UPDATE_CALLBACKS_INIT { LUA_NOREF, LUA_NOREF, LUA_NOREF, false }
 
 #define BUF_HAS_QF_ENTRY 1
 #define BUF_HAS_LL_ENTRY 2
@@ -802,12 +803,24 @@ struct file_buffer {
 
   kvec_t(BufhlLine *) b_bufhl_move_space;  // temporary space for highlights
 
-  // array of channelids which have asked to receive updates for this
+  // array of channel_id:s which have asked to receive updates for this
   // buffer.
   kvec_t(uint64_t) update_channels;
+  // array of lua callbacks for buffer updates.
   kvec_t(BufUpdateCallbacks) update_callbacks;
 
+  // whether an update callback has requested codepoint size of deleted regions.
+  bool update_need_codepoints;
+
+  // Measurements of the deleted or replaced region since the last update
+  // event. Some consumers of buffer changes need to know the byte size (like
+  // tree-sitter) or the corresponding UTF-32/UTF-16 size (like LSP) of the
+  // deleted text.
   size_t deleted_bytes;
+  size_t deleted_codepoints;
+  size_t deleted_codeunits;
+
+  // The number for times the current line has been flushed in the memline.
   int flush_count;
 
   int b_diff_failed;    // internal diff failed for this buffer
diff --git a/src/nvim/buffer_updates.c b/src/nvim/buffer_updates.c
index 7dea8bfac5..3604578b50 100644
--- a/src/nvim/buffer_updates.c
+++ b/src/nvim/buffer_updates.c
@@ -26,6 +26,9 @@ bool buf_updates_register(buf_T *buf, uint64_t channel_id,
 
   if (channel_id == LUA_INTERNAL_CALL) {
     kv_push(buf->update_callbacks, cb);
+    if (cb.utf_sizes) {
+      buf->update_need_codepoints = true;
+    }
     return true;
   }
 
@@ -169,7 +172,9 @@ void buf_updates_send_changes(buf_T *buf,
                               int64_t num_removed,
                               bool send_tick)
 {
-  size_t deleted_bytes = ml_flush_deleted_bytes(buf);
+  size_t deleted_codepoints, deleted_codeunits;
+  size_t deleted_bytes = ml_flush_deleted_bytes(buf, &deleted_codepoints,
+                                                &deleted_codeunits);
 
   if (!buf_updates_active(buf)) {
     return;
@@ -233,8 +238,8 @@ void buf_updates_send_changes(buf_T *buf,
     bool keep = true;
     if (cb.on_lines != LUA_NOREF) {
       Array args = ARRAY_DICT_INIT;
-      Object items[6];
-      args.size = 6;
+      Object items[8];
+      args.size = 6;  // may be increased to 8 below
       args.items = items;
 
       // the first argument is always the buffer handle
@@ -254,6 +259,11 @@ void buf_updates_send_changes(buf_T *buf,
 
       // byte count of previous contents
       args.items[5] = INTEGER_OBJ((Integer)deleted_bytes);
+      if (cb.utf_sizes) {
+        args.size = 8;
+        args.items[6] = INTEGER_OBJ((Integer)deleted_codepoints);
+        args.items[7] = INTEGER_OBJ((Integer)deleted_codeunits);
+      }
       textlock++;
       Object res = executor_exec_lua_cb(cb.on_lines, "lines", args, true);
       textlock--;
diff --git a/src/nvim/fileio.c b/src/nvim/fileio.c
index 2232de8c1e..d03b9138d0 100644
--- a/src/nvim/fileio.c
+++ b/src/nvim/fileio.c
@@ -1756,6 +1756,8 @@ failed:
       linecnt--;
     }
     curbuf->deleted_bytes = 0;
+    curbuf->deleted_codepoints = 0;
+    curbuf->deleted_codeunits = 0;
     linecnt = curbuf->b_ml.ml_line_count - linecnt;
     if (filesize == 0)
       linecnt = 0;
diff --git a/src/nvim/globals.h b/src/nvim/globals.h
index de6f59b3f1..4524c4b2c0 100644
--- a/src/nvim/globals.h
+++ b/src/nvim/globals.h
@@ -627,6 +627,8 @@ EXTERN pos_T Insstart_orig;
 EXTERN int orig_line_count INIT(= 0);       /* Line count when "gR" started */
 EXTERN int vr_lines_changed INIT(= 0);      /* #Lines changed by "gR" so far */
 
+// increase around internal delete/replace
+EXTERN int inhibit_delete_count INIT(= 0);
 
 /*
  * These flags are set based upon 'fileencoding'.
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c
index e7579399f3..bf8ce46113 100644
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -1438,6 +1438,39 @@ int utf16_to_utf8(const wchar_t *strw, char **str)
 
 #endif
 
+/// Measure the length of a string in corresponding UTF-32 and UTF-16 units.
+///
+/// Invalid UTF-8 bytes, or embedded surrogates, count as one code point/unit
+/// each.
+///
+/// The out parameters are incremented. This is used to measure the size of
+/// a buffer region consisting of multiple line segments.
+///
+/// @param s the string
+/// @param len maximum length (an earlier NUL terminates)
+/// @param[out] codepoints incremented with UTF-32 code point size
+/// @param[out] codeunits incremented with UTF-16 code unit size
+void mb_utflen(const char_u *s, size_t len, size_t *codepoints,
+               size_t *codeunits)
+  FUNC_ATTR_NONNULL_ALL
+{
+  size_t count = 0, extra = 0;
+  size_t clen;
+  for (size_t i = 0; i < len && s[i] != NUL; i += clen) {
+    clen = utf_ptr2len_len(s+i, len-i);
+    // NB: gets the byte value of invalid sequence bytes.
+    // we only care whether the char fits in the BMP or not
+    int c = (clen > 1) ? utf_ptr2char(s+i) : s[i];
+    count++;
+    if (c > 0xFFFF) {
+      extra++;
+    }
+  }
+  *codepoints += count;
+  *codeunits += count + extra;
+}
+
+
 /*
  * Version of strnicmp() that handles multi-byte characters.
  * Needed for Big5, Shift-JIS and UTF-8 encoding.  Other DBCS encodings can
diff --git a/src/nvim/memline.c b/src/nvim/memline.c
index 0b16f86416..3220c7d9b8 100644
--- a/src/nvim/memline.c
+++ b/src/nvim/memline.c
@@ -2383,6 +2383,23 @@ static int ml_append_int(
   return OK;
 }
 
+void ml_add_deleted_len(char_u *ptr, ssize_t len)
+{
+  if (inhibit_delete_count) {
+    return;
+  }
+  if (len == -1) {
+    len = STRLEN(ptr);
+  }
+  curbuf->deleted_bytes += len+1;
+  if (curbuf->update_need_codepoints) {
+    mb_utflen(ptr, len, &curbuf->deleted_codepoints,
+              &curbuf->deleted_codeunits);
+    curbuf->deleted_codepoints++;  // NL char
+    curbuf->deleted_codeunits++;
+  }
+}
+
 /*
  * Replace line lnum, with buffering, in current buffer.
  *
@@ -2408,19 +2425,17 @@ int ml_replace(linenr_T lnum, char_u *line, bool copy)
   if (copy) {
     line = vim_strsave(line);
   }
-  if (curbuf->b_ml.ml_line_lnum != lnum) {           /* other line buffered */
-    ml_flush_line(curbuf);                          /* flush it */
-  } else if (curbuf->b_ml.ml_flags & ML_LINE_DIRTY) {  /* same line allocated */
-    // TODO FIXME: see other "TODO FIXME"
-    curbuf->deleted_bytes += STRLEN(curbuf->b_ml.ml_line_ptr)+1;
-    xfree(curbuf->b_ml.ml_line_ptr);             /* free it */
-    readlen = false; // already read it.
+  if (curbuf->b_ml.ml_line_lnum != lnum) {  // other line buffered
+    ml_flush_line(curbuf);  // flush it
+  } else if (curbuf->b_ml.ml_flags & ML_LINE_DIRTY) {  // same line allocated
+    ml_add_deleted_len(curbuf->b_ml.ml_line_ptr, -1);
+    readlen = false;  // already added the length
+
+    xfree(curbuf->b_ml.ml_line_ptr);  // free it
   }
 
-  if (readlen) {
-    if (true) { // TODO: buffer updates active
-      curbuf->deleted_bytes += STRLEN(ml_get_buf(curbuf, lnum, false))+1;
-    }
+  if (readlen && kv_size(curbuf->update_callbacks)) {
+    ml_add_deleted_len(ml_get_buf(curbuf, lnum, false), -1);
   }
 
   curbuf->b_ml.ml_line_ptr = line;
@@ -2504,7 +2519,10 @@ static int ml_delete_int(buf_T *buf, linenr_T lnum, bool message)
   else
     line_size = ((dp->db_index[idx - 1]) & DB_INDEX_MASK) - line_start;
 
-  buf->deleted_bytes += line_size;
+  // Line should always have an NL char internally (represented as NUL),
+  // even if 'noeol' is set.
+  assert(line_size >= 1);
+  ml_add_deleted_len((char_u *)dp + line_start, line_size-1);
 
   /*
    * special case: If there is only one line in the data block it becomes empty.
@@ -2690,10 +2708,14 @@ void ml_clearmarked(void)
   return;
 }
 
-size_t ml_flush_deleted_bytes(buf_T *buf)
+size_t ml_flush_deleted_bytes(buf_T *buf, size_t *codepoints, size_t *codeunits)
 {
   size_t ret = buf->deleted_bytes;
+  *codepoints = buf->deleted_codepoints;
+  *codeunits = buf->deleted_codeunits;
   buf->deleted_bytes = 0;
+  buf->deleted_codepoints = 0;
+  buf->deleted_codeunits = 0;
   return ret;
 }
 
diff --git a/src/nvim/misc1.c b/src/nvim/misc1.c
index 112ca6f287..a62fa6d585 100644
--- a/src/nvim/misc1.c
+++ b/src/nvim/misc1.c
@@ -780,6 +780,7 @@ open_line (
     did_append = FALSE;
   }
 
+  inhibit_delete_count++;
   if (newindent
       || did_si
       ) {
@@ -821,6 +822,7 @@ open_line (
       did_si = false;
     }
   }
+  inhibit_delete_count--;
 
   /*
    * In REPLACE mode, for each character in the extra leader, there must be
@@ -1685,7 +1687,7 @@ int del_bytes(colnr_T count, bool fixpos_arg, bool use_delcombine)
   bool was_alloced = ml_line_alloced();     // check if oldp was allocated
   char_u *newp;
   if (was_alloced) {
-    curbuf->deleted_bytes += (size_t)oldlen+1;
+    ml_add_deleted_len(curbuf->b_ml.ml_line_ptr, oldlen);
     newp = oldp;                            // use same allocated memory
   } else {                                  // need to allocate a new line
     newp = xmalloc((size_t)(oldlen + 1 - count));
diff --git a/test/functional/lua/buffer_updates_spec.lua b/test/functional/lua/buffer_updates_spec.lua
index 16c38bc20b..990cb97fec 100644
--- a/test/functional/lua/buffer_updates_spec.lua
+++ b/test/functional/lua/buffer_updates_spec.lua
@@ -13,7 +13,8 @@ local origlines = {"original line 1",
                    "original line 3",
                    "original line 4",
                    "original line 5",
-                   "original line 6"}
+                   "original line 6",
+                   "    indented line"}
 
 describe('lua: buffer event callbacks', function()
   before_each(function()
@@ -21,14 +22,14 @@ describe('lua: buffer event callbacks', function()
     exec_lua([[
       local events = {}
 
-      function test_register(bufnr, id, changedtick)
+      function test_register(bufnr, id, changedtick, utf_sizes)
         local function callback(...)
           table.insert(events, {id, ...})
           if test_unreg == id then
             return true
           end
         end
-        local opts = {on_lines=callback, on_detach=callback}
+        local opts = {on_lines=callback, on_detach=callback, utf_sizes=utf_sizes}
         if changedtick then
           opts.on_changedtick = callback
         end
@@ -48,18 +49,26 @@ describe('lua: buffer event callbacks', function()
   -- assert the wrong thing), but masks errors with unflushed lines (as
   -- nvim_buf_get_offset forces a flush of the memline). To be safe run the
   -- test both ways.
-  local function check(verify)
+  local function check(verify,utf_sizes)
     local lastsize
     meths.buf_set_lines(0, 0, -1, true, origlines)
     if verify then
       lastsize = meths.buf_get_offset(0, meths.buf_line_count(0))
     end
-    exec_lua("return test_register(...)", 0, "test1")
+    exec_lua("return test_register(...)", 0, "test1",false,utf_sizes)
     local tick = meths.buf_get_changedtick(0)
 
     local verify_name = "test1"
     local function check_events(expected)
       local events = exec_lua("return get_events(...)" )
+      if utf_sizes then
+        -- this test case uses ASCII only, so sizes sshould be the same.
+        -- Unicode is tested below.
+        for _, event in ipairs(expected) do
+          event[9] = event[8]
+          event[10] = event[8]
+        end
+      end
       eq(expected, events)
       if verify then
         for _, event in ipairs(events) do
@@ -75,6 +84,7 @@ describe('lua: buffer event callbacks', function()
       end
     end
 
+    command('set autoindent')
     command('normal! GyyggP')
     tick = tick + 1
     check_events({{ "test1", "lines", 1, tick, 0, 0, 1, 0}})
@@ -83,7 +93,7 @@ describe('lua: buffer event callbacks', function()
     tick = tick + 1
     check_events({{ "test1", "lines", 1, tick, 3, 5, 4, 32 }})
 
-    exec_lua("return test_register(...)", 0, "test2", true)
+    exec_lua("return test_register(...)", 0, "test2", true, utf_sizes)
     tick = tick + 1
     command('undo')
 
@@ -124,7 +134,13 @@ describe('lua: buffer event callbacks', function()
     tick = tick + 1
     check_events({{ "test2", "lines", 1, tick, 4, 5, 5, 19 }})
 
-    feed('<esc>')
+    feed('<esc>Go')
+    tick = tick + 1
+    check_events({{ "test2", "lines", 1, tick, 11, 11, 12, 0 }})
+
+    feed('x')
+    tick = tick + 1
+    check_events({{ "test2", "lines", 1, tick, 11, 12, 12, 5 }})
 
     command('bwipe!')
     check_events({{ "test2", "detach", 1 }})
@@ -137,4 +153,54 @@ describe('lua: buffer event callbacks', function()
   it('works with verify', function()
     check(true)
   end)
+
+  it('works with utf_sizes and ASCII text', function()
+    check(false,true)
+  end)
+
+  it('works with utf_sizes and unicode text', function()
+    local unicode_text = {"ascii text",
+                          "latin text åäö",
+                          "BMP text ɧ αλφά",
+                          "BMP text 汉语 ↥↧",
+                          "SMP 🤦 🦄🦃",
+                          "combining å بِيَّة"}
+    meths.buf_set_lines(0, 0, -1, true, unicode_text)
+    feed('gg')
+    exec_lua("return test_register(...)", 0, "test1", false, true)
+    local tick = meths.buf_get_changedtick(0)
+
+    feed('dd')
+    tick = tick + 1
+    eq({{ "test1", "lines", 1, tick, 0, 1, 0, 11, 11, 11 }}, exec_lua("return get_events(...)" ))
+
+    feed('A<bs>')
+    tick = tick + 1
+    eq({{ "test1", "lines", 1, tick, 0, 1, 1, 18, 15, 15 }}, exec_lua("return get_events(...)" ))
+
+    feed('<esc>jylp')
+    tick = tick + 1
+    eq({{ "test1", "lines", 1, tick, 1, 2, 2, 21, 16, 16 }}, exec_lua("return get_events(...)" ))
+
+    feed('+eea<cr>')
+    tick = tick + 1
+    eq({{ "test1", "lines", 1, tick, 2, 3, 4, 23, 15, 15 }}, exec_lua("return get_events(...)" ))
+
+    feed('<esc>jdw')
+    tick = tick + 1
+    -- non-BMP chars count as 2 UTF-2 codeunits
+    eq({{ "test1", "lines", 1, tick, 4, 5, 5, 18, 9, 12 }}, exec_lua("return get_events(...)" ))
+
+    feed('+rx')
+    tick = tick + 1
+    -- count the individual codepoints of a composed character.
+    eq({{ "test1", "lines", 1, tick, 5, 6, 6, 27, 20, 20 }}, exec_lua("return get_events(...)" ))
+
+    feed('kJ')
+    tick = tick + 1
+    -- NB: this is inefficient (but not really wrong).
+    eq({{ "test1", "lines", 1,   tick, 4, 5, 5, 14, 5, 8 },
+        { "test1", "lines", 1, tick+1, 5, 6, 5, 27, 20, 20 }}, exec_lua("return get_events(...)" ))
+  end)
+
 end)