From c0993ed3433ef4111a39e59642d15b15261e8b68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Linse?= Date: Sun, 4 Aug 2019 12:22:22 +0200 Subject: [PATCH] lua: support getting UTF-32 and UTF-16 sizes of replaced text --- runtime/doc/api.txt | 9 ++- src/nvim/api/buffer.c | 11 ++- src/nvim/buffer_defs.h | 17 ++++- src/nvim/buffer_updates.c | 16 ++++- src/nvim/fileio.c | 2 + src/nvim/globals.h | 2 + src/nvim/mbyte.c | 33 +++++++++ src/nvim/memline.c | 48 +++++++++---- src/nvim/misc1.c | 4 +- test/functional/lua/buffer_updates_spec.lua | 80 +++++++++++++++++++-- 10 files changed, 192 insertions(+), 30 deletions(-) diff --git a/runtime/doc/api.txt b/runtime/doc/api.txt index f4366cb1af..2c6b053994 100644 --- a/runtime/doc/api.txt +++ b/runtime/doc/api.txt @@ -208,14 +208,17 @@ they are allowed. |nvim_buf_attach| will take keyword args for the callbacks. "on_lines" will receive parameters ("lines", {buf}, {changedtick}, {firstline}, {lastline}, -{new_lastline}, {old_bytecount}). +{new_lastline}, {old_byte_size}[, {old_utf32_size}, {old_utf16_size}]). Unlike remote channel events the text contents are not passed. The new text can be accessed inside the callback as `vim.api.nvim_buf_get_lines(buf, firstline, new_lastline, true)` -{old_bytecount} is the total size of the replaced region {firstline} to -{lastline} in bytes, including the final newline after {lastline}. +{old_byte_size} is the total size of the replaced region {firstline} to +{lastline} in bytes, including the final newline after {lastline}. if +`utf_sizes` is set to true in |nvim_buf_attach()| keyword args, then the +UTF-32 and UTF-16 sizes of the deleted region is also passed as additional +arguments {old_utf32_size} and {old_utf16_size}. "on_changedtick" is invoked when |b:changedtick| was incremented but no text was changed. The parameters recieved are ("changedtick", {buf}, {changedtick}). diff --git a/src/nvim/api/buffer.c b/src/nvim/api/buffer.c index 497b4ae9a4..c6f82e9d85 100644 --- a/src/nvim/api/buffer.c +++ b/src/nvim/api/buffer.c @@ -109,9 +109,11 @@ String buffer_get_line(Buffer buffer, Integer index, Error *err) /// `nvim_buf_lines_event`. Otherwise, the first notification will be /// a `nvim_buf_changedtick_event`. Not used for lua callbacks. /// @param opts Optional parameters. -/// `on_lines`: lua callback received on change. +/// `on_lines`: lua callback received on change. /// `on_changedtick`: lua callback received on changedtick /// increment without text change. +/// `utf_sizes`: include UTF-32 and UTF-16 size of +/// the replaced region. /// See |api-buffer-updates-lua| for more information /// @param[out] err Error details, if any /// @return False when updates couldn't be enabled because the buffer isn't @@ -156,6 +158,12 @@ Boolean nvim_buf_attach(uint64_t channel_id, } cb.on_detach = v->data.luaref; v->data.integer = LUA_NOREF; + } else if (is_lua && strequal("utf_sizes", k.data)) { + if (v->type != kObjectTypeBoolean) { + api_set_error(err, kErrorTypeValidation, "utf_sizes must be boolean"); + goto error; + } + cb.utf_sizes = v->data.boolean; } else { api_set_error(err, kErrorTypeValidation, "unexpected key: %s", k.data); goto error; @@ -1196,6 +1204,7 @@ Dictionary nvim__buf_stats(Buffer buffer, Error *err) // NB: this should be zero at any time API functions are called, // this exists to debug issues PUT(rv, "dirty_bytes", INTEGER_OBJ((Integer)buf->deleted_bytes)); + return rv; } diff --git a/src/nvim/buffer_defs.h b/src/nvim/buffer_defs.h index eb26e4ad8e..b11eaefdd0 100644 --- a/src/nvim/buffer_defs.h +++ b/src/nvim/buffer_defs.h @@ -459,8 +459,9 @@ typedef struct { LuaRef on_lines; LuaRef on_changedtick; LuaRef on_detach; + bool utf_sizes; } BufUpdateCallbacks; -#define BUF_UPDATE_CALLBACKS_INIT { LUA_NOREF, LUA_NOREF, LUA_NOREF } +#define BUF_UPDATE_CALLBACKS_INIT { LUA_NOREF, LUA_NOREF, LUA_NOREF, false } #define BUF_HAS_QF_ENTRY 1 #define BUF_HAS_LL_ENTRY 2 @@ -802,12 +803,24 @@ struct file_buffer { kvec_t(BufhlLine *) b_bufhl_move_space; // temporary space for highlights - // array of channelids which have asked to receive updates for this + // array of channel_id:s which have asked to receive updates for this // buffer. kvec_t(uint64_t) update_channels; + // array of lua callbacks for buffer updates. kvec_t(BufUpdateCallbacks) update_callbacks; + // whether an update callback has requested codepoint size of deleted regions. + bool update_need_codepoints; + + // Measurements of the deleted or replaced region since the last update + // event. Some consumers of buffer changes need to know the byte size (like + // tree-sitter) or the corresponding UTF-32/UTF-16 size (like LSP) of the + // deleted text. size_t deleted_bytes; + size_t deleted_codepoints; + size_t deleted_codeunits; + + // The number for times the current line has been flushed in the memline. int flush_count; int b_diff_failed; // internal diff failed for this buffer diff --git a/src/nvim/buffer_updates.c b/src/nvim/buffer_updates.c index 7dea8bfac5..3604578b50 100644 --- a/src/nvim/buffer_updates.c +++ b/src/nvim/buffer_updates.c @@ -26,6 +26,9 @@ bool buf_updates_register(buf_T *buf, uint64_t channel_id, if (channel_id == LUA_INTERNAL_CALL) { kv_push(buf->update_callbacks, cb); + if (cb.utf_sizes) { + buf->update_need_codepoints = true; + } return true; } @@ -169,7 +172,9 @@ void buf_updates_send_changes(buf_T *buf, int64_t num_removed, bool send_tick) { - size_t deleted_bytes = ml_flush_deleted_bytes(buf); + size_t deleted_codepoints, deleted_codeunits; + size_t deleted_bytes = ml_flush_deleted_bytes(buf, &deleted_codepoints, + &deleted_codeunits); if (!buf_updates_active(buf)) { return; @@ -233,8 +238,8 @@ void buf_updates_send_changes(buf_T *buf, bool keep = true; if (cb.on_lines != LUA_NOREF) { Array args = ARRAY_DICT_INIT; - Object items[6]; - args.size = 6; + Object items[8]; + args.size = 6; // may be increased to 8 below args.items = items; // the first argument is always the buffer handle @@ -254,6 +259,11 @@ void buf_updates_send_changes(buf_T *buf, // byte count of previous contents args.items[5] = INTEGER_OBJ((Integer)deleted_bytes); + if (cb.utf_sizes) { + args.size = 8; + args.items[6] = INTEGER_OBJ((Integer)deleted_codepoints); + args.items[7] = INTEGER_OBJ((Integer)deleted_codeunits); + } textlock++; Object res = executor_exec_lua_cb(cb.on_lines, "lines", args, true); textlock--; diff --git a/src/nvim/fileio.c b/src/nvim/fileio.c index 2232de8c1e..d03b9138d0 100644 --- a/src/nvim/fileio.c +++ b/src/nvim/fileio.c @@ -1756,6 +1756,8 @@ failed: linecnt--; } curbuf->deleted_bytes = 0; + curbuf->deleted_codepoints = 0; + curbuf->deleted_codeunits = 0; linecnt = curbuf->b_ml.ml_line_count - linecnt; if (filesize == 0) linecnt = 0; diff --git a/src/nvim/globals.h b/src/nvim/globals.h index de6f59b3f1..4524c4b2c0 100644 --- a/src/nvim/globals.h +++ b/src/nvim/globals.h @@ -627,6 +627,8 @@ EXTERN pos_T Insstart_orig; EXTERN int orig_line_count INIT(= 0); /* Line count when "gR" started */ EXTERN int vr_lines_changed INIT(= 0); /* #Lines changed by "gR" so far */ +// increase around internal delete/replace +EXTERN int inhibit_delete_count INIT(= 0); /* * These flags are set based upon 'fileencoding'. diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c index e7579399f3..bf8ce46113 100644 --- a/src/nvim/mbyte.c +++ b/src/nvim/mbyte.c @@ -1438,6 +1438,39 @@ int utf16_to_utf8(const wchar_t *strw, char **str) #endif +/// Measure the length of a string in corresponding UTF-32 and UTF-16 units. +/// +/// Invalid UTF-8 bytes, or embedded surrogates, count as one code point/unit +/// each. +/// +/// The out parameters are incremented. This is used to measure the size of +/// a buffer region consisting of multiple line segments. +/// +/// @param s the string +/// @param len maximum length (an earlier NUL terminates) +/// @param[out] codepoints incremented with UTF-32 code point size +/// @param[out] codeunits incremented with UTF-16 code unit size +void mb_utflen(const char_u *s, size_t len, size_t *codepoints, + size_t *codeunits) + FUNC_ATTR_NONNULL_ALL +{ + size_t count = 0, extra = 0; + size_t clen; + for (size_t i = 0; i < len && s[i] != NUL; i += clen) { + clen = utf_ptr2len_len(s+i, len-i); + // NB: gets the byte value of invalid sequence bytes. + // we only care whether the char fits in the BMP or not + int c = (clen > 1) ? utf_ptr2char(s+i) : s[i]; + count++; + if (c > 0xFFFF) { + extra++; + } + } + *codepoints += count; + *codeunits += count + extra; +} + + /* * Version of strnicmp() that handles multi-byte characters. * Needed for Big5, Shift-JIS and UTF-8 encoding. Other DBCS encodings can diff --git a/src/nvim/memline.c b/src/nvim/memline.c index 0b16f86416..3220c7d9b8 100644 --- a/src/nvim/memline.c +++ b/src/nvim/memline.c @@ -2383,6 +2383,23 @@ static int ml_append_int( return OK; } +void ml_add_deleted_len(char_u *ptr, ssize_t len) +{ + if (inhibit_delete_count) { + return; + } + if (len == -1) { + len = STRLEN(ptr); + } + curbuf->deleted_bytes += len+1; + if (curbuf->update_need_codepoints) { + mb_utflen(ptr, len, &curbuf->deleted_codepoints, + &curbuf->deleted_codeunits); + curbuf->deleted_codepoints++; // NL char + curbuf->deleted_codeunits++; + } +} + /* * Replace line lnum, with buffering, in current buffer. * @@ -2408,19 +2425,17 @@ int ml_replace(linenr_T lnum, char_u *line, bool copy) if (copy) { line = vim_strsave(line); } - if (curbuf->b_ml.ml_line_lnum != lnum) { /* other line buffered */ - ml_flush_line(curbuf); /* flush it */ - } else if (curbuf->b_ml.ml_flags & ML_LINE_DIRTY) { /* same line allocated */ - // TODO FIXME: see other "TODO FIXME" - curbuf->deleted_bytes += STRLEN(curbuf->b_ml.ml_line_ptr)+1; - xfree(curbuf->b_ml.ml_line_ptr); /* free it */ - readlen = false; // already read it. + if (curbuf->b_ml.ml_line_lnum != lnum) { // other line buffered + ml_flush_line(curbuf); // flush it + } else if (curbuf->b_ml.ml_flags & ML_LINE_DIRTY) { // same line allocated + ml_add_deleted_len(curbuf->b_ml.ml_line_ptr, -1); + readlen = false; // already added the length + + xfree(curbuf->b_ml.ml_line_ptr); // free it } - if (readlen) { - if (true) { // TODO: buffer updates active - curbuf->deleted_bytes += STRLEN(ml_get_buf(curbuf, lnum, false))+1; - } + if (readlen && kv_size(curbuf->update_callbacks)) { + ml_add_deleted_len(ml_get_buf(curbuf, lnum, false), -1); } curbuf->b_ml.ml_line_ptr = line; @@ -2504,7 +2519,10 @@ static int ml_delete_int(buf_T *buf, linenr_T lnum, bool message) else line_size = ((dp->db_index[idx - 1]) & DB_INDEX_MASK) - line_start; - buf->deleted_bytes += line_size; + // Line should always have an NL char internally (represented as NUL), + // even if 'noeol' is set. + assert(line_size >= 1); + ml_add_deleted_len((char_u *)dp + line_start, line_size-1); /* * special case: If there is only one line in the data block it becomes empty. @@ -2690,10 +2708,14 @@ void ml_clearmarked(void) return; } -size_t ml_flush_deleted_bytes(buf_T *buf) +size_t ml_flush_deleted_bytes(buf_T *buf, size_t *codepoints, size_t *codeunits) { size_t ret = buf->deleted_bytes; + *codepoints = buf->deleted_codepoints; + *codeunits = buf->deleted_codeunits; buf->deleted_bytes = 0; + buf->deleted_codepoints = 0; + buf->deleted_codeunits = 0; return ret; } diff --git a/src/nvim/misc1.c b/src/nvim/misc1.c index 112ca6f287..a62fa6d585 100644 --- a/src/nvim/misc1.c +++ b/src/nvim/misc1.c @@ -780,6 +780,7 @@ open_line ( did_append = FALSE; } + inhibit_delete_count++; if (newindent || did_si ) { @@ -821,6 +822,7 @@ open_line ( did_si = false; } } + inhibit_delete_count--; /* * In REPLACE mode, for each character in the extra leader, there must be @@ -1685,7 +1687,7 @@ int del_bytes(colnr_T count, bool fixpos_arg, bool use_delcombine) bool was_alloced = ml_line_alloced(); // check if oldp was allocated char_u *newp; if (was_alloced) { - curbuf->deleted_bytes += (size_t)oldlen+1; + ml_add_deleted_len(curbuf->b_ml.ml_line_ptr, oldlen); newp = oldp; // use same allocated memory } else { // need to allocate a new line newp = xmalloc((size_t)(oldlen + 1 - count)); diff --git a/test/functional/lua/buffer_updates_spec.lua b/test/functional/lua/buffer_updates_spec.lua index 16c38bc20b..990cb97fec 100644 --- a/test/functional/lua/buffer_updates_spec.lua +++ b/test/functional/lua/buffer_updates_spec.lua @@ -13,7 +13,8 @@ local origlines = {"original line 1", "original line 3", "original line 4", "original line 5", - "original line 6"} + "original line 6", + " indented line"} describe('lua: buffer event callbacks', function() before_each(function() @@ -21,14 +22,14 @@ describe('lua: buffer event callbacks', function() exec_lua([[ local events = {} - function test_register(bufnr, id, changedtick) + function test_register(bufnr, id, changedtick, utf_sizes) local function callback(...) table.insert(events, {id, ...}) if test_unreg == id then return true end end - local opts = {on_lines=callback, on_detach=callback} + local opts = {on_lines=callback, on_detach=callback, utf_sizes=utf_sizes} if changedtick then opts.on_changedtick = callback end @@ -48,18 +49,26 @@ describe('lua: buffer event callbacks', function() -- assert the wrong thing), but masks errors with unflushed lines (as -- nvim_buf_get_offset forces a flush of the memline). To be safe run the -- test both ways. - local function check(verify) + local function check(verify,utf_sizes) local lastsize meths.buf_set_lines(0, 0, -1, true, origlines) if verify then lastsize = meths.buf_get_offset(0, meths.buf_line_count(0)) end - exec_lua("return test_register(...)", 0, "test1") + exec_lua("return test_register(...)", 0, "test1",false,utf_sizes) local tick = meths.buf_get_changedtick(0) local verify_name = "test1" local function check_events(expected) local events = exec_lua("return get_events(...)" ) + if utf_sizes then + -- this test case uses ASCII only, so sizes sshould be the same. + -- Unicode is tested below. + for _, event in ipairs(expected) do + event[9] = event[8] + event[10] = event[8] + end + end eq(expected, events) if verify then for _, event in ipairs(events) do @@ -75,6 +84,7 @@ describe('lua: buffer event callbacks', function() end end + command('set autoindent') command('normal! GyyggP') tick = tick + 1 check_events({{ "test1", "lines", 1, tick, 0, 0, 1, 0}}) @@ -83,7 +93,7 @@ describe('lua: buffer event callbacks', function() tick = tick + 1 check_events({{ "test1", "lines", 1, tick, 3, 5, 4, 32 }}) - exec_lua("return test_register(...)", 0, "test2", true) + exec_lua("return test_register(...)", 0, "test2", true, utf_sizes) tick = tick + 1 command('undo') @@ -124,7 +134,13 @@ describe('lua: buffer event callbacks', function() tick = tick + 1 check_events({{ "test2", "lines", 1, tick, 4, 5, 5, 19 }}) - feed('') + feed('Go') + tick = tick + 1 + check_events({{ "test2", "lines", 1, tick, 11, 11, 12, 0 }}) + + feed('x') + tick = tick + 1 + check_events({{ "test2", "lines", 1, tick, 11, 12, 12, 5 }}) command('bwipe!') check_events({{ "test2", "detach", 1 }}) @@ -137,4 +153,54 @@ describe('lua: buffer event callbacks', function() it('works with verify', function() check(true) end) + + it('works with utf_sizes and ASCII text', function() + check(false,true) + end) + + it('works with utf_sizes and unicode text', function() + local unicode_text = {"ascii text", + "latin text åäö", + "BMP text ɧ αλφά", + "BMP text 汉语 ↥↧", + "SMP 🤦 🦄🦃", + "combining å بِيَّة"} + meths.buf_set_lines(0, 0, -1, true, unicode_text) + feed('gg') + exec_lua("return test_register(...)", 0, "test1", false, true) + local tick = meths.buf_get_changedtick(0) + + feed('dd') + tick = tick + 1 + eq({{ "test1", "lines", 1, tick, 0, 1, 0, 11, 11, 11 }}, exec_lua("return get_events(...)" )) + + feed('A') + tick = tick + 1 + eq({{ "test1", "lines", 1, tick, 0, 1, 1, 18, 15, 15 }}, exec_lua("return get_events(...)" )) + + feed('jylp') + tick = tick + 1 + eq({{ "test1", "lines", 1, tick, 1, 2, 2, 21, 16, 16 }}, exec_lua("return get_events(...)" )) + + feed('+eea') + tick = tick + 1 + eq({{ "test1", "lines", 1, tick, 2, 3, 4, 23, 15, 15 }}, exec_lua("return get_events(...)" )) + + feed('jdw') + tick = tick + 1 + -- non-BMP chars count as 2 UTF-2 codeunits + eq({{ "test1", "lines", 1, tick, 4, 5, 5, 18, 9, 12 }}, exec_lua("return get_events(...)" )) + + feed('+rx') + tick = tick + 1 + -- count the individual codepoints of a composed character. + eq({{ "test1", "lines", 1, tick, 5, 6, 6, 27, 20, 20 }}, exec_lua("return get_events(...)" )) + + feed('kJ') + tick = tick + 1 + -- NB: this is inefficient (but not really wrong). + eq({{ "test1", "lines", 1, tick, 4, 5, 5, 14, 5, 8 }, + { "test1", "lines", 1, tick+1, 5, 6, 5, 27, 20, 20 }}, exec_lua("return get_events(...)" )) + end) + end)