lua: support getting UTF-32 and UTF-16 sizes of replaced text

This commit is contained in:
Björn Linse 2019-08-04 12:22:22 +02:00
parent b0e26199ec
commit c0993ed343
10 changed files with 192 additions and 30 deletions

View File

@ -208,14 +208,17 @@ they are allowed.
|nvim_buf_attach| will take keyword args for the callbacks. "on_lines" will
receive parameters ("lines", {buf}, {changedtick}, {firstline}, {lastline},
{new_lastline}, {old_bytecount}).
{new_lastline}, {old_byte_size}[, {old_utf32_size}, {old_utf16_size}]).
Unlike remote channel events the text contents are not passed. The new text can
be accessed inside the callback as
`vim.api.nvim_buf_get_lines(buf, firstline, new_lastline, true)`
{old_bytecount} is the total size of the replaced region {firstline} to
{lastline} in bytes, including the final newline after {lastline}.
{old_byte_size} is the total size of the replaced region {firstline} to
{lastline} in bytes, including the final newline after {lastline}. if
`utf_sizes` is set to true in |nvim_buf_attach()| keyword args, then the
UTF-32 and UTF-16 sizes of the deleted region is also passed as additional
arguments {old_utf32_size} and {old_utf16_size}.
"on_changedtick" is invoked when |b:changedtick| was incremented but no text
was changed. The parameters recieved are ("changedtick", {buf}, {changedtick}).

View File

@ -109,9 +109,11 @@ String buffer_get_line(Buffer buffer, Integer index, Error *err)
/// `nvim_buf_lines_event`. Otherwise, the first notification will be
/// a `nvim_buf_changedtick_event`. Not used for lua callbacks.
/// @param opts Optional parameters.
/// `on_lines`: lua callback received on change.
/// `on_lines`: lua callback received on change.
/// `on_changedtick`: lua callback received on changedtick
/// increment without text change.
/// `utf_sizes`: include UTF-32 and UTF-16 size of
/// the replaced region.
/// See |api-buffer-updates-lua| for more information
/// @param[out] err Error details, if any
/// @return False when updates couldn't be enabled because the buffer isn't
@ -156,6 +158,12 @@ Boolean nvim_buf_attach(uint64_t channel_id,
}
cb.on_detach = v->data.luaref;
v->data.integer = LUA_NOREF;
} else if (is_lua && strequal("utf_sizes", k.data)) {
if (v->type != kObjectTypeBoolean) {
api_set_error(err, kErrorTypeValidation, "utf_sizes must be boolean");
goto error;
}
cb.utf_sizes = v->data.boolean;
} else {
api_set_error(err, kErrorTypeValidation, "unexpected key: %s", k.data);
goto error;
@ -1196,6 +1204,7 @@ Dictionary nvim__buf_stats(Buffer buffer, Error *err)
// NB: this should be zero at any time API functions are called,
// this exists to debug issues
PUT(rv, "dirty_bytes", INTEGER_OBJ((Integer)buf->deleted_bytes));
return rv;
}

View File

@ -459,8 +459,9 @@ typedef struct {
LuaRef on_lines;
LuaRef on_changedtick;
LuaRef on_detach;
bool utf_sizes;
} BufUpdateCallbacks;
#define BUF_UPDATE_CALLBACKS_INIT { LUA_NOREF, LUA_NOREF, LUA_NOREF }
#define BUF_UPDATE_CALLBACKS_INIT { LUA_NOREF, LUA_NOREF, LUA_NOREF, false }
#define BUF_HAS_QF_ENTRY 1
#define BUF_HAS_LL_ENTRY 2
@ -802,12 +803,24 @@ struct file_buffer {
kvec_t(BufhlLine *) b_bufhl_move_space; // temporary space for highlights
// array of channelids which have asked to receive updates for this
// array of channel_id:s which have asked to receive updates for this
// buffer.
kvec_t(uint64_t) update_channels;
// array of lua callbacks for buffer updates.
kvec_t(BufUpdateCallbacks) update_callbacks;
// whether an update callback has requested codepoint size of deleted regions.
bool update_need_codepoints;
// Measurements of the deleted or replaced region since the last update
// event. Some consumers of buffer changes need to know the byte size (like
// tree-sitter) or the corresponding UTF-32/UTF-16 size (like LSP) of the
// deleted text.
size_t deleted_bytes;
size_t deleted_codepoints;
size_t deleted_codeunits;
// The number for times the current line has been flushed in the memline.
int flush_count;
int b_diff_failed; // internal diff failed for this buffer

View File

@ -26,6 +26,9 @@ bool buf_updates_register(buf_T *buf, uint64_t channel_id,
if (channel_id == LUA_INTERNAL_CALL) {
kv_push(buf->update_callbacks, cb);
if (cb.utf_sizes) {
buf->update_need_codepoints = true;
}
return true;
}
@ -169,7 +172,9 @@ void buf_updates_send_changes(buf_T *buf,
int64_t num_removed,
bool send_tick)
{
size_t deleted_bytes = ml_flush_deleted_bytes(buf);
size_t deleted_codepoints, deleted_codeunits;
size_t deleted_bytes = ml_flush_deleted_bytes(buf, &deleted_codepoints,
&deleted_codeunits);
if (!buf_updates_active(buf)) {
return;
@ -233,8 +238,8 @@ void buf_updates_send_changes(buf_T *buf,
bool keep = true;
if (cb.on_lines != LUA_NOREF) {
Array args = ARRAY_DICT_INIT;
Object items[6];
args.size = 6;
Object items[8];
args.size = 6; // may be increased to 8 below
args.items = items;
// the first argument is always the buffer handle
@ -254,6 +259,11 @@ void buf_updates_send_changes(buf_T *buf,
// byte count of previous contents
args.items[5] = INTEGER_OBJ((Integer)deleted_bytes);
if (cb.utf_sizes) {
args.size = 8;
args.items[6] = INTEGER_OBJ((Integer)deleted_codepoints);
args.items[7] = INTEGER_OBJ((Integer)deleted_codeunits);
}
textlock++;
Object res = executor_exec_lua_cb(cb.on_lines, "lines", args, true);
textlock--;

View File

@ -1756,6 +1756,8 @@ failed:
linecnt--;
}
curbuf->deleted_bytes = 0;
curbuf->deleted_codepoints = 0;
curbuf->deleted_codeunits = 0;
linecnt = curbuf->b_ml.ml_line_count - linecnt;
if (filesize == 0)
linecnt = 0;

View File

@ -627,6 +627,8 @@ EXTERN pos_T Insstart_orig;
EXTERN int orig_line_count INIT(= 0); /* Line count when "gR" started */
EXTERN int vr_lines_changed INIT(= 0); /* #Lines changed by "gR" so far */
// increase around internal delete/replace
EXTERN int inhibit_delete_count INIT(= 0);
/*
* These flags are set based upon 'fileencoding'.

View File

@ -1438,6 +1438,39 @@ int utf16_to_utf8(const wchar_t *strw, char **str)
#endif
/// Measure the length of a string in corresponding UTF-32 and UTF-16 units.
///
/// Invalid UTF-8 bytes, or embedded surrogates, count as one code point/unit
/// each.
///
/// The out parameters are incremented. This is used to measure the size of
/// a buffer region consisting of multiple line segments.
///
/// @param s the string
/// @param len maximum length (an earlier NUL terminates)
/// @param[out] codepoints incremented with UTF-32 code point size
/// @param[out] codeunits incremented with UTF-16 code unit size
void mb_utflen(const char_u *s, size_t len, size_t *codepoints,
size_t *codeunits)
FUNC_ATTR_NONNULL_ALL
{
size_t count = 0, extra = 0;
size_t clen;
for (size_t i = 0; i < len && s[i] != NUL; i += clen) {
clen = utf_ptr2len_len(s+i, len-i);
// NB: gets the byte value of invalid sequence bytes.
// we only care whether the char fits in the BMP or not
int c = (clen > 1) ? utf_ptr2char(s+i) : s[i];
count++;
if (c > 0xFFFF) {
extra++;
}
}
*codepoints += count;
*codeunits += count + extra;
}
/*
* Version of strnicmp() that handles multi-byte characters.
* Needed for Big5, Shift-JIS and UTF-8 encoding. Other DBCS encodings can

View File

@ -2383,6 +2383,23 @@ static int ml_append_int(
return OK;
}
void ml_add_deleted_len(char_u *ptr, ssize_t len)
{
if (inhibit_delete_count) {
return;
}
if (len == -1) {
len = STRLEN(ptr);
}
curbuf->deleted_bytes += len+1;
if (curbuf->update_need_codepoints) {
mb_utflen(ptr, len, &curbuf->deleted_codepoints,
&curbuf->deleted_codeunits);
curbuf->deleted_codepoints++; // NL char
curbuf->deleted_codeunits++;
}
}
/*
* Replace line lnum, with buffering, in current buffer.
*
@ -2408,19 +2425,17 @@ int ml_replace(linenr_T lnum, char_u *line, bool copy)
if (copy) {
line = vim_strsave(line);
}
if (curbuf->b_ml.ml_line_lnum != lnum) { /* other line buffered */
ml_flush_line(curbuf); /* flush it */
} else if (curbuf->b_ml.ml_flags & ML_LINE_DIRTY) { /* same line allocated */
// TODO FIXME: see other "TODO FIXME"
curbuf->deleted_bytes += STRLEN(curbuf->b_ml.ml_line_ptr)+1;
xfree(curbuf->b_ml.ml_line_ptr); /* free it */
readlen = false; // already read it.
if (curbuf->b_ml.ml_line_lnum != lnum) { // other line buffered
ml_flush_line(curbuf); // flush it
} else if (curbuf->b_ml.ml_flags & ML_LINE_DIRTY) { // same line allocated
ml_add_deleted_len(curbuf->b_ml.ml_line_ptr, -1);
readlen = false; // already added the length
xfree(curbuf->b_ml.ml_line_ptr); // free it
}
if (readlen) {
if (true) { // TODO: buffer updates active
curbuf->deleted_bytes += STRLEN(ml_get_buf(curbuf, lnum, false))+1;
}
if (readlen && kv_size(curbuf->update_callbacks)) {
ml_add_deleted_len(ml_get_buf(curbuf, lnum, false), -1);
}
curbuf->b_ml.ml_line_ptr = line;
@ -2504,7 +2519,10 @@ static int ml_delete_int(buf_T *buf, linenr_T lnum, bool message)
else
line_size = ((dp->db_index[idx - 1]) & DB_INDEX_MASK) - line_start;
buf->deleted_bytes += line_size;
// Line should always have an NL char internally (represented as NUL),
// even if 'noeol' is set.
assert(line_size >= 1);
ml_add_deleted_len((char_u *)dp + line_start, line_size-1);
/*
* special case: If there is only one line in the data block it becomes empty.
@ -2690,10 +2708,14 @@ void ml_clearmarked(void)
return;
}
size_t ml_flush_deleted_bytes(buf_T *buf)
size_t ml_flush_deleted_bytes(buf_T *buf, size_t *codepoints, size_t *codeunits)
{
size_t ret = buf->deleted_bytes;
*codepoints = buf->deleted_codepoints;
*codeunits = buf->deleted_codeunits;
buf->deleted_bytes = 0;
buf->deleted_codepoints = 0;
buf->deleted_codeunits = 0;
return ret;
}

View File

@ -780,6 +780,7 @@ open_line (
did_append = FALSE;
}
inhibit_delete_count++;
if (newindent
|| did_si
) {
@ -821,6 +822,7 @@ open_line (
did_si = false;
}
}
inhibit_delete_count--;
/*
* In REPLACE mode, for each character in the extra leader, there must be
@ -1685,7 +1687,7 @@ int del_bytes(colnr_T count, bool fixpos_arg, bool use_delcombine)
bool was_alloced = ml_line_alloced(); // check if oldp was allocated
char_u *newp;
if (was_alloced) {
curbuf->deleted_bytes += (size_t)oldlen+1;
ml_add_deleted_len(curbuf->b_ml.ml_line_ptr, oldlen);
newp = oldp; // use same allocated memory
} else { // need to allocate a new line
newp = xmalloc((size_t)(oldlen + 1 - count));

View File

@ -13,7 +13,8 @@ local origlines = {"original line 1",
"original line 3",
"original line 4",
"original line 5",
"original line 6"}
"original line 6",
" indented line"}
describe('lua: buffer event callbacks', function()
before_each(function()
@ -21,14 +22,14 @@ describe('lua: buffer event callbacks', function()
exec_lua([[
local events = {}
function test_register(bufnr, id, changedtick)
function test_register(bufnr, id, changedtick, utf_sizes)
local function callback(...)
table.insert(events, {id, ...})
if test_unreg == id then
return true
end
end
local opts = {on_lines=callback, on_detach=callback}
local opts = {on_lines=callback, on_detach=callback, utf_sizes=utf_sizes}
if changedtick then
opts.on_changedtick = callback
end
@ -48,18 +49,26 @@ describe('lua: buffer event callbacks', function()
-- assert the wrong thing), but masks errors with unflushed lines (as
-- nvim_buf_get_offset forces a flush of the memline). To be safe run the
-- test both ways.
local function check(verify)
local function check(verify,utf_sizes)
local lastsize
meths.buf_set_lines(0, 0, -1, true, origlines)
if verify then
lastsize = meths.buf_get_offset(0, meths.buf_line_count(0))
end
exec_lua("return test_register(...)", 0, "test1")
exec_lua("return test_register(...)", 0, "test1",false,utf_sizes)
local tick = meths.buf_get_changedtick(0)
local verify_name = "test1"
local function check_events(expected)
local events = exec_lua("return get_events(...)" )
if utf_sizes then
-- this test case uses ASCII only, so sizes sshould be the same.
-- Unicode is tested below.
for _, event in ipairs(expected) do
event[9] = event[8]
event[10] = event[8]
end
end
eq(expected, events)
if verify then
for _, event in ipairs(events) do
@ -75,6 +84,7 @@ describe('lua: buffer event callbacks', function()
end
end
command('set autoindent')
command('normal! GyyggP')
tick = tick + 1
check_events({{ "test1", "lines", 1, tick, 0, 0, 1, 0}})
@ -83,7 +93,7 @@ describe('lua: buffer event callbacks', function()
tick = tick + 1
check_events({{ "test1", "lines", 1, tick, 3, 5, 4, 32 }})
exec_lua("return test_register(...)", 0, "test2", true)
exec_lua("return test_register(...)", 0, "test2", true, utf_sizes)
tick = tick + 1
command('undo')
@ -124,7 +134,13 @@ describe('lua: buffer event callbacks', function()
tick = tick + 1
check_events({{ "test2", "lines", 1, tick, 4, 5, 5, 19 }})
feed('<esc>')
feed('<esc>Go')
tick = tick + 1
check_events({{ "test2", "lines", 1, tick, 11, 11, 12, 0 }})
feed('x')
tick = tick + 1
check_events({{ "test2", "lines", 1, tick, 11, 12, 12, 5 }})
command('bwipe!')
check_events({{ "test2", "detach", 1 }})
@ -137,4 +153,54 @@ describe('lua: buffer event callbacks', function()
it('works with verify', function()
check(true)
end)
it('works with utf_sizes and ASCII text', function()
check(false,true)
end)
it('works with utf_sizes and unicode text', function()
local unicode_text = {"ascii text",
"latin text åäö",
"BMP text ɧ αλφά",
"BMP text 汉语 ↥↧",
"SMP 🤦 🦄🦃",
"combining å بِيَّة"}
meths.buf_set_lines(0, 0, -1, true, unicode_text)
feed('gg')
exec_lua("return test_register(...)", 0, "test1", false, true)
local tick = meths.buf_get_changedtick(0)
feed('dd')
tick = tick + 1
eq({{ "test1", "lines", 1, tick, 0, 1, 0, 11, 11, 11 }}, exec_lua("return get_events(...)" ))
feed('A<bs>')
tick = tick + 1
eq({{ "test1", "lines", 1, tick, 0, 1, 1, 18, 15, 15 }}, exec_lua("return get_events(...)" ))
feed('<esc>jylp')
tick = tick + 1
eq({{ "test1", "lines", 1, tick, 1, 2, 2, 21, 16, 16 }}, exec_lua("return get_events(...)" ))
feed('+eea<cr>')
tick = tick + 1
eq({{ "test1", "lines", 1, tick, 2, 3, 4, 23, 15, 15 }}, exec_lua("return get_events(...)" ))
feed('<esc>jdw')
tick = tick + 1
-- non-BMP chars count as 2 UTF-2 codeunits
eq({{ "test1", "lines", 1, tick, 4, 5, 5, 18, 9, 12 }}, exec_lua("return get_events(...)" ))
feed('+rx')
tick = tick + 1
-- count the individual codepoints of a composed character.
eq({{ "test1", "lines", 1, tick, 5, 6, 6, 27, 20, 20 }}, exec_lua("return get_events(...)" ))
feed('kJ')
tick = tick + 1
-- NB: this is inefficient (but not really wrong).
eq({{ "test1", "lines", 1, tick, 4, 5, 5, 14, 5, 8 },
{ "test1", "lines", 1, tick+1, 5, 6, 5, 27, 20, 20 }}, exec_lua("return get_events(...)" ))
end)
end)