neovim/test/unit/mbyte_spec.lua
bfredl b522cb1ac3 refactor(grid): make screen rendering more multibyte than ever before
Problem: buffer text with composing chars are converted from UTF-8
to an array of up to seven UTF-32 values and then converted back
to UTF-8 strings.

Solution: Convert buffer text directly to UTF-8 based schar_T values.

The limit of the text size is now in schar_T bytes, which is currently
31+1 but easily could be raised as it no longer multiplies the size
of the entire screen grid when not used, the full size is only required
for temporary scratch buffers.

Also does some general cleanup to win_line text handling, which was
unnecessarily complicated due to multibyte rendering being an "opt-in"
feature long ago. Nowadays, a char is just a char, regardless if it consists
of one ASCII byte or multiple bytes.
2023-11-17 12:58:57 +01:00

174 lines
6.2 KiB
Lua

local helpers = require("test.unit.helpers")(after_each)
local itp = helpers.gen_itp(it)
local ffi = helpers.ffi
local eq = helpers.eq
local lib = helpers.cimport('./src/nvim/mbyte.h', './src/nvim/charset.h', './src/nvim/grid.h')
describe('mbyte', function()
-- Convert from bytes to string
local function to_string(bytes)
local s = {}
for i = 1, #bytes do
s[i] = string.char(bytes[i])
end
return table.concat(s)
end
before_each(function()
end)
itp('utf_ptr2char', function()
-- For strings with length 1 the first byte is returned.
for c = 0, 255 do
eq(c, lib.utf_ptr2char(to_string({c, 0})))
end
-- Some ill formed byte sequences that should not be recognized as UTF-8
-- First byte: 0xc0 or 0xc1
-- Second byte: 0x80 .. 0xbf
--eq(0x00c0, lib.utf_ptr2char(to_string({0xc0, 0x80})))
--eq(0x00c1, lib.utf_ptr2char(to_string({0xc1, 0xbf})))
--
-- Sequences with more than four bytes
end)
for n = 0, 0xF do
itp(('utf_char2bytes for chars 0x%x - 0x%x'):format(n * 0x1000, n * 0x1000 + 0xFFF), function()
local char_p = ffi.typeof('char[?]')
for c = n * 0x1000, n * 0x1000 + 0xFFF do
local p = char_p(4, 0)
lib.utf_char2bytes(c, p)
eq(c, lib.utf_ptr2char(p))
eq(lib.vim_iswordc(c), lib.vim_iswordp(p))
end
end)
end
describe('utfc_ptr2schar_len', function()
local function test_seq(seq)
local firstc = ffi.new("int[1]")
local buf = ffi.new("char[32]")
lib.schar_get(buf, lib.utfc_ptr2schar_len(to_string(seq), #seq, firstc))
return {ffi.string(buf), firstc[0]}
end
local function byte(val)
return {string.char(val), val}
end
itp('1-byte sequences', function()
eq({'', 0}, test_seq{0})
for c = 1, 127 do
eq(byte(c), test_seq{c})
end
for c = 128, 255 do
eq({'', c}, test_seq{c})
end
end)
itp('2-byte sequences', function()
-- No combining characters
eq(byte(0x7f), test_seq{0x7f, 0x7f})
-- No combining characters
eq(byte(0x7f), test_seq{0x7f, 0x80})
-- No UTF-8 sequence
eq({'', 0xc2}, test_seq{0xc2, 0x7f})
-- One UTF-8 character
eq({'\xc2\x80', 0x80}, test_seq{0xc2, 0x80})
-- No UTF-8 sequence
eq({'', 0xc2}, test_seq{0xc2, 0xc0})
end)
itp('3-byte sequences', function()
-- No second UTF-8 character
eq(byte(0x7f), test_seq{0x7f, 0x80, 0x80})
-- No combining character
eq(byte(0x7f), test_seq{0x7f, 0xc2, 0x80})
-- Combining character is U+0300
eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80})
-- No UTF-8 sequence
eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc})
-- Incomplete combining character
eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc})
-- One UTF-8 character (composing only)
eq({" \xe2\x83\x90", 0x20d0}, test_seq{0xe2, 0x83, 0x90})
end)
itp('4-byte sequences', function()
-- No following combining character
eq(byte(0x7f), test_seq{0x7f, 0x7f, 0xcc, 0x80})
-- No second UTF-8 character
eq(byte(0x7f), test_seq{0x7f, 0xc2, 0xcc, 0x80})
-- Combining character U+0300
eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc})
-- No UTF-8 sequence
eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc, 0x80})
-- No following UTF-8 character
eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0xcc})
-- Combining character U+0301
eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81})
-- One UTF-8 character
eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80})
end)
itp('5+-byte sequences', function()
-- No following combining character
eq(byte(0x7f), test_seq{0x7f, 0x7f, 0xcc, 0x80, 0x80})
-- No second UTF-8 character
eq(byte(0x7f), test_seq{0x7f, 0xc2, 0xcc, 0x80, 0x80})
-- Combining character U+0300
eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x00})
-- Combining characters U+0300 and U+0301
eq({"\x7f\xcc\x80\xcc\x81", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81})
-- Combining characters U+0300, U+0301, U+0302
eq({"\x7f\xcc\x80\xcc\x81\xcc\x82", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82})
-- Combining characters U+0300, U+0301, U+0302, U+0303
eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83})
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304
eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84})
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305
eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85})
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305, U+0306
eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85, 0xcc, 0x86})
-- Only three following combining characters U+0300, U+0301, U+0302
eq({"\x7f\xcc\x80\xcc\x81\xcc\x82", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85})
-- No UTF-8 sequence
eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc, 0x80, 0x80})
-- No following UTF-8 character
eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0xcc, 0x80})
-- Combining character U+0301
eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81, 0x7f})
-- Combining character U+0301
eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81, 0xcc})
-- One UTF-8 character
eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0x7f})
-- One UTF-8 character
eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0x80})
-- One UTF-8 character
eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0xcc})
-- Combining characters U+1AB0 and U+0301
eq({"\xf4\x80\x80\x80\xe1\xaa\xb0\xcc\x81", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81})
end)
end)
end)