2017-03-11 03:28:18 -07:00
|
|
|
local helpers = require("test.unit.helpers")(after_each)
|
2017-03-04 18:02:45 -07:00
|
|
|
local itp = helpers.gen_itp(it)
|
2016-04-15 12:15:12 -07:00
|
|
|
|
|
|
|
local ffi = helpers.ffi
|
|
|
|
local eq = helpers.eq
|
|
|
|
|
2023-11-06 06:52:27 -07:00
|
|
|
local lib = helpers.cimport('./src/nvim/mbyte.h', './src/nvim/charset.h', './src/nvim/grid.h')
|
2016-04-15 12:15:12 -07:00
|
|
|
|
|
|
|
describe('mbyte', function()
|
|
|
|
-- Convert from bytes to string
|
|
|
|
local function to_string(bytes)
|
2016-04-26 12:14:33 -07:00
|
|
|
local s = {}
|
2016-04-15 12:15:12 -07:00
|
|
|
for i = 1, #bytes do
|
|
|
|
s[i] = string.char(bytes[i])
|
|
|
|
end
|
|
|
|
return table.concat(s)
|
|
|
|
end
|
|
|
|
|
|
|
|
before_each(function()
|
|
|
|
end)
|
|
|
|
|
2017-03-04 18:02:45 -07:00
|
|
|
itp('utf_ptr2char', function()
|
2016-04-15 12:15:12 -07:00
|
|
|
-- For strings with length 1 the first byte is returned.
|
|
|
|
for c = 0, 255 do
|
2023-11-06 06:52:27 -07:00
|
|
|
eq(c, lib.utf_ptr2char(to_string({c, 0})))
|
2016-04-15 12:15:12 -07:00
|
|
|
end
|
|
|
|
|
|
|
|
-- Some ill formed byte sequences that should not be recognized as UTF-8
|
|
|
|
-- First byte: 0xc0 or 0xc1
|
|
|
|
-- Second byte: 0x80 .. 0xbf
|
2023-11-06 06:52:27 -07:00
|
|
|
--eq(0x00c0, lib.utf_ptr2char(to_string({0xc0, 0x80})))
|
|
|
|
--eq(0x00c1, lib.utf_ptr2char(to_string({0xc1, 0xbf})))
|
2016-04-15 12:15:12 -07:00
|
|
|
--
|
|
|
|
-- Sequences with more than four bytes
|
|
|
|
end)
|
|
|
|
|
2018-11-15 05:06:37 -07:00
|
|
|
for n = 0, 0xF do
|
|
|
|
itp(('utf_char2bytes for chars 0x%x - 0x%x'):format(n * 0x1000, n * 0x1000 + 0xFFF), function()
|
|
|
|
local char_p = ffi.typeof('char[?]')
|
|
|
|
for c = n * 0x1000, n * 0x1000 + 0xFFF do
|
|
|
|
local p = char_p(4, 0)
|
2023-11-06 06:52:27 -07:00
|
|
|
lib.utf_char2bytes(c, p)
|
|
|
|
eq(c, lib.utf_ptr2char(p))
|
|
|
|
eq(lib.vim_iswordc(c), lib.vim_iswordp(p))
|
2018-11-15 05:06:37 -07:00
|
|
|
end
|
|
|
|
end)
|
|
|
|
end
|
2016-04-15 12:15:12 -07:00
|
|
|
|
2023-11-06 06:52:27 -07:00
|
|
|
describe('utfc_ptr2schar_len', function()
|
|
|
|
local function test_seq(seq)
|
|
|
|
local firstc = ffi.new("int[1]")
|
|
|
|
local buf = ffi.new("char[32]")
|
|
|
|
lib.schar_get(buf, lib.utfc_ptr2schar_len(to_string(seq), #seq, firstc))
|
|
|
|
return {ffi.string(buf), firstc[0]}
|
|
|
|
end
|
|
|
|
|
|
|
|
local function byte(val)
|
|
|
|
return {string.char(val), val}
|
|
|
|
end
|
2016-04-15 12:15:12 -07:00
|
|
|
|
2017-03-04 18:02:45 -07:00
|
|
|
itp('1-byte sequences', function()
|
2023-11-06 06:52:27 -07:00
|
|
|
eq({'', 0}, test_seq{0})
|
|
|
|
for c = 1, 127 do
|
|
|
|
eq(byte(c), test_seq{c})
|
|
|
|
end
|
|
|
|
for c = 128, 255 do
|
|
|
|
eq({'', c}, test_seq{c})
|
2016-04-15 12:15:12 -07:00
|
|
|
end
|
|
|
|
end)
|
|
|
|
|
2017-03-04 18:02:45 -07:00
|
|
|
itp('2-byte sequences', function()
|
2016-04-15 12:15:12 -07:00
|
|
|
-- No combining characters
|
2023-11-06 06:52:27 -07:00
|
|
|
eq(byte(0x7f), test_seq{0x7f, 0x7f})
|
2016-04-15 12:15:12 -07:00
|
|
|
-- No combining characters
|
2023-11-06 06:52:27 -07:00
|
|
|
eq(byte(0x7f), test_seq{0x7f, 0x80})
|
2016-04-15 12:15:12 -07:00
|
|
|
|
|
|
|
-- No UTF-8 sequence
|
2023-11-06 06:52:27 -07:00
|
|
|
eq({'', 0xc2}, test_seq{0xc2, 0x7f})
|
2016-04-15 12:15:12 -07:00
|
|
|
-- One UTF-8 character
|
2023-11-06 06:52:27 -07:00
|
|
|
eq({'\xc2\x80', 0x80}, test_seq{0xc2, 0x80})
|
2016-04-15 12:15:12 -07:00
|
|
|
-- No UTF-8 sequence
|
2023-11-06 06:52:27 -07:00
|
|
|
eq({'', 0xc2}, test_seq{0xc2, 0xc0})
|
2016-04-15 12:15:12 -07:00
|
|
|
end)
|
|
|
|
|
2017-03-04 18:02:45 -07:00
|
|
|
itp('3-byte sequences', function()
|
2016-04-15 12:15:12 -07:00
|
|
|
-- No second UTF-8 character
|
2023-11-06 06:52:27 -07:00
|
|
|
eq(byte(0x7f), test_seq{0x7f, 0x80, 0x80})
|
2016-04-15 12:15:12 -07:00
|
|
|
-- No combining character
|
2023-11-06 06:52:27 -07:00
|
|
|
eq(byte(0x7f), test_seq{0x7f, 0xc2, 0x80})
|
2016-04-15 12:15:12 -07:00
|
|
|
|
|
|
|
-- Combining character is U+0300
|
2023-11-06 06:52:27 -07:00
|
|
|
eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80})
|
2016-04-15 12:15:12 -07:00
|
|
|
|
|
|
|
-- No UTF-8 sequence
|
2023-11-06 06:52:27 -07:00
|
|
|
eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc})
|
2016-04-15 12:15:12 -07:00
|
|
|
-- Incomplete combining character
|
2023-11-06 06:52:27 -07:00
|
|
|
eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc})
|
2016-04-15 12:15:12 -07:00
|
|
|
|
2023-11-06 06:52:27 -07:00
|
|
|
-- One UTF-8 character (composing only)
|
|
|
|
eq({" \xe2\x83\x90", 0x20d0}, test_seq{0xe2, 0x83, 0x90})
|
2016-04-15 12:15:12 -07:00
|
|
|
end)
|
|
|
|
|
2017-03-04 18:02:45 -07:00
|
|
|
itp('4-byte sequences', function()
|
2016-04-15 12:15:12 -07:00
|
|
|
|
|
|
|
-- No following combining character
|
2023-11-06 06:52:27 -07:00
|
|
|
eq(byte(0x7f), test_seq{0x7f, 0x7f, 0xcc, 0x80})
|
2016-04-15 12:15:12 -07:00
|
|
|
-- No second UTF-8 character
|
2023-11-06 06:52:27 -07:00
|
|
|
eq(byte(0x7f), test_seq{0x7f, 0xc2, 0xcc, 0x80})
|
2016-04-15 12:15:12 -07:00
|
|
|
|
|
|
|
-- Combining character U+0300
|
2023-11-06 06:52:27 -07:00
|
|
|
eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc})
|
2016-04-15 12:15:12 -07:00
|
|
|
|
|
|
|
-- No UTF-8 sequence
|
2023-11-06 06:52:27 -07:00
|
|
|
eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc, 0x80})
|
2016-04-15 12:15:12 -07:00
|
|
|
-- No following UTF-8 character
|
2023-11-06 06:52:27 -07:00
|
|
|
eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0xcc})
|
2016-04-15 12:15:12 -07:00
|
|
|
-- Combining character U+0301
|
2023-11-06 06:52:27 -07:00
|
|
|
eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81})
|
2016-04-15 12:15:12 -07:00
|
|
|
|
|
|
|
-- One UTF-8 character
|
2023-11-06 06:52:27 -07:00
|
|
|
eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80})
|
2016-04-15 12:15:12 -07:00
|
|
|
end)
|
|
|
|
|
2017-03-04 18:02:45 -07:00
|
|
|
itp('5+-byte sequences', function()
|
2016-04-15 12:15:12 -07:00
|
|
|
-- No following combining character
|
2023-11-06 06:52:27 -07:00
|
|
|
eq(byte(0x7f), test_seq{0x7f, 0x7f, 0xcc, 0x80, 0x80})
|
2016-04-15 12:15:12 -07:00
|
|
|
-- No second UTF-8 character
|
2023-11-06 06:52:27 -07:00
|
|
|
eq(byte(0x7f), test_seq{0x7f, 0xc2, 0xcc, 0x80, 0x80})
|
2016-04-15 12:15:12 -07:00
|
|
|
|
|
|
|
-- Combining character U+0300
|
2023-11-06 06:52:27 -07:00
|
|
|
eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x00})
|
2016-04-15 12:15:12 -07:00
|
|
|
|
|
|
|
-- Combining characters U+0300 and U+0301
|
2023-11-06 06:52:27 -07:00
|
|
|
eq({"\x7f\xcc\x80\xcc\x81", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81})
|
2016-04-15 12:15:12 -07:00
|
|
|
-- Combining characters U+0300, U+0301, U+0302
|
2023-11-06 06:52:27 -07:00
|
|
|
eq({"\x7f\xcc\x80\xcc\x81\xcc\x82", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82})
|
2016-04-15 12:15:12 -07:00
|
|
|
-- Combining characters U+0300, U+0301, U+0302, U+0303
|
2023-11-06 06:52:27 -07:00
|
|
|
eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83})
|
2016-04-15 12:15:12 -07:00
|
|
|
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304
|
2023-11-06 06:52:27 -07:00
|
|
|
eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84})
|
|
|
|
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305
|
|
|
|
eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85})
|
2016-04-15 12:15:12 -07:00
|
|
|
|
2023-11-06 06:52:27 -07:00
|
|
|
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305, U+0306
|
|
|
|
eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85, 0xcc, 0x86})
|
2016-04-15 12:15:12 -07:00
|
|
|
|
2023-11-06 06:52:27 -07:00
|
|
|
-- Only three following combining characters U+0300, U+0301, U+0302
|
|
|
|
eq({"\x7f\xcc\x80\xcc\x81\xcc\x82", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85})
|
2016-04-15 12:15:12 -07:00
|
|
|
|
|
|
|
-- No UTF-8 sequence
|
2023-11-06 06:52:27 -07:00
|
|
|
eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc, 0x80, 0x80})
|
2016-04-15 12:15:12 -07:00
|
|
|
-- No following UTF-8 character
|
2023-11-06 06:52:27 -07:00
|
|
|
eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0xcc, 0x80})
|
2016-04-15 12:15:12 -07:00
|
|
|
-- Combining character U+0301
|
2023-11-06 06:52:27 -07:00
|
|
|
eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81, 0x7f})
|
2016-04-15 12:15:12 -07:00
|
|
|
-- Combining character U+0301
|
2023-11-06 06:52:27 -07:00
|
|
|
eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81, 0xcc})
|
2016-04-15 12:15:12 -07:00
|
|
|
|
|
|
|
-- One UTF-8 character
|
2023-11-06 06:52:27 -07:00
|
|
|
eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0x7f})
|
2016-04-15 12:15:12 -07:00
|
|
|
|
|
|
|
-- One UTF-8 character
|
2023-11-06 06:52:27 -07:00
|
|
|
eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0x80})
|
2016-04-15 12:15:12 -07:00
|
|
|
-- One UTF-8 character
|
2023-11-06 06:52:27 -07:00
|
|
|
eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0xcc})
|
2016-04-15 12:15:12 -07:00
|
|
|
|
|
|
|
-- Combining characters U+1AB0 and U+0301
|
2023-11-06 06:52:27 -07:00
|
|
|
eq({"\xf4\x80\x80\x80\xe1\xaa\xb0\xcc\x81", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81})
|
2016-04-15 12:15:12 -07:00
|
|
|
end)
|
|
|
|
|
|
|
|
end)
|
|
|
|
|
|
|
|
end)
|