neovim/test/unit/mbyte_spec.lua

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

207 lines
6.7 KiB
Lua
Raw Normal View History

local helpers = require('test.unit.helpers')(after_each)
local itp = helpers.gen_itp(it)
local ffi = helpers.ffi
local eq = helpers.eq
local lib = helpers.cimport('./src/nvim/mbyte.h', './src/nvim/charset.h', './src/nvim/grid.h')
describe('mbyte', function()
-- Convert from bytes to string
local function to_string(bytes)
local s = {}
for i = 1, #bytes do
s[i] = string.char(bytes[i])
end
return table.concat(s)
end
before_each(function() end)
itp('utf_ptr2char', function()
-- For strings with length 1 the first byte is returned.
for c = 0, 255 do
eq(c, lib.utf_ptr2char(to_string({ c, 0 })))
end
-- Some ill formed byte sequences that should not be recognized as UTF-8
-- First byte: 0xc0 or 0xc1
-- Second byte: 0x80 .. 0xbf
--eq(0x00c0, lib.utf_ptr2char(to_string({0xc0, 0x80})))
--eq(0x00c1, lib.utf_ptr2char(to_string({0xc1, 0xbf})))
--
-- Sequences with more than four bytes
end)
for n = 0, 0xF do
itp(('utf_char2bytes for chars 0x%x - 0x%x'):format(n * 0x1000, n * 0x1000 + 0xFFF), function()
local char_p = ffi.typeof('char[?]')
for c = n * 0x1000, n * 0x1000 + 0xFFF do
local p = char_p(4, 0)
lib.utf_char2bytes(c, p)
eq(c, lib.utf_ptr2char(p))
eq(lib.vim_iswordc(c), lib.vim_iswordp(p))
end
end)
end
describe('utfc_ptr2schar_len', function()
local function test_seq(seq)
local firstc = ffi.new('int[1]')
local buf = ffi.new('char[32]')
lib.schar_get(buf, lib.utfc_ptr2schar_len(to_string(seq), #seq, firstc))
return { ffi.string(buf), firstc[0] }
end
local function byte(val)
return { string.char(val), val }
end
itp('1-byte sequences', function()
eq({ '', 0 }, test_seq { 0 })
for c = 1, 127 do
eq(byte(c), test_seq { c })
end
for c = 128, 255 do
eq({ '', c }, test_seq { c })
end
end)
itp('2-byte sequences', function()
-- No combining characters
eq(byte(0x7f), test_seq { 0x7f, 0x7f })
-- No combining characters
eq(byte(0x7f), test_seq { 0x7f, 0x80 })
-- No UTF-8 sequence
eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f })
-- One UTF-8 character
eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80 })
-- No UTF-8 sequence
eq({ '', 0xc2 }, test_seq { 0xc2, 0xc0 })
end)
itp('3-byte sequences', function()
-- No second UTF-8 character
eq(byte(0x7f), test_seq { 0x7f, 0x80, 0x80 })
-- No combining character
eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0x80 })
-- Combining character is U+0300
eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 })
-- No UTF-8 sequence
eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc })
-- Incomplete combining character
eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc })
-- One UTF-8 character (composing only)
eq({ ' \xe2\x83\x90', 0x20d0 }, test_seq { 0xe2, 0x83, 0x90 })
end)
itp('4-byte sequences', function()
-- No following combining character
eq(byte(0x7f), test_seq { 0x7f, 0x7f, 0xcc, 0x80 })
-- No second UTF-8 character
eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80 })
-- Combining character U+0300
eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc })
-- No UTF-8 sequence
eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80 })
-- No following UTF-8 character
eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc })
-- Combining character U+0301
eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 })
-- One UTF-8 character
eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80 })
end)
itp('5+-byte sequences', function()
-- No following combining character
eq(byte(0x7f), test_seq { 0x7f, 0x7f, 0xcc, 0x80, 0x80 })
-- No second UTF-8 character
eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80, 0x80 })
-- Combining character U+0300
eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x00 })
-- Combining characters U+0300 and U+0301
eq({ '\x7f\xcc\x80\xcc\x81', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81 })
-- Combining characters U+0300, U+0301, U+0302
eq(
{ '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f },
test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 }
)
-- Combining characters U+0300, U+0301, U+0302, U+0303
eq(
{ '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x7f },
test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 }
)
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304
eq(
{ '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x7f },
test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 }
)
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305
eq(
{ '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x7f },
test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 }
)
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305, U+0306
eq(
{ '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x7f },
test_seq {
0x7f,
0xcc,
0x80,
0xcc,
0x81,
0xcc,
0x82,
0xcc,
0x83,
0xcc,
0x84,
0xcc,
0x85,
0xcc,
0x86,
}
)
-- Only three following combining characters U+0300, U+0301, U+0302
eq(
{ '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f },
test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 }
)
-- No UTF-8 sequence
eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80, 0x80 })
-- No following UTF-8 character
eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc, 0x80 })
-- Combining character U+0301
eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0x7f })
-- Combining character U+0301
eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0xcc })
-- One UTF-8 character
eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0x7f })
-- One UTF-8 character
eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0x80 })
-- One UTF-8 character
eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0xcc })
-- Combining characters U+1AB0 and U+0301
eq(
{ '\xf4\x80\x80\x80\xe1\xaa\xb0\xcc\x81', 0x100000 },
test_seq { 0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81 }
)
end)
end)
end)