mirror of
https://github.com/neovim/neovim.git
synced 2024-12-23 20:55:18 -07:00
fix(mbyte): fix bugs in utf_cp_*_off() functions
Problems: - Illegal bytes after valid UTF-8 char cause utf_cp_*_off() to fail. - When stream isn't NUL-terminated, utf_cp_*_off() may go over the end. Solution: Don't go over end of the char of end of the string.
This commit is contained in:
parent
8b4e269156
commit
ad5a155b1f
@ -203,7 +203,8 @@ information.
|
||||
mb_off2cells utf_off2cells
|
||||
mb_ptr2char utf_ptr2char
|
||||
mb_head_off utf_head_off
|
||||
mb_tail_off utf_cp_tail_off
|
||||
mb_tail_off utf_cp_bounds
|
||||
mb_off_next utf_cp_bounds
|
||||
mb_lefthalve grid_lefthalve
|
||||
mb_fix_col grid_fix_col
|
||||
utf_off2cells grid_off2cells
|
||||
|
@ -3436,7 +3436,7 @@ void maketitle(void)
|
||||
int len = (int)strlen(buf_p);
|
||||
if (len > 100) {
|
||||
len -= 100;
|
||||
len += utf_cp_tail_off(buf_p, buf_p + len) + 1;
|
||||
len += utf_cp_bounds(buf_p, buf_p + len).end_off;
|
||||
buf_p += len;
|
||||
}
|
||||
STRCPY(icon_str, buf_p);
|
||||
|
@ -1512,8 +1512,8 @@ static int command_line_erase_chars(CommandLineState *s)
|
||||
}
|
||||
|
||||
if (s->c == K_DEL) {
|
||||
ccline.cmdpos += mb_off_next(ccline.cmdbuff,
|
||||
ccline.cmdbuff + ccline.cmdpos);
|
||||
CharBoundsOff bounds = utf_cp_bounds(ccline.cmdbuff, ccline.cmdbuff + ccline.cmdpos);
|
||||
ccline.cmdpos += bounds.begin_off != 0 ? bounds.end_off : 0;
|
||||
}
|
||||
|
||||
if (ccline.cmdpos > 0) {
|
||||
|
@ -277,7 +277,7 @@ void line_do_arabic_shape(schar_T *buf, int cols)
|
||||
// Too bigly, discard one code-point.
|
||||
// This should be enough as c0 cannot grow more than from 2 to 4 bytes
|
||||
// (base arabic to extended arabic)
|
||||
rest -= (size_t)utf_cp_head_off(scbuf + off, scbuf + off + rest - 1) + 1;
|
||||
rest -= (size_t)utf_cp_bounds(scbuf + off, scbuf + off + rest - 1).begin_off + 1;
|
||||
}
|
||||
memcpy(scbuf_new + len, scbuf + off, rest);
|
||||
buf[i] = schar_from_buf(scbuf_new, len + rest);
|
||||
|
@ -226,11 +226,12 @@ static int nlua_str_utf_start(lua_State *const lstate) FUNC_ATTR_NONNULL_ALL
|
||||
size_t s1_len;
|
||||
const char *s1 = luaL_checklstring(lstate, 1, &s1_len);
|
||||
ptrdiff_t offset = luaL_checkinteger(lstate, 2);
|
||||
if (offset < 0 || offset > (intptr_t)s1_len) {
|
||||
if (offset <= 0 || offset > (intptr_t)s1_len) {
|
||||
return luaL_error(lstate, "index out of range");
|
||||
}
|
||||
int head_offset = -utf_cp_head_off(s1, s1 + offset - 1);
|
||||
lua_pushinteger(lstate, head_offset);
|
||||
size_t const off = (size_t)(offset - 1);
|
||||
int head_off = -utf_cp_bounds_len(s1, s1 + off, (int)(s1_len - off)).begin_off;
|
||||
lua_pushinteger(lstate, head_off);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -246,11 +247,12 @@ static int nlua_str_utf_end(lua_State *const lstate) FUNC_ATTR_NONNULL_ALL
|
||||
size_t s1_len;
|
||||
const char *s1 = luaL_checklstring(lstate, 1, &s1_len);
|
||||
ptrdiff_t offset = luaL_checkinteger(lstate, 2);
|
||||
if (offset < 0 || offset > (intptr_t)s1_len) {
|
||||
if (offset <= 0 || offset > (intptr_t)s1_len) {
|
||||
return luaL_error(lstate, "index out of range");
|
||||
}
|
||||
int tail_offset = utf_cp_tail_off(s1, s1 + offset - 1);
|
||||
lua_pushinteger(lstate, tail_offset);
|
||||
size_t const off = (size_t)(offset - 1);
|
||||
int tail_off = utf_cp_bounds_len(s1, s1 + off, (int)(s1_len - off)).end_off - 1;
|
||||
lua_pushinteger(lstate, tail_off);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
117
src/nvim/mbyte.c
117
src/nvim/mbyte.c
@ -1884,99 +1884,52 @@ void mb_copy_char(const char **const fp, char **const tp)
|
||||
*fp += l;
|
||||
}
|
||||
|
||||
/// Return the offset from "p_in" to the first byte of a character. When "p_in" is
|
||||
/// at the start of a character 0 is returned, otherwise the offset to the next
|
||||
/// character. Can start anywhere in a stream of bytes.
|
||||
int mb_off_next(const char *base, const char *p_in)
|
||||
{
|
||||
const uint8_t *p = (uint8_t *)p_in;
|
||||
int i;
|
||||
|
||||
if (*p < 0x80) { // be quick for ASCII
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Find the next character that isn't 10xx.xxxx
|
||||
for (i = 0; (p[i] & 0xc0) == 0x80; i++) {}
|
||||
if (i > 0) {
|
||||
int j;
|
||||
// Check for illegal sequence.
|
||||
for (j = 0; p - j > (uint8_t *)base; j++) {
|
||||
if ((p[-j] & 0xc0) != 0x80) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (utf8len_tab[p[-j]] != i + j) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
/// Return the offset from `p_in` to the last byte of the codepoint it points
|
||||
/// to. Can start anywhere in a stream of bytes.
|
||||
/// Returns the offset in bytes from "p_in" to the first and one-past-end bytes
|
||||
/// of the codepoint it points to.
|
||||
/// "p_in" can point anywhere in a stream of bytes.
|
||||
/// "p_len" limits number of bytes after "p_in".
|
||||
/// Note: Counts individual codepoints of composed characters separately.
|
||||
int utf_cp_tail_off(const char *base, const char *p_in)
|
||||
CharBoundsOff utf_cp_bounds_len(char const *base, char const *p_in, int p_len)
|
||||
FUNC_ATTR_PURE FUNC_ATTR_NONNULL_ALL
|
||||
{
|
||||
const uint8_t *p = (uint8_t *)p_in;
|
||||
int i;
|
||||
int j;
|
||||
|
||||
if (*p == NUL) {
|
||||
return 0;
|
||||
assert(base <= p_in && p_len > 0);
|
||||
uint8_t const *const b = (uint8_t *)base;
|
||||
uint8_t const *const p = (uint8_t *)p_in;
|
||||
if (*p < 0x80U) { // be quick for ASCII
|
||||
return (CharBoundsOff){ 0, 1 };
|
||||
}
|
||||
|
||||
// Find the last character that is 10xx.xxxx
|
||||
for (i = 0; (p[i + 1] & 0xc0) == 0x80; i++) {}
|
||||
|
||||
// Check for illegal sequence.
|
||||
for (j = 0; p_in - j > base; j++) {
|
||||
if ((p[-j] & 0xc0) != 0x80) {
|
||||
break;
|
||||
int const max_first_off = -MIN((int)(p - b), MB_MAXCHAR - 1);
|
||||
int first_off = 0;
|
||||
for (; utf_is_trail_byte(p[first_off]); first_off--) {
|
||||
if (first_off == max_first_off) { // failed to find first byte
|
||||
return (CharBoundsOff){ 0, 1 };
|
||||
}
|
||||
}
|
||||
|
||||
if (utf8len_tab[p[-j]] != i + j + 1) {
|
||||
return 0;
|
||||
int const max_end_off = utf8len_tab[p[first_off]] + first_off;
|
||||
if (max_end_off <= 0 || max_end_off > p_len) { // illegal or incomplete sequence
|
||||
return (CharBoundsOff){ 0, 1 };
|
||||
}
|
||||
return i;
|
||||
|
||||
for (int end_off = 1; end_off < max_end_off; end_off++) {
|
||||
if (!utf_is_trail_byte(p[end_off])) { // not enough trail bytes
|
||||
return (CharBoundsOff){ 0, 1 };
|
||||
}
|
||||
}
|
||||
|
||||
return (CharBoundsOff){ .begin_off = (int8_t)-first_off, .end_off = (int8_t)max_end_off };
|
||||
}
|
||||
|
||||
/// Return the offset from "p" to the first byte of the codepoint it points
|
||||
/// to. Can start anywhere in a stream of bytes.
|
||||
/// Note: Unlike `utf_head_off`, this counts individual codepoints of composed characters
|
||||
/// separately.
|
||||
///
|
||||
/// @param[in] base Pointer to start of string
|
||||
/// @param[in] p Pointer to byte for which to return the offset to the previous codepoint
|
||||
//
|
||||
/// @return 0 if invalid sequence, else number of bytes to previous codepoint
|
||||
int utf_cp_head_off(const char *base, const char *p)
|
||||
/// Returns the offset in bytes from "p_in" to the first and one-past-end bytes
|
||||
/// of the codepoint it points to.
|
||||
/// "p_in" can point anywhere in a stream of bytes.
|
||||
/// Stream must be NUL-terminated.
|
||||
/// Note: Counts individual codepoints of composed characters separately.
|
||||
CharBoundsOff utf_cp_bounds(char const *base, char const *p_in)
|
||||
FUNC_ATTR_PURE FUNC_ATTR_NONNULL_ALL
|
||||
{
|
||||
int i;
|
||||
|
||||
if (*p == NUL) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Find the first character that is not 10xx.xxxx
|
||||
for (i = 0; p - i >= base; i++) {
|
||||
if (((uint8_t)p[-i] & 0xc0) != 0x80) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Find the last character that is 10xx.xxxx (condition terminates on NUL)
|
||||
int j = 1;
|
||||
while (((uint8_t)p[j] & 0xc0) == 0x80) {
|
||||
j++;
|
||||
}
|
||||
|
||||
// Check for illegal sequence.
|
||||
if (utf8len_tab[(uint8_t)p[-i]] != j + i) {
|
||||
return 0;
|
||||
}
|
||||
return i;
|
||||
return utf_cp_bounds_len(base, p_in, INT_MAX);
|
||||
}
|
||||
|
||||
// Find the next illegal byte sequence.
|
||||
|
@ -52,6 +52,16 @@ extern const uint8_t utf8len_tab[256];
|
||||
#define MB_PTR_BACK(s, p) \
|
||||
(p -= utf_head_off((char *)(s), (char *)(p) - 1) + 1)
|
||||
|
||||
/// Check whether a given UTF-8 byte is a trailing byte (10xx.xxxx).
|
||||
static inline bool utf_is_trail_byte(uint8_t byte)
|
||||
REAL_FATTR_CONST REAL_FATTR_ALWAYS_INLINE;
|
||||
|
||||
static inline bool utf_is_trail_byte(uint8_t const byte)
|
||||
{
|
||||
// uint8_t is for clang to use smaller cmp
|
||||
return (uint8_t)(byte & 0xC0U) == 0x80U;
|
||||
}
|
||||
|
||||
static inline CharInfo utf_ptr2CharInfo(char const *p_in)
|
||||
REAL_FATTR_NONNULL_ALL REAL_FATTR_PURE REAL_FATTR_WARN_UNUSED_RESULT REAL_FATTR_ALWAYS_INLINE;
|
||||
|
||||
|
@ -66,3 +66,8 @@ typedef struct {
|
||||
char *ptr; ///< Pointer to the first byte of the character.
|
||||
CharInfo chr; ///< Information about the character.
|
||||
} StrCharInfo;
|
||||
|
||||
typedef struct {
|
||||
int8_t begin_off; ///< Offset to the first byte of the codepoint.
|
||||
int8_t end_off; ///< Offset to one past the end byte of the codepoint.
|
||||
} CharBoundsOff;
|
||||
|
@ -203,4 +203,74 @@ describe('mbyte', function()
|
||||
)
|
||||
end)
|
||||
end)
|
||||
|
||||
describe('utf_cp_bounds_len', function()
|
||||
local to_cstr = helpers.to_cstr
|
||||
|
||||
local tests = {
|
||||
{
|
||||
name = 'for valid string',
|
||||
str = 'iÀiiⱠiⱠⱠ𐀀i',
|
||||
offsets = {
|
||||
b = { 0, 0, 1, 0, 0, 0, 1, 2, 0, 0, 1, 2, 0, 1, 2, 0, 1, 2, 3, 0 },
|
||||
e = { 1, 2, 1, 1, 1, 3, 2, 1, 1, 3, 2, 1, 3, 2, 1, 4, 3, 2, 1, 1 },
|
||||
},
|
||||
},
|
||||
{
|
||||
name = 'for string with incomplete sequence',
|
||||
str = 'i\xC3iÀⱠiÀ\xE2\xB1Ⱡ\xF0\x90\x80',
|
||||
offsets = {
|
||||
b = { 0, 0, 0, 0, 1, 0, 1, 2, 0, 0, 1, 0, 0, 0, 1, 2, 0, 0, 0 },
|
||||
e = { 1, 1, 1, 2, 1, 3, 2, 1, 1, 2, 1, 1, 1, 3, 2, 1, 1, 1, 1 },
|
||||
},
|
||||
},
|
||||
{
|
||||
name = 'for string with trailing bytes after multibyte',
|
||||
str = 'iÀ\xA0Ⱡ\xA0Ⱡ𐀀\xA0i',
|
||||
offsets = {
|
||||
b = { 0, 0, 1, 0, 0, 1, 2, 0, 0, 1, 2, 0, 1, 2, 3, 0, 0 },
|
||||
e = { 1, 2, 1, 1, 3, 2, 1, 1, 3, 2, 1, 4, 3, 2, 1, 1, 1 },
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test in ipairs(tests) do
|
||||
itp(test.name, function()
|
||||
local cstr = to_cstr(test.str)
|
||||
local b_offsets, e_offsets = {}, {}
|
||||
for i = 1, #test.str do
|
||||
local result = lib.utf_cp_bounds_len(cstr, cstr + i - 1, #test.str - (i - 1))
|
||||
table.insert(b_offsets, result.begin_off)
|
||||
table.insert(e_offsets, result.end_off)
|
||||
end
|
||||
eq(test.offsets, { b = b_offsets, e = e_offsets })
|
||||
end)
|
||||
end
|
||||
|
||||
itp('does not read before start', function()
|
||||
local str = '𐀀'
|
||||
local expected_offsets = { b = { 0, 0, 0 }, e = { 1, 1, 1 } }
|
||||
local cstr = to_cstr(str) + 1
|
||||
local b_offsets, e_offsets = {}, {}
|
||||
for i = 1, 3 do
|
||||
local result = lib.utf_cp_bounds_len(cstr, cstr + i - 1, 3 - (i - 1))
|
||||
table.insert(b_offsets, result.begin_off)
|
||||
table.insert(e_offsets, result.end_off)
|
||||
end
|
||||
eq(expected_offsets, { b = b_offsets, e = e_offsets })
|
||||
end)
|
||||
|
||||
itp('does not read past the end', function()
|
||||
local str = '𐀀'
|
||||
local expected_offsets = { b = { 0, 0, 0 }, e = { 1, 1, 1 } }
|
||||
local cstr = to_cstr(str)
|
||||
local b_offsets, e_offsets = {}, {}
|
||||
for i = 1, 3 do
|
||||
local result = lib.utf_cp_bounds_len(cstr, cstr + i - 1, 3 - (i - 1))
|
||||
table.insert(b_offsets, result.begin_off)
|
||||
table.insert(e_offsets, result.end_off)
|
||||
end
|
||||
eq(expected_offsets, { b = b_offsets, e = e_offsets })
|
||||
end)
|
||||
end)
|
||||
end)
|
||||
|
Loading…
Reference in New Issue
Block a user