mirror of
https://github.com/neovim/neovim.git
synced 2024-12-19 10:45:16 -07:00
perf: don't decode utf8 character multiple times in getvcol()
The optimized virtual column calculation loop in getvcol() was decoding the current character twice: once in ptr2cells() and the second time in utfc_ptr2len(). For combining charcters, they were decoded up to 2 times in utfc_ptr2len(). Additionally, the function used to decode the character could be further optimised.
This commit is contained in:
parent
2f2f12122f
commit
b5653984e5
@ -257,6 +257,7 @@ bool arabic_maycombine(int two)
|
||||
}
|
||||
|
||||
/// Check whether we are dealing with Arabic combining characters.
|
||||
/// Returns false for negative values.
|
||||
/// Note: these are NOT really composing characters!
|
||||
///
|
||||
/// @param one First character.
|
||||
|
@ -111,10 +111,13 @@
|
||||
#endif
|
||||
|
||||
#if defined(__clang__) || defined(__GNUC__)
|
||||
# define EXPECT(cond, value) __builtin_expect((cond), (value))
|
||||
# define UNREACHABLE __builtin_unreachable()
|
||||
#elif defined(_MSVC_VER)
|
||||
#elif defined(_MSC_VER)
|
||||
# define EXPECT(cond, value) (cond)
|
||||
# define UNREACHABLE __assume(false)
|
||||
#else
|
||||
# define EXPECT(cond, value) (cond)
|
||||
# define UNREACHABLE
|
||||
#endif
|
||||
|
||||
|
170
src/nvim/mbyte.c
170
src/nvim/mbyte.c
@ -528,6 +528,74 @@ int utf_ptr2cells(const char *p)
|
||||
return 1;
|
||||
}
|
||||
|
||||
/// Convert a UTF-8 byte sequence to a character number.
|
||||
/// Doesn't handle ascii! only multibyte and illegal sequences.
|
||||
///
|
||||
/// @param[in] p String to convert.
|
||||
/// @param[in] len Length of the character in bytes, 0 or 1 if illegal.
|
||||
///
|
||||
/// @return Unicode codepoint. A negative value When the sequence is illegal.
|
||||
int32_t utf_ptr2CharInfo_impl(uint8_t const *p, uintptr_t const len)
|
||||
FUNC_ATTR_PURE FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT
|
||||
{
|
||||
// uint8_t is a reminder for clang to use smaller cmp
|
||||
#define CHECK \
|
||||
do { \
|
||||
if (EXPECT((uint8_t)(cur & 0xC0U) != 0x80U, false)) { \
|
||||
return -1; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
static uint32_t const corrections[] = {
|
||||
(1U << 31), // invalid - set invalid bits (safe to add as first 2 bytes
|
||||
(1U << 31), // won't affect highest bit in normal ret)
|
||||
-(0x80U + (0xC0U << 6)), // multibyte - subtract added UTF8 bits (1..10xxx and 10xxx)
|
||||
-(0x80U + (0x80U << 6) + (0xE0U << 12)),
|
||||
-(0x80U + (0x80U << 6) + (0x80U << 12) + (0xF0U << 18)),
|
||||
-(0x80U + (0x80U << 6) + (0x80U << 12) + (0x80U << 18) + (0xF8U << 24)),
|
||||
-(0x80U + (0x80U << 6) + (0x80U << 12) + (0x80U << 18) + (0x80U << 24)), // + (0xFCU << 30)
|
||||
};
|
||||
|
||||
// len is 0-6, but declared uintptr_t to avoid zeroing out upper bits
|
||||
uint32_t const corr = corrections[len];
|
||||
uint8_t cur;
|
||||
|
||||
// reading second byte unconditionally, safe for invalid
|
||||
// as it cannot be the last byte, not safe for ascii
|
||||
uint32_t code_point = ((uint32_t)p[0] << 6) + (cur = p[1]);
|
||||
CHECK;
|
||||
if ((uint32_t)len < 3) {
|
||||
goto ret; // len == 0, 1, 2
|
||||
}
|
||||
|
||||
code_point = (code_point << 6) + (cur = p[2]);
|
||||
CHECK;
|
||||
if ((uint32_t)len == 3) {
|
||||
goto ret;
|
||||
}
|
||||
|
||||
code_point = (code_point << 6) + (cur = p[3]);
|
||||
CHECK;
|
||||
if ((uint32_t)len == 4) {
|
||||
goto ret;
|
||||
}
|
||||
|
||||
code_point = (code_point << 6) + (cur = p[4]);
|
||||
CHECK;
|
||||
if ((uint32_t)len == 5) {
|
||||
goto ret;
|
||||
}
|
||||
|
||||
code_point = (code_point << 6) + (cur = p[5]);
|
||||
CHECK;
|
||||
// len == 6
|
||||
|
||||
ret:
|
||||
return (int32_t)(code_point + corr);
|
||||
|
||||
#undef CHECK
|
||||
}
|
||||
|
||||
/// Like utf_ptr2cells(), but limit string length to "size".
|
||||
/// For an empty string or truncated character returns 1.
|
||||
int utf_ptr2cells_len(const char *p, int size)
|
||||
@ -597,45 +665,62 @@ size_t mb_string2cells_len(const char *str, size_t size)
|
||||
///
|
||||
/// @return Unicode codepoint or byte value.
|
||||
int utf_ptr2char(const char *const p_in)
|
||||
FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT
|
||||
FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
|
||||
{
|
||||
uint8_t *p = (uint8_t *)p_in;
|
||||
if (p[0] < 0x80) { // Be quick for ASCII.
|
||||
return p[0];
|
||||
|
||||
uint32_t const v0 = p[0];
|
||||
if (EXPECT(v0 < 0x80U, true)) { // Be quick for ASCII.
|
||||
return (int)v0;
|
||||
}
|
||||
|
||||
const uint8_t len = utf8len_tab_zero[p[0]];
|
||||
if (len > 1 && (p[1] & 0xc0) == 0x80) {
|
||||
if (len == 2) {
|
||||
return ((p[0] & 0x1f) << 6) + (p[1] & 0x3f);
|
||||
}
|
||||
if ((p[2] & 0xc0) == 0x80) {
|
||||
if (len == 3) {
|
||||
return (((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6)
|
||||
+ (p[2] & 0x3f));
|
||||
}
|
||||
if ((p[3] & 0xc0) == 0x80) {
|
||||
if (len == 4) {
|
||||
return (((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12)
|
||||
+ ((p[2] & 0x3f) << 6) + (p[3] & 0x3f));
|
||||
}
|
||||
if ((p[4] & 0xc0) == 0x80) {
|
||||
if (len == 5) {
|
||||
return (((p[0] & 0x03) << 24) + ((p[1] & 0x3f) << 18)
|
||||
+ ((p[2] & 0x3f) << 12) + ((p[3] & 0x3f) << 6)
|
||||
+ (p[4] & 0x3f));
|
||||
}
|
||||
if ((p[5] & 0xc0) == 0x80 && len == 6) {
|
||||
return (((p[0] & 0x01) << 30) + ((p[1] & 0x3f) << 24)
|
||||
+ ((p[2] & 0x3f) << 18) + ((p[3] & 0x3f) << 12)
|
||||
+ ((p[4] & 0x3f) << 6) + (p[5] & 0x3f));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
const uint8_t len = utf8len_tab[v0];
|
||||
if (EXPECT(len < 2, false)) {
|
||||
return (int)v0;
|
||||
}
|
||||
// Illegal value: just return the first byte.
|
||||
return p[0];
|
||||
|
||||
#define CHECK(v) \
|
||||
do { \
|
||||
if (EXPECT((uint8_t)((v) & 0xC0U) != 0x80U, false)) { \
|
||||
return (int)v0; \
|
||||
} \
|
||||
} while (0)
|
||||
#define LEN_RETURN(len_v, result) \
|
||||
do { \
|
||||
if (len == (len_v)) { \
|
||||
return (int)(result); \
|
||||
} \
|
||||
} while (0)
|
||||
#define S(s) ((uint32_t)0x80U << (s))
|
||||
|
||||
uint32_t const v1 = p[1];
|
||||
CHECK(v1);
|
||||
LEN_RETURN(2, (v0 << 6) + v1 - ((0xC0U << 6) + S(0)));
|
||||
|
||||
uint32_t const v2 = p[2];
|
||||
CHECK(v2);
|
||||
LEN_RETURN(3, (v0 << 12) + (v1 << 6) + v2 - ((0xE0U << 12) + S(6) + S(0)));
|
||||
|
||||
uint32_t const v3 = p[3];
|
||||
CHECK(v3);
|
||||
LEN_RETURN(4, (v0 << 18) + (v1 << 12) + (v2 << 6) + v3
|
||||
- ((0xF0U << 18) + S(12) + S(6) + S(0)));
|
||||
|
||||
uint32_t const v4 = p[4];
|
||||
CHECK(v4);
|
||||
LEN_RETURN(5, (v0 << 24) + (v1 << 18) + (v2 << 12) + (v3 << 6) + v4
|
||||
- ((0xF8U << 24) + S(18) + S(12) + S(6) + S(0)));
|
||||
|
||||
uint32_t const v5 = p[5];
|
||||
CHECK(v5);
|
||||
// len == 6
|
||||
return (int)((v0 << 30) + (v1 << 24) + (v2 << 18) + (v3 << 12) + (v4 << 6) + v5
|
||||
// - (0xFCU << 30)
|
||||
- (S(24) + S(18) + S(12) + S(6) + S(0)));
|
||||
|
||||
#undef S
|
||||
#undef CHECK
|
||||
#undef LEN_RETURN
|
||||
}
|
||||
|
||||
// Convert a UTF-8 byte sequence to a wide character.
|
||||
@ -722,6 +807,16 @@ bool utf_composinglike(const char *p1, const char *p2)
|
||||
return arabic_combine(utf_ptr2char(p1), c2);
|
||||
}
|
||||
|
||||
/// Check if the next character is a composing character when it
|
||||
/// comes after the first. For Arabic sometimes "ab" is replaced with "c", which
|
||||
/// behaves like a composing character.
|
||||
/// returns false for negative values
|
||||
bool utf_char_composinglike(int32_t const first, int32_t const next)
|
||||
FUNC_ATTR_PURE
|
||||
{
|
||||
return utf_iscomposing(next) || arabic_combine(first, next);
|
||||
}
|
||||
|
||||
/// Get the screen char at the beginning of a string
|
||||
///
|
||||
/// Caller is expected to check for things like unprintable chars etc
|
||||
@ -988,9 +1083,10 @@ int utf_char2bytes(const int c, char *const buf)
|
||||
}
|
||||
}
|
||||
|
||||
// Return true if "c" is a composing UTF-8 character. This means it will be
|
||||
// drawn on top of the preceding character.
|
||||
// Based on code from Markus Kuhn.
|
||||
/// Return true if "c" is a composing UTF-8 character.
|
||||
/// This means it will be drawn on top of the preceding character.
|
||||
/// Based on code from Markus Kuhn.
|
||||
/// Returns false for negative values.
|
||||
bool utf_iscomposing(int c)
|
||||
{
|
||||
return intable(combining, ARRAY_SIZE(combining), c);
|
||||
|
@ -6,6 +6,7 @@
|
||||
|
||||
#include "nvim/cmdexpand_defs.h" // IWYU pragma: keep
|
||||
#include "nvim/eval/typval_defs.h" // IWYU pragma: keep
|
||||
#include "nvim/macros_defs.h"
|
||||
#include "nvim/mbyte_defs.h" // IWYU pragma: keep
|
||||
#include "nvim/types_defs.h" // IWYU pragma: keep
|
||||
|
||||
@ -13,6 +14,10 @@
|
||||
# include "mbyte.h.generated.h"
|
||||
#endif
|
||||
|
||||
enum {
|
||||
kInvalidByteCells = 4,
|
||||
};
|
||||
|
||||
// Return byte length of character that starts with byte "b".
|
||||
// Returns 1 for a single-byte character.
|
||||
// MB_BYTE2LEN_CHECK() can be used to count a special key as one byte.
|
||||
@ -44,3 +49,64 @@ extern const uint8_t utf8len_tab[256];
|
||||
// multi-byte characters if needed. Only use with "p" > "s" !
|
||||
#define MB_PTR_BACK(s, p) \
|
||||
(p -= utf_head_off((char *)(s), (char *)(p) - 1) + 1)
|
||||
|
||||
static inline CharInfo utf_ptr2CharInfo(char const *p_in)
|
||||
REAL_FATTR_NONNULL_ALL REAL_FATTR_PURE REAL_FATTR_WARN_UNUSED_RESULT REAL_FATTR_ALWAYS_INLINE;
|
||||
|
||||
/// Convert a UTF-8 byte sequence to a Unicode code point.
|
||||
/// Handles ascii, multibyte sequiences and illegal sequences.
|
||||
///
|
||||
/// @param[in] p_in String to convert.
|
||||
///
|
||||
/// @return information abouth the character. When the sequence is illegal,
|
||||
/// 'value' is negative, 'len' is 1.
|
||||
static inline CharInfo utf_ptr2CharInfo(char const *const p_in)
|
||||
{
|
||||
uint8_t const *const p = (uint8_t const *)p_in;
|
||||
uint8_t const first = *p;
|
||||
if (first < 0x80) {
|
||||
return (CharInfo){ .value = first, .len = 1 };
|
||||
} else {
|
||||
int len = utf8len_tab[first];
|
||||
int32_t const code_point = utf_ptr2CharInfo_impl(p, (uintptr_t)len);
|
||||
if (code_point < 0) {
|
||||
len = 1;
|
||||
}
|
||||
return (CharInfo){ .value = code_point, .len = len };
|
||||
}
|
||||
}
|
||||
|
||||
static inline StrCharInfo utfc_next(StrCharInfo cur)
|
||||
REAL_FATTR_NONNULL_ALL REAL_FATTR_ALWAYS_INLINE REAL_FATTR_PURE;
|
||||
|
||||
/// Return information about the next character.
|
||||
/// Composing and combining characters are
|
||||
/// considered a part of the current character.
|
||||
///
|
||||
/// @param[in] cur Pointer to the current character. Must not point to NUL
|
||||
/// @param[in] cur_char Decoded charater at 'cur'.
|
||||
static inline StrCharInfo utfc_next(StrCharInfo cur)
|
||||
{
|
||||
int32_t prev_code = cur.chr.value;
|
||||
uint8_t *next = (uint8_t *)(cur.ptr + cur.chr.len);
|
||||
|
||||
while (true) {
|
||||
if (EXPECT(*next < 0x80U, true)) {
|
||||
return (StrCharInfo){
|
||||
.ptr = (char *)next,
|
||||
.chr = (CharInfo){ .value = *next, .len = 1 },
|
||||
};
|
||||
}
|
||||
uint8_t const next_len = utf8len_tab[*next];
|
||||
int32_t const next_code = utf_ptr2CharInfo_impl(next, (uintptr_t)next_len);
|
||||
if (!utf_char_composinglike(prev_code, next_code)) {
|
||||
return (StrCharInfo){
|
||||
.ptr = (char *)next,
|
||||
.chr = (CharInfo){ .value = next_code, .len = (next_code < 0 ? 1 : next_len) },
|
||||
};
|
||||
}
|
||||
|
||||
prev_code = next_code;
|
||||
next += next_len;
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "nvim/iconv_defs.h"
|
||||
|
||||
@ -55,3 +56,13 @@ typedef struct {
|
||||
bool vc_fail; ///< What to do with invalid characters: if true, fail,
|
||||
///< otherwise use '?'.
|
||||
} vimconv_T;
|
||||
|
||||
typedef struct {
|
||||
int32_t value; ///< code point
|
||||
int len; ///< length in bytes
|
||||
} CharInfo;
|
||||
|
||||
typedef struct {
|
||||
char *ptr; ///< pointer to the first byte of the character
|
||||
CharInfo chr; ///< the character
|
||||
} StrCharInfo;
|
||||
|
@ -511,55 +511,47 @@ void getvcol(win_T *wp, pos_T *pos, colnr_T *start, colnr_T *cursor, colnr_T *en
|
||||
cts.cts_max_head_vcol = -1;
|
||||
|
||||
// This function is used very often, do some speed optimizations.
|
||||
// When 'list', 'linebreak', 'showbreak' and 'breakindent' are not set
|
||||
// When 'linebreak', 'showbreak' and 'breakindent' are not set
|
||||
// and there are no virtual text use a simple loop.
|
||||
// Also use this when 'list' is set but tabs take their normal size.
|
||||
if ((!wp->w_p_list || (wp->w_p_lcs_chars.tab1 != NUL))
|
||||
&& !wp->w_p_lbr
|
||||
&& *get_showbreak_value(wp) == NUL
|
||||
&& !wp->w_p_bri
|
||||
&& cts.virt_row < 0) {
|
||||
if (!wp->w_p_lbr && !wp->w_p_bri && cts.virt_row < 0 && *get_showbreak_value(wp) == NUL) {
|
||||
bool const special_tab = !wp->w_p_list || wp->w_p_lcs_chars.tab1 != NUL;
|
||||
CharInfo cur_char = utf_ptr2CharInfo(ptr);
|
||||
while (true) {
|
||||
head = 0;
|
||||
int c = (uint8_t)(*ptr);
|
||||
|
||||
// make sure we don't go past the end of the line
|
||||
if (c == NUL) {
|
||||
if (cur_char.value == 0 && cur_char.len == 1) {
|
||||
// NUL at end of line only takes one column
|
||||
incr = 1;
|
||||
break;
|
||||
}
|
||||
|
||||
// A tab gets expanded, depending on the current column
|
||||
if (c == TAB) {
|
||||
if (cur_char.value == TAB && special_tab) {
|
||||
incr = tabstop_padding(vcol, ts, vts);
|
||||
} else {
|
||||
// For utf-8, if the byte is >= 0x80, need to look at
|
||||
// further bytes to find the cell width.
|
||||
if (c >= 0x80) {
|
||||
incr = utf_ptr2cells(ptr);
|
||||
if (cur_char.value < 0) {
|
||||
incr = kInvalidByteCells;
|
||||
} else {
|
||||
incr = byte2cells(c);
|
||||
incr = char2cells(cur_char.value);
|
||||
}
|
||||
|
||||
// If a double-cell char doesn't fit at the end of a line
|
||||
// it wraps to the next line, it's like this char is three
|
||||
// cells wide.
|
||||
if ((incr == 2)
|
||||
&& wp->w_p_wrap
|
||||
&& (MB_BYTE2LEN((uint8_t)(*ptr)) > 1)
|
||||
&& in_win_border(wp, vcol)) {
|
||||
if (incr == 2 && cur_char.value >= 0x80
|
||||
&& wp->w_p_wrap && in_win_border(wp, vcol)) {
|
||||
incr++;
|
||||
head = 1;
|
||||
}
|
||||
}
|
||||
|
||||
char *const next = ptr + utfc_ptr2len(ptr);
|
||||
if ((uintptr_t)next > last_pos) {
|
||||
StrCharInfo const next_char = utfc_next((StrCharInfo){ ptr, cur_char });
|
||||
if ((uintptr_t)next_char.ptr > last_pos) {
|
||||
break;
|
||||
}
|
||||
|
||||
ptr = next;
|
||||
cur_char = next_char.chr;
|
||||
ptr = next_char.ptr;
|
||||
vcol += incr;
|
||||
}
|
||||
} else {
|
||||
|
Loading…
Reference in New Issue
Block a user