2017-09-10 15:27:46 -07:00
|
|
|
#include <stddef.h>
|
2017-10-08 11:52:38 -07:00
|
|
|
#include <inttypes.h>
|
|
|
|
#include <assert.h>
|
|
|
|
#include <stdbool.h>
|
2017-09-10 15:27:46 -07:00
|
|
|
|
|
|
|
#include "nvim/types.h"
|
|
|
|
#include "nvim/mbyte.h"
|
|
|
|
#include "nvim/ascii.h"
|
|
|
|
|
2017-10-08 11:52:38 -07:00
|
|
|
const uint8_t utf8len_tab_zero[] = {
|
|
|
|
//1 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 2
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 4
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 6
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 8
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // A
|
|
|
|
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // C
|
|
|
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,0,0, // E
|
|
|
|
};
|
|
|
|
|
|
|
|
const uint8_t utf8len_tab[] = {
|
|
|
|
// ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9 ?A ?B ?C ?D ?E ?F
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0?
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1?
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2?
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3?
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4?
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5?
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6?
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7?
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8?
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9?
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A?
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B?
|
|
|
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C?
|
|
|
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D?
|
|
|
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E?
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1, // F?
|
|
|
|
};
|
|
|
|
|
|
|
|
int utf_ptr2char(const char_u *const p)
|
|
|
|
{
|
|
|
|
if (p[0] < 0x80) { // Be quick for ASCII.
|
|
|
|
return p[0];
|
|
|
|
}
|
|
|
|
|
|
|
|
const uint8_t len = utf8len_tab_zero[p[0]];
|
|
|
|
if (len > 1 && (p[1] & 0xc0) == 0x80) {
|
|
|
|
if (len == 2) {
|
|
|
|
return ((p[0] & 0x1f) << 6) + (p[1] & 0x3f);
|
|
|
|
}
|
|
|
|
if ((p[2] & 0xc0) == 0x80) {
|
|
|
|
if (len == 3) {
|
|
|
|
return (((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6)
|
|
|
|
+ (p[2] & 0x3f));
|
|
|
|
}
|
|
|
|
if ((p[3] & 0xc0) == 0x80) {
|
|
|
|
if (len == 4) {
|
|
|
|
return (((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12)
|
|
|
|
+ ((p[2] & 0x3f) << 6) + (p[3] & 0x3f));
|
|
|
|
}
|
|
|
|
if ((p[4] & 0xc0) == 0x80) {
|
|
|
|
if (len == 5) {
|
|
|
|
return (((p[0] & 0x03) << 24) + ((p[1] & 0x3f) << 18)
|
|
|
|
+ ((p[2] & 0x3f) << 12) + ((p[3] & 0x3f) << 6)
|
|
|
|
+ (p[4] & 0x3f));
|
|
|
|
}
|
|
|
|
if ((p[5] & 0xc0) == 0x80 && len == 6) {
|
|
|
|
return (((p[0] & 0x01) << 30) + ((p[1] & 0x3f) << 24)
|
|
|
|
+ ((p[2] & 0x3f) << 18) + ((p[3] & 0x3f) << 12)
|
|
|
|
+ ((p[4] & 0x3f) << 6) + (p[5] & 0x3f));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Illegal value: just return the first byte.
|
|
|
|
return p[0];
|
|
|
|
}
|
|
|
|
|
|
|
|
bool utf_composinglike(const char_u *p1, const char_u *p2)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2017-09-10 15:27:46 -07:00
|
|
|
char_u *string_convert(const vimconv_T *conv, char_u *data, size_t *size)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
int utfc_ptr2len_len(const char_u *p, int size)
|
|
|
|
{
|
2017-10-08 11:52:38 -07:00
|
|
|
assert(false);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int utf_char2len(const int c)
|
|
|
|
{
|
|
|
|
if (c < 0x80) {
|
|
|
|
return 1;
|
|
|
|
} else if (c < 0x800) {
|
|
|
|
return 2;
|
|
|
|
} else if (c < 0x10000) {
|
|
|
|
return 3;
|
|
|
|
} else if (c < 0x200000) {
|
|
|
|
return 4;
|
|
|
|
} else if (c < 0x4000000) {
|
|
|
|
return 5;
|
|
|
|
} else {
|
|
|
|
return 6;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int utf_char2bytes(const int c, char_u *const buf)
|
|
|
|
{
|
|
|
|
if (c < 0x80) { // 7 bits
|
|
|
|
buf[0] = c;
|
|
|
|
return 1;
|
|
|
|
} else if (c < 0x800) { // 11 bits
|
|
|
|
buf[0] = 0xc0 + ((unsigned)c >> 6);
|
|
|
|
buf[1] = 0x80 + (c & 0x3f);
|
|
|
|
return 2;
|
|
|
|
} else if (c < 0x10000) { // 16 bits
|
|
|
|
buf[0] = 0xe0 + ((unsigned)c >> 12);
|
|
|
|
buf[1] = 0x80 + (((unsigned)c >> 6) & 0x3f);
|
|
|
|
buf[2] = 0x80 + (c & 0x3f);
|
|
|
|
return 3;
|
|
|
|
} else if (c < 0x200000) { // 21 bits
|
|
|
|
buf[0] = 0xf0 + ((unsigned)c >> 18);
|
|
|
|
buf[1] = 0x80 + (((unsigned)c >> 12) & 0x3f);
|
|
|
|
buf[2] = 0x80 + (((unsigned)c >> 6) & 0x3f);
|
|
|
|
buf[3] = 0x80 + (c & 0x3f);
|
|
|
|
return 4;
|
|
|
|
} else if (c < 0x4000000) { // 26 bits
|
|
|
|
buf[0] = 0xf8 + ((unsigned)c >> 24);
|
|
|
|
buf[1] = 0x80 + (((unsigned)c >> 18) & 0x3f);
|
|
|
|
buf[2] = 0x80 + (((unsigned)c >> 12) & 0x3f);
|
|
|
|
buf[3] = 0x80 + (((unsigned)c >> 6) & 0x3f);
|
|
|
|
buf[4] = 0x80 + (c & 0x3f);
|
|
|
|
return 5;
|
|
|
|
} else { // 31 bits
|
|
|
|
buf[0] = 0xfc + ((unsigned)c >> 30);
|
|
|
|
buf[1] = 0x80 + (((unsigned)c >> 24) & 0x3f);
|
|
|
|
buf[2] = 0x80 + (((unsigned)c >> 18) & 0x3f);
|
|
|
|
buf[3] = 0x80 + (((unsigned)c >> 12) & 0x3f);
|
|
|
|
buf[4] = 0x80 + (((unsigned)c >> 6) & 0x3f);
|
|
|
|
buf[5] = 0x80 + (c & 0x3f);
|
|
|
|
return 6;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int utf_ptr2len(const char_u *const p)
|
|
|
|
{
|
|
|
|
if (*p == NUL) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
const int len = utf8len_tab[*p];
|
|
|
|
for (int i = 1; i < len; i++) {
|
|
|
|
if ((p[i] & 0xc0) != 0x80) {
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return len;
|
|
|
|
}
|
|
|
|
|
|
|
|
int utfc_ptr2len(const char_u *const p)
|
|
|
|
{
|
|
|
|
uint8_t b0 = (uint8_t)(*p);
|
|
|
|
|
|
|
|
if (b0 == NUL) {
|
2017-09-10 15:27:46 -07:00
|
|
|
return 0;
|
|
|
|
}
|
2017-10-08 11:52:38 -07:00
|
|
|
if (b0 < 0x80 && p[1] < 0x80) { // be quick for ASCII
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Skip over first UTF-8 char, stopping at a NUL byte.
|
|
|
|
int len = utf_ptr2len(p);
|
|
|
|
|
|
|
|
// Check for illegal byte.
|
|
|
|
if (len == 1 && b0 >= 0x80) {
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check for composing characters. We can handle only the first six, but
|
|
|
|
// skip all of them (otherwise the cursor would get stuck).
|
|
|
|
int prevlen = 0;
|
|
|
|
for (;;) {
|
|
|
|
if (p[len] < 0x80 || !UTF_COMPOSINGLIKE(p + prevlen, p + len)) {
|
|
|
|
return len;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Skip over composing char.
|
|
|
|
prevlen = len;
|
|
|
|
len += utf_ptr2len(p + len);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void mb_copy_char(const char_u **fp, char_u **tp)
|
|
|
|
{
|
|
|
|
const size_t l = utfc_ptr2len(*fp);
|
|
|
|
|
|
|
|
memmove(*tp, *fp, (size_t)l);
|
|
|
|
*tp += l;
|
|
|
|
*fp += l;
|
2017-09-10 15:27:46 -07:00
|
|
|
}
|