neovim/test/unit/mbyte_spec.lua

local helpers = require("test.unit.helpers")(after_each)
local itp = helpers.gen_itp(it)

local ffi     = helpers.ffi
local eq      = helpers.eq

local mbyte = helpers.cimport("./src/nvim/mbyte.h")
local charset = helpers.cimport('./src/nvim/charset.h')

describe('mbyte', function()
  if helpers.isCI('quickbuild') then
    pending("crashes on quickbuild", function() end)
  end

  -- Array for composing characters
  local intp = ffi.typeof('int[?]')
  local function to_intp()
    -- how to get MAX_MCO from globals.h?
    return intp(7, 1)
  end

  -- Convert from bytes to string
  local function to_string(bytes)
    local s = {}
    for i = 1, #bytes do
      s[i] = string.char(bytes[i])
    end
    return table.concat(s)
  end

  before_each(function()
  end)

  itp('utf_ptr2char', function()
    -- For strings with length 1 the first byte is returned.
    for c = 0, 255 do
      eq(c, mbyte.utf_ptr2char(to_string({c, 0})))
    end

    -- Some ill formed byte sequences that should not be recognized as UTF-8
    -- First byte: 0xc0 or 0xc1
    -- Second byte: 0x80 .. 0xbf
    --eq(0x00c0, mbyte.utf_ptr2char(to_string({0xc0, 0x80})))
    --eq(0x00c1, mbyte.utf_ptr2char(to_string({0xc1, 0xbf})))
    --
    -- Sequences with more than four bytes
  end)

  for n = 0, 0xF do
    itp(('utf_char2bytes for chars 0x%x - 0x%x'):format(n * 0x1000, n * 0x1000 + 0xFFF), function()
      local char_p = ffi.typeof('char[?]')
      for c = n * 0x1000, n * 0x1000 + 0xFFF do
        local p = char_p(4, 0)
        mbyte.utf_char2bytes(c, p)
        eq(c, mbyte.utf_ptr2char(p))
        eq(charset.vim_iswordc(c), charset.vim_iswordp(p))
      end
    end)
  end

  describe('utfc_ptr2char_len', function()

    itp('1-byte sequences', function()
      local pcc = to_intp()
      for c = 0, 255 do
        eq(c, mbyte.utfc_ptr2char_len(to_string({c}), pcc, 1))
        eq(0, pcc[0])
      end
    end)

    itp('2-byte sequences', function()
      local pcc = to_intp()
      -- No combining characters
      eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f}), pcc, 2))
      eq(0, pcc[0])
      -- No combining characters
      pcc = to_intp()
      eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x80}), pcc, 2))
      eq(0, pcc[0])

      -- No UTF-8 sequence
      pcc = to_intp()
      eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f}), pcc, 2))
      eq(0, pcc[0])
      -- One UTF-8 character
      pcc = to_intp()
      eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80}), pcc, 2))
      eq(0, pcc[0])
      -- No UTF-8 sequence
      pcc = to_intp()
      eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0xc0}), pcc, 2))
      eq(0, pcc[0])
    end)

    itp('3-byte sequences', function()
      local pcc = to_intp()

      -- No second UTF-8 character
      eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x80, 0x80}), pcc, 3))
      eq(0, pcc[0])
      -- No combining character
      pcc = to_intp()
      eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0x80}), pcc, 3))
      eq(0, pcc[0])

      -- Combining character is U+0300
      pcc = to_intp()
      eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80}), pcc, 3))
      eq(0x0300, pcc[0])
      eq(0x0000, pcc[1])

      -- No UTF-8 sequence
      pcc = to_intp()
      eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc}), pcc, 3))
      eq(0, pcc[0])
      -- Incomplete combining character
      pcc = to_intp()
      eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc}), pcc, 3))
      eq(0, pcc[0])

      -- One UTF-8 character
      pcc = to_intp()
      eq(0x20d0, mbyte.utfc_ptr2char_len(to_string({0xe2, 0x83, 0x90}), pcc, 3))
      eq(0, pcc[0])
    end)

    itp('4-byte sequences', function()
      local pcc = to_intp()

      -- No following combining character
      eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f, 0xcc, 0x80}), pcc, 4))
      eq(0, pcc[0])
      -- No second UTF-8 character
      pcc = to_intp()
      eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0xcc, 0x80}), pcc, 4))
      eq(0, pcc[0])

      -- Combining character U+0300
      pcc = to_intp()
      eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc}), pcc, 4))
      eq(0x0300, pcc[0])
      eq(0x0000, pcc[1])

      -- No UTF-8 sequence
      pcc = to_intp()
      eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc, 0x80}), pcc, 4))
      eq(0, pcc[0])
      -- No following UTF-8 character
      pcc = to_intp()
      eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0xcc}), pcc, 4))
      eq(0, pcc[0])
      -- Combining character U+0301
      pcc = to_intp()
      eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81}), pcc, 4))
      eq(0x0301, pcc[0])
      eq(0x0000, pcc[1])

      -- One UTF-8 character
      pcc = to_intp()
      eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80}), pcc, 4))
      eq(0, pcc[0])
    end)

    itp('5+-byte sequences', function()
      local pcc = to_intp()

      -- No following combining character
      eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f, 0xcc, 0x80, 0x80}), pcc, 5))
      eq(0, pcc[0])
      -- No second UTF-8 character
      pcc = to_intp()
      eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0xcc, 0x80, 0x80}), pcc, 5))
      eq(0, pcc[0])

      -- Combining character U+0300
      pcc = to_intp()
      eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc}), pcc, 5))
      eq(0x0300, pcc[0])
      eq(0x0000, pcc[1])

      -- Combining characters U+0300 and U+0301
      pcc = to_intp()
      eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81}), pcc, 5))
      eq(0x0300, pcc[0])
      eq(0x0301, pcc[1])
      eq(0x0000, pcc[2])
      -- Combining characters U+0300, U+0301, U+0302
      pcc = to_intp()
      eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82}), pcc, 7))
      eq(0x0300, pcc[0])
      eq(0x0301, pcc[1])
      eq(0x0302, pcc[2])
      eq(0x0000, pcc[3])
      -- Combining characters U+0300, U+0301, U+0302, U+0303
      pcc = to_intp()
      eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83}), pcc, 9))
      eq(0x0300, pcc[0])
      eq(0x0301, pcc[1])
      eq(0x0302, pcc[2])
      eq(0x0303, pcc[3])
      eq(0x0000, pcc[4])
      -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304
      pcc = to_intp()
      eq(0x007f, mbyte.utfc_ptr2char_len(to_string(
        {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84}), pcc, 11))
      eq(0x0300, pcc[0])
      eq(0x0301, pcc[1])
      eq(0x0302, pcc[2])
      eq(0x0303, pcc[3])
      eq(0x0304, pcc[4])
      eq(0x0000, pcc[5])
      -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304,
      -- U+0305
      pcc = to_intp()
      eq(0x007f, mbyte.utfc_ptr2char_len(to_string(
        {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85}), pcc, 13))
      eq(0x0300, pcc[0])
      eq(0x0301, pcc[1])
      eq(0x0302, pcc[2])
      eq(0x0303, pcc[3])
      eq(0x0304, pcc[4])
      eq(0x0305, pcc[5])
      eq(1, pcc[6])

      -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304,
      -- U+0305, U+0306, but only save six (= MAX_MCO).
      pcc = to_intp()
      eq(0x007f, mbyte.utfc_ptr2char_len(to_string(
        {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85, 0xcc, 0x86}), pcc, 15))
      eq(0x0300, pcc[0])
      eq(0x0301, pcc[1])
      eq(0x0302, pcc[2])
      eq(0x0303, pcc[3])
      eq(0x0304, pcc[4])
      eq(0x0305, pcc[5])
      eq(0x0001, pcc[6])

      -- Only three following combining characters U+0300, U+0301, U+0302
      pcc = to_intp()
      eq(0x007f, mbyte.utfc_ptr2char_len(to_string(
        {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85}), pcc, 13))
      eq(0x0300, pcc[0])
      eq(0x0301, pcc[1])
      eq(0x0302, pcc[2])
      eq(0x0000, pcc[3])


      -- No UTF-8 sequence
      pcc = to_intp()
      eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc, 0x80, 0x80}), pcc, 5))
      eq(0, pcc[0])
      -- No following UTF-8 character
      pcc = to_intp()
      eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0xcc, 0x80}), pcc, 5))
      eq(0, pcc[0])
      -- Combining character U+0301
      pcc = to_intp()
      eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81, 0x7f}), pcc, 5))
      eq(0x0301, pcc[0])
      eq(0x0000, pcc[1])
      -- Combining character U+0301
      pcc = to_intp()
      eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81, 0xcc}), pcc, 5))
      eq(0x0301, pcc[0])
      eq(0x0000, pcc[1])

      -- One UTF-8 character
      pcc = to_intp()
      eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0x7f}), pcc, 5))
      eq(0, pcc[0])

      -- One UTF-8 character
      pcc = to_intp()
      eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0x80}), pcc, 5))
      eq(0, pcc[0])
      -- One UTF-8 character
      pcc = to_intp()
      eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0xcc}), pcc, 5))
      eq(0, pcc[0])

      -- Combining characters U+1AB0 and U+0301
      pcc = to_intp()
      eq(0x100000, mbyte.utfc_ptr2char_len(to_string(
        {0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81}), pcc, 9))
      eq(0x1ab0, pcc[0])
      eq(0x0301, pcc[1])
      eq(0x0000, pcc[2])
    end)

  end)

end)