neovim/runtime/lua/vim/lsp/sync.lua

-- Notes on incremental sync:
--  Per the protocol, the text range should be:
--
--  A position inside a document (see Position definition below) is expressed as
--  a zero-based line and character offset. The offsets are based on a UTF-16
--  string representation. So a string of the form a𐐀b the character offset
--  of the character a is 0, the character offset of 𐐀 is 1 and the character
--  offset of b is 3 since 𐐀 is represented using two code units in UTF-16.
--
--  To ensure that both client and server split the string into the same line
--  representation the protocol specifies the following end-of-line sequences: ‘\n’, ‘\r\n’ and ‘\r’.
--
--  Positions are line end character agnostic. So you can not specify a position that
--  denotes \r|\n or \n| where | represents the character offset. This means *no* defining
--  a range than ends on the same line after a terminating character
--
-- Generic warnings about byte level changes in neovim. Many apparently "single"
-- operations in on_lines callbacks are actually multiple operations.
--
--  Join operation (2 operations):
--  * extends line 1 with the contents of line 2
--  * deletes line 2
--
--  test 1    test 1 test 2    test 1 test 2
--  test 2 -> test 2        -> test 3
--  test 3    test 3
--
--  Deleting (and undoing) two middle lines (1 operation):
--
--  test 1    test 1
--  test 2 -> test 4
--  test 3
--  test 4
--
--  Deleting partial lines (5 operations) deleting between asterisks below:
--
--  test *1   test *    test *     test *    test *4    test *4*
--  test 2 -> test 2 -> test *4 -> *4     -> *4      ->
--  test 3    test 3
--  test *4   test 4

local M = {}

-- local string.byte, unclear if this is necessary for JIT compilation
local str_byte = string.byte
local min = math.min
local str_utfindex = vim.str_utfindex
local str_utf_start = vim.str_utf_start
local str_utf_end = vim.str_utf_end

-- Given a line, byte idx, and offset_encoding convert to the
-- utf-8, utf-16, or utf-32 index.
---@param line string the line to index into
---@param byte integer the byte idx
---@param offset_encoding string utf-8|utf-16|utf-32|nil (default: utf-8)
--@returns integer the utf idx for the given encoding
local function byte_to_utf(line, byte, offset_encoding)
  -- convert to 0 based indexing for str_utfindex
  byte = byte - 1

  local utf_idx, _ --- @type integer, integer
  -- Convert the byte range to utf-{8,16,32} and convert 1-based (lua) indexing to 0-based
  if offset_encoding == 'utf-16' then
    _, utf_idx = str_utfindex(line, byte)
  elseif offset_encoding == 'utf-32' then
    utf_idx, _ = str_utfindex(line, byte)
  else
    utf_idx = byte
  end

  -- convert to 1 based indexing
  return utf_idx + 1
end

---@param line string
---@param offset_encoding string
---@return integer
local function compute_line_length(line, offset_encoding)
  local length, _ --- @type integer, integer
  if offset_encoding == 'utf-16' then
    _, length = str_utfindex(line)
  elseif offset_encoding == 'utf-32' then
    length, _ = str_utfindex(line)
  else
    length = #line
  end
  return length
end

-- Given a line, byte idx, alignment, and offset_encoding convert to the aligned
-- utf-8 index and either the utf-16, or utf-32 index.
---@param line string the line to index into
---@param byte integer the byte idx
---@param offset_encoding string utf-8|utf-16|utf-32|nil (default: utf-8)
---@return integer byte_idx of first change position
---@return integer char_idx of first change position
local function align_end_position(line, byte, offset_encoding)
  local char --- @type integer
  -- If on the first byte, or an empty string: the trivial case
  if byte == 1 or #line == 0 then
    char = byte
    -- Called in the case of extending an empty line "" -> "a"
  elseif byte == #line + 1 then
    char = compute_line_length(line, offset_encoding) + 1
  else
    -- Modifying line, find the nearest utf codepoint
    local offset = str_utf_start(line, byte)
    -- If the byte does not fall on the start of the character, then
    -- align to the start of the next character.
    if offset < 0 then
      byte = byte + str_utf_end(line, byte) + 1
    end
    if byte <= #line then
      char = byte_to_utf(line, byte, offset_encoding)
    else
      char = compute_line_length(line, offset_encoding) + 1
    end
    -- Extending line, find the nearest utf codepoint for the last valid character
  end
  return byte, char
end

--- Finds the first line, byte, and char index of the difference between the previous and current lines buffer normalized to the previous codepoint.
---@param prev_lines string[] list of lines from previous buffer
---@param curr_lines string[] list of lines from current buffer
---@param firstline integer firstline from on_lines, adjusted to 1-index
---@param lastline integer lastline from on_lines, adjusted to 1-index
---@param new_lastline integer new_lastline from on_lines, adjusted to 1-index
---@param offset_encoding string utf-8|utf-16|utf-32|nil (fallback to utf-8)
---@return table result table include line_idx, byte_idx, and char_idx of first change position
local function compute_start_range(
  prev_lines,
  curr_lines,
  firstline,
  lastline,
  new_lastline,
  offset_encoding
)
  local char_idx --- @type integer?
  local byte_idx --- @type integer?
  -- If firstline == lastline, no existing text is changed. All edit operations
  -- occur on a new line pointed to by lastline. This occurs during insertion of
  -- new lines(O), the new newline is inserted at the line indicated by
  -- new_lastline.
  if firstline == lastline then
    local line_idx --- @type integer
    local line = prev_lines[firstline - 1]
    if line then
      line_idx = firstline - 1
      byte_idx = #line + 1
      char_idx = compute_line_length(line, offset_encoding) + 1
    else
      line_idx = firstline
      byte_idx = 1
      char_idx = 1
    end
    return { line_idx = line_idx, byte_idx = byte_idx, char_idx = char_idx }
  end

  -- If firstline == new_lastline, the first change occurred on a line that was deleted.
  -- In this case, the first byte change is also at the first byte of firstline
  if firstline == new_lastline then
    return { line_idx = firstline, byte_idx = 1, char_idx = 1 }
  end

  local prev_line = prev_lines[firstline]
  local curr_line = curr_lines[firstline]

  -- Iterate across previous and current line containing first change
  -- to find the first different byte.
  -- Note: *about -> a*about will register the second a as the first
  -- difference, regardless of edit since we do not receive the first
  -- column of the edit from on_lines.
  local start_byte_idx = 1
  for idx = 1, #prev_line + 1 do
    start_byte_idx = idx
    if str_byte(prev_line, idx) ~= str_byte(curr_line, idx) then
      break
    end
  end

  -- Convert byte to codepoint if applicable
  if start_byte_idx == 1 or (#prev_line == 0 and start_byte_idx == 1) then
    byte_idx = start_byte_idx
    char_idx = 1
  elseif start_byte_idx == #prev_line + 1 then
    byte_idx = start_byte_idx
    char_idx = compute_line_length(prev_line, offset_encoding) + 1
  else
    byte_idx = start_byte_idx + str_utf_start(prev_line, start_byte_idx)
    char_idx = byte_to_utf(prev_line, byte_idx, offset_encoding)
  end

  -- Return the start difference (shared for new and prev lines)
  return { line_idx = firstline, byte_idx = byte_idx, char_idx = char_idx }
end

--- Finds the last line and byte index of the differences between prev and current buffer.
--- Normalized to the next codepoint.
--- prev_end_range is the text range sent to the server representing the changed region.
--- curr_end_range is the text that should be collected and sent to the server.
--
---@param start_range table
---@param prev_lines string[] list of lines
---@param curr_lines string[] list of lines
---@param firstline integer
---@param lastline integer
---@param new_lastline integer
---@param offset_encoding string
---@return integer|table end_line_idx and end_col_idx of range
---@return table|nil end_col_idx of range
local function compute_end_range(
  prev_lines,
  curr_lines,
  start_range,
  firstline,
  lastline,
  new_lastline,
  offset_encoding
)
  -- If firstline == new_lastline, the first change occurred on a line that was deleted.
  -- In this case, the last_byte...
  if firstline == new_lastline then
    return { line_idx = (lastline - new_lastline + firstline), byte_idx = 1, char_idx = 1 }, {
      line_idx = firstline,
      byte_idx = 1,
      char_idx = 1,
    }
  end
  if firstline == lastline then
    return { line_idx = firstline, byte_idx = 1, char_idx = 1 }, {
      line_idx = new_lastline - lastline + firstline,
      byte_idx = 1,
      char_idx = 1,
    }
  end
  -- Compare on last line, at minimum will be the start range
  local start_line_idx = start_range.line_idx

  -- lastline and new_lastline were last lines that were *not* replaced, compare previous lines
  local prev_line_idx = lastline - 1
  local curr_line_idx = new_lastline - 1

  local prev_line = prev_lines[lastline - 1]
  local curr_line = curr_lines[new_lastline - 1]

  local prev_line_length = #prev_line
  local curr_line_length = #curr_line

  local byte_offset = 0

  -- Editing the same line
  -- If the byte offset is zero, that means there is a difference on the last byte (not newline)
  if prev_line_idx == curr_line_idx then
    local max_length --- @type integer
    if start_line_idx == prev_line_idx then
      -- Search until beginning of difference
      max_length = min(
        prev_line_length - start_range.byte_idx,
        curr_line_length - start_range.byte_idx
      ) + 1
    else
      max_length = min(prev_line_length, curr_line_length) + 1
    end
    for idx = 0, max_length do
      byte_offset = idx
      if
        str_byte(prev_line, prev_line_length - byte_offset)
        ~= str_byte(curr_line, curr_line_length - byte_offset)
      then
        break
      end
    end
  end

  -- Iterate from end to beginning of shortest line
  local prev_end_byte_idx = prev_line_length - byte_offset + 1

  -- Handle case where lines match
  if prev_end_byte_idx == 0 then
    prev_end_byte_idx = 1
  end
  local prev_byte_idx, prev_char_idx =
    align_end_position(prev_line, prev_end_byte_idx, offset_encoding)
  local prev_end_range =
    { line_idx = prev_line_idx, byte_idx = prev_byte_idx, char_idx = prev_char_idx }

  local curr_end_range
  -- Deletion event, new_range cannot be before start
  if curr_line_idx < start_line_idx then
    curr_end_range = { line_idx = start_line_idx, byte_idx = 1, char_idx = 1 }
  else
    local curr_end_byte_idx = curr_line_length - byte_offset + 1
    -- Handle case where lines match
    if curr_end_byte_idx == 0 then
      curr_end_byte_idx = 1
    end
    local curr_byte_idx, curr_char_idx =
      align_end_position(curr_line, curr_end_byte_idx, offset_encoding)
    curr_end_range =
      { line_idx = curr_line_idx, byte_idx = curr_byte_idx, char_idx = curr_char_idx }
  end

  return prev_end_range, curr_end_range
end

--- Get the text of the range defined by start and end line/column
---@param lines table list of lines
---@param start_range table table returned by first_difference
---@param end_range table new_end_range returned by last_difference
---@return string text extracted from defined region
local function extract_text(lines, start_range, end_range, line_ending)
  if not lines[start_range.line_idx] then
    return ''
  end
  -- Trivial case: start and end range are the same line, directly grab changed text
  if start_range.line_idx == end_range.line_idx then
    -- string.sub is inclusive, end_range is not
    return string.sub(lines[start_range.line_idx], start_range.byte_idx, end_range.byte_idx - 1)
  else
    -- Handle deletion case
    -- Collect the changed portion of the first changed line
    local result = { string.sub(lines[start_range.line_idx], start_range.byte_idx) }

    -- Collect the full line for intermediate lines
    for idx = start_range.line_idx + 1, end_range.line_idx - 1 do
      table.insert(result, lines[idx])
    end

    if lines[end_range.line_idx] then
      -- Collect the changed portion of the last changed line.
      table.insert(result, string.sub(lines[end_range.line_idx], 1, end_range.byte_idx - 1))
    else
      table.insert(result, '')
    end

    -- Add line ending between all lines
    return table.concat(result, line_ending)
  end
end

-- rangelength depends on the offset encoding
-- bytes for utf-8 (clangd with extension)
-- codepoints for utf-16
-- codeunits for utf-32
-- Line endings count here as 2 chars for \r\n (dos), 1 char for \n (unix), and 1 char for \r (mac)
-- These correspond to Windows, Linux/macOS (OSX and newer), and macOS (version 9 and prior)
---@param lines string[]
---@param start_range table
---@param end_range table
---@param offset_encoding string
---@param line_ending string
---@return integer
local function compute_range_length(lines, start_range, end_range, offset_encoding, line_ending)
  local line_ending_length = #line_ending
  -- Single line case
  if start_range.line_idx == end_range.line_idx then
    return end_range.char_idx - start_range.char_idx
  end

  local start_line = lines[start_range.line_idx]
  local range_length --- @type integer
  if start_line and #start_line > 0 then
    range_length = compute_line_length(start_line, offset_encoding)
      - start_range.char_idx
      + 1
      + line_ending_length
  else
    -- Length of newline character
    range_length = line_ending_length
  end

  -- The first and last range of the line idx may be partial lines
  for idx = start_range.line_idx + 1, end_range.line_idx - 1 do
    -- Length full line plus newline character
    if #lines[idx] > 0 then
      range_length = range_length + compute_line_length(lines[idx], offset_encoding) + #line_ending
    else
      range_length = range_length + line_ending_length
    end
  end

  local end_line = lines[end_range.line_idx]
  if end_line and #end_line > 0 then
    range_length = range_length + end_range.char_idx - 1
  end

  return range_length
end

--- Returns the range table for the difference between prev and curr lines
---@param prev_lines table list of lines
---@param curr_lines table list of lines
---@param firstline integer line to begin search for first difference
---@param lastline integer line to begin search in old_lines for last difference
---@param new_lastline integer line to begin search in new_lines for last difference
---@param offset_encoding string encoding requested by language server
---@param line_ending string
---@return lsp.TextDocumentContentChangeEvent : see https://microsoft.github.io/language-server-protocol/specification/#textDocumentContentChangeEvent
function M.compute_diff(
  prev_lines,
  curr_lines,
  firstline,
  lastline,
  new_lastline,
  offset_encoding,
  line_ending
)
  -- Find the start of changes between the previous and current buffer. Common between both.
  -- Sent to the server as the start of the changed range.
  -- Used to grab the changed text from the latest buffer.
  local start_range = compute_start_range(
    prev_lines,
    curr_lines,
    firstline + 1,
    lastline + 1,
    new_lastline + 1,
    offset_encoding
  )
  -- Find the last position changed in the previous and current buffer.
  -- prev_end_range is sent to the server as as the end of the changed range.
  -- curr_end_range is used to grab the changed text from the latest buffer.
  local prev_end_range, curr_end_range = compute_end_range(
    prev_lines,
    curr_lines,
    start_range,
    firstline + 1,
    lastline + 1,
    new_lastline + 1,
    offset_encoding
  )

  -- Grab the changed text of from start_range to curr_end_range in the current buffer.
  -- The text range is "" if entire range is deleted.
  local text = extract_text(curr_lines, start_range, curr_end_range, line_ending)

  -- Compute the range of the replaced text. Deprecated but still required for certain language servers
  local range_length =
    compute_range_length(prev_lines, start_range, prev_end_range, offset_encoding, line_ending)

  -- convert to 0 based indexing
  local result = {
    range = {
      ['start'] = { line = start_range.line_idx - 1, character = start_range.char_idx - 1 },
      ['end'] = { line = prev_end_range.line_idx - 1, character = prev_end_range.char_idx - 1 },
    },
    text = text,
    rangeLength = range_length,
  }

  return result
end

return M
-												fix(lsp): rewrite incremental sync (#16252)

* use codeunits/points instead of byte ranges when applicable
* take into account different file formats when computing range and
  sending text (dos, unix, and mac supported)
* add tests of incremental sync
											
										
										
											2021-11-09 15:37:48 -07:00
+								-- Notes on incremental sync:
 								--  Per the protocol, the text range should be:
 								--
 								--  A position inside a document (see Position definition below) is expressed as
 								--  a zero-based line and character offset. The offsets are based on a UTF-16
 								--  string representation. So a string of the form a𐐀b the character offset
 								--  of the character a is 0, the character offset of 𐐀 is 1 and the character
 								--  offset of b is 3 since 𐐀 is represented using two code units in UTF-16.
 								--
 								--  To ensure that both client and server split the string into the same line
 								--  representation the protocol specifies the following end-of-line sequences: ‘\n’, ‘\r\n’ and ‘\r’.
 								--
 								--  Positions are line end character agnostic. So you can not specify a position that
 								--  denotes \r|\n or \n| where | represents the character offset. This means *no* defining
 								--  a range than ends on the same line after a terminating character
 								--
 								-- Generic warnings about byte level changes in neovim. Many apparently "single"
 								-- operations in on_lines callbacks are actually multiple operations.
 								--
 								--  Join operation (2 operations):
 								--  * extends line 1 with the contents of line 2
 								--  * deletes line 2
 								--
 								--  test 1    test 1 test 2    test 1 test 2
 								--  test 2 -> test 2        -> test 3
 								--  test 3    test 3
 								--
 								--  Deleting (and undoing) two middle lines (1 operation):
 								--
 								--  test 1    test 1
 								--  test 2 -> test 4
 								--  test 3
 								--  test 4
 								--
 								--  Deleting partial lines (5 operations) deleting between asterisks below:
 								--
 								--  test *1   test *    test *     test *    test *4    test *4*
 								--  test 2 -> test 2 -> test *4 -> *4     -> *4      ->
 								--  test 3    test 3
 								--  test *4   test 4
 								local M = {}
 								-- local string.byte, unclear if this is necessary for JIT compilation
 								local str_byte = string.byte
 								local min = math.min
 								local str_utfindex = vim.str_utfindex
 								local str_utf_start = vim.str_utf_start
 								local str_utf_end = vim.str_utf_end
 								-- Given a line, byte idx, and offset_encoding convert to the
 								-- utf-8, utf-16, or utf-32 index.
 								---@param line string the line to index into
 								---@param byte integer the byte idx
 								---@param offset_encoding string utf-8|utf-16|utf-32|nil (default: utf-8)
 								--@returns integer the utf idx for the given encoding
 								local function byte_to_utf(line, byte, offset_encoding)
 								  -- convert to 0 based indexing for str_utfindex
 								  byte = byte - 1
-												refactor(lsp): add type annotations

											
										
										
											2024-02-10 15:03:44 -07:00
+								  local utf_idx, _ --- @type integer, integer
-												fix(lsp): rewrite incremental sync (#16252)

* use codeunits/points instead of byte ranges when applicable
* take into account different file formats when computing range and
  sending text (dos, unix, and mac supported)
* add tests of incremental sync
											
										
										
											2021-11-09 15:37:48 -07:00
+								  -- Convert the byte range to utf-{8,16,32} and convert 1-based (lua) indexing to 0-based
 								  if offset_encoding == 'utf-16' then
 								    _, utf_idx = str_utfindex(line, byte)
 								  elseif offset_encoding == 'utf-32' then
 								    utf_idx, _ = str_utfindex(line, byte)
 								  else
 								    utf_idx = byte
 								  end
 								  -- convert to 1 based indexing
 								  return utf_idx + 1
 								end
-												feat(lsp): more annotations

											
										
										
											2023-12-13 05:00:11 -07:00
+								---@param line string
 								---@param offset_encoding string
 								---@return integer
-												fix(lsp): improve incremental sync robustness (#16358)

closes https://github.com/neovim/neovim/issues/16352

* improve handling of multi-byte deletions
											
										
										
											2021-11-18 14:49:46 -07:00
+								local function compute_line_length(line, offset_encoding)
-												refactor(lsp): add type annotations

											
										
										
											2024-02-10 15:03:44 -07:00
+								  local length, _ --- @type integer, integer
-												fix(lsp): improve incremental sync robustness (#16358)

closes https://github.com/neovim/neovim/issues/16352

* improve handling of multi-byte deletions
											
										
										
											2021-11-18 14:49:46 -07:00
+								  if offset_encoding == 'utf-16' then
 								    _, length = str_utfindex(line)
 								  elseif offset_encoding == 'utf-32' then
 								    length, _ = str_utfindex(line)
 								  else
 								    length = #line
 								  end
 								  return length
 								end
-												fix(lsp): rewrite incremental sync (#16252)

* use codeunits/points instead of byte ranges when applicable
* take into account different file formats when computing range and
  sending text (dos, unix, and mac supported)
* add tests of incremental sync
											
										
										
											2021-11-09 15:37:48 -07:00
+								-- Given a line, byte idx, alignment, and offset_encoding convert to the aligned
 								-- utf-8 index and either the utf-16, or utf-32 index.
 								---@param line string the line to index into
 								---@param byte integer the byte idx
 								---@param offset_encoding string utf-8|utf-16|utf-32|nil (default: utf-8)
-												fix(lint): lint warnings #24226


											
										
										
											2023-07-10 04:38:15 -07:00
+								---@return integer byte_idx of first change position
 								---@return integer char_idx of first change position
-												fix(lsp): improve incremental sync robustness (#16358)

closes https://github.com/neovim/neovim/issues/16352

* improve handling of multi-byte deletions
											
										
										
											2021-11-18 14:49:46 -07:00
+								local function align_end_position(line, byte, offset_encoding)
-												feat(lsp): more annotations

											
										
										
											2023-12-13 05:00:11 -07:00
+								  local char --- @type integer
-												fix(lsp): rewrite incremental sync (#16252)

* use codeunits/points instead of byte ranges when applicable
* take into account different file formats when computing range and
  sending text (dos, unix, and mac supported)
* add tests of incremental sync
											
										
										
											2021-11-09 15:37:48 -07:00
+								  -- If on the first byte, or an empty string: the trivial case
 								  if byte == 1 or #line == 0 then
 								    char = byte
 								    -- Called in the case of extending an empty line "" -> "a"
 								  elseif byte == #line + 1 then
-												fix(lsp): improve incremental sync robustness (#16358)

closes https://github.com/neovim/neovim/issues/16352

* improve handling of multi-byte deletions
											
										
										
											2021-11-18 14:49:46 -07:00
+								    char = compute_line_length(line, offset_encoding) + 1
-												fix(lsp): rewrite incremental sync (#16252)

* use codeunits/points instead of byte ranges when applicable
* take into account different file formats when computing range and
  sending text (dos, unix, and mac supported)
* add tests of incremental sync
											
										
										
											2021-11-09 15:37:48 -07:00
+								  else
 								    -- Modifying line, find the nearest utf codepoint
-												fix(lsp): correctly align start and end range to codepoints during incremental sync (#16670)

Closes #16624

Fixes two issues with aligning the start position and end position to
codepoints when calculating the start and end range.

When aligning the start position:
* use aligned byte index to calculate character index rather than 
  the unadjusted byte

When aligning the end position:
* do not adjust the end byte if it falls on a UTF-8 codepoint
* align byte to the first byte of the next codepoint rather than the
  last byte of the current codepoint
* compute character character end range on the aligned byte index

This commit also adds additional test coverage, including multibyte operations
that previously failed before this commit.
											
										
										
											2021-12-17 19:05:00 -07:00
+								    local offset = str_utf_start(line, byte)
-												fix(lsp): improve incremental sync robustness (#16358)

closes https://github.com/neovim/neovim/issues/16352

* improve handling of multi-byte deletions
											
										
										
											2021-11-18 14:49:46 -07:00
+								    -- If the byte does not fall on the start of the character, then
 								    -- align to the start of the next character.
-												fix(lsp): correctly align start and end range to codepoints during incremental sync (#16670)

Closes #16624

Fixes two issues with aligning the start position and end position to
codepoints when calculating the start and end range.

When aligning the start position:
* use aligned byte index to calculate character index rather than 
  the unadjusted byte

When aligning the end position:
* do not adjust the end byte if it falls on a UTF-8 codepoint
* align byte to the first byte of the next codepoint rather than the
  last byte of the current codepoint
* compute character character end range on the aligned byte index

This commit also adds additional test coverage, including multibyte operations
that previously failed before this commit.
											
										
										
											2021-12-17 19:05:00 -07:00
+								    if offset < 0 then
 								      byte = byte + str_utf_end(line, byte) + 1
 								    end
 								    if byte <= #line then
-												fix(lsp): improve incremental sync robustness (#16358)

closes https://github.com/neovim/neovim/issues/16352

* improve handling of multi-byte deletions
											
										
										
											2021-11-18 14:49:46 -07:00
+								      char = byte_to_utf(line, byte, offset_encoding)
-												fix(lsp): correctly align start and end range to codepoints during incremental sync (#16670)

Closes #16624

Fixes two issues with aligning the start position and end position to
codepoints when calculating the start and end range.

When aligning the start position:
* use aligned byte index to calculate character index rather than 
  the unadjusted byte

When aligning the end position:
* do not adjust the end byte if it falls on a UTF-8 codepoint
* align byte to the first byte of the next codepoint rather than the
  last byte of the current codepoint
* compute character character end range on the aligned byte index

This commit also adds additional test coverage, including multibyte operations
that previously failed before this commit.
											
										
										
											2021-12-17 19:05:00 -07:00
+								    else
 								      char = compute_line_length(line, offset_encoding) + 1
-												fix(lsp): rewrite incremental sync (#16252)

* use codeunits/points instead of byte ranges when applicable
* take into account different file formats when computing range and
  sending text (dos, unix, and mac supported)
* add tests of incremental sync
											
										
										
											2021-11-09 15:37:48 -07:00
+								    end
 								    -- Extending line, find the nearest utf codepoint for the last valid character
 								  end
 								  return byte, char
 								end
 								--- Finds the first line, byte, and char index of the difference between the previous and current lines buffer normalized to the previous codepoint.
-												feat(lsp): more annotations

											
										
										
											2023-12-13 05:00:11 -07:00
+								---@param prev_lines string[] list of lines from previous buffer
 								---@param curr_lines string[] list of lines from current buffer
-												fix(lsp): rewrite incremental sync (#16252)

* use codeunits/points instead of byte ranges when applicable
* take into account different file formats when computing range and
  sending text (dos, unix, and mac supported)
* add tests of incremental sync
											
										
										
											2021-11-09 15:37:48 -07:00
+								---@param firstline integer firstline from on_lines, adjusted to 1-index
 								---@param lastline integer lastline from on_lines, adjusted to 1-index
 								---@param new_lastline integer new_lastline from on_lines, adjusted to 1-index
 								---@param offset_encoding string utf-8|utf-16|utf-32|nil (fallback to utf-8)
-												fix(lint): lint warnings #24226


											
										
										
											2023-07-10 04:38:15 -07:00
+								---@return table result table include line_idx, byte_idx, and char_idx of first change position
-												fix(lsp): rewrite incremental sync (#16252)

* use codeunits/points instead of byte ranges when applicable
* take into account different file formats when computing range and
  sending text (dos, unix, and mac supported)
* add tests of incremental sync
											
										
										
											2021-11-09 15:37:48 -07:00
+								local function compute_start_range(
 								  prev_lines,
 								  curr_lines,
 								  firstline,
 								  lastline,
 								  new_lastline,
 								  offset_encoding
 								)
-												feat(lsp): more annotations

											
										
										
											2023-12-13 05:00:11 -07:00
+								  local char_idx --- @type integer?
 								  local byte_idx --- @type integer?
-												fix(lsp): rewrite incremental sync (#16252)

* use codeunits/points instead of byte ranges when applicable
* take into account different file formats when computing range and
  sending text (dos, unix, and mac supported)
* add tests of incremental sync
											
										
										
											2021-11-09 15:37:48 -07:00
+								  -- If firstline == lastline, no existing text is changed. All edit operations
 								  -- occur on a new line pointed to by lastline. This occurs during insertion of
 								  -- new lines(O), the new newline is inserted at the line indicated by
 								  -- new_lastline.
-												fix(lsp): start incremental sync range at previous newline character (#17610)

This change forces the start of an incremental sync range to begin always on an existing line.
											
										
										
											2022-03-05 10:17:56 -07:00
+								  if firstline == lastline then
-												feat(lsp): more annotations

											
										
										
											2023-12-13 05:00:11 -07:00
+								    local line_idx --- @type integer
-												fix(lsp): start incremental sync range at previous newline character (#17610)

This change forces the start of an incremental sync range to begin always on an existing line.
											
										
										
											2022-03-05 10:17:56 -07:00
+								    local line = prev_lines[firstline - 1]
-												fix(lsp): handle insertion of previous line (#17618)


											
										
										
											2022-03-06 08:52:11 -07:00
+								    if line then
 								      line_idx = firstline - 1
 								      byte_idx = #line + 1
 								      char_idx = compute_line_length(line, offset_encoding) + 1
 								    else
 								      line_idx = firstline
 								      byte_idx = 1
 								      char_idx = 1
 								    end
 								    return { line_idx = line_idx, byte_idx = byte_idx, char_idx = char_idx }
-												fix(lsp): start incremental sync range at previous newline character (#17610)

This change forces the start of an incremental sync range to begin always on an existing line.
											
										
										
											2022-03-05 10:17:56 -07:00
+								  end
-												fix(lsp): rewrite incremental sync (#16252)

* use codeunits/points instead of byte ranges when applicable
* take into account different file formats when computing range and
  sending text (dos, unix, and mac supported)
* add tests of incremental sync
											
										
										
											2021-11-09 15:37:48 -07:00
+								  -- If firstline == new_lastline, the first change occurred on a line that was deleted.
 								  -- In this case, the first byte change is also at the first byte of firstline
-												fix(lsp): start incremental sync range at previous newline character (#17610)

This change forces the start of an incremental sync range to begin always on an existing line.
											
										
										
											2022-03-05 10:17:56 -07:00
+								  if firstline == new_lastline then
-												fix(lsp): rewrite incremental sync (#16252)

* use codeunits/points instead of byte ranges when applicable
* take into account different file formats when computing range and
  sending text (dos, unix, and mac supported)
* add tests of incremental sync
											
										
										
											2021-11-09 15:37:48 -07:00
+								    return { line_idx = firstline, byte_idx = 1, char_idx = 1 }
 								  end
 								  local prev_line = prev_lines[firstline]
 								  local curr_line = curr_lines[firstline]
 								  -- Iterate across previous and current line containing first change
 								  -- to find the first different byte.
 								  -- Note: *about -> a*about will register the second a as the first
 								  -- difference, regardless of edit since we do not receive the first
 								  -- column of the edit from on_lines.
 								  local start_byte_idx = 1
 								  for idx = 1, #prev_line + 1 do
 								    start_byte_idx = idx
 								    if str_byte(prev_line, idx) ~= str_byte(curr_line, idx) then
 								      break
 								    end
 								  end
 								  -- Convert byte to codepoint if applicable
-												fix(lsp): improve incremental sync robustness (#16358)

closes https://github.com/neovim/neovim/issues/16352

* improve handling of multi-byte deletions
											
										
										
											2021-11-18 14:49:46 -07:00
+								  if start_byte_idx == 1 or (#prev_line == 0 and start_byte_idx == 1) then
 								    byte_idx = start_byte_idx
 								    char_idx = 1
 								  elseif start_byte_idx == #prev_line + 1 then
 								    byte_idx = start_byte_idx
 								    char_idx = compute_line_length(prev_line, offset_encoding) + 1
 								  else
 								    byte_idx = start_byte_idx + str_utf_start(prev_line, start_byte_idx)
-												fix(lsp): correctly align start and end range to codepoints during incremental sync (#16670)

Closes #16624

Fixes two issues with aligning the start position and end position to
codepoints when calculating the start and end range.

When aligning the start position:
* use aligned byte index to calculate character index rather than 
  the unadjusted byte

When aligning the end position:
* do not adjust the end byte if it falls on a UTF-8 codepoint
* align byte to the first byte of the next codepoint rather than the
  last byte of the current codepoint
* compute character character end range on the aligned byte index

This commit also adds additional test coverage, including multibyte operations
that previously failed before this commit.
											
										
										
											2021-12-17 19:05:00 -07:00
+								    char_idx = byte_to_utf(prev_line, byte_idx, offset_encoding)
-												fix(lsp): improve incremental sync robustness (#16358)

closes https://github.com/neovim/neovim/issues/16352

* improve handling of multi-byte deletions
											
										
										
											2021-11-18 14:49:46 -07:00
+								  end
-												fix(lsp): rewrite incremental sync (#16252)

* use codeunits/points instead of byte ranges when applicable
* take into account different file formats when computing range and
  sending text (dos, unix, and mac supported)
* add tests of incremental sync
											
										
										
											2021-11-09 15:37:48 -07:00
 								  -- Return the start difference (shared for new and prev lines)
 								  return { line_idx = firstline, byte_idx = byte_idx, char_idx = char_idx }
 								end
 								--- Finds the last line and byte index of the differences between prev and current buffer.
 								--- Normalized to the next codepoint.
 								--- prev_end_range is the text range sent to the server representing the changed region.
 								--- curr_end_range is the text that should be collected and sent to the server.
 								--
 								---@param start_range table
-												refactor(lsp): add type annotations

											
										
										
											2024-02-10 15:03:44 -07:00
+								---@param prev_lines string[] list of lines
 								---@param curr_lines string[] list of lines
 								---@param firstline integer
-												fix(lsp): rewrite incremental sync (#16252)

* use codeunits/points instead of byte ranges when applicable
* take into account different file formats when computing range and
  sending text (dos, unix, and mac supported)
* add tests of incremental sync
											
										
										
											2021-11-09 15:37:48 -07:00
+								---@param lastline integer
 								---@param new_lastline integer
 								---@param offset_encoding string
-												fix(lint): lint warnings #24226


											
										
										
											2023-07-10 04:38:15 -07:00
+								---@return integer|table end_line_idx and end_col_idx of range
 								---@return table|nil end_col_idx of range
-												fix(lsp): rewrite incremental sync (#16252)

* use codeunits/points instead of byte ranges when applicable
* take into account different file formats when computing range and
  sending text (dos, unix, and mac supported)
* add tests of incremental sync
											
										
										
											2021-11-09 15:37:48 -07:00
+								local function compute_end_range(
 								  prev_lines,
 								  curr_lines,
 								  start_range,
 								  firstline,
 								  lastline,
 								  new_lastline,
 								  offset_encoding
 								)
 								  -- If firstline == new_lastline, the first change occurred on a line that was deleted.
 								  -- In this case, the last_byte...
 								  if firstline == new_lastline then
 								    return { line_idx = (lastline - new_lastline + firstline), byte_idx = 1, char_idx = 1 }, {
 								      line_idx = firstline,
 								      byte_idx = 1,
 								      char_idx = 1,
 								    }
 								  end
 								  if firstline == lastline then
 								    return { line_idx = firstline, byte_idx = 1, char_idx = 1 }, {
 								      line_idx = new_lastline - lastline + firstline,
 								      byte_idx = 1,
 								      char_idx = 1,
 								    }
 								  end
 								  -- Compare on last line, at minimum will be the start range
 								  local start_line_idx = start_range.line_idx
 								  -- lastline and new_lastline were last lines that were *not* replaced, compare previous lines
 								  local prev_line_idx = lastline - 1
 								  local curr_line_idx = new_lastline - 1
 								  local prev_line = prev_lines[lastline - 1]
 								  local curr_line = curr_lines[new_lastline - 1]
 								  local prev_line_length = #prev_line
 								  local curr_line_length = #curr_line
 								  local byte_offset = 0
 								  -- Editing the same line
 								  -- If the byte offset is zero, that means there is a difference on the last byte (not newline)
 								  if prev_line_idx == curr_line_idx then
-												refactor(lsp): add type annotations

											
										
										
											2024-02-10 15:03:44 -07:00
+								    local max_length --- @type integer
-												fix(lsp): rewrite incremental sync (#16252)

* use codeunits/points instead of byte ranges when applicable
* take into account different file formats when computing range and
  sending text (dos, unix, and mac supported)
* add tests of incremental sync
											
										
										
											2021-11-09 15:37:48 -07:00
+								    if start_line_idx == prev_line_idx then
 								      -- Search until beginning of difference
 								      max_length = min(
 								        prev_line_length - start_range.byte_idx,
 								        curr_line_length - start_range.byte_idx
 								      ) + 1
 								    else
 								      max_length = min(prev_line_length, curr_line_length) + 1
 								    end
 								    for idx = 0, max_length do
 								      byte_offset = idx
 								      if
 								        str_byte(prev_line, prev_line_length - byte_offset)
 								        ~= str_byte(curr_line, curr_line_length - byte_offset)
 								      then
 								        break
 								      end
 								    end
 								  end
 								  -- Iterate from end to beginning of shortest line
 								  local prev_end_byte_idx = prev_line_length - byte_offset + 1
-												fix(lsp): improve incremental sync robustness (#16358)

closes https://github.com/neovim/neovim/issues/16352

* improve handling of multi-byte deletions
											
										
										
											2021-11-18 14:49:46 -07:00
-												fix(lsp): fix edge cases in incremental sync (#16308)


											
										
										
											2021-11-15 09:51:30 -07:00
+								  -- Handle case where lines match
 								  if prev_end_byte_idx == 0 then
 								    prev_end_byte_idx = 1
 								  end
-												fix(lsp): improve incremental sync robustness (#16358)

closes https://github.com/neovim/neovim/issues/16352

* improve handling of multi-byte deletions
											
										
										
											2021-11-18 14:49:46 -07:00
+								  local prev_byte_idx, prev_char_idx =
 								    align_end_position(prev_line, prev_end_byte_idx, offset_encoding)
-												fix(lsp): rewrite incremental sync (#16252)

* use codeunits/points instead of byte ranges when applicable
* take into account different file formats when computing range and
  sending text (dos, unix, and mac supported)
* add tests of incremental sync
											
										
										
											2021-11-09 15:37:48 -07:00
+								  local prev_end_range =
 								    { line_idx = prev_line_idx, byte_idx = prev_byte_idx, char_idx = prev_char_idx }
 								  local curr_end_range
 								  -- Deletion event, new_range cannot be before start
 								  if curr_line_idx < start_line_idx then
 								    curr_end_range = { line_idx = start_line_idx, byte_idx = 1, char_idx = 1 }
 								  else
 								    local curr_end_byte_idx = curr_line_length - byte_offset + 1
-												fix(lsp): fix edge cases in incremental sync (#16308)


											
										
										
											2021-11-15 09:51:30 -07:00
+								    -- Handle case where lines match
 								    if curr_end_byte_idx == 0 then
 								      curr_end_byte_idx = 1
 								    end
-												fix(lsp): improve incremental sync robustness (#16358)

closes https://github.com/neovim/neovim/issues/16352

* improve handling of multi-byte deletions
											
										
										
											2021-11-18 14:49:46 -07:00
+								    local curr_byte_idx, curr_char_idx =
 								      align_end_position(curr_line, curr_end_byte_idx, offset_encoding)
-												fix(lsp): rewrite incremental sync (#16252)

* use codeunits/points instead of byte ranges when applicable
* take into account different file formats when computing range and
  sending text (dos, unix, and mac supported)
* add tests of incremental sync
											
										
										
											2021-11-09 15:37:48 -07:00
+								    curr_end_range =
 								      { line_idx = curr_line_idx, byte_idx = curr_byte_idx, char_idx = curr_char_idx }
 								  end
 								  return prev_end_range, curr_end_range
 								end
 								--- Get the text of the range defined by start and end line/column
 								---@param lines table list of lines
 								---@param start_range table table returned by first_difference
 								---@param end_range table new_end_range returned by last_difference
-												fix(lint): lint warnings #24226


											
										
										
											2023-07-10 04:38:15 -07:00
+								---@return string text extracted from defined region
-												fix(lsp): rewrite incremental sync (#16252)

* use codeunits/points instead of byte ranges when applicable
* take into account different file formats when computing range and
  sending text (dos, unix, and mac supported)
* add tests of incremental sync
											
										
										
											2021-11-09 15:37:48 -07:00
+								local function extract_text(lines, start_range, end_range, line_ending)
 								  if not lines[start_range.line_idx] then
 								    return ''
 								  end
 								  -- Trivial case: start and end range are the same line, directly grab changed text
 								  if start_range.line_idx == end_range.line_idx then
 								    -- string.sub is inclusive, end_range is not
 								    return string.sub(lines[start_range.line_idx], start_range.byte_idx, end_range.byte_idx - 1)
 								  else
 								    -- Handle deletion case
 								    -- Collect the changed portion of the first changed line
 								    local result = { string.sub(lines[start_range.line_idx], start_range.byte_idx) }
 								    -- Collect the full line for intermediate lines
 								    for idx = start_range.line_idx + 1, end_range.line_idx - 1 do
 								      table.insert(result, lines[idx])
 								    end
 								    if lines[end_range.line_idx] then
 								      -- Collect the changed portion of the last changed line.
 								      table.insert(result, string.sub(lines[end_range.line_idx], 1, end_range.byte_idx - 1))
 								    else
 								      table.insert(result, '')
 								    end
 								    -- Add line ending between all lines
 								    return table.concat(result, line_ending)
 								  end
 								end
 								-- rangelength depends on the offset encoding
 								-- bytes for utf-8 (clangd with extension)
 								-- codepoints for utf-16
 								-- codeunits for utf-32
 								-- Line endings count here as 2 chars for \r\n (dos), 1 char for \n (unix), and 1 char for \r (mac)
 								-- These correspond to Windows, Linux/macOS (OSX and newer), and macOS (version 9 and prior)
-												feat(lsp): more annotations

											
										
										
											2023-12-13 05:00:11 -07:00
+								---@param lines string[]
 								---@param start_range table
 								---@param end_range table
 								---@param offset_encoding string
 								---@param line_ending string
 								---@return integer
-												fix(lsp): rewrite incremental sync (#16252)

* use codeunits/points instead of byte ranges when applicable
* take into account different file formats when computing range and
  sending text (dos, unix, and mac supported)
* add tests of incremental sync
											
										
										
											2021-11-09 15:37:48 -07:00
+								local function compute_range_length(lines, start_range, end_range, offset_encoding, line_ending)
 								  local line_ending_length = #line_ending
 								  -- Single line case
 								  if start_range.line_idx == end_range.line_idx then
 								    return end_range.char_idx - start_range.char_idx
 								  end
 								  local start_line = lines[start_range.line_idx]
-												feat(lsp): more annotations

											
										
										
											2023-12-13 05:00:11 -07:00
+								  local range_length --- @type integer
-												fix(lsp): rewrite incremental sync (#16252)

* use codeunits/points instead of byte ranges when applicable
* take into account different file formats when computing range and
  sending text (dos, unix, and mac supported)
* add tests of incremental sync
											
										
										
											2021-11-09 15:37:48 -07:00
+								  if start_line and #start_line > 0 then
 								    range_length = compute_line_length(start_line, offset_encoding)
 								      - start_range.char_idx
 								      + 1
 								      + line_ending_length
 								  else
 								    -- Length of newline character
 								    range_length = line_ending_length
 								  end
 								  -- The first and last range of the line idx may be partial lines
 								  for idx = start_range.line_idx + 1, end_range.line_idx - 1 do
 								    -- Length full line plus newline character
 								    if #lines[idx] > 0 then
 								      range_length = range_length + compute_line_length(lines[idx], offset_encoding) + #line_ending
 								    else
 								      range_length = range_length + line_ending_length
 								    end
 								  end
 								  local end_line = lines[end_range.line_idx]
 								  if end_line and #end_line > 0 then
 								    range_length = range_length + end_range.char_idx - 1
 								  end
 								  return range_length
 								end
 								--- Returns the range table for the difference between prev and curr lines
 								---@param prev_lines table list of lines
 								---@param curr_lines table list of lines
-												docs(lsp): change type annotations from number → integer (#22510)


											
										
										
											2023-03-06 23:17:52 -07:00
+								---@param firstline integer line to begin search for first difference
 								---@param lastline integer line to begin search in old_lines for last difference
 								---@param new_lastline integer line to begin search in new_lines for last difference
-												fix(lsp): rewrite incremental sync (#16252)

* use codeunits/points instead of byte ranges when applicable
* take into account different file formats when computing range and
  sending text (dos, unix, and mac supported)
* add tests of incremental sync
											
										
										
											2021-11-09 15:37:48 -07:00
+								---@param offset_encoding string encoding requested by language server
-												feat(lsp): more annotations

											
										
										
											2023-12-13 05:00:11 -07:00
+								---@param line_ending string
-												refactor(lsp): move changetracking to separate file (#26577)

* refactor(lsp): move changetracking to separate file

- Prefixed changetracking types with `vim.lsp.`

* fixup!: make _reset_timer a local function

* fixup!: remove @private annotations

* fixup!: changetracking.lua -> _changetracking.lua

* fixup! types

* fixup! add send_changes_for_group
											
										
										
											2023-12-17 02:54:38 -07:00
+								---@return lsp.TextDocumentContentChangeEvent : see https://microsoft.github.io/language-server-protocol/specification/#textDocumentContentChangeEvent
-												fix(lsp): rewrite incremental sync (#16252)

* use codeunits/points instead of byte ranges when applicable
* take into account different file formats when computing range and
  sending text (dos, unix, and mac supported)
* add tests of incremental sync
											
										
										
											2021-11-09 15:37:48 -07:00
+								function M.compute_diff(
 								  prev_lines,
 								  curr_lines,
 								  firstline,
 								  lastline,
 								  new_lastline,
 								  offset_encoding,
 								  line_ending
 								)
 								  -- Find the start of changes between the previous and current buffer. Common between both.
 								  -- Sent to the server as the start of the changed range.
 								  -- Used to grab the changed text from the latest buffer.
 								  local start_range = compute_start_range(
 								    prev_lines,
 								    curr_lines,
 								    firstline + 1,
 								    lastline + 1,
 								    new_lastline + 1,
 								    offset_encoding
 								  )
 								  -- Find the last position changed in the previous and current buffer.
 								  -- prev_end_range is sent to the server as as the end of the changed range.
 								  -- curr_end_range is used to grab the changed text from the latest buffer.
 								  local prev_end_range, curr_end_range = compute_end_range(
 								    prev_lines,
 								    curr_lines,
 								    start_range,
 								    firstline + 1,
 								    lastline + 1,
 								    new_lastline + 1,
 								    offset_encoding
 								  )
 								  -- Grab the changed text of from start_range to curr_end_range in the current buffer.
 								  -- The text range is "" if entire range is deleted.
 								  local text = extract_text(curr_lines, start_range, curr_end_range, line_ending)
 								  -- Compute the range of the replaced text. Deprecated but still required for certain language servers
 								  local range_length =
 								    compute_range_length(prev_lines, start_range, prev_end_range, offset_encoding, line_ending)
 								  -- convert to 0 based indexing
 								  local result = {
 								    range = {
 								      ['start'] = { line = start_range.line_idx - 1, character = start_range.char_idx - 1 },
 								      ['end'] = { line = prev_end_range.line_idx - 1, character = prev_end_range.char_idx - 1 },
 								    },
 								    text = text,
 								    rangeLength = range_length,
 								  }
 								  return result
 								end
 								return M