vim-patch:9.1.0011: regexp cannot match combining chars in collection (#26992)

Problem:  regexp cannot match combining chars in collection
Solution: Check for combining characters in regex collections for the
          NFA and BT Regex Engine

Also, while at it, make debug mode work again.

fixes vim/vim#10286
closes: vim/vim#12871

d2cc51f9a1

Co-authored-by: Christian Brabandt <cb@256bit.org>
This commit is contained in:
zeertzjq 2024-01-12 14:09:10 +08:00 committed by GitHub
parent 1813661a61
commit 3bcf8e5622
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 141 additions and 6 deletions

View File

@ -140,7 +140,9 @@ struct regengine {
int (*regexec_nl)(regmatch_T *, uint8_t *, colnr_T, bool); int (*regexec_nl)(regmatch_T *, uint8_t *, colnr_T, bool);
/// bt_regexec_mult or nfa_regexec_mult /// bt_regexec_mult or nfa_regexec_mult
int (*regexec_multi)(regmmatch_T *, win_T *, buf_T *, linenr_T, colnr_T, proftime_T *, int *); int (*regexec_multi)(regmmatch_T *, win_T *, buf_T *, linenr_T, colnr_T, proftime_T *, int *);
// uint8_t *expr; #ifdef REGEXP_DEBUG
uint8_t *expr;
#endif
}; };
// Structure used to save the current input state, when it needs to be // Structure used to save the current input state, when it needs to be
@ -6426,15 +6428,33 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)
break; break;
case ANYOF: case ANYOF:
case ANYBUT: case ANYBUT: {
uint8_t *q = OPERAND(scan);
if (c == NUL) { if (c == NUL) {
status = RA_NOMATCH; status = RA_NOMATCH;
} else if ((cstrchr((char *)OPERAND(scan), c) == NULL) == (op == ANYOF)) { } else if ((cstrchr((char *)q, c) == NULL) == (op == ANYOF)) {
status = RA_NOMATCH; status = RA_NOMATCH;
} else { } else { // Check following combining characters
ADVANCE_REGINPUT(); int len = utfc_ptr2len((char *)q) - utf_ptr2len((char *)q);
rex.input += utf_ptr2len((char *)rex.input);
q += utf_ptr2len((char *)q);
if (len == 0) {
break;
}
for (int i = 0; i < len; i++) {
if (q[i] != rex.input[i]) {
status = RA_NOMATCH;
break;
}
}
rex.input += len;
} }
break; break;
}
case MULTIBYTECODE: { case MULTIBYTECODE: {
int i, len; int i, len;
@ -10448,13 +10468,39 @@ collection:
} else { } else {
if (got_coll_char == true && startc == 0) { if (got_coll_char == true && startc == 0) {
EMIT(0x0a); EMIT(0x0a);
EMIT(NFA_CONCAT);
} else { } else {
EMIT(startc); EMIT(startc);
} if (utf_ptr2len(regparse) == utfc_ptr2len(regparse)) {
EMIT(NFA_CONCAT); EMIT(NFA_CONCAT);
} }
} }
}
}
int plen;
if (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse))) {
int i = utf_ptr2len(regparse);
c = utf_ptr2char(regparse + i);
// Add composing characters
while (true) {
if (c == 0) {
// \x00 is translated to \x0a, start at \x01.
EMIT(1);
} else {
EMIT(c);
}
EMIT(NFA_CONCAT);
if ((i += utf_char2len(c)) >= plen) {
break;
}
c = utf_ptr2char(regparse + i);
}
EMIT(NFA_COMPOSING);
EMIT(NFA_CONCAT);
}
MB_PTR_ADV(regparse); MB_PTR_ADV(regparse);
} // while (p < endp) } // while (p < endp)
@ -14503,6 +14549,78 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm
state = t->state->out; state = t->state->out;
result_if_matched = (t->state->c == NFA_START_COLL); result_if_matched = (t->state->c == NFA_START_COLL);
while (true) { while (true) {
if (state->c == NFA_COMPOSING) {
int mc = curc;
int len = 0;
nfa_state_T *end;
nfa_state_T *sta;
int cchars[MAX_MCO];
int ccount = 0;
int j;
sta = t->state->out->out;
if (utf_iscomposing(sta->c)) {
// Only match composing character(s), ignore base
// character. Used for ".{composing}" and "{composing}"
// (no preceding character).
len += utf_char2len(mc);
}
if (rex.reg_icombine && len == 0) {
// If \Z was present, then ignore composing characters.
// When ignoring the base character this always matches.
if (sta->c != curc) {
result = FAIL;
} else {
result = OK;
}
while (sta->c != NFA_END_COMPOSING) {
sta = sta->out;
}
}
// Check base character matches first, unless ignored.
else if (len > 0 || mc == sta->c) {
if (len == 0) {
len += utf_char2len(mc);
sta = sta->out;
}
// We don't care about the order of composing characters.
// Get them into cchars[] first.
while (len < clen) {
mc = utf_ptr2char((char *)rex.input + len);
cchars[ccount++] = mc;
len += utf_char2len(mc);
if (ccount == MAX_MCO) {
break;
}
}
// Check that each composing char in the pattern matches a
// composing char in the text. We do not check if all
// composing chars are matched.
result = OK;
while (sta->c != NFA_END_COMPOSING) {
for (j = 0; j < ccount; j++) {
if (cchars[j] == sta->c) {
break;
}
}
if (j == ccount) {
result = FAIL;
break;
}
sta = sta->out;
}
} else {
result = FAIL;
}
if (t->state->out->out1->c == NFA_END_COMPOSING) {
end = t->state->out->out1;
ADD_STATE_IF_MATCH(end);
}
break;
}
if (state->c == NFA_END_COLL) { if (state->c == NFA_END_COLL) {
result = !result_if_matched; result = !result_if_matched;
break; break;
@ -15645,6 +15763,9 @@ static regengine_T bt_regengine = {
bt_regfree, bt_regfree,
bt_regexec_nl, bt_regexec_nl,
bt_regexec_multi, bt_regexec_multi,
#ifdef REGEXP_DEBUG
"",
#endif
}; };
static regengine_T nfa_regengine = { static regengine_T nfa_regengine = {
@ -15652,6 +15773,9 @@ static regengine_T nfa_regengine = {
nfa_regfree, nfa_regfree,
nfa_regexec_nl, nfa_regexec_nl,
nfa_regexec_multi, nfa_regexec_multi,
#ifdef REGEXP_DEBUG
"",
#endif
}; };
// Which regexp engine to use? Needed for vim_regcomp(). // Which regexp engine to use? Needed for vim_regcomp().

View File

@ -599,5 +599,16 @@ func Test_match_too_complicated()
set regexpengine=0 set regexpengine=0
endfunc endfunc
func Test_combining_chars_in_collection()
new
for i in range(0,2)
exe "set re=".i
put =['ɔ̃', 'ɔ', '̃ ã', 'abcd']
:%s/[ɔ̃]//
call assert_equal(['', '', 'ɔ', '̃ ã', 'abcd'], getline(1,'$'))
%d
endfor
bw!
endfunc
" vim: shiftwidth=2 sts=2 expandtab " vim: shiftwidth=2 sts=2 expandtab