Pull request 1924: 6003-relax-rule-validation

Updates #6003.

Squashed commit of the following:

commit 1874860877662999d158631e3a25f8072c24f155
Author: Ainar Garipov <A.Garipov@AdGuard.COM>
Date:   Thu Jul 13 19:36:26 2023 +0300

    filtering/rulelist: imp test

commit 871a41af8039bf4d4fb139622d4296bcaff6729c
Author: Ainar Garipov <A.Garipov@AdGuard.COM>
Date:   Thu Jul 13 19:10:35 2023 +0300

    filtering/rulelist: relax validation
This commit is contained in:
Ainar Garipov 2023-07-13 19:43:53 +03:00
parent f22d893845
commit 2adc8624c0
4 changed files with 46 additions and 29 deletions

View File

@ -25,8 +25,9 @@ NOTE: Add new changes BELOW THIS COMMENT.
### Fixed ### Fixed
- `bufio.Scanner: token too long` errors when trying to add filtering-rule lists - `bufio.Scanner: token too long` and other errors when trying to add
with lines over 1024 bytes long ([#6003]). filtering-rule lists with lines over 1024 bytes long or containing cosmetic
rules ([#6003]).
### Removed ### Removed

View File

@ -6,10 +6,9 @@ import (
"fmt" "fmt"
"hash/crc32" "hash/crc32"
"io" "io"
"unicode"
"unicode/utf8"
"github.com/AdguardTeam/golibs/errors" "github.com/AdguardTeam/golibs/errors"
"golang.org/x/exp/slices"
) )
// Parser is a filtering-rule parser that collects data, such as the checksum // Parser is a filtering-rule parser that collects data, such as the checksum
@ -105,13 +104,11 @@ func (p *Parser) processLine(dst io.Writer, line []byte, lineNum int) (n int, er
badIdx, isRule = p.parseLineTitle(trimmed) badIdx, isRule = p.parseLineTitle(trimmed)
} }
if badIdx != -1 { if badIdx != -1 {
badRune, _ := utf8.DecodeRune(trimmed[badIdx:])
return 0, fmt.Errorf( return 0, fmt.Errorf(
"line %d: character %d: non-printable character %q", "line %d: character %d: likely binary character %q",
lineNum, lineNum,
badIdx+bytes.Index(line, trimmed)+1, badIdx+bytes.Index(line, trimmed)+1,
badRune, trimmed[badIdx],
) )
} }
@ -144,41 +141,37 @@ func hasPrefixFold(b, prefix []byte) (ok bool) {
} }
// parseLine returns true if the parsed line is a filtering rule. line is // parseLine returns true if the parsed line is a filtering rule. line is
// assumed to be trimmed of whitespace characters. nonPrintIdx is the index of // assumed to be trimmed of whitespace characters. badIdx is the index of the
// the first non-printable character, if any; if there are none, nonPrintIdx is // first character that may indicate that this is a binary file, or -1 if none.
// -1.
// //
// A line is considered a rule if it's not empty, not a comment, and contains // A line is considered a rule if it's not empty, not a comment, and contains
// only printable characters. // only printable characters.
func parseLine(line []byte) (nonPrintIdx int, isRule bool) { func parseLine(line []byte) (badIdx int, isRule bool) {
if len(line) == 0 || line[0] == '#' || line[0] == '!' { if len(line) == 0 || line[0] == '#' || line[0] == '!' {
return -1, false return -1, false
} }
nonPrintIdx = bytes.IndexFunc(line, isNotPrintable) badIdx = slices.IndexFunc(line, likelyBinary)
return nonPrintIdx, nonPrintIdx == -1 return badIdx, badIdx == -1
} }
// isNotPrintable returns true if r is not a printable character that can be // likelyBinary returns true if b is likely to be a byte from a binary file.
// contained in a filtering rule. func likelyBinary(b byte) (ok bool) {
func isNotPrintable(r rune) (ok bool) { return (b < ' ' || b == 0x7f) && b != '\n' && b != '\r' && b != '\t'
// Tab isn't included into Unicode's graphic symbols, so include it here
// explicitly.
return r != '\t' && !unicode.IsGraphic(r)
} }
// parseLineTitle is like [parseLine] but additionally looks for a title. line // parseLineTitle is like [parseLine] but additionally looks for a title. line
// is assumed to be trimmed of whitespace characters. // is assumed to be trimmed of whitespace characters.
func (p *Parser) parseLineTitle(line []byte) (nonPrintIdx int, isRule bool) { func (p *Parser) parseLineTitle(line []byte) (badIdx int, isRule bool) {
if len(line) == 0 || line[0] == '#' { if len(line) == 0 || line[0] == '#' {
return -1, false return -1, false
} }
if line[0] != '!' { if line[0] != '!' {
nonPrintIdx = bytes.IndexFunc(line, isNotPrintable) badIdx = slices.IndexFunc(line, likelyBinary)
return nonPrintIdx, nonPrintIdx == -1 return badIdx, badIdx == -1
} }
const titlePattern = "! Title: " const titlePattern = "! Title: "

View File

@ -77,6 +77,14 @@ func TestParser_Parse(t *testing.T) {
wantTitle: "Test Title", wantTitle: "Test Title",
wantRulesNum: 1, wantRulesNum: 1,
wantWritten: len(testRuleTextBlocked), wantWritten: len(testRuleTextBlocked),
}, {
name: "cosmetic_with_zwnj",
in: testRuleTextCosmetic,
wantDst: testRuleTextCosmetic,
wantErrMsg: "",
wantTitle: "",
wantRulesNum: 1,
wantWritten: len(testRuleTextCosmetic),
}, { }, {
name: "bad_char", name: "bad_char",
in: "! Title: Test Title \n" + in: "! Title: Test Title \n" +
@ -85,7 +93,7 @@ func TestParser_Parse(t *testing.T) {
wantDst: testRuleTextBlocked, wantDst: testRuleTextBlocked,
wantErrMsg: "line 3: " + wantErrMsg: "line 3: " +
"character 4: " + "character 4: " +
"non-printable character '\\x7f'", "likely binary character '\\x7f'",
wantTitle: "Test Title", wantTitle: "Test Title",
wantRulesNum: 1, wantRulesNum: 1,
wantWritten: len(testRuleTextBlocked), wantWritten: len(testRuleTextBlocked),
@ -215,6 +223,14 @@ func BenchmarkParser_Parse(b *testing.B) {
require.NoError(b, errSink) require.NoError(b, errSink)
require.NotNil(b, resSink) require.NotNil(b, resSink)
// Most recent result, on a ThinkPad X13 with a Ryzen Pro 7 CPU:
//
// goos: linux
// goarch: amd64
// pkg: github.com/AdguardTeam/AdGuardHome/internal/filtering/rulelist
// cpu: AMD Ryzen 7 PRO 4750U with Radeon Graphics
// BenchmarkParser_Parse-16 100000000 128.0 ns/op 48 B/op 1 allocs/op
} }
func FuzzParser_Parse(f *testing.F) { func FuzzParser_Parse(f *testing.F) {
@ -226,15 +242,17 @@ func FuzzParser_Parse(f *testing.F) {
"! Comment", "! Comment",
"! Title ", "! Title ",
"! Title XXX", "! Title XXX",
testRuleTextBadTab,
testRuleTextBlocked,
testRuleTextCosmetic,
testRuleTextEtcHostsTab, testRuleTextEtcHostsTab,
testRuleTextHTML, testRuleTextHTML,
testRuleTextBlocked,
testRuleTextBadTab,
"1.2.3.4", "1.2.3.4",
"1.2.3.4 etc-hosts.example", "1.2.3.4 etc-hosts.example",
">>>\x00<<<", ">>>\x00<<<",
">>>\x7F<<<", ">>>\x7F<<<",
strings.Repeat("a", n+1), strings.Repeat("a", rulelist.DefaultRuleBufSize+1),
strings.Repeat("a", bufio.MaxScanTokenSize+1),
} }
for _, tc := range testCases { for _, tc := range testCases {

View File

@ -7,8 +7,13 @@ const testTimeout = 1 * time.Second
// Common texts for tests. // Common texts for tests.
const ( const (
testRuleTextHTML = "<!DOCTYPE html>\n"
testRuleTextBlocked = "||blocked.example^\n"
testRuleTextBadTab = "||bad-tab-and-comment.example^\t# A comment.\n" testRuleTextBadTab = "||bad-tab-and-comment.example^\t# A comment.\n"
testRuleTextBlocked = "||blocked.example^\n"
testRuleTextEtcHostsTab = "0.0.0.0 tab..example^\t# A comment.\n" testRuleTextEtcHostsTab = "0.0.0.0 tab..example^\t# A comment.\n"
testRuleTextHTML = "<!DOCTYPE html>\n"
// testRuleTextCosmetic is a cosmetic rule with a zero-width non-joiner.
//
// See https://github.com/AdguardTeam/AdGuardHome/issues/6003.
testRuleTextCosmetic = "||cosmetic.example## :has-text(/\u200c/i)\n"
) )