fix(ignore): ensure normalization of patterns and paths match (fixes #9597) (#9717)

In ignores, normalize the input when parsing it. When scanning, normalize earlier such that the path is already normalized when checking ignores. This requires splitting normalization of the string from normalization of the file, as we don't want to attempt the latter if the file is ignored. Closes #9598 --------- Co-authored-by: Jakob Borg <jakob@kastelo.net>
2024-11-15 09:58:57 -07:00 · 2024-09-28 17:16:44 +02:00 · 2024-09-28 17:16:44 +02:00 · 605fd6d726
commit 605fd6d726
parent 3c476542d2
3 changed files with 59 additions and 35 deletions
--- a/lib/ignore/ignore.go
+++ b/lib/ignore/ignore.go
@ -18,7 +18,9 @@ import (
 	"time"

 	"github.com/gobwas/glob"
+	"golang.org/x/text/unicode/norm"

+	"github.com/syncthing/syncthing/lib/build"
 	"github.com/syncthing/syncthing/lib/fs"
 	"github.com/syncthing/syncthing/lib/ignore/ignoreresult"
 	"github.com/syncthing/syncthing/lib/osutil"
@ -212,6 +214,11 @@ func (m *Matcher) parseLocked(r io.Reader, file string) error {
 }

 // Match matches the patterns plus temporary and internal files.
+//
+// The "file" parameter must be in the OS' native unicode format (NFD on macos,
+// NFC everywhere else). This is always the case in real usage in syncthing, as
+// we ensure native unicode normalisation on all entry points (scanning and from
+// protocol) - so no need to normalize when calling this, except e.g. in tests.
 func (m *Matcher) Match(file string) (result ignoreresult.R) {
 	switch {
 	case fs.IsTemporary(file):
@ -387,6 +394,10 @@ func loadParseIncludeFile(filesystem fs.Filesystem, file string, cd ChangeDetect
 }

 func parseLine(line string) ([]Pattern, error) {
+	// We use native normalization internally, thus the patterns must match
+	// that to avoid false negative matches.
+	line = nativeUnicodeNorm(line)
+
 	pattern := Pattern{
 		result: ignoreresult.Ignored,
 	}
@ -464,6 +475,13 @@ func parseLine(line string) ([]Pattern, error) {
 	return patterns, nil
 }

+func nativeUnicodeNorm(s string) string {
+	if build.IsDarwin || build.IsIOS {
+		return norm.NFD.String(s)
+	}
+	return norm.NFC.String(s)
+}
+
 func parseIgnoreFile(fs fs.Filesystem, fd io.Reader, currentFile string, cd ChangeDetector, linesSeen map[string]struct{}) ([]string, []Pattern, error) {
 	var patterns []Pattern

--- a/lib/ignore/ignore_test.go
+++ b/lib/ignore/ignore_test.go
@ -805,7 +805,10 @@ func TestIssue3174(t *testing.T) {
 		t.Fatal(err)
 	}

-	if !pats.Match("åäö").IsIgnored() {
+	// The pattern above is normalized when parsing, and in order for this
+	// string to match the pattern, it needs to use the same normalization. And
+	// Go always uses NFC regardless of OS, while we use NFD on macos.
+	if !pats.Match(nativeUnicodeNorm("åäö")).IsIgnored() {
 		t.Error("Should match")
 	}
 }
--- a/lib/scanner/walk.go
+++ b/lib/scanner/walk.go
@ -293,6 +293,11 @@ func (w *walker) walkAndHashFiles(ctx context.Context, toHashChan chan<- protoco
 			return skip
 		}

+		// Just in case the filesystem doesn't produce the normalization the OS
+		// uses, and we use internally.
+		nonNormPath := path
+		path = normalizePath(path)
+
 		if m := w.Matcher.Match(path); m.IsIgnored() {
 			l.Debugln(w, "ignored (patterns):", path)
 			// Only descend if matcher says so and the current file is not a symlink.
@ -319,9 +324,23 @@ func (w *walker) walkAndHashFiles(ctx context.Context, toHashChan chan<- protoco
 			return nil
 		}

+		if path != nonNormPath {
+			if !w.AutoNormalize {
+				// We're not authorized to do anything about it, so complain and skip.
+				handleError(ctx, "normalizing path", nonNormPath, errUTF8Normalization, finishedChan)
+				return skip
+			}
+
+			path, err = w.applyNormalization(nonNormPath, path, info)
+			if err != nil {
+				handleError(ctx, "normalizing path", nonNormPath, err, finishedChan)
+				return skip
+			}
+		}
+
 		if ignoredParent == "" {
 			// parent isn't ignored, nothing special
-			return w.handleItem(ctx, path, info, toHashChan, finishedChan, skip)
+			return w.handleItem(ctx, path, info, toHashChan, finishedChan)
 		}

 		// Part of current path below the ignored (potential) parent
@ -330,7 +349,7 @@ func (w *walker) walkAndHashFiles(ctx context.Context, toHashChan chan<- protoco
 		// ignored path isn't actually a parent of the current path
 		if rel == path {
 			ignoredParent = ""
-			return w.handleItem(ctx, path, info, toHashChan, finishedChan, skip)
+			return w.handleItem(ctx, path, info, toHashChan, finishedChan)
 		}

 		// The previously ignored parent directories of the current, not
@ -345,7 +364,7 @@ func (w *walker) walkAndHashFiles(ctx context.Context, toHashChan chan<- protoco
 				handleError(ctx, "scan", ignoredParent, err, finishedChan)
 				return skip
 			}
-			if err = w.handleItem(ctx, ignoredParent, info, toHashChan, finishedChan, skip); err != nil {
+			if err = w.handleItem(ctx, ignoredParent, info, toHashChan, finishedChan); err != nil {
 				return err
 			}
 		}
@ -355,14 +374,7 @@ func (w *walker) walkAndHashFiles(ctx context.Context, toHashChan chan<- protoco
 	}
 }

-func (w *walker) handleItem(ctx context.Context, path string, info fs.FileInfo, toHashChan chan<- protocol.FileInfo, finishedChan chan<- ScanResult, skip error) error {
-	oldPath := path
-	path, err := w.normalizePath(path, info)
-	if err != nil {
-		handleError(ctx, "normalizing path", oldPath, err, finishedChan)
-		return skip
-	}
-
+func (w *walker) handleItem(ctx context.Context, path string, info fs.FileInfo, toHashChan chan<- protocol.FileInfo, finishedChan chan<- ScanResult) error {
 	switch {
 	case info.IsSymlink():
 		if err := w.walkSymlink(ctx, path, info, finishedChan); err != nil {
@ -375,13 +387,13 @@ func (w *walker) handleItem(ctx context.Context, path string, info fs.FileInfo,
 		return nil

 	case info.IsDir():
-		err = w.walkDir(ctx, path, info, finishedChan)
+		return w.walkDir(ctx, path, info, finishedChan)

 	case info.IsRegular():
-		err = w.walkRegular(ctx, path, info, toHashChan)
+		return w.walkRegular(ctx, path, info, toHashChan)
 	}

-	return err
+	return fmt.Errorf("bug: file info for %v is neither symlink, dir nor regular", path)
 }

 func (w *walker) walkRegular(ctx context.Context, relPath string, info fs.FileInfo, toHashChan chan<- protocol.FileInfo) error {
@ -550,30 +562,21 @@ func (w *walker) walkSymlink(ctx context.Context, relPath string, info fs.FileIn
 	return nil
 }

-// normalizePath returns the normalized relative path (possibly after fixing
-// it on disk), or skip is true.
-func (w *walker) normalizePath(path string, info fs.FileInfo) (normPath string, err error) {
+func normalizePath(path string) string {
 	if build.IsDarwin || build.IsIOS {
 		// Mac OS X file names should always be NFD normalized.
-		normPath = norm.NFD.String(path)
-	} else {
+		return norm.NFD.String(path)
+	}
 	// Every other OS in the known universe uses NFC or just plain
 	// doesn't bother to define an encoding. In our case *we* do care,
 	// so we enforce NFC regardless.
-		normPath = norm.NFC.String(path)
-	}
-
-	if path == normPath {
-		// The file name is already normalized: nothing to do
-		return path, nil
-	}
-
-	if !w.AutoNormalize {
-		// We're not authorized to do anything about it, so complain and skip.
-
-		return "", errUTF8Normalization
+	return norm.NFC.String(path)
 }

+// applyNormalization fixes the normalization of the file on disk, i.e. ensures
+// the file at path ends up named normPath. It shouldn't but may happen that the
+// file ends up with a different name, in which case that one should be scanned.
+func (w *walker) applyNormalization(path, normPath string, info fs.FileInfo) (string, error) {
 	// We will attempt to normalize it.
 	normInfo, err := w.Filesystem.Lstat(normPath)
 	if fs.IsNotExist(err) {