From b8f599d69facb271c344fc50e4287cf785190f90 Mon Sep 17 00:00:00 2001 From: PhatPhuckDave Date: Fri, 23 May 2025 13:14:54 +0200 Subject: [PATCH] Filter non-image files and deduplicate by keeping the largest file in each group --- main.go | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/main.go b/main.go index f73b1dd..70e69a4 100644 --- a/main.go +++ b/main.go @@ -45,6 +45,12 @@ func main() { go func(file string) { defer wg.Done() log := logger.Default.WithPrefix(file) + ext := filepath.Ext(file) + if ext != ".jpg" && ext != ".jpeg" && ext != ".png" { + log.Debug("Skipping non-image file: %s", file) + return + } + imgfile, err := os.Open(file) if err != nil { log.Error("Failed to open file: %v", err) @@ -68,9 +74,17 @@ func main() { groupedImages := make(map[string][]string) wg.Wait() + + processed := make(map[string]bool) hashes.Range(func(key, value interface{}) bool { filea := key.(string) hasha := value.(*goimagehash.ExtImageHash) + + if processed[filea] { + return true + } + + var group []string hashes.Range(func(key, value interface{}) bool { fileb := key.(string) hashb := value.(*goimagehash.ExtImageHash) @@ -84,15 +98,48 @@ func main() { } logger.Debug("Distance between %v and %v: %v", filea, fileb, distance) if distance <= *thresh { - groupedImages[filea] = append(groupedImages[filea], fileb) + group = append(group, fileb) + processed[fileb] = true } return true }) + + if len(group) > 0 { + groupedImages[filea] = group + processed[filea] = true + } return true }) - for file, files := range groupedImages { - logger.Info("Grouped %v with %v", file, files) + // Deduplicate by keeping the largest file in each group + for file, group := range groupedImages { + // Add the main file to the group for size comparison + allFiles := append([]string{file}, group...) + + // Find the largest file + var largestFile string + var largestSize int64 + for _, f := range allFiles { + info, err := os.Stat(f) + if err != nil { + logger.Error("Failed to get file info for %s: %v", f, err) + continue + } + if info.Size() > largestSize { + largestSize = info.Size() + largestFile = f + } + } + + // Remove all files except the largest one + for _, f := range allFiles { + if f != largestFile { + logger.Info("Removing duplicate: %s (keeping %s)", f, largestFile) + if err := os.Remove(f); err != nil { + logger.Error("Failed to remove file %s: %v", f, err) + } + } + } } logger.Info("Done") }