package main import ( "flag" "image/jpeg" "os" "path/filepath" "sync" logger "git.site.quack-lab.dev/dave/cylogger" "github.com/bmatcuk/doublestar/v4" "github.com/corona10/goimagehash" ) func main() { thresh := flag.Int("thresh", 10, "Threshold for distance") flag.Parse() logger.InitFlag() hashes := &sync.Map{} logger.Info("Starting") logger.Info("Threshold: %v", *thresh) logger.Info("Patterns: %d", len(flag.Args())) files := make([]string, 0) for _, pattern := range flag.Args() { base, pattern := doublestar.SplitPattern(pattern) logger.Debug("Globbing %q from %q", pattern, base) matches, err := doublestar.Glob(os.DirFS(base), pattern) if err != nil { logger.Error("Failed to glob pattern: %v", err) continue } logger.Debug("Glob %q in %q got %d matches", pattern, base, len(matches)) for _, match := range matches { match = filepath.Join(base, match) logger.Trace("Adding %q", match) files = append(files, match) } } logger.Info("Patterns expanded to %d files", len(files)) wg := sync.WaitGroup{} for _, file := range flag.Args() { wg.Add(1) go func(file string) { defer wg.Done() log := logger.Default.WithPrefix(file) ext := filepath.Ext(file) if ext != ".jpg" && ext != ".jpeg" && ext != ".png" { log.Debug("Skipping non-image file: %s", file) return } imgfile, err := os.Open(file) if err != nil { log.Error("Failed to open file: %v", err) return } defer imgfile.Close() img, err := jpeg.Decode(imgfile) if err != nil { log.Error("Failed to decode image: %v", err) return } hash, err := goimagehash.ExtPerceptionHash(img, 8, 8) if err != nil { log.Error("Failed to calculate hash: %v", err) return } log.Debug("Hashed: %v", hash) hashes.Store(file, hash) }(file) } groupedImages := make(map[string][]string) wg.Wait() processed := make(map[string]bool) hashes.Range(func(key, value interface{}) bool { filea := key.(string) hasha := value.(*goimagehash.ExtImageHash) if processed[filea] { return true } var group []string hashes.Range(func(key, value interface{}) bool { fileb := key.(string) hashb := value.(*goimagehash.ExtImageHash) if filea == fileb { return true } distance, err := hasha.Distance(hashb) if err != nil { logger.Error("Failed to calculate distance: %v", err) return true } logger.Debug("Distance between %v and %v: %v", filea, fileb, distance) if distance <= *thresh { group = append(group, fileb) processed[fileb] = true } return true }) if len(group) > 0 { groupedImages[filea] = group processed[filea] = true } return true }) // Deduplicate by keeping the largest file in each group for file, group := range groupedImages { // Add the main file to the group for size comparison allFiles := append([]string{file}, group...) // Find the largest file var largestFile string var largestSize int64 for _, f := range allFiles { info, err := os.Stat(f) if err != nil { logger.Error("Failed to get file info for %s: %v", f, err) continue } if info.Size() > largestSize { largestSize = info.Size() largestFile = f } } // Remove all files except the largest one for _, f := range allFiles { if f != largestFile { logger.Info("Removing duplicate: %s (keeping %s)", f, largestFile) if err := os.Remove(f); err != nil { logger.Error("Failed to remove file %s: %v", f, err) } } } } logger.Info("Done") }