package main import ( "flag" "image" "image/jpeg" "image/png" "os" "path/filepath" "sync" logger "git.site.quack-lab.dev/dave/cylogger" "github.com/bmatcuk/doublestar/v4" "github.com/corona10/goimagehash" ) func main() { thresh := flag.Int("thresh", 10, "Threshold for distance") workers := flag.Int("workers", 100, "Number of workers") flag.Parse() logger.InitFlag() hashes := &sync.Map{} logger.Info("Starting") logger.Info("Threshold: %v", *thresh) logger.Info("Patterns: %d", len(flag.Args())) files := make([]string, 0) for _, pattern := range flag.Args() { base, pattern := doublestar.SplitPattern(pattern) logger.Debug("Globbing %q from %q", pattern, base) matches, err := doublestar.Glob(os.DirFS(base), pattern) if err != nil { logger.Error("Failed to glob pattern: %v", err) continue } logger.Debug("Glob %q in %q got %d matches", pattern, base, len(matches)) for _, match := range matches { match = filepath.Join(base, match) logger.Trace("Adding %q", match) files = append(files, match) } } logger.Info("Patterns expanded to %d files", len(files)) workerChan := make(chan struct{}, *workers) wg := sync.WaitGroup{} for _, file := range files { workerChan <- struct{}{} wg.Add(1) go func(file string) { defer wg.Done() defer func() { <-workerChan }() log := logger.Default.WithPrefix(file) ext := filepath.Ext(file) if ext != ".jpg" && ext != ".jpeg" && ext != ".png" { log.Debug("Skipping non-image file: %s", file) return } imgfile, err := os.Open(file) if err != nil { log.Error("Failed to open file: %v", err) return } defer imgfile.Close() isPng := ext == ".png" var img image.Image if isPng { img, err = png.Decode(imgfile) } else { img, err = jpeg.Decode(imgfile) } if err != nil { log.Error("Failed to decode image: %v", err) return } hash, err := goimagehash.ExtPerceptionHash(img, 8, 8) if err != nil { log.Error("Failed to calculate hash: %v", err) return } log.Debug("Hashed: %v", hash) hashes.Store(file, hash) }(file) } groupedImages := make(map[string][]string) wg.Wait() processed := &sync.Map{} hashes.Range(func(key, value interface{}) bool { workerChan <- struct{}{} wg.Add(1) go func(key, value interface{}) { defer wg.Done() defer func() { <-workerChan }() filea := key.(string) hasha := value.(*goimagehash.ExtImageHash) if _, ok := processed.Load(filea); ok { return } var group []string hashes.Range(func(key, value interface{}) bool { fileb := key.(string) hashb := value.(*goimagehash.ExtImageHash) if filea == fileb { return true } distance, err := hasha.Distance(hashb) if err != nil { logger.Error("Failed to calculate distance: %v", err) return true } logger.Debug("Distance between %v and %v: %v", filea, fileb, distance) if distance <= *thresh { group = append(group, fileb) processed.Store(fileb, true) } return true }) if len(group) > 0 { groupedImages[filea] = group processed.Store(filea, true) } }(key, value) return true }) wg.Wait() // Deduplicate by keeping the largest file in each group for file, group := range groupedImages { // Add the main file to the group for size comparison allFiles := append([]string{file}, group...) // Find the largest file var largestFile string var largestSize int64 for _, f := range allFiles { info, err := os.Stat(f) if err != nil { logger.Error("Failed to get file info for %s: %v", f, err) continue } if info.Size() > largestSize { largestSize = info.Size() largestFile = f } } // Remove all files except the largest one for _, f := range allFiles { if f != largestFile { logger.Info("Removing duplicate: %s (keeping %s)", f, largestFile) if err := os.Remove(f); err != nil { logger.Error("Failed to remove file %s: %v", f, err) } } } } logger.Info("Done") }