146 lines
3.5 KiB
Go
146 lines
3.5 KiB
Go
package main
|
|
|
|
import (
|
|
"flag"
|
|
"image/jpeg"
|
|
"os"
|
|
"path/filepath"
|
|
"sync"
|
|
|
|
logger "git.site.quack-lab.dev/dave/cylogger"
|
|
"github.com/bmatcuk/doublestar/v4"
|
|
"github.com/corona10/goimagehash"
|
|
)
|
|
|
|
func main() {
|
|
thresh := flag.Int("thresh", 10, "Threshold for distance")
|
|
flag.Parse()
|
|
logger.InitFlag()
|
|
hashes := &sync.Map{}
|
|
logger.Info("Starting")
|
|
logger.Info("Threshold: %v", *thresh)
|
|
logger.Info("Patterns: %d", len(flag.Args()))
|
|
|
|
files := make([]string, 0)
|
|
for _, pattern := range flag.Args() {
|
|
base, pattern := doublestar.SplitPattern(pattern)
|
|
logger.Debug("Globbing %q from %q", pattern, base)
|
|
matches, err := doublestar.Glob(os.DirFS(base), pattern)
|
|
if err != nil {
|
|
logger.Error("Failed to glob pattern: %v", err)
|
|
continue
|
|
}
|
|
logger.Debug("Glob %q in %q got %d matches", pattern, base, len(matches))
|
|
for _, match := range matches {
|
|
match = filepath.Join(base, match)
|
|
logger.Trace("Adding %q", match)
|
|
files = append(files, match)
|
|
}
|
|
}
|
|
logger.Info("Patterns expanded to %d files", len(files))
|
|
|
|
wg := sync.WaitGroup{}
|
|
for _, file := range files {
|
|
wg.Add(1)
|
|
go func(file string) {
|
|
defer wg.Done()
|
|
log := logger.Default.WithPrefix(file)
|
|
ext := filepath.Ext(file)
|
|
if ext != ".jpg" && ext != ".jpeg" && ext != ".png" {
|
|
log.Debug("Skipping non-image file: %s", file)
|
|
return
|
|
}
|
|
|
|
imgfile, err := os.Open(file)
|
|
if err != nil {
|
|
log.Error("Failed to open file: %v", err)
|
|
return
|
|
}
|
|
defer imgfile.Close()
|
|
img, err := jpeg.Decode(imgfile)
|
|
if err != nil {
|
|
log.Error("Failed to decode image: %v", err)
|
|
return
|
|
}
|
|
hash, err := goimagehash.ExtPerceptionHash(img, 8, 8)
|
|
if err != nil {
|
|
log.Error("Failed to calculate hash: %v", err)
|
|
return
|
|
}
|
|
log.Debug("Hashed: %v", hash)
|
|
hashes.Store(file, hash)
|
|
}(file)
|
|
}
|
|
|
|
groupedImages := make(map[string][]string)
|
|
wg.Wait()
|
|
|
|
processed := make(map[string]bool)
|
|
hashes.Range(func(key, value interface{}) bool {
|
|
filea := key.(string)
|
|
hasha := value.(*goimagehash.ExtImageHash)
|
|
|
|
if processed[filea] {
|
|
return true
|
|
}
|
|
|
|
var group []string
|
|
hashes.Range(func(key, value interface{}) bool {
|
|
fileb := key.(string)
|
|
hashb := value.(*goimagehash.ExtImageHash)
|
|
if filea == fileb {
|
|
return true
|
|
}
|
|
distance, err := hasha.Distance(hashb)
|
|
if err != nil {
|
|
logger.Error("Failed to calculate distance: %v", err)
|
|
return true
|
|
}
|
|
logger.Debug("Distance between %v and %v: %v", filea, fileb, distance)
|
|
if distance <= *thresh {
|
|
group = append(group, fileb)
|
|
processed[fileb] = true
|
|
}
|
|
return true
|
|
})
|
|
|
|
if len(group) > 0 {
|
|
groupedImages[filea] = group
|
|
processed[filea] = true
|
|
}
|
|
return true
|
|
})
|
|
|
|
// Deduplicate by keeping the largest file in each group
|
|
for file, group := range groupedImages {
|
|
// Add the main file to the group for size comparison
|
|
allFiles := append([]string{file}, group...)
|
|
|
|
// Find the largest file
|
|
var largestFile string
|
|
var largestSize int64
|
|
for _, f := range allFiles {
|
|
info, err := os.Stat(f)
|
|
if err != nil {
|
|
logger.Error("Failed to get file info for %s: %v", f, err)
|
|
continue
|
|
}
|
|
if info.Size() > largestSize {
|
|
largestSize = info.Size()
|
|
largestFile = f
|
|
}
|
|
}
|
|
|
|
// Remove all files except the largest one
|
|
for _, f := range allFiles {
|
|
if f != largestFile {
|
|
logger.Info("Removing duplicate: %s (keeping %s)", f, largestFile)
|
|
if err := os.Remove(f); err != nil {
|
|
logger.Error("Failed to remove file %s: %v", f, err)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
logger.Info("Done")
|
|
}
|