Add worker concurrency and support for PNG images

This commit is contained in:
2025-05-23 13:25:10 +02:00
parent 47617fcdc5
commit 6445a2eded

79
main.go
View File

@@ -2,7 +2,9 @@ package main
import ( import (
"flag" "flag"
"image"
"image/jpeg" "image/jpeg"
"image/png"
"os" "os"
"path/filepath" "path/filepath"
"sync" "sync"
@@ -14,6 +16,7 @@ import (
func main() { func main() {
thresh := flag.Int("thresh", 10, "Threshold for distance") thresh := flag.Int("thresh", 10, "Threshold for distance")
workers := flag.Int("workers", 100, "Number of workers")
flag.Parse() flag.Parse()
logger.InitFlag() logger.InitFlag()
hashes := &sync.Map{} hashes := &sync.Map{}
@@ -39,11 +42,14 @@ func main() {
} }
logger.Info("Patterns expanded to %d files", len(files)) logger.Info("Patterns expanded to %d files", len(files))
workerChan := make(chan struct{}, *workers)
wg := sync.WaitGroup{} wg := sync.WaitGroup{}
for _, file := range files { for _, file := range files {
workerChan <- struct{}{}
wg.Add(1) wg.Add(1)
go func(file string) { go func(file string) {
defer wg.Done() defer wg.Done()
defer func() { <-workerChan }()
log := logger.Default.WithPrefix(file) log := logger.Default.WithPrefix(file)
ext := filepath.Ext(file) ext := filepath.Ext(file)
if ext != ".jpg" && ext != ".jpeg" && ext != ".png" { if ext != ".jpg" && ext != ".jpeg" && ext != ".png" {
@@ -57,11 +63,19 @@ func main() {
return return
} }
defer imgfile.Close() defer imgfile.Close()
img, err := jpeg.Decode(imgfile)
isPng := ext == ".png"
var img image.Image
if isPng {
img, err = png.Decode(imgfile)
} else {
img, err = jpeg.Decode(imgfile)
}
if err != nil { if err != nil {
log.Error("Failed to decode image: %v", err) log.Error("Failed to decode image: %v", err)
return return
} }
hash, err := goimagehash.ExtPerceptionHash(img, 8, 8) hash, err := goimagehash.ExtPerceptionHash(img, 8, 8)
if err != nil { if err != nil {
log.Error("Failed to calculate hash: %v", err) log.Error("Failed to calculate hash: %v", err)
@@ -75,41 +89,48 @@ func main() {
groupedImages := make(map[string][]string) groupedImages := make(map[string][]string)
wg.Wait() wg.Wait()
processed := make(map[string]bool) processed := &sync.Map{}
hashes.Range(func(key, value interface{}) bool { hashes.Range(func(key, value interface{}) bool {
filea := key.(string) workerChan <- struct{}{}
hasha := value.(*goimagehash.ExtImageHash) wg.Add(1)
go func(key, value interface{}) {
defer wg.Done()
defer func() { <-workerChan }()
filea := key.(string)
hasha := value.(*goimagehash.ExtImageHash)
if processed[filea] { if _, ok := processed.Load(filea); ok {
return true return
} }
var group []string var group []string
hashes.Range(func(key, value interface{}) bool { hashes.Range(func(key, value interface{}) bool {
fileb := key.(string) fileb := key.(string)
hashb := value.(*goimagehash.ExtImageHash) hashb := value.(*goimagehash.ExtImageHash)
if filea == fileb { if filea == fileb {
return true
}
distance, err := hasha.Distance(hashb)
if err != nil {
logger.Error("Failed to calculate distance: %v", err)
return true
}
logger.Debug("Distance between %v and %v: %v", filea, fileb, distance)
if distance <= *thresh {
group = append(group, fileb)
processed.Store(fileb, true)
}
return true return true
} })
distance, err := hasha.Distance(hashb)
if err != nil {
logger.Error("Failed to calculate distance: %v", err)
return true
}
logger.Debug("Distance between %v and %v: %v", filea, fileb, distance)
if distance <= *thresh {
group = append(group, fileb)
processed[fileb] = true
}
return true
})
if len(group) > 0 { if len(group) > 0 {
groupedImages[filea] = group groupedImages[filea] = group
processed[filea] = true processed.Store(filea, true)
} }
}(key, value)
return true return true
}) })
wg.Wait()
// Deduplicate by keeping the largest file in each group // Deduplicate by keeping the largest file in each group
for file, group := range groupedImages { for file, group := range groupedImages {