Add worker concurrency and support for PNG images
This commit is contained in:
79
main.go
79
main.go
@@ -2,7 +2,9 @@ package main
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"flag"
|
"flag"
|
||||||
|
"image"
|
||||||
"image/jpeg"
|
"image/jpeg"
|
||||||
|
"image/png"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"sync"
|
"sync"
|
||||||
@@ -14,6 +16,7 @@ import (
|
|||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
thresh := flag.Int("thresh", 10, "Threshold for distance")
|
thresh := flag.Int("thresh", 10, "Threshold for distance")
|
||||||
|
workers := flag.Int("workers", 100, "Number of workers")
|
||||||
flag.Parse()
|
flag.Parse()
|
||||||
logger.InitFlag()
|
logger.InitFlag()
|
||||||
hashes := &sync.Map{}
|
hashes := &sync.Map{}
|
||||||
@@ -39,11 +42,14 @@ func main() {
|
|||||||
}
|
}
|
||||||
logger.Info("Patterns expanded to %d files", len(files))
|
logger.Info("Patterns expanded to %d files", len(files))
|
||||||
|
|
||||||
|
workerChan := make(chan struct{}, *workers)
|
||||||
wg := sync.WaitGroup{}
|
wg := sync.WaitGroup{}
|
||||||
for _, file := range files {
|
for _, file := range files {
|
||||||
|
workerChan <- struct{}{}
|
||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
go func(file string) {
|
go func(file string) {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
|
defer func() { <-workerChan }()
|
||||||
log := logger.Default.WithPrefix(file)
|
log := logger.Default.WithPrefix(file)
|
||||||
ext := filepath.Ext(file)
|
ext := filepath.Ext(file)
|
||||||
if ext != ".jpg" && ext != ".jpeg" && ext != ".png" {
|
if ext != ".jpg" && ext != ".jpeg" && ext != ".png" {
|
||||||
@@ -57,11 +63,19 @@ func main() {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
defer imgfile.Close()
|
defer imgfile.Close()
|
||||||
img, err := jpeg.Decode(imgfile)
|
|
||||||
|
isPng := ext == ".png"
|
||||||
|
var img image.Image
|
||||||
|
if isPng {
|
||||||
|
img, err = png.Decode(imgfile)
|
||||||
|
} else {
|
||||||
|
img, err = jpeg.Decode(imgfile)
|
||||||
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Error("Failed to decode image: %v", err)
|
log.Error("Failed to decode image: %v", err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
hash, err := goimagehash.ExtPerceptionHash(img, 8, 8)
|
hash, err := goimagehash.ExtPerceptionHash(img, 8, 8)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Error("Failed to calculate hash: %v", err)
|
log.Error("Failed to calculate hash: %v", err)
|
||||||
@@ -75,41 +89,48 @@ func main() {
|
|||||||
groupedImages := make(map[string][]string)
|
groupedImages := make(map[string][]string)
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
|
|
||||||
processed := make(map[string]bool)
|
processed := &sync.Map{}
|
||||||
hashes.Range(func(key, value interface{}) bool {
|
hashes.Range(func(key, value interface{}) bool {
|
||||||
filea := key.(string)
|
workerChan <- struct{}{}
|
||||||
hasha := value.(*goimagehash.ExtImageHash)
|
wg.Add(1)
|
||||||
|
go func(key, value interface{}) {
|
||||||
|
defer wg.Done()
|
||||||
|
defer func() { <-workerChan }()
|
||||||
|
filea := key.(string)
|
||||||
|
hasha := value.(*goimagehash.ExtImageHash)
|
||||||
|
|
||||||
if processed[filea] {
|
if _, ok := processed.Load(filea); ok {
|
||||||
return true
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
var group []string
|
var group []string
|
||||||
hashes.Range(func(key, value interface{}) bool {
|
hashes.Range(func(key, value interface{}) bool {
|
||||||
fileb := key.(string)
|
fileb := key.(string)
|
||||||
hashb := value.(*goimagehash.ExtImageHash)
|
hashb := value.(*goimagehash.ExtImageHash)
|
||||||
if filea == fileb {
|
if filea == fileb {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
distance, err := hasha.Distance(hashb)
|
||||||
|
if err != nil {
|
||||||
|
logger.Error("Failed to calculate distance: %v", err)
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
logger.Debug("Distance between %v and %v: %v", filea, fileb, distance)
|
||||||
|
if distance <= *thresh {
|
||||||
|
group = append(group, fileb)
|
||||||
|
processed.Store(fileb, true)
|
||||||
|
}
|
||||||
return true
|
return true
|
||||||
}
|
})
|
||||||
distance, err := hasha.Distance(hashb)
|
|
||||||
if err != nil {
|
|
||||||
logger.Error("Failed to calculate distance: %v", err)
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
logger.Debug("Distance between %v and %v: %v", filea, fileb, distance)
|
|
||||||
if distance <= *thresh {
|
|
||||||
group = append(group, fileb)
|
|
||||||
processed[fileb] = true
|
|
||||||
}
|
|
||||||
return true
|
|
||||||
})
|
|
||||||
|
|
||||||
if len(group) > 0 {
|
if len(group) > 0 {
|
||||||
groupedImages[filea] = group
|
groupedImages[filea] = group
|
||||||
processed[filea] = true
|
processed.Store(filea, true)
|
||||||
}
|
}
|
||||||
|
}(key, value)
|
||||||
return true
|
return true
|
||||||
})
|
})
|
||||||
|
wg.Wait()
|
||||||
|
|
||||||
// Deduplicate by keeping the largest file in each group
|
// Deduplicate by keeping the largest file in each group
|
||||||
for file, group := range groupedImages {
|
for file, group := range groupedImages {
|
||||||
|
Reference in New Issue
Block a user