Filter non-image files and deduplicate by keeping the largest file in each group

This commit is contained in:
2025-05-23 13:14:54 +02:00
parent 9b49cfa9a5
commit b8f599d69f

53
main.go
View File

@@ -45,6 +45,12 @@ func main() {
go func(file string) { go func(file string) {
defer wg.Done() defer wg.Done()
log := logger.Default.WithPrefix(file) log := logger.Default.WithPrefix(file)
ext := filepath.Ext(file)
if ext != ".jpg" && ext != ".jpeg" && ext != ".png" {
log.Debug("Skipping non-image file: %s", file)
return
}
imgfile, err := os.Open(file) imgfile, err := os.Open(file)
if err != nil { if err != nil {
log.Error("Failed to open file: %v", err) log.Error("Failed to open file: %v", err)
@@ -68,9 +74,17 @@ func main() {
groupedImages := make(map[string][]string) groupedImages := make(map[string][]string)
wg.Wait() wg.Wait()
processed := make(map[string]bool)
hashes.Range(func(key, value interface{}) bool { hashes.Range(func(key, value interface{}) bool {
filea := key.(string) filea := key.(string)
hasha := value.(*goimagehash.ExtImageHash) hasha := value.(*goimagehash.ExtImageHash)
if processed[filea] {
return true
}
var group []string
hashes.Range(func(key, value interface{}) bool { hashes.Range(func(key, value interface{}) bool {
fileb := key.(string) fileb := key.(string)
hashb := value.(*goimagehash.ExtImageHash) hashb := value.(*goimagehash.ExtImageHash)
@@ -84,15 +98,48 @@ func main() {
} }
logger.Debug("Distance between %v and %v: %v", filea, fileb, distance) logger.Debug("Distance between %v and %v: %v", filea, fileb, distance)
if distance <= *thresh { if distance <= *thresh {
groupedImages[filea] = append(groupedImages[filea], fileb) group = append(group, fileb)
processed[fileb] = true
} }
return true return true
}) })
if len(group) > 0 {
groupedImages[filea] = group
processed[filea] = true
}
return true return true
}) })
for file, files := range groupedImages { // Deduplicate by keeping the largest file in each group
logger.Info("Grouped %v with %v", file, files) for file, group := range groupedImages {
// Add the main file to the group for size comparison
allFiles := append([]string{file}, group...)
// Find the largest file
var largestFile string
var largestSize int64
for _, f := range allFiles {
info, err := os.Stat(f)
if err != nil {
logger.Error("Failed to get file info for %s: %v", f, err)
continue
}
if info.Size() > largestSize {
largestSize = info.Size()
largestFile = f
}
}
// Remove all files except the largest one
for _, f := range allFiles {
if f != largestFile {
logger.Info("Removing duplicate: %s (keeping %s)", f, largestFile)
if err := os.Remove(f); err != nil {
logger.Error("Failed to remove file %s: %v", f, err)
}
}
}
} }
logger.Info("Done") logger.Info("Done")
} }