Filter non-image files and deduplicate by keeping the largest file in each group
This commit is contained in:
57
main.go
57
main.go
@@ -45,6 +45,12 @@ func main() {
|
||||
go func(file string) {
|
||||
defer wg.Done()
|
||||
log := logger.Default.WithPrefix(file)
|
||||
ext := filepath.Ext(file)
|
||||
if ext != ".jpg" && ext != ".jpeg" && ext != ".png" {
|
||||
log.Debug("Skipping non-image file: %s", file)
|
||||
return
|
||||
}
|
||||
|
||||
imgfile, err := os.Open(file)
|
||||
if err != nil {
|
||||
log.Error("Failed to open file: %v", err)
|
||||
@@ -68,9 +74,17 @@ func main() {
|
||||
|
||||
groupedImages := make(map[string][]string)
|
||||
wg.Wait()
|
||||
|
||||
processed := make(map[string]bool)
|
||||
hashes.Range(func(key, value interface{}) bool {
|
||||
filea := key.(string)
|
||||
hasha := value.(*goimagehash.ExtImageHash)
|
||||
|
||||
if processed[filea] {
|
||||
return true
|
||||
}
|
||||
|
||||
var group []string
|
||||
hashes.Range(func(key, value interface{}) bool {
|
||||
fileb := key.(string)
|
||||
hashb := value.(*goimagehash.ExtImageHash)
|
||||
@@ -84,15 +98,48 @@ func main() {
|
||||
}
|
||||
logger.Debug("Distance between %v and %v: %v", filea, fileb, distance)
|
||||
if distance <= *thresh {
|
||||
groupedImages[filea] = append(groupedImages[filea], fileb)
|
||||
group = append(group, fileb)
|
||||
processed[fileb] = true
|
||||
}
|
||||
return true
|
||||
})
|
||||
return true
|
||||
})
|
||||
|
||||
for file, files := range groupedImages {
|
||||
logger.Info("Grouped %v with %v", file, files)
|
||||
if len(group) > 0 {
|
||||
groupedImages[filea] = group
|
||||
processed[filea] = true
|
||||
}
|
||||
return true
|
||||
})
|
||||
|
||||
// Deduplicate by keeping the largest file in each group
|
||||
for file, group := range groupedImages {
|
||||
// Add the main file to the group for size comparison
|
||||
allFiles := append([]string{file}, group...)
|
||||
|
||||
// Find the largest file
|
||||
var largestFile string
|
||||
var largestSize int64
|
||||
for _, f := range allFiles {
|
||||
info, err := os.Stat(f)
|
||||
if err != nil {
|
||||
logger.Error("Failed to get file info for %s: %v", f, err)
|
||||
continue
|
||||
}
|
||||
if info.Size() > largestSize {
|
||||
largestSize = info.Size()
|
||||
largestFile = f
|
||||
}
|
||||
}
|
||||
|
||||
// Remove all files except the largest one
|
||||
for _, f := range allFiles {
|
||||
if f != largestFile {
|
||||
logger.Info("Removing duplicate: %s (keeping %s)", f, largestFile)
|
||||
if err := os.Remove(f); err != nil {
|
||||
logger.Error("Failed to remove file %s: %v", f, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
logger.Info("Done")
|
||||
}
|
||||
|
Reference in New Issue
Block a user