Filter non-image files and deduplicate by keeping the largest file in each group
This commit is contained in:
53
main.go
53
main.go
@@ -45,6 +45,12 @@ func main() {
|
|||||||
go func(file string) {
|
go func(file string) {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
log := logger.Default.WithPrefix(file)
|
log := logger.Default.WithPrefix(file)
|
||||||
|
ext := filepath.Ext(file)
|
||||||
|
if ext != ".jpg" && ext != ".jpeg" && ext != ".png" {
|
||||||
|
log.Debug("Skipping non-image file: %s", file)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
imgfile, err := os.Open(file)
|
imgfile, err := os.Open(file)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Error("Failed to open file: %v", err)
|
log.Error("Failed to open file: %v", err)
|
||||||
@@ -68,9 +74,17 @@ func main() {
|
|||||||
|
|
||||||
groupedImages := make(map[string][]string)
|
groupedImages := make(map[string][]string)
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
|
|
||||||
|
processed := make(map[string]bool)
|
||||||
hashes.Range(func(key, value interface{}) bool {
|
hashes.Range(func(key, value interface{}) bool {
|
||||||
filea := key.(string)
|
filea := key.(string)
|
||||||
hasha := value.(*goimagehash.ExtImageHash)
|
hasha := value.(*goimagehash.ExtImageHash)
|
||||||
|
|
||||||
|
if processed[filea] {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
var group []string
|
||||||
hashes.Range(func(key, value interface{}) bool {
|
hashes.Range(func(key, value interface{}) bool {
|
||||||
fileb := key.(string)
|
fileb := key.(string)
|
||||||
hashb := value.(*goimagehash.ExtImageHash)
|
hashb := value.(*goimagehash.ExtImageHash)
|
||||||
@@ -84,15 +98,48 @@ func main() {
|
|||||||
}
|
}
|
||||||
logger.Debug("Distance between %v and %v: %v", filea, fileb, distance)
|
logger.Debug("Distance between %v and %v: %v", filea, fileb, distance)
|
||||||
if distance <= *thresh {
|
if distance <= *thresh {
|
||||||
groupedImages[filea] = append(groupedImages[filea], fileb)
|
group = append(group, fileb)
|
||||||
|
processed[fileb] = true
|
||||||
}
|
}
|
||||||
return true
|
return true
|
||||||
})
|
})
|
||||||
|
|
||||||
|
if len(group) > 0 {
|
||||||
|
groupedImages[filea] = group
|
||||||
|
processed[filea] = true
|
||||||
|
}
|
||||||
return true
|
return true
|
||||||
})
|
})
|
||||||
|
|
||||||
for file, files := range groupedImages {
|
// Deduplicate by keeping the largest file in each group
|
||||||
logger.Info("Grouped %v with %v", file, files)
|
for file, group := range groupedImages {
|
||||||
|
// Add the main file to the group for size comparison
|
||||||
|
allFiles := append([]string{file}, group...)
|
||||||
|
|
||||||
|
// Find the largest file
|
||||||
|
var largestFile string
|
||||||
|
var largestSize int64
|
||||||
|
for _, f := range allFiles {
|
||||||
|
info, err := os.Stat(f)
|
||||||
|
if err != nil {
|
||||||
|
logger.Error("Failed to get file info for %s: %v", f, err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if info.Size() > largestSize {
|
||||||
|
largestSize = info.Size()
|
||||||
|
largestFile = f
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove all files except the largest one
|
||||||
|
for _, f := range allFiles {
|
||||||
|
if f != largestFile {
|
||||||
|
logger.Info("Removing duplicate: %s (keeping %s)", f, largestFile)
|
||||||
|
if err := os.Remove(f); err != nil {
|
||||||
|
logger.Error("Failed to remove file %s: %v", f, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
logger.Info("Done")
|
logger.Info("Done")
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user