Maybe fix the directory crawler

This commit is contained in:
2025-02-26 11:17:51 +01:00
parent 125bf78c16
commit 83477d5f18

130
util.go
View File

@@ -79,76 +79,104 @@ func GetSyncFilesRecursively(input string, output chan string, status chan error
var filesProcessed int32 var filesProcessed int32
var foldersProcessed int32 var foldersProcessed int32
var activeWorkers int32
progressTicker := time.NewTicker(200 * time.Millisecond) progressTicker := time.NewTicker(200 * time.Millisecond)
defer progressTicker.Stop() defer progressTicker.Stop()
var wg sync.WaitGroup done := make(chan struct{})
var initial sync.Once defer close(done)
var done bool
wg.Add(1)
directories := make(chan string, 100000) directories := make(chan string, 100000)
workerPool := make(chan struct{}, 4000) workerPool := make(chan struct{}, 4000)
directories <- input directories <- input
go func() { go func() {
for { for {
fmt.Printf("\rFiles processed: %d; Folders processed: %d; Workers: %d; Directory Stack Size: %d;", atomic.LoadInt32((&filesProcessed)), atomic.LoadInt32(&foldersProcessed), len(workerPool), len(directories)) select {
<-progressTicker.C case <-progressTicker.C:
dirCount := len(directories)
workers := atomic.LoadInt32(&activeWorkers)
fmt.Printf("\rFiles processed: %d; Folders processed: %d; Active workers: %d; Directory queue: %d",
atomic.LoadInt32(&filesProcessed),
atomic.LoadInt32(&foldersProcessed),
workers,
dirCount)
case <-done:
// Final progress update
fmt.Printf("\rFiles processed: %d; Folders processed: %d; Completed successfully\n",
atomic.LoadInt32(&filesProcessed),
atomic.LoadInt32(&foldersProcessed))
return
}
} }
}() }()
// log.Printf("%+v", len(workerPool)) allDone := make(chan struct{})
go func() {
for directory := range directories {
workerPool <- struct{}{}
wg.Add(1)
go func(directory string) {
atomic.AddInt32(&foldersProcessed, 1)
defer wg.Done()
defer func() { <-workerPool }()
files, err := os.ReadDir(directory) go func() {
if err != nil { var wg sync.WaitGroup
log.Printf("Error reading directory %s: %+v", directory, err)
go func() {
for {
if atomic.LoadInt32(&activeWorkers) == 0 && len(directories) == 0 {
time.Sleep(10 * time.Millisecond)
if atomic.LoadInt32(&activeWorkers) == 0 && len(directories) == 0 {
close(allDone)
return
}
}
time.Sleep(50 * time.Millisecond)
}
}()
for {
select {
case directory, ok := <-directories:
if !ok {
wg.Wait()
return return
} }
for _, file := range files { atomic.AddInt32(&activeWorkers, 1)
// log.Printf("Processing file %s", file.Name())
if file.IsDir() { go func(dir string) {
directories <- filepath.Join(directory, file.Name()) workerPool <- struct{}{}
} else {
// log.Println(file.Name(), DirRegex.MatchString(file.Name())) atomic.AddInt32(&foldersProcessed, 1)
if FileRegex.MatchString(file.Name()) || IsYAMLSyncFile(file.Name()) { processDirectory(dir, directories, output, &filesProcessed)
// log.Printf("Writing")
output <- filepath.Join(directory, file.Name()) <-workerPool
} atomic.AddInt32(&activeWorkers, -1)
atomic.AddInt32(&filesProcessed, 1) }(directory)
} }
}
// log.Printf("Done reading directory %s", directory)
done = len(directories) == 0
if done {
initial.Do(func() {
wg.Done()
})
}
}(directory)
} }
}() }()
// This actually does not go through ALL files sadly... <-allDone
// It so happens (very often) that we manage to quit between one iteration ending
// And another beginning log.Printf("Files processed: %d; Folders processed: %d",
// In such a state workgroup is decreased and, before it has a chance to increase, we are done atomic.LoadInt32(&filesProcessed),
// What I should do here is only terminate if directories is empty atomic.LoadInt32(&foldersProcessed))
// ...but how do I do that? }
// I might be wrong... Fuck knows...
// It also sometimes happens that wg.Wait triggers after wg.Done on line 97 but before the next (what would be!) wg.Add on line 94 func processDirectory(directory string, directories chan<- string, output chan<- string, filesProcessed *int32) {
// This happens much more often with a small number of workers files, err := os.ReadDir(directory)
// Such is the nature of race conditions... if err != nil {
wg.Wait() log.Printf("Error reading directory %s: %+v", directory, err)
log.Printf("Files processed: %d; Folders processed: %d", filesProcessed, foldersProcessed) return
}
for _, file := range files {
if file.IsDir() {
directories <- filepath.Join(directory, file.Name())
} else {
if FileRegex.MatchString(file.Name()) || IsYAMLSyncFile(file.Name()) {
output <- filepath.Join(directory, file.Name())
}
atomic.AddInt32(filesProcessed, 1)
}
}
} }
func IsYAMLSyncFile(filename string) bool { func IsYAMLSyncFile(filename string) bool {