package main import ( "bufio" "flag" "fmt" "log" "os" "path/filepath" "sort" "strings" "sync" "sync/atomic" "time" ) var Error *log.Logger var Warning *log.Logger func init() { log.SetFlags(log.Lmicroseconds | log.Lshortfile) log.SetOutput(os.Stdout) Error = log.New(os.Stderr, fmt.Sprintf("%sERROR:%s ", "\033[0;101m", "\033[0m"), log.Lmicroseconds|log.Lshortfile) Warning = log.New(os.Stdout, fmt.Sprintf("%sWarning:%s ", "\033[0;93m", "\033[0m"), log.Lmicroseconds|log.Lshortfile) } func main() { flag.Parse() dir := flag.Arg(0) if dir == "" { dir = "." } dir = NormalizePath(dir) log.Printf("Scanning directory: %s", dir) files := make(chan string, 10000) status := make(chan error) go GetSyncFilesRecursively(dir, files, status) extensionTypeCount := sync.Map{} wg := sync.WaitGroup{} for file := range files { wg.Add(1) go func(file string) { defer wg.Done() f, err := os.Open(file) if err != nil { log.Printf("Error opening file %s: %v", file, err) return } scanner := bufio.NewScanner(f) if scanner.Scan() { ext := filepath.Ext(file) key := ext if IsStringBinary(scanner.Text()) { key += " (binary)" count, _ := extensionTypeCount.LoadOrStore(key, 0) extensionTypeCount.Store(key, count.(int)+1) //log.Printf("Binary file: %s (%s)", file, ext) } else { key += " (text)" count, _ := extensionTypeCount.LoadOrStore(key, 0) extensionTypeCount.Store(key, count.(int)+1) //log.Printf("Text file: %s (%s)", file, ext) } } else if err := scanner.Err(); err != nil { log.Printf("Error reading line from file %s: %v", file, err) } f.Close() }(file) } wg.Wait() // Create a map to store raw extension names and their binary vs text counts extensionBinaryTextCount := make(map[string][2]int) // Collect all raw extensions and their counts extensionTypeCount.Range(func(key, value any) bool { keyStr := key.(string) count := value.(int) // Check if it's a text file (has " (text)" suffix) if strings.HasSuffix(keyStr, " (text)") { baseExt := strings.TrimSuffix(keyStr, " (text)") counts, exists := extensionBinaryTextCount[baseExt] if !exists { counts = [2]int{0, 0} } counts[1] = count // index 1 for text count extensionBinaryTextCount[baseExt] = counts } else { // Binary file counts, exists := extensionBinaryTextCount[keyStr] if !exists { counts = [2]int{0, 0} } counts[0] = count // index 0 for binary count extensionBinaryTextCount[keyStr] = counts } return true }) // Get all extensions that have more binary occurrences than text var binaryDominantExts []string for ext, counts := range extensionBinaryTextCount { binaryCount := counts[0] textCount := counts[1] if binaryCount > textCount { binaryDominantExts = append(binaryDominantExts, ext) } } // Sort the extensions sort.Strings(binaryDominantExts) // Print only the extensions that are more likely to be binary fmt.Println("Extensions that are predominantly binary:") for _, ext := range binaryDominantExts { counts := extensionBinaryTextCount[ext] fmt.Printf("Extension: %s, Binary Count: %d, Text Count: %d\n", ext, counts[0], counts[1]) } } func IsStringBinary(s string) bool { for _, c := range s { if c < ' ' || c > '~' { return true } } return false } func GetSyncFilesRecursively(input string, output chan string, status chan error) { defer close(output) defer close(status) var filesProcessed int32 var foldersProcessed int32 var activeWorkers int32 progressTicker := time.NewTicker(200 * time.Millisecond) defer progressTicker.Stop() done := make(chan struct{}) defer close(done) directories := make(chan string, 100000) workerPool := make(chan struct{}, 4000) directories <- input go func() { for { select { case <-progressTicker.C: dirCount := len(directories) workers := atomic.LoadInt32(&activeWorkers) fmt.Printf("\rFiles processed: %8d; Folders processed: %8d; Active workers: %8d; Directory queue: %8d", atomic.LoadInt32(&filesProcessed), atomic.LoadInt32(&foldersProcessed), workers, dirCount) case <-done: // Final progress update fmt.Printf("\nFiles processed: %8d; Folders processed: %8d; Completed successfully\n", atomic.LoadInt32(&filesProcessed), atomic.LoadInt32(&foldersProcessed)) return } } }() allDone := make(chan struct{}) go func() { var wg sync.WaitGroup go func() { for { if atomic.LoadInt32(&activeWorkers) == 0 && len(directories) == 0 { time.Sleep(10 * time.Millisecond) if atomic.LoadInt32(&activeWorkers) == 0 && len(directories) == 0 { close(allDone) return } } time.Sleep(50 * time.Millisecond) } }() for { select { case directory, ok := <-directories: if !ok { wg.Wait() return } atomic.AddInt32(&activeWorkers, 1) go func(dir string) { workerPool <- struct{}{} atomic.AddInt32(&foldersProcessed, 1) processDirectory(dir, directories, output, &filesProcessed) <-workerPool atomic.AddInt32(&activeWorkers, -1) }(directory) } } }() <-allDone log.Printf("Files processed: %d; Folders processed: %d", atomic.LoadInt32(&filesProcessed), atomic.LoadInt32(&foldersProcessed)) } func processDirectory(directory string, directories chan<- string, output chan<- string, filesProcessed *int32) { files, err := os.ReadDir(directory) if err != nil { log.Printf("Error reading directory %s: %+v", directory, err) return } for _, file := range files { if file.IsDir() { directories <- filepath.Join(directory, file.Name()) } else { output <- filepath.Join(directory, file.Name()) atomic.AddInt32(filesProcessed, 1) } } } func NormalizePath(input string) string { input = filepath.Clean(input) input = filepath.ToSlash(input) input = strings.ReplaceAll(input, "\"", "") return input }