package main import ( "flag" "fmt" "log" "os" "path/filepath" "strings" "sync" "sync/atomic" "time" ) var Error *log.Logger var Warning *log.Logger func init() { log.SetFlags(log.Lmicroseconds | log.Lshortfile) log.SetOutput(os.Stdout) Error = log.New(os.Stderr, fmt.Sprintf("%sERROR:%s ", "\033[0;101m", "\033[0m"), log.Lmicroseconds|log.Lshortfile) Warning = log.New(os.Stdout, fmt.Sprintf("%sWarning:%s ", "\033[0;93m", "\033[0m"), log.Lmicroseconds|log.Lshortfile) } type ExtData struct { ext string binaryCount int textCount int } var debug bool func main() { raw := flag.Bool("r", false, "More application friendly output") debugF := flag.Bool("d", false, "Debug mode") flag.Parse() debug = *debugF dir := flag.Arg(0) if dir == "" { dir = "." } dir = NormalizePath(dir) if debug { log.Printf("Scanning directory: %s", dir) } files := make(chan string, 10000) status := make(chan error) go GetSyncFilesRecursively(dir, files, status) //files <- "SmarterConstruction.pdb" extensionTypeCount := sync.Map{} wg := sync.WaitGroup{} for file := range files { wg.Add(1) go func(file string) { defer wg.Done() if debug { log.Printf("Processing file: %s", file) // Log the file being processed } isBinary, err := IsBinaryFile(file) if err != nil { if debug { log.Printf("Error analyzing file %s: %v", file, err) } return } ext := filepath.Ext(file) extData, _ := extensionTypeCount.LoadOrStore(ext, &ExtData{ext: ext, binaryCount: 0, textCount: 0}) if isBinary { extData.(*ExtData).binaryCount++ if debug { log.Printf("Binary file detected: %s (%s)", file, ext) } } else { extData.(*ExtData).textCount++ if debug { log.Printf("Text file detected: %s (%s)", file, ext) } } }(file) } wg.Wait() extensionTypeCount.Range(func(key, value any) bool { extData := value.(*ExtData) if extData.ext == "" { return true } if extData.binaryCount > extData.textCount*2 { if *raw { fmt.Println(extData.ext) } else { log.Printf("Extension: %q, Binary Count: %d, Text Count: %d", extData.ext, extData.binaryCount, extData.textCount) } } return true }) } // IsBinaryFile detects if a file is binary by analyzing a sample of its content // It uses multiple heuristics for more reliable detection func IsBinaryFile(filename string) (bool, error) { // Open the file file, err := os.Open(filename) if err != nil { return false, err } defer file.Close() // Create a buffer to read a sample (first 8KB is usually enough) // Adjust the buffer size as needed const sampleSize = 8192 buffer := make([]byte, sampleSize) // Read a sample from the file bytesRead, err := file.Read(buffer) if err != nil && err.Error() != "EOF" { return false, err } // Adjust buffer to actual bytes read buffer = buffer[:bytesRead] // Null byte check - common in binary files, rare in text files nullCount := 0 nonPrintableCount := 0 // Count of characters analyzed totalBytes := bytesRead // Check each byte in the sample for _, b := range buffer { // Count null bytes if b == 0 { nullCount++ } // Count non-printable, non-whitespace characters // BOM in UTF-8 is represented by bytes 0xEF,0xBB,0xBF, not a single byte value if (b < 32 || b > 126) && !isWhitespace(b) { nonPrintableCount++ } } // Thresholds for binary detection // 1. If more than 1% are null bytes, likely binary // 2. If more than 20% are non-printable characters, likely binary nullThreshold := float64(totalBytes) * 0.01 nonPrintableThreshold := float64(totalBytes) * 0.20 if debug { log.Printf("File: %s, Size: %d, Null bytes: %d, Non-printable: %d", filename, totalBytes, nullCount, nonPrintableCount) } isBinary := float64(nullCount) > nullThreshold || float64(nonPrintableCount) > nonPrintableThreshold return isBinary, nil } // isWhitespace checks if a byte is a whitespace character func isWhitespace(b byte) bool { switch b { case ' ', '\t', '\n', '\r', '\f', '\v': return true default: return false } } // IsStringBinary is kept for backwards compatibility func IsStringBinary(s string) bool { if debug { log.Printf("Checking if string is binary: %q", s) } for _, c := range s { // 65279 is GOD DAMNED BOM dogshit if (c < ' ' || c > '~') && c != 65279 { if debug { log.Printf("Found non-printable character: '%c' with ASCII value %d", c, c) } return true } } if debug { log.Println("String is not binary.") } return false } func GetSyncFilesRecursively(input string, output chan string, status chan error) { defer close(output) defer close(status) var filesProcessed int32 var foldersProcessed int32 var activeWorkers int32 done := make(chan struct{}) defer close(done) directories := make(chan string, 100000) workerPool := make(chan struct{}, 4000) directories <- input allDone := make(chan struct{}) go func() { var wg sync.WaitGroup go func() { for { if atomic.LoadInt32(&activeWorkers) == 0 && len(directories) == 0 { time.Sleep(10 * time.Millisecond) if atomic.LoadInt32(&activeWorkers) == 0 && len(directories) == 0 { close(allDone) return } } time.Sleep(50 * time.Millisecond) } }() for { select { case directory, ok := <-directories: if !ok { wg.Wait() return } atomic.AddInt32(&activeWorkers, 1) go func(dir string) { workerPool <- struct{}{} atomic.AddInt32(&foldersProcessed, 1) processDirectory(dir, directories, output, &filesProcessed) <-workerPool atomic.AddInt32(&activeWorkers, -1) }(directory) } } }() <-allDone } func processDirectory(directory string, directories chan<- string, output chan<- string, filesProcessed *int32) { files, err := os.ReadDir(directory) if err != nil { //log.Printf("Error reading directory %s: %+v", directory, err) return } for _, file := range files { if file.IsDir() { directories <- filepath.Join(directory, file.Name()) } else { output <- filepath.Join(directory, file.Name()) atomic.AddInt32(filesProcessed, 1) } } } func NormalizePath(input string) string { input = filepath.Clean(input) input = filepath.ToSlash(input) input = strings.ReplaceAll(input, "\"", "") return input }