package main import ( "flag" "fmt" "log" "os" "path/filepath" "sort" "strings" "sync" "sync/atomic" "time" ) var Error *log.Logger var Warning *log.Logger func init() { log.SetFlags(log.Lmicroseconds | log.Lshortfile) log.SetOutput(os.Stdout) Error = log.New(os.Stderr, fmt.Sprintf("%sERROR:%s ", "\033[0;101m", "\033[0m"), log.Lmicroseconds|log.Lshortfile) Warning = log.New(os.Stdout, fmt.Sprintf("%sWarning:%s ", "\033[0;93m", "\033[0m"), log.Lmicroseconds|log.Lshortfile) } type ExtData struct { ext string binaryCount int textCount int } var debug bool func main() { raw := flag.Bool("r", false, "More application friendly output") debugF := flag.Bool("d", false, "Debug mode") flag.Parse() debug = *debugF dir := flag.Arg(0) if dir == "" { dir = "." } dir = NormalizePath(dir) if debug { log.Printf("Scanning directory: %s", dir) } files := make(chan string, 10000) status := make(chan error) go GetSyncFilesRecursively(dir, files, status) //files <- "SmarterConstruction.pdb" extensionTypeCount := sync.Map{} wg := sync.WaitGroup{} for file := range files { wg.Add(1) go func(file string) { defer wg.Done() if debug { log.Printf("[%s] Processing file", file) // Log the file being processed } isBinary, err := IsBinaryFile(file) if err != nil { if debug { log.Printf("[%s] Error analyzing file: %v", file, err) } return } ext := filepath.Ext(file) extData, _ := extensionTypeCount.LoadOrStore(ext, &ExtData{ext: ext, binaryCount: 0, textCount: 0}) if isBinary { extData.(*ExtData).binaryCount++ if debug { log.Printf("[%s] Binary file detected: (%s)", file, ext) } } else { extData.(*ExtData).textCount++ if debug { log.Printf("[%s] Text file detected: (%s)", file, ext) } } }(file) } wg.Wait() extensionTypeCount.Range(func(key, value any) bool { extData := value.(*ExtData) if extData.ext == "" { return true } if extData.binaryCount > extData.textCount*2 { if *raw { fmt.Println(extData.ext) } else { log.Printf("Extension: %q, Binary Count: %d, Text Count: %d", extData.ext, extData.binaryCount, extData.textCount) } } return true }) } // IsBinaryFile detects if a file is binary by analyzing a sample of its content // It uses multiple heuristics for more reliable detection func IsBinaryFile(filename string) (bool, error) { if debug { log.Printf("[%s] Starting binary detection for file", filename) } // Open the file file, err := os.Open(filename) if err != nil { if debug { log.Printf("[%s] Failed to open file: %v", filename, err) } return false, err } defer file.Close() // Get file info for size fileInfo, err := file.Stat() if err != nil { if debug { log.Printf("[%s] Failed to get file stats: %v", filename, err) } } else if debug { log.Printf("[%s] File size: %d bytes", filename, fileInfo.Size()) } // Create a buffer to read a sample (first 8KB is usually enough) // Adjust the buffer size as needed const sampleSize = 8192 buffer := make([]byte, sampleSize) // Read a sample from the file if debug { log.Printf("[%s] Reading %d byte sample from file", filename, sampleSize) } bytesRead, err := file.Read(buffer) if err != nil && err.Error() != "EOF" { if debug { log.Printf("[%s] Error reading from file: %v", filename, err) } return false, err } // Adjust buffer to actual bytes read buffer = buffer[:bytesRead] if debug { log.Printf("[%s] Actually read %d bytes from file", filename, bytesRead) } // Null byte check - common in binary files, rare in text files nullCount := 0 nonPrintableCount := 0 controlCharCount := 0 extendedAsciiCount := 0 // Character frequency map (for debug) charFreq := make(map[byte]int) // Count of characters analyzed totalBytes := bytesRead if debug { log.Printf("[%s] Analyzing bytes for binary detection...", filename) } // Check each byte in the sample for _, b := range buffer { // Update character frequency (debug only) if debug { charFreq[b]++ } // Count null bytes if b == 0 { nullCount++ } // Track control characters (0-31 except whitespace) if b < 32 && !isWhitespace(b) { controlCharCount++ } // Track extended ASCII if b > 127 { extendedAsciiCount++ } // Count non-printable, non-whitespace characters // BOM in UTF-8 is represented by bytes 0xEF,0xBB,0xBF, not a single byte value if (b < 32 || b > 126) && !isWhitespace(b) { nonPrintableCount++ } } // Thresholds for binary detection // 1. If more than 1% are null bytes, likely binary // 2. If more than 20% are non-printable characters, likely binary nullThreshold := float64(totalBytes) * 0.01 nonPrintableThreshold := float64(totalBytes) * 0.20 nullPercentage := 0.0 nonPrintablePercentage := 0.0 controlCharPercentage := 0.0 extendedAsciiPercentage := 0.0 if totalBytes > 0 { nullPercentage = 100.0 * float64(nullCount) / float64(totalBytes) nonPrintablePercentage = 100.0 * float64(nonPrintableCount) / float64(totalBytes) controlCharPercentage = 100.0 * float64(controlCharCount) / float64(totalBytes) extendedAsciiPercentage = 100.0 * float64(extendedAsciiCount) / float64(totalBytes) } if debug { log.Printf("[%s] File", filename) log.Printf("[%s] Size analyzed: %d bytes", filename, totalBytes) log.Printf("[%s] Null bytes: %d (%.2f%%)", filename, nullCount, nullPercentage) log.Printf("[%s] Non-printable: %d (%.2f%%)", filename, nonPrintableCount, nonPrintablePercentage) log.Printf("[%s] Control chars: %d (%.2f%%)", filename, controlCharCount, controlCharPercentage) log.Printf("[%s] Extended ASCII: %d (%.2f%%)", filename, extendedAsciiCount, extendedAsciiPercentage) log.Printf("[%s] Thresholds: nulls > %.2f%%, non-printable > %.2f%%", filename, 100.0*nullThreshold/float64(totalBytes), 100.0*nonPrintableThreshold/float64(totalBytes)) // Print top 10 most frequent non-printable characters if any were found if nonPrintableCount > 0 { type charCountPair struct { char byte count int } // Filter to non-printable chars and sort by frequency nonPrintableChars := []charCountPair{} for char, count := range charFreq { if (char < 32 || char > 126) && !isWhitespace(char) { nonPrintableChars = append(nonPrintableChars, charCountPair{char, count}) } } // Sort by frequency (descending) if len(nonPrintableChars) > 0 { sort.Slice(nonPrintableChars, func(i, j int) bool { return nonPrintableChars[i].count > nonPrintableChars[j].count }) // Print top 10 or fewer log.Printf("[%s] Top non-printable characters:", filename) maxToShow := 10 if len(nonPrintableChars) < maxToShow { maxToShow = len(nonPrintableChars) } for i := 0; i < maxToShow; i++ { pair := nonPrintableChars[i] log.Printf("[%s] Byte 0x%02X: %d occurrences (%.2f%%)", filename, pair.char, pair.count, 100.0*float64(pair.count)/float64(totalBytes)) } } } } isBinary := float64(nullCount) > nullThreshold || float64(nonPrintableCount) > nonPrintableThreshold if debug { if isBinary { log.Printf("[%s] RESULT: %s is detected as BINARY file", filename, filename) if float64(nullCount) > nullThreshold { log.Printf("[%s] - Detected as binary due to null bytes: %.2f%% > threshold %.2f%%", filename, nullPercentage, 100.0*nullThreshold/float64(totalBytes)) } if float64(nonPrintableCount) > nonPrintableThreshold { log.Printf("[%s] - Detected as binary due to non-printable chars: %.2f%% > threshold %.2f%%", filename, nonPrintablePercentage, 100.0*nonPrintableThreshold/float64(totalBytes)) } } else { log.Printf("[%s] RESULT: %s is detected as TEXT file", filename, filename) } } return isBinary, nil } // isWhitespace checks if a byte is a whitespace character func isWhitespace(b byte) bool { switch b { case ' ', '\t', '\n', '\r', '\f', '\v': return true default: return false } } // IsStringBinary is kept for backwards compatibility func IsStringBinary(s string) bool { if debug { log.Printf("Checking if string is binary: %q", s) } for _, c := range s { // 65279 is GOD DAMNED BOM dogshit if (c < ' ' || c > '~') && c != 65279 { if debug { log.Printf("Found non-printable character: '%c' with ASCII value %d", c, c) } return true } } if debug { log.Println("String is not binary.") } return false } func GetSyncFilesRecursively(input string, output chan string, status chan error) { defer close(output) defer close(status) var filesProcessed int32 var foldersProcessed int32 var activeWorkers int32 done := make(chan struct{}) defer close(done) directories := make(chan string, 100000) workerPool := make(chan struct{}, 4000) directories <- input allDone := make(chan struct{}) go func() { var wg sync.WaitGroup go func() { for { if atomic.LoadInt32(&activeWorkers) == 0 && len(directories) == 0 { time.Sleep(10 * time.Millisecond) if atomic.LoadInt32(&activeWorkers) == 0 && len(directories) == 0 { close(allDone) return } } time.Sleep(50 * time.Millisecond) } }() for { select { case directory, ok := <-directories: if !ok { wg.Wait() return } atomic.AddInt32(&activeWorkers, 1) go func(dir string) { workerPool <- struct{}{} atomic.AddInt32(&foldersProcessed, 1) processDirectory(dir, directories, output, &filesProcessed) <-workerPool atomic.AddInt32(&activeWorkers, -1) }(directory) } } }() <-allDone } func processDirectory(directory string, directories chan<- string, output chan<- string, filesProcessed *int32) { files, err := os.ReadDir(directory) if err != nil { //log.Printf("Error reading directory %s: %+v", directory, err) return } for _, file := range files { if file.IsDir() { directories <- filepath.Join(directory, file.Name()) } else { output <- filepath.Join(directory, file.Name()) atomic.AddInt32(filesProcessed, 1) } } } func NormalizePath(input string) string { input = filepath.Clean(input) input = filepath.ToSlash(input) input = strings.ReplaceAll(input, "\"", "") return input }