package main import ( "flag" "fmt" "log" "os" "path/filepath" "sort" "strings" "sync" "github.com/bmatcuk/doublestar/v4" logger "git.site.quack-lab.dev/dave/cylogger" utils "git.site.quack-lab.dev/dave/cyutils" ) type ExtData struct { ext string binaryCount int textCount int } var debug bool func main() { track := flag.Bool("t", false, "Git add and commit the results") flag.Parse() logger.InitFlag() logger.Info("Starting binarysusser") dir := flag.Arg(0) if dir == "" { dir = "." } logger.Info("Scanning directory: %s", dir) dir = NormalizePath(dir) logger.Info("Normalized directory: %s", dir) files, err := doublestar.Glob(os.DirFS(dir), "**/*") if err != nil { logger.Error("Error globbing directory: %v", err) os.Exit(1) } logger.Info("Found %d files", len(files)) extensionTypeCount := sync.Map{} utils.WithWorkers(20, files, func(worker, i int, file string) { filelog := logger.Default.WithPrefix(fmt.Sprintf("file=%s", file)) filelog.Debug("Processing file") isBinary, err := IsBinaryFile(file) if err != nil { filelog.Error("Error analyzing file: %v", err) return } ext := filepath.Ext(file) extData, _ := extensionTypeCount.LoadOrStore(ext, &ExtData{ext: ext, binaryCount: 0, textCount: 0}) if isBinary { extData.(*ExtData).binaryCount++ filelog.Debug("Binary file detected: (%s)", ext) } else { extData.(*ExtData).textCount++ filelog.Debug("Text file detected: (%s)", ext) } }) logger.Info("Processing complete") extensionTypeCount.Range(func(key, value any) bool { extData := value.(*ExtData) extlog := logger.Default.WithPrefix(fmt.Sprintf("ext=%s", extData.ext)) if extData.ext == "" { extlog.Debug("Skipping empty extension") return true } if extData.binaryCount > extData.textCount*2 { extlog.Info("Extension: %q, Binary Count: %d, Text Count: %d", extData.ext, extData.binaryCount, extData.textCount) } return true }) } // IsBinaryFile detects if a file is binary by analyzing a sample of its content // It uses multiple heuristics for more reliable detection func IsBinaryFile(filename string) (bool, error) { filelog := logger.Default.WithPrefix(fmt.Sprintf("file=%s", filename)) filelog.Debug("Starting binary detection for file") // Open the file file, err := os.Open(filename) if err != nil { filelog.Error("Failed to open file: %v", err) return false, err } defer file.Close() // Get file info for size fileInfo, err := file.Stat() if err != nil { filelog.Error("Failed to get file stats: %v", err) return false, err } filelog.Debug("File size: %d bytes", fileInfo.Size()) // Create a buffer to read a sample (first 8KB is usually enough) // Adjust the buffer size as needed const sampleSize = 8192 buffer := make([]byte, sampleSize) // Read a sample from the file filelog.Debug("Reading %d byte sample from file", sampleSize) bytesRead, err := file.Read(buffer) if err != nil && err.Error() != "EOF" { filelog.Error("Error reading from file: %v", err) return false, err } // Adjust buffer to actual bytes read buffer = buffer[:bytesRead] filelog.Debug("Actually read %d bytes from file", bytesRead) // Null byte check - common in binary files, rare in text files nullCount := 0 nonPrintableCount := 0 controlCharCount := 0 extendedAsciiCount := 0 // Character frequency map (for debug) charFreq := make(map[byte]int) // Count of characters analyzed totalBytes := bytesRead filelog.Debug("Analyzing bytes for binary detection...") // Check each byte in the sample for _, b := range buffer { // Update character frequency (debug only) charFreq[b]++ // Count null bytes if b == 0 { nullCount++ } // Track control characters (0-31 except whitespace) if b < 32 && !isWhitespace(b) { controlCharCount++ } // Track extended ASCII if b > 127 { extendedAsciiCount++ } // Count non-printable, non-whitespace characters // BOM in UTF-8 is represented by bytes 0xEF,0xBB,0xBF, not a single byte value if (b < 32 || b > 126) && !isWhitespace(b) { nonPrintableCount++ } } // Thresholds for binary detection // 1. If more than 1% are null bytes, likely binary // 2. If more than 20% are non-printable characters, likely binary nullThreshold := float64(totalBytes) * 0.01 nonPrintableThreshold := float64(totalBytes) * 0.20 nullPercentage := 0.0 nonPrintablePercentage := 0.0 controlCharPercentage := 0.0 extendedAsciiPercentage := 0.0 if totalBytes > 0 { nullPercentage = 100.0 * float64(nullCount) / float64(totalBytes) nonPrintablePercentage = 100.0 * float64(nonPrintableCount) / float64(totalBytes) controlCharPercentage = 100.0 * float64(controlCharCount) / float64(totalBytes) extendedAsciiPercentage = 100.0 * float64(extendedAsciiCount) / float64(totalBytes) } filelog.Trace("File size analyzed: %d bytes", totalBytes) filelog.Trace("Null bytes: %d (%.2f%%)", nullCount, nullPercentage) filelog.Trace("Non-printable: %d (%.2f%%)", nonPrintableCount, nonPrintablePercentage) filelog.Trace("Control chars: %d (%.2f%%)", controlCharCount, controlCharPercentage) filelog.Trace("Extended ASCII: %d (%.2f%%)", extendedAsciiCount, extendedAsciiPercentage) filelog.Trace("Thresholds: nulls > %.2f%%, non-printable > %.2f%%", 100.0*nullThreshold/float64(totalBytes), 100.0*nonPrintableThreshold/float64(totalBytes)) // Print top 10 most frequent non-printable characters if any were found if nonPrintableCount > 0 { type charCountPair struct { char byte count int } // Filter to non-printable chars and sort by frequency nonPrintableChars := []charCountPair{} for char, count := range charFreq { if (char < 32 || char > 126) && !isWhitespace(char) { nonPrintableChars = append(nonPrintableChars, charCountPair{char, count}) } } // Sort by frequency (descending) if len(nonPrintableChars) > 0 { sort.Slice(nonPrintableChars, func(i, j int) bool { return nonPrintableChars[i].count > nonPrintableChars[j].count }) // Print top 10 or fewer log.Printf("[%s] Top non-printable characters:", filename) maxToShow := 10 if len(nonPrintableChars) < maxToShow { maxToShow = len(nonPrintableChars) } for i := 0; i < maxToShow; i++ { pair := nonPrintableChars[i] log.Printf("[%s] Byte 0x%02X: %d occurrences (%.2f%%)", filename, pair.char, pair.count, 100.0*float64(pair.count)/float64(totalBytes)) } } } isBinary := float64(nullCount) > nullThreshold || float64(nonPrintableCount) > nonPrintableThreshold if isBinary { filelog.Debug("File is detected as BINARY file") if float64(nullCount) > nullThreshold { filelog.Trace("Detected as binary due to null bytes: %.2f%% > threshold %.2f%%", nullPercentage, 100.0*nullThreshold/float64(totalBytes)) } if float64(nonPrintableCount) > nonPrintableThreshold { filelog.Trace("Detected as binary due to non-printable chars: %.2f%% > threshold %.2f%%", nonPrintablePercentage, 100.0*nonPrintableThreshold/float64(totalBytes)) } } else { filelog.Debug("File is detected as TEXT file") } return isBinary, nil } // isWhitespace checks if a byte is a whitespace character func isWhitespace(b byte) bool { switch b { case ' ', '\t', '\n', '\r', '\f', '\v': return true default: return false } } // IsStringBinary is kept for backwards compatibility func IsStringBinary(s string) bool { filelog := logger.Default.WithPrefix(fmt.Sprintf("file=%s", s)) filelog.Debug("Checking if string is binary: %q", s) for _, c := range s { // 65279 is GOD DAMNED BOM dogshit if (c < ' ' || c > '~') && c != 65279 { filelog.Trace("Found non-printable character: '%c' with ASCII value %d", c, c) return true } } filelog.Debug("String is not binary.") return false } func NormalizePath(input string) string { input = filepath.Clean(input) input = filepath.ToSlash(input) input = strings.ReplaceAll(input, "\"", "") return input }