From 098d642a86d7ec5358deb8eec31183598257137b Mon Sep 17 00:00:00 2001 From: PhatPhuckDave Date: Wed, 12 Mar 2025 22:01:55 +0100 Subject: [PATCH] Make more better distinction detection --- main.go | 112 ++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 88 insertions(+), 24 deletions(-) diff --git a/main.go b/main.go index 9eeabf1..aeb360c 100644 --- a/main.go +++ b/main.go @@ -1,7 +1,6 @@ package main import ( - "bufio" "flag" "fmt" "log" @@ -53,8 +52,10 @@ func main() { files := make(chan string, 10000) status := make(chan error) go GetSyncFilesRecursively(dir, files, status) + //files <- "SmarterConstruction.pdb" extensionTypeCount := sync.Map{} + wg := sync.WaitGroup{} for file := range files { wg.Add(1) @@ -63,33 +64,26 @@ func main() { if debug { log.Printf("Processing file: %s", file) // Log the file being processed } - f, err := os.Open(file) + + isBinary, err := IsBinaryFile(file) if err != nil { if debug { - log.Printf("Error opening file %s: %v", file, err) + log.Printf("Error analyzing file %s: %v", file, err) } return } - defer f.Close() // Ensure the file is closed after processing - scanner := bufio.NewScanner(f) - if scanner.Scan() { - ext := filepath.Ext(file) - extData, _ := extensionTypeCount.LoadOrStore(ext, &ExtData{ext: ext, binaryCount: 0, textCount: 0}) - if IsStringBinary(scanner.Text()) { - extData.(*ExtData).binaryCount++ - if debug { - log.Printf("Binary file detected: %s (%s)", file, ext) // Log binary file detection - } - } else { - extData.(*ExtData).textCount++ - if debug { - log.Printf("Text file detected: %s (%s)", file, ext) // Log text file detection - } - } - } else if err := scanner.Err(); err != nil { + ext := filepath.Ext(file) + extData, _ := extensionTypeCount.LoadOrStore(ext, &ExtData{ext: ext, binaryCount: 0, textCount: 0}) + if isBinary { + extData.(*ExtData).binaryCount++ if debug { - log.Printf("Error reading line from file %s: %v", file, err) + log.Printf("Binary file detected: %s (%s)", file, ext) + } + } else { + extData.(*ExtData).textCount++ + if debug { + log.Printf("Text file detected: %s (%s)", file, ext) } } }(file) @@ -105,15 +99,85 @@ func main() { if *raw { fmt.Println(extData.ext) } else { - if debug { - log.Printf("Extension: %q, Binary Count: %d, Text Count: %d", extData.ext, extData.binaryCount, extData.textCount) - } + log.Printf("Extension: %q, Binary Count: %d, Text Count: %d", extData.ext, extData.binaryCount, extData.textCount) } } return true }) } +// IsBinaryFile detects if a file is binary by analyzing a sample of its content +// It uses multiple heuristics for more reliable detection +func IsBinaryFile(filename string) (bool, error) { + // Open the file + file, err := os.Open(filename) + if err != nil { + return false, err + } + defer file.Close() + + // Create a buffer to read a sample (first 8KB is usually enough) + // Adjust the buffer size as needed + const sampleSize = 8192 + buffer := make([]byte, sampleSize) + + // Read a sample from the file + bytesRead, err := file.Read(buffer) + if err != nil && err.Error() != "EOF" { + return false, err + } + + // Adjust buffer to actual bytes read + buffer = buffer[:bytesRead] + + // Null byte check - common in binary files, rare in text files + nullCount := 0 + nonPrintableCount := 0 + + // Count of characters analyzed + totalBytes := bytesRead + + // Check each byte in the sample + for _, b := range buffer { + // Count null bytes + if b == 0 { + nullCount++ + } + + // Count non-printable, non-whitespace characters + // BOM in UTF-8 is represented by bytes 0xEF,0xBB,0xBF, not a single byte value + if (b < 32 || b > 126) && !isWhitespace(b) { + nonPrintableCount++ + } + } + + // Thresholds for binary detection + // 1. If more than 1% are null bytes, likely binary + // 2. If more than 20% are non-printable characters, likely binary + nullThreshold := float64(totalBytes) * 0.01 + nonPrintableThreshold := float64(totalBytes) * 0.20 + + if debug { + log.Printf("File: %s, Size: %d, Null bytes: %d, Non-printable: %d", + filename, totalBytes, nullCount, nonPrintableCount) + } + + isBinary := float64(nullCount) > nullThreshold || float64(nonPrintableCount) > nonPrintableThreshold + + return isBinary, nil +} + +// isWhitespace checks if a byte is a whitespace character +func isWhitespace(b byte) bool { + switch b { + case ' ', '\t', '\n', '\r', '\f', '\v': + return true + default: + return false + } +} + +// IsStringBinary is kept for backwards compatibility func IsStringBinary(s string) bool { if debug { log.Printf("Checking if string is binary: %q", s)