diff --git a/main.go b/main.go index aeb360c..d463aca 100644 --- a/main.go +++ b/main.go @@ -6,6 +6,7 @@ import ( "log" "os" "path/filepath" + "sort" "strings" "sync" "sync/atomic" @@ -62,13 +63,13 @@ func main() { go func(file string) { defer wg.Done() if debug { - log.Printf("Processing file: %s", file) // Log the file being processed + log.Printf("[%s] Processing file", file) // Log the file being processed } isBinary, err := IsBinaryFile(file) if err != nil { if debug { - log.Printf("Error analyzing file %s: %v", file, err) + log.Printf("[%s] Error analyzing file: %v", file, err) } return } @@ -78,12 +79,12 @@ func main() { if isBinary { extData.(*ExtData).binaryCount++ if debug { - log.Printf("Binary file detected: %s (%s)", file, ext) + log.Printf("[%s] Binary file detected: (%s)", file, ext) } } else { extData.(*ExtData).textCount++ if debug { - log.Printf("Text file detected: %s (%s)", file, ext) + log.Printf("[%s] Text file detected: (%s)", file, ext) } } }(file) @@ -109,41 +110,91 @@ func main() { // IsBinaryFile detects if a file is binary by analyzing a sample of its content // It uses multiple heuristics for more reliable detection func IsBinaryFile(filename string) (bool, error) { + if debug { + log.Printf("[%s] Starting binary detection for file", filename) + } + // Open the file file, err := os.Open(filename) if err != nil { + if debug { + log.Printf("[%s] Failed to open file: %v", filename, err) + } return false, err } defer file.Close() + // Get file info for size + fileInfo, err := file.Stat() + if err != nil { + if debug { + log.Printf("[%s] Failed to get file stats: %v", filename, err) + } + } else if debug { + log.Printf("[%s] File size: %d bytes", filename, fileInfo.Size()) + } + // Create a buffer to read a sample (first 8KB is usually enough) // Adjust the buffer size as needed const sampleSize = 8192 buffer := make([]byte, sampleSize) // Read a sample from the file + if debug { + log.Printf("[%s] Reading %d byte sample from file", filename, sampleSize) + } bytesRead, err := file.Read(buffer) if err != nil && err.Error() != "EOF" { + if debug { + log.Printf("[%s] Error reading from file: %v", filename, err) + } return false, err } // Adjust buffer to actual bytes read buffer = buffer[:bytesRead] + if debug { + log.Printf("[%s] Actually read %d bytes from file", filename, bytesRead) + } // Null byte check - common in binary files, rare in text files nullCount := 0 nonPrintableCount := 0 + controlCharCount := 0 + extendedAsciiCount := 0 + + // Character frequency map (for debug) + charFreq := make(map[byte]int) // Count of characters analyzed totalBytes := bytesRead + if debug { + log.Printf("[%s] Analyzing bytes for binary detection...", filename) + } + // Check each byte in the sample for _, b := range buffer { + // Update character frequency (debug only) + if debug { + charFreq[b]++ + } + // Count null bytes if b == 0 { nullCount++ } + // Track control characters (0-31 except whitespace) + if b < 32 && !isWhitespace(b) { + controlCharCount++ + } + + // Track extended ASCII + if b > 127 { + extendedAsciiCount++ + } + // Count non-printable, non-whitespace characters // BOM in UTF-8 is represented by bytes 0xEF,0xBB,0xBF, not a single byte value if (b < 32 || b > 126) && !isWhitespace(b) { @@ -157,13 +208,85 @@ func IsBinaryFile(filename string) (bool, error) { nullThreshold := float64(totalBytes) * 0.01 nonPrintableThreshold := float64(totalBytes) * 0.20 + nullPercentage := 0.0 + nonPrintablePercentage := 0.0 + controlCharPercentage := 0.0 + extendedAsciiPercentage := 0.0 + + if totalBytes > 0 { + nullPercentage = 100.0 * float64(nullCount) / float64(totalBytes) + nonPrintablePercentage = 100.0 * float64(nonPrintableCount) / float64(totalBytes) + controlCharPercentage = 100.0 * float64(controlCharCount) / float64(totalBytes) + extendedAsciiPercentage = 100.0 * float64(extendedAsciiCount) / float64(totalBytes) + } + if debug { - log.Printf("File: %s, Size: %d, Null bytes: %d, Non-printable: %d", - filename, totalBytes, nullCount, nonPrintableCount) + log.Printf("[%s] File", filename) + log.Printf("[%s] Size analyzed: %d bytes", filename, totalBytes) + log.Printf("[%s] Null bytes: %d (%.2f%%)", filename, nullCount, nullPercentage) + log.Printf("[%s] Non-printable: %d (%.2f%%)", filename, nonPrintableCount, nonPrintablePercentage) + log.Printf("[%s] Control chars: %d (%.2f%%)", filename, controlCharCount, controlCharPercentage) + log.Printf("[%s] Extended ASCII: %d (%.2f%%)", filename, extendedAsciiCount, extendedAsciiPercentage) + log.Printf("[%s] Thresholds: nulls > %.2f%%, non-printable > %.2f%%", + filename, + 100.0*nullThreshold/float64(totalBytes), + 100.0*nonPrintableThreshold/float64(totalBytes)) + + // Print top 10 most frequent non-printable characters if any were found + if nonPrintableCount > 0 { + type charCountPair struct { + char byte + count int + } + + // Filter to non-printable chars and sort by frequency + nonPrintableChars := []charCountPair{} + for char, count := range charFreq { + if (char < 32 || char > 126) && !isWhitespace(char) { + nonPrintableChars = append(nonPrintableChars, charCountPair{char, count}) + } + } + + // Sort by frequency (descending) + if len(nonPrintableChars) > 0 { + sort.Slice(nonPrintableChars, func(i, j int) bool { + return nonPrintableChars[i].count > nonPrintableChars[j].count + }) + + // Print top 10 or fewer + log.Printf("[%s] Top non-printable characters:", filename) + maxToShow := 10 + if len(nonPrintableChars) < maxToShow { + maxToShow = len(nonPrintableChars) + } + for i := 0; i < maxToShow; i++ { + pair := nonPrintableChars[i] + log.Printf("[%s] Byte 0x%02X: %d occurrences (%.2f%%)", + filename, pair.char, pair.count, + 100.0*float64(pair.count)/float64(totalBytes)) + } + } + } } isBinary := float64(nullCount) > nullThreshold || float64(nonPrintableCount) > nonPrintableThreshold + if debug { + if isBinary { + log.Printf("[%s] RESULT: %s is detected as BINARY file", filename, filename) + if float64(nullCount) > nullThreshold { + log.Printf("[%s] - Detected as binary due to null bytes: %.2f%% > threshold %.2f%%", + filename, nullPercentage, 100.0*nullThreshold/float64(totalBytes)) + } + if float64(nonPrintableCount) > nonPrintableThreshold { + log.Printf("[%s] - Detected as binary due to non-printable chars: %.2f%% > threshold %.2f%%", + filename, nonPrintablePercentage, 100.0*nonPrintableThreshold/float64(totalBytes)) + } + } else { + log.Printf("[%s] RESULT: %s is detected as TEXT file", filename, filename) + } + } + return isBinary, nil }