More betterer logging
This commit is contained in:
135
main.go
135
main.go
@@ -6,6 +6,7 @@ import (
|
|||||||
"log"
|
"log"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
@@ -62,13 +63,13 @@ func main() {
|
|||||||
go func(file string) {
|
go func(file string) {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
if debug {
|
if debug {
|
||||||
log.Printf("Processing file: %s", file) // Log the file being processed
|
log.Printf("[%s] Processing file", file) // Log the file being processed
|
||||||
}
|
}
|
||||||
|
|
||||||
isBinary, err := IsBinaryFile(file)
|
isBinary, err := IsBinaryFile(file)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if debug {
|
if debug {
|
||||||
log.Printf("Error analyzing file %s: %v", file, err)
|
log.Printf("[%s] Error analyzing file: %v", file, err)
|
||||||
}
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -78,12 +79,12 @@ func main() {
|
|||||||
if isBinary {
|
if isBinary {
|
||||||
extData.(*ExtData).binaryCount++
|
extData.(*ExtData).binaryCount++
|
||||||
if debug {
|
if debug {
|
||||||
log.Printf("Binary file detected: %s (%s)", file, ext)
|
log.Printf("[%s] Binary file detected: (%s)", file, ext)
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
extData.(*ExtData).textCount++
|
extData.(*ExtData).textCount++
|
||||||
if debug {
|
if debug {
|
||||||
log.Printf("Text file detected: %s (%s)", file, ext)
|
log.Printf("[%s] Text file detected: (%s)", file, ext)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}(file)
|
}(file)
|
||||||
@@ -109,41 +110,91 @@ func main() {
|
|||||||
// IsBinaryFile detects if a file is binary by analyzing a sample of its content
|
// IsBinaryFile detects if a file is binary by analyzing a sample of its content
|
||||||
// It uses multiple heuristics for more reliable detection
|
// It uses multiple heuristics for more reliable detection
|
||||||
func IsBinaryFile(filename string) (bool, error) {
|
func IsBinaryFile(filename string) (bool, error) {
|
||||||
|
if debug {
|
||||||
|
log.Printf("[%s] Starting binary detection for file", filename)
|
||||||
|
}
|
||||||
|
|
||||||
// Open the file
|
// Open the file
|
||||||
file, err := os.Open(filename)
|
file, err := os.Open(filename)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
if debug {
|
||||||
|
log.Printf("[%s] Failed to open file: %v", filename, err)
|
||||||
|
}
|
||||||
return false, err
|
return false, err
|
||||||
}
|
}
|
||||||
defer file.Close()
|
defer file.Close()
|
||||||
|
|
||||||
|
// Get file info for size
|
||||||
|
fileInfo, err := file.Stat()
|
||||||
|
if err != nil {
|
||||||
|
if debug {
|
||||||
|
log.Printf("[%s] Failed to get file stats: %v", filename, err)
|
||||||
|
}
|
||||||
|
} else if debug {
|
||||||
|
log.Printf("[%s] File size: %d bytes", filename, fileInfo.Size())
|
||||||
|
}
|
||||||
|
|
||||||
// Create a buffer to read a sample (first 8KB is usually enough)
|
// Create a buffer to read a sample (first 8KB is usually enough)
|
||||||
// Adjust the buffer size as needed
|
// Adjust the buffer size as needed
|
||||||
const sampleSize = 8192
|
const sampleSize = 8192
|
||||||
buffer := make([]byte, sampleSize)
|
buffer := make([]byte, sampleSize)
|
||||||
|
|
||||||
// Read a sample from the file
|
// Read a sample from the file
|
||||||
|
if debug {
|
||||||
|
log.Printf("[%s] Reading %d byte sample from file", filename, sampleSize)
|
||||||
|
}
|
||||||
bytesRead, err := file.Read(buffer)
|
bytesRead, err := file.Read(buffer)
|
||||||
if err != nil && err.Error() != "EOF" {
|
if err != nil && err.Error() != "EOF" {
|
||||||
|
if debug {
|
||||||
|
log.Printf("[%s] Error reading from file: %v", filename, err)
|
||||||
|
}
|
||||||
return false, err
|
return false, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// Adjust buffer to actual bytes read
|
// Adjust buffer to actual bytes read
|
||||||
buffer = buffer[:bytesRead]
|
buffer = buffer[:bytesRead]
|
||||||
|
if debug {
|
||||||
|
log.Printf("[%s] Actually read %d bytes from file", filename, bytesRead)
|
||||||
|
}
|
||||||
|
|
||||||
// Null byte check - common in binary files, rare in text files
|
// Null byte check - common in binary files, rare in text files
|
||||||
nullCount := 0
|
nullCount := 0
|
||||||
nonPrintableCount := 0
|
nonPrintableCount := 0
|
||||||
|
controlCharCount := 0
|
||||||
|
extendedAsciiCount := 0
|
||||||
|
|
||||||
|
// Character frequency map (for debug)
|
||||||
|
charFreq := make(map[byte]int)
|
||||||
|
|
||||||
// Count of characters analyzed
|
// Count of characters analyzed
|
||||||
totalBytes := bytesRead
|
totalBytes := bytesRead
|
||||||
|
|
||||||
|
if debug {
|
||||||
|
log.Printf("[%s] Analyzing bytes for binary detection...", filename)
|
||||||
|
}
|
||||||
|
|
||||||
// Check each byte in the sample
|
// Check each byte in the sample
|
||||||
for _, b := range buffer {
|
for _, b := range buffer {
|
||||||
|
// Update character frequency (debug only)
|
||||||
|
if debug {
|
||||||
|
charFreq[b]++
|
||||||
|
}
|
||||||
|
|
||||||
// Count null bytes
|
// Count null bytes
|
||||||
if b == 0 {
|
if b == 0 {
|
||||||
nullCount++
|
nullCount++
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Track control characters (0-31 except whitespace)
|
||||||
|
if b < 32 && !isWhitespace(b) {
|
||||||
|
controlCharCount++
|
||||||
|
}
|
||||||
|
|
||||||
|
// Track extended ASCII
|
||||||
|
if b > 127 {
|
||||||
|
extendedAsciiCount++
|
||||||
|
}
|
||||||
|
|
||||||
// Count non-printable, non-whitespace characters
|
// Count non-printable, non-whitespace characters
|
||||||
// BOM in UTF-8 is represented by bytes 0xEF,0xBB,0xBF, not a single byte value
|
// BOM in UTF-8 is represented by bytes 0xEF,0xBB,0xBF, not a single byte value
|
||||||
if (b < 32 || b > 126) && !isWhitespace(b) {
|
if (b < 32 || b > 126) && !isWhitespace(b) {
|
||||||
@@ -157,13 +208,85 @@ func IsBinaryFile(filename string) (bool, error) {
|
|||||||
nullThreshold := float64(totalBytes) * 0.01
|
nullThreshold := float64(totalBytes) * 0.01
|
||||||
nonPrintableThreshold := float64(totalBytes) * 0.20
|
nonPrintableThreshold := float64(totalBytes) * 0.20
|
||||||
|
|
||||||
|
nullPercentage := 0.0
|
||||||
|
nonPrintablePercentage := 0.0
|
||||||
|
controlCharPercentage := 0.0
|
||||||
|
extendedAsciiPercentage := 0.0
|
||||||
|
|
||||||
|
if totalBytes > 0 {
|
||||||
|
nullPercentage = 100.0 * float64(nullCount) / float64(totalBytes)
|
||||||
|
nonPrintablePercentage = 100.0 * float64(nonPrintableCount) / float64(totalBytes)
|
||||||
|
controlCharPercentage = 100.0 * float64(controlCharCount) / float64(totalBytes)
|
||||||
|
extendedAsciiPercentage = 100.0 * float64(extendedAsciiCount) / float64(totalBytes)
|
||||||
|
}
|
||||||
|
|
||||||
if debug {
|
if debug {
|
||||||
log.Printf("File: %s, Size: %d, Null bytes: %d, Non-printable: %d",
|
log.Printf("[%s] File", filename)
|
||||||
filename, totalBytes, nullCount, nonPrintableCount)
|
log.Printf("[%s] Size analyzed: %d bytes", filename, totalBytes)
|
||||||
|
log.Printf("[%s] Null bytes: %d (%.2f%%)", filename, nullCount, nullPercentage)
|
||||||
|
log.Printf("[%s] Non-printable: %d (%.2f%%)", filename, nonPrintableCount, nonPrintablePercentage)
|
||||||
|
log.Printf("[%s] Control chars: %d (%.2f%%)", filename, controlCharCount, controlCharPercentage)
|
||||||
|
log.Printf("[%s] Extended ASCII: %d (%.2f%%)", filename, extendedAsciiCount, extendedAsciiPercentage)
|
||||||
|
log.Printf("[%s] Thresholds: nulls > %.2f%%, non-printable > %.2f%%",
|
||||||
|
filename,
|
||||||
|
100.0*nullThreshold/float64(totalBytes),
|
||||||
|
100.0*nonPrintableThreshold/float64(totalBytes))
|
||||||
|
|
||||||
|
// Print top 10 most frequent non-printable characters if any were found
|
||||||
|
if nonPrintableCount > 0 {
|
||||||
|
type charCountPair struct {
|
||||||
|
char byte
|
||||||
|
count int
|
||||||
|
}
|
||||||
|
|
||||||
|
// Filter to non-printable chars and sort by frequency
|
||||||
|
nonPrintableChars := []charCountPair{}
|
||||||
|
for char, count := range charFreq {
|
||||||
|
if (char < 32 || char > 126) && !isWhitespace(char) {
|
||||||
|
nonPrintableChars = append(nonPrintableChars, charCountPair{char, count})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort by frequency (descending)
|
||||||
|
if len(nonPrintableChars) > 0 {
|
||||||
|
sort.Slice(nonPrintableChars, func(i, j int) bool {
|
||||||
|
return nonPrintableChars[i].count > nonPrintableChars[j].count
|
||||||
|
})
|
||||||
|
|
||||||
|
// Print top 10 or fewer
|
||||||
|
log.Printf("[%s] Top non-printable characters:", filename)
|
||||||
|
maxToShow := 10
|
||||||
|
if len(nonPrintableChars) < maxToShow {
|
||||||
|
maxToShow = len(nonPrintableChars)
|
||||||
|
}
|
||||||
|
for i := 0; i < maxToShow; i++ {
|
||||||
|
pair := nonPrintableChars[i]
|
||||||
|
log.Printf("[%s] Byte 0x%02X: %d occurrences (%.2f%%)",
|
||||||
|
filename, pair.char, pair.count,
|
||||||
|
100.0*float64(pair.count)/float64(totalBytes))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
isBinary := float64(nullCount) > nullThreshold || float64(nonPrintableCount) > nonPrintableThreshold
|
isBinary := float64(nullCount) > nullThreshold || float64(nonPrintableCount) > nonPrintableThreshold
|
||||||
|
|
||||||
|
if debug {
|
||||||
|
if isBinary {
|
||||||
|
log.Printf("[%s] RESULT: %s is detected as BINARY file", filename, filename)
|
||||||
|
if float64(nullCount) > nullThreshold {
|
||||||
|
log.Printf("[%s] - Detected as binary due to null bytes: %.2f%% > threshold %.2f%%",
|
||||||
|
filename, nullPercentage, 100.0*nullThreshold/float64(totalBytes))
|
||||||
|
}
|
||||||
|
if float64(nonPrintableCount) > nonPrintableThreshold {
|
||||||
|
log.Printf("[%s] - Detected as binary due to non-printable chars: %.2f%% > threshold %.2f%%",
|
||||||
|
filename, nonPrintablePercentage, 100.0*nonPrintableThreshold/float64(totalBytes))
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
log.Printf("[%s] RESULT: %s is detected as TEXT file", filename, filename)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return isBinary, nil
|
return isBinary, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user