Make more better distinction detection
This commit is contained in:
96
main.go
96
main.go
@@ -1,7 +1,6 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"flag"
|
||||
"fmt"
|
||||
"log"
|
||||
@@ -53,8 +52,10 @@ func main() {
|
||||
files := make(chan string, 10000)
|
||||
status := make(chan error)
|
||||
go GetSyncFilesRecursively(dir, files, status)
|
||||
//files <- "SmarterConstruction.pdb"
|
||||
|
||||
extensionTypeCount := sync.Map{}
|
||||
|
||||
wg := sync.WaitGroup{}
|
||||
for file := range files {
|
||||
wg.Add(1)
|
||||
@@ -63,33 +64,26 @@ func main() {
|
||||
if debug {
|
||||
log.Printf("Processing file: %s", file) // Log the file being processed
|
||||
}
|
||||
f, err := os.Open(file)
|
||||
|
||||
isBinary, err := IsBinaryFile(file)
|
||||
if err != nil {
|
||||
if debug {
|
||||
log.Printf("Error opening file %s: %v", file, err)
|
||||
log.Printf("Error analyzing file %s: %v", file, err)
|
||||
}
|
||||
return
|
||||
}
|
||||
defer f.Close() // Ensure the file is closed after processing
|
||||
|
||||
scanner := bufio.NewScanner(f)
|
||||
if scanner.Scan() {
|
||||
ext := filepath.Ext(file)
|
||||
extData, _ := extensionTypeCount.LoadOrStore(ext, &ExtData{ext: ext, binaryCount: 0, textCount: 0})
|
||||
if IsStringBinary(scanner.Text()) {
|
||||
if isBinary {
|
||||
extData.(*ExtData).binaryCount++
|
||||
if debug {
|
||||
log.Printf("Binary file detected: %s (%s)", file, ext) // Log binary file detection
|
||||
log.Printf("Binary file detected: %s (%s)", file, ext)
|
||||
}
|
||||
} else {
|
||||
extData.(*ExtData).textCount++
|
||||
if debug {
|
||||
log.Printf("Text file detected: %s (%s)", file, ext) // Log text file detection
|
||||
}
|
||||
}
|
||||
} else if err := scanner.Err(); err != nil {
|
||||
if debug {
|
||||
log.Printf("Error reading line from file %s: %v", file, err)
|
||||
log.Printf("Text file detected: %s (%s)", file, ext)
|
||||
}
|
||||
}
|
||||
}(file)
|
||||
@@ -105,15 +99,85 @@ func main() {
|
||||
if *raw {
|
||||
fmt.Println(extData.ext)
|
||||
} else {
|
||||
if debug {
|
||||
log.Printf("Extension: %q, Binary Count: %d, Text Count: %d", extData.ext, extData.binaryCount, extData.textCount)
|
||||
}
|
||||
}
|
||||
}
|
||||
return true
|
||||
})
|
||||
}
|
||||
|
||||
// IsBinaryFile detects if a file is binary by analyzing a sample of its content
|
||||
// It uses multiple heuristics for more reliable detection
|
||||
func IsBinaryFile(filename string) (bool, error) {
|
||||
// Open the file
|
||||
file, err := os.Open(filename)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
// Create a buffer to read a sample (first 8KB is usually enough)
|
||||
// Adjust the buffer size as needed
|
||||
const sampleSize = 8192
|
||||
buffer := make([]byte, sampleSize)
|
||||
|
||||
// Read a sample from the file
|
||||
bytesRead, err := file.Read(buffer)
|
||||
if err != nil && err.Error() != "EOF" {
|
||||
return false, err
|
||||
}
|
||||
|
||||
// Adjust buffer to actual bytes read
|
||||
buffer = buffer[:bytesRead]
|
||||
|
||||
// Null byte check - common in binary files, rare in text files
|
||||
nullCount := 0
|
||||
nonPrintableCount := 0
|
||||
|
||||
// Count of characters analyzed
|
||||
totalBytes := bytesRead
|
||||
|
||||
// Check each byte in the sample
|
||||
for _, b := range buffer {
|
||||
// Count null bytes
|
||||
if b == 0 {
|
||||
nullCount++
|
||||
}
|
||||
|
||||
// Count non-printable, non-whitespace characters
|
||||
// BOM in UTF-8 is represented by bytes 0xEF,0xBB,0xBF, not a single byte value
|
||||
if (b < 32 || b > 126) && !isWhitespace(b) {
|
||||
nonPrintableCount++
|
||||
}
|
||||
}
|
||||
|
||||
// Thresholds for binary detection
|
||||
// 1. If more than 1% are null bytes, likely binary
|
||||
// 2. If more than 20% are non-printable characters, likely binary
|
||||
nullThreshold := float64(totalBytes) * 0.01
|
||||
nonPrintableThreshold := float64(totalBytes) * 0.20
|
||||
|
||||
if debug {
|
||||
log.Printf("File: %s, Size: %d, Null bytes: %d, Non-printable: %d",
|
||||
filename, totalBytes, nullCount, nonPrintableCount)
|
||||
}
|
||||
|
||||
isBinary := float64(nullCount) > nullThreshold || float64(nonPrintableCount) > nonPrintableThreshold
|
||||
|
||||
return isBinary, nil
|
||||
}
|
||||
|
||||
// isWhitespace checks if a byte is a whitespace character
|
||||
func isWhitespace(b byte) bool {
|
||||
switch b {
|
||||
case ' ', '\t', '\n', '\r', '\f', '\v':
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// IsStringBinary is kept for backwards compatibility
|
||||
func IsStringBinary(s string) bool {
|
||||
if debug {
|
||||
log.Printf("Checking if string is binary: %q", s)
|
||||
|
Reference in New Issue
Block a user