Files
binary-susser/main.go

306 lines
8.7 KiB
Go

package main
import (
"flag"
"fmt"
"log"
"os"
"os/exec"
"path/filepath"
"sort"
"strings"
"sync"
logger "git.site.quack-lab.dev/dave/cylogger"
utils "git.site.quack-lab.dev/dave/cyutils"
"github.com/bmatcuk/doublestar/v4"
)
type ExtData struct {
ext string
binaryCount int
textCount int
}
var debug bool
func main() {
track := flag.Bool("t", false, "Git add and commit the results")
flag.Parse()
logger.InitFlag()
logger.Info("Starting binarysusser")
dir := flag.Arg(0)
if dir == "" {
dir = "."
}
logger.Info("Scanning directory: %s", dir)
dir = NormalizePath(dir)
logger.Info("Normalized directory: %s", dir)
files, err := doublestar.Glob(os.DirFS(dir), "**/*")
if err != nil {
logger.Error("Error globbing directory: %v", err)
os.Exit(1)
}
logger.Info("Found %d files", len(files))
extensionTypeCount := sync.Map{}
utils.WithWorkers(20, files, func(worker, i int, file string) {
filelog := logger.Default.WithPrefix(fmt.Sprintf("file=%s", file))
filelog.Debug("Processing file")
isBinary, err := IsBinaryFile(file)
if err != nil {
filelog.Error("Error analyzing file: %v", err)
return
}
ext := filepath.Ext(file)
extData, _ := extensionTypeCount.LoadOrStore(ext, &ExtData{ext: ext, binaryCount: 0, textCount: 0})
if isBinary {
extData.(*ExtData).binaryCount++
filelog.Debug("Binary file detected: (%s)", ext)
} else {
extData.(*ExtData).textCount++
filelog.Debug("Text file detected: (%s)", ext)
}
})
logger.Info("Processing complete")
extensionTypeCount.Range(func(key, value any) bool {
extData := value.(*ExtData)
extlog := logger.Default.WithPrefix(fmt.Sprintf("ext=%s", extData.ext))
if extData.ext == "" {
extlog.Debug("Skipping empty extension")
return true
}
if extData.binaryCount > extData.textCount*2 {
extlog.Info("Extension: %q, Binary Count: %d, Text Count: %d", extData.ext, extData.binaryCount, extData.textCount)
}
return true
})
if *track {
logger.Info("Tracking results")
extensionTypeCount.Range(func(key, value any) bool {
extData := value.(*ExtData)
extlog := logger.Default.WithPrefix(fmt.Sprintf("ext=%s", extData.ext))
if extData.ext == "" {
extlog.Debug("Skipping empty extension")
return true
}
cmd := exec.Command("git", "add", extData.ext)
output, err := cmd.CombinedOutput()
if err != nil {
extlog.Error("Error adding extension: %v: %s", err, output)
return true
}
extlog.Debug("Added extension")
extlog.Trace("output: %s", output)
cmd = exec.Command("git", "commit", "-m", fmt.Sprintf("Track %s", extData.ext))
output, err = cmd.CombinedOutput()
if err != nil {
extlog.Error("Error committing extension: %v: %s", err, output)
return true
}
extlog.Debug("Committed extension")
extlog.Trace("output: %s", output)
return true
})
}
}
// IsBinaryFile detects if a file is binary by analyzing a sample of its content
// It uses multiple heuristics for more reliable detection
func IsBinaryFile(filename string) (bool, error) {
filelog := logger.Default.WithPrefix(fmt.Sprintf("file=%s", filename))
filelog.Debug("Starting binary detection for file")
// Open the file
file, err := os.Open(filename)
if err != nil {
filelog.Error("Failed to open file: %v", err)
return false, err
}
defer file.Close()
// Get file info for size
fileInfo, err := file.Stat()
if err != nil {
filelog.Error("Failed to get file stats: %v", err)
return false, err
}
filelog.Debug("File size: %d bytes", fileInfo.Size())
// Create a buffer to read a sample (first 8KB is usually enough)
// Adjust the buffer size as needed
const sampleSize = 8192
buffer := make([]byte, sampleSize)
// Read a sample from the file
filelog.Debug("Reading %d byte sample from file", sampleSize)
bytesRead, err := file.Read(buffer)
if err != nil && err.Error() != "EOF" {
filelog.Error("Error reading from file: %v", err)
return false, err
}
// Adjust buffer to actual bytes read
buffer = buffer[:bytesRead]
filelog.Debug("Actually read %d bytes from file", bytesRead)
// Null byte check - common in binary files, rare in text files
nullCount := 0
nonPrintableCount := 0
controlCharCount := 0
extendedAsciiCount := 0
// Character frequency map (for debug)
charFreq := make(map[byte]int)
// Count of characters analyzed
totalBytes := bytesRead
filelog.Debug("Analyzing bytes for binary detection...")
// Check each byte in the sample
for _, b := range buffer {
// Update character frequency (debug only)
charFreq[b]++
// Count null bytes
if b == 0 {
nullCount++
}
// Track control characters (0-31 except whitespace)
if b < 32 && !isWhitespace(b) {
controlCharCount++
}
// Track extended ASCII
if b > 127 {
extendedAsciiCount++
}
// Count non-printable, non-whitespace characters
// BOM in UTF-8 is represented by bytes 0xEF,0xBB,0xBF, not a single byte value
if (b < 32 || b > 126) && !isWhitespace(b) {
nonPrintableCount++
}
}
// Thresholds for binary detection
// 1. If more than 1% are null bytes, likely binary
// 2. If more than 20% are non-printable characters, likely binary
nullThreshold := float64(totalBytes) * 0.01
nonPrintableThreshold := float64(totalBytes) * 0.20
nullPercentage := 0.0
nonPrintablePercentage := 0.0
controlCharPercentage := 0.0
extendedAsciiPercentage := 0.0
if totalBytes > 0 {
nullPercentage = 100.0 * float64(nullCount) / float64(totalBytes)
nonPrintablePercentage = 100.0 * float64(nonPrintableCount) / float64(totalBytes)
controlCharPercentage = 100.0 * float64(controlCharCount) / float64(totalBytes)
extendedAsciiPercentage = 100.0 * float64(extendedAsciiCount) / float64(totalBytes)
}
filelog.Trace("File size analyzed: %d bytes", totalBytes)
filelog.Trace("Null bytes: %d (%.2f%%)", nullCount, nullPercentage)
filelog.Trace("Non-printable: %d (%.2f%%)", nonPrintableCount, nonPrintablePercentage)
filelog.Trace("Control chars: %d (%.2f%%)", controlCharCount, controlCharPercentage)
filelog.Trace("Extended ASCII: %d (%.2f%%)", extendedAsciiCount, extendedAsciiPercentage)
filelog.Trace("Thresholds: nulls > %.2f%%, non-printable > %.2f%%", 100.0*nullThreshold/float64(totalBytes), 100.0*nonPrintableThreshold/float64(totalBytes))
// Print top 10 most frequent non-printable characters if any were found
if nonPrintableCount > 0 {
type charCountPair struct {
char byte
count int
}
// Filter to non-printable chars and sort by frequency
nonPrintableChars := []charCountPair{}
for char, count := range charFreq {
if (char < 32 || char > 126) && !isWhitespace(char) {
nonPrintableChars = append(nonPrintableChars, charCountPair{char, count})
}
}
// Sort by frequency (descending)
if len(nonPrintableChars) > 0 {
sort.Slice(nonPrintableChars, func(i, j int) bool {
return nonPrintableChars[i].count > nonPrintableChars[j].count
})
// Print top 10 or fewer
log.Printf("[%s] Top non-printable characters:", filename)
maxToShow := 10
if len(nonPrintableChars) < maxToShow {
maxToShow = len(nonPrintableChars)
}
for i := 0; i < maxToShow; i++ {
pair := nonPrintableChars[i]
log.Printf("[%s] Byte 0x%02X: %d occurrences (%.2f%%)",
filename, pair.char, pair.count,
100.0*float64(pair.count)/float64(totalBytes))
}
}
}
isBinary := float64(nullCount) > nullThreshold || float64(nonPrintableCount) > nonPrintableThreshold
if isBinary {
filelog.Debug("File is detected as BINARY file")
if float64(nullCount) > nullThreshold {
filelog.Trace("Detected as binary due to null bytes: %.2f%% > threshold %.2f%%", nullPercentage, 100.0*nullThreshold/float64(totalBytes))
}
if float64(nonPrintableCount) > nonPrintableThreshold {
filelog.Trace("Detected as binary due to non-printable chars: %.2f%% > threshold %.2f%%", nonPrintablePercentage, 100.0*nonPrintableThreshold/float64(totalBytes))
}
} else {
filelog.Debug("File is detected as TEXT file")
}
return isBinary, nil
}
// isWhitespace checks if a byte is a whitespace character
func isWhitespace(b byte) bool {
switch b {
case ' ', '\t', '\n', '\r', '\f', '\v':
return true
default:
return false
}
}
// IsStringBinary is kept for backwards compatibility
func IsStringBinary(s string) bool {
filelog := logger.Default.WithPrefix(fmt.Sprintf("file=%s", s))
filelog.Debug("Checking if string is binary: %q", s)
for _, c := range s {
// 65279 is GOD DAMNED BOM dogshit
if (c < ' ' || c > '~') && c != 65279 {
filelog.Trace("Found non-printable character: '%c' with ASCII value %d", c, c)
return true
}
}
filelog.Debug("String is not binary.")
return false
}
func NormalizePath(input string) string {
input = filepath.Clean(input)
input = filepath.ToSlash(input)
input = strings.ReplaceAll(input, "\"", "")
return input
}