binary-susser/main.go

package main

import (
	"flag"
	"fmt"
	"log"
	"os"
	"path/filepath"
	"sort"
	"strings"
	"sync"
	"sync/atomic"
	"time"
)

var Error *log.Logger
var Warning *log.Logger

func init() {
	log.SetFlags(log.Lmicroseconds | log.Lshortfile)
	log.SetOutput(os.Stdout)

	Error = log.New(os.Stderr,
		fmt.Sprintf("%sERROR:%s ", "\033[0;101m", "\033[0m"),
		log.Lmicroseconds|log.Lshortfile)
	Warning = log.New(os.Stdout,
		fmt.Sprintf("%sWarning:%s ", "\033[0;93m", "\033[0m"),
		log.Lmicroseconds|log.Lshortfile)
}

type ExtData struct {
	ext         string
	binaryCount int
	textCount   int
}

var debug bool

func main() {
	raw := flag.Bool("r", false, "More application friendly output")
	debugF := flag.Bool("d", false, "Debug mode")
	flag.Parse()
	debug = *debugF
	dir := flag.Arg(0)
	if dir == "" {
		dir = "."
	}
	dir = NormalizePath(dir)
	if debug {
		log.Printf("Scanning directory: %s", dir)
	}

	files := make(chan string, 10000)
	status := make(chan error)
	go GetSyncFilesRecursively(dir, files, status)
	//files <- "SmarterConstruction.pdb"

	extensionTypeCount := sync.Map{}

	wg := sync.WaitGroup{}
	for file := range files {
		wg.Add(1)
		go func(file string) {
			defer wg.Done()
			if debug {
				log.Printf("[%s] Processing file", file) // Log the file being processed
			}

			isBinary, err := IsBinaryFile(file)
			if err != nil {
				if debug {
					log.Printf("[%s] Error analyzing file: %v", file, err)
				}
				return
			}

			ext := filepath.Ext(file)
			extData, _ := extensionTypeCount.LoadOrStore(ext, &ExtData{ext: ext, binaryCount: 0, textCount: 0})
			if isBinary {
				extData.(*ExtData).binaryCount++
				if debug {
					log.Printf("[%s] Binary file detected: (%s)", file, ext)
				}
			} else {
				extData.(*ExtData).textCount++
				if debug {
					log.Printf("[%s] Text file detected: (%s)", file, ext)
				}
			}
		}(file)
	}
	wg.Wait()

	extensionTypeCount.Range(func(key, value any) bool {
		extData := value.(*ExtData)
		if extData.ext == "" {
			return true
		}
		if extData.binaryCount > extData.textCount*2 {
			if *raw {
				fmt.Println(extData.ext)
			} else {
				log.Printf("Extension: %q, Binary Count: %d, Text Count: %d", extData.ext, extData.binaryCount, extData.textCount)
			}
		}
		return true
	})
}

// IsBinaryFile detects if a file is binary by analyzing a sample of its content
// It uses multiple heuristics for more reliable detection
func IsBinaryFile(filename string) (bool, error) {
	if debug {
		log.Printf("[%s] Starting binary detection for file", filename)
	}

	// Open the file
	file, err := os.Open(filename)
	if err != nil {
		if debug {
			log.Printf("[%s] Failed to open file: %v", filename, err)
		}
		return false, err
	}
	defer file.Close()

	// Get file info for size
	fileInfo, err := file.Stat()
	if err != nil {
		if debug {
			log.Printf("[%s] Failed to get file stats: %v", filename, err)
		}
	} else if debug {
		log.Printf("[%s] File size: %d bytes", filename, fileInfo.Size())
	}

	// Create a buffer to read a sample (first 8KB is usually enough)
	// Adjust the buffer size as needed
	const sampleSize = 8192
	buffer := make([]byte, sampleSize)

	// Read a sample from the file
	if debug {
		log.Printf("[%s] Reading %d byte sample from file", filename, sampleSize)
	}
	bytesRead, err := file.Read(buffer)
	if err != nil && err.Error() != "EOF" {
		if debug {
			log.Printf("[%s] Error reading from file: %v", filename, err)
		}
		return false, err
	}

	// Adjust buffer to actual bytes read
	buffer = buffer[:bytesRead]
	if debug {
		log.Printf("[%s] Actually read %d bytes from file", filename, bytesRead)
	}

	// Null byte check - common in binary files, rare in text files
	nullCount := 0
	nonPrintableCount := 0
	controlCharCount := 0
	extendedAsciiCount := 0

	// Character frequency map (for debug)
	charFreq := make(map[byte]int)

	// Count of characters analyzed
	totalBytes := bytesRead

	if debug {
		log.Printf("[%s] Analyzing bytes for binary detection...", filename)
	}

	// Check each byte in the sample
	for _, b := range buffer {
		// Update character frequency (debug only)
		if debug {
			charFreq[b]++
		}

		// Count null bytes
		if b == 0 {
			nullCount++
		}

		// Track control characters (0-31 except whitespace)
		if b < 32 && !isWhitespace(b) {
			controlCharCount++
		}

		// Track extended ASCII
		if b > 127 {
			extendedAsciiCount++
		}

		// Count non-printable, non-whitespace characters
		// BOM in UTF-8 is represented by bytes 0xEF,0xBB,0xBF, not a single byte value
		if (b < 32 || b > 126) && !isWhitespace(b) {
			nonPrintableCount++
		}
	}

	// Thresholds for binary detection
	// 1. If more than 1% are null bytes, likely binary
	// 2. If more than 20% are non-printable characters, likely binary
	nullThreshold := float64(totalBytes) * 0.01
	nonPrintableThreshold := float64(totalBytes) * 0.20

	nullPercentage := 0.0
	nonPrintablePercentage := 0.0
	controlCharPercentage := 0.0
	extendedAsciiPercentage := 0.0

	if totalBytes > 0 {
		nullPercentage = 100.0 * float64(nullCount) / float64(totalBytes)
		nonPrintablePercentage = 100.0 * float64(nonPrintableCount) / float64(totalBytes)
		controlCharPercentage = 100.0 * float64(controlCharCount) / float64(totalBytes)
		extendedAsciiPercentage = 100.0 * float64(extendedAsciiCount) / float64(totalBytes)
	}

	if debug {
		log.Printf("[%s] File", filename)
		log.Printf("[%s]   Size analyzed: %d bytes", filename, totalBytes)
		log.Printf("[%s]   Null bytes: %d (%.2f%%)", filename, nullCount, nullPercentage)
		log.Printf("[%s]   Non-printable: %d (%.2f%%)", filename, nonPrintableCount, nonPrintablePercentage)
		log.Printf("[%s]   Control chars: %d (%.2f%%)", filename, controlCharCount, controlCharPercentage)
		log.Printf("[%s]   Extended ASCII: %d (%.2f%%)", filename, extendedAsciiCount, extendedAsciiPercentage)
		log.Printf("[%s]   Thresholds: nulls > %.2f%%, non-printable > %.2f%%",
			filename,
			100.0*nullThreshold/float64(totalBytes),
			100.0*nonPrintableThreshold/float64(totalBytes))

		// Print top 10 most frequent non-printable characters if any were found
		if nonPrintableCount > 0 {
			type charCountPair struct {
				char  byte
				count int
			}

			// Filter to non-printable chars and sort by frequency
			nonPrintableChars := []charCountPair{}
			for char, count := range charFreq {
				if (char < 32 || char > 126) && !isWhitespace(char) {
					nonPrintableChars = append(nonPrintableChars, charCountPair{char, count})
				}
			}

			// Sort by frequency (descending)
			if len(nonPrintableChars) > 0 {
				sort.Slice(nonPrintableChars, func(i, j int) bool {
					return nonPrintableChars[i].count > nonPrintableChars[j].count
				})

				// Print top 10 or fewer
				log.Printf("[%s]   Top non-printable characters:", filename)
				maxToShow := 10
				if len(nonPrintableChars) < maxToShow {
					maxToShow = len(nonPrintableChars)
				}
				for i := 0; i < maxToShow; i++ {
					pair := nonPrintableChars[i]
					log.Printf("[%s]     Byte 0x%02X: %d occurrences (%.2f%%)",
						filename, pair.char, pair.count,
						100.0*float64(pair.count)/float64(totalBytes))
				}
			}
		}
	}

	isBinary := float64(nullCount) > nullThreshold || float64(nonPrintableCount) > nonPrintableThreshold

	if debug {
		if isBinary {
			log.Printf("[%s] RESULT: %s is detected as BINARY file", filename, filename)
			if float64(nullCount) > nullThreshold {
				log.Printf("[%s]   - Detected as binary due to null bytes: %.2f%% > threshold %.2f%%",
					filename, nullPercentage, 100.0*nullThreshold/float64(totalBytes))
			}
			if float64(nonPrintableCount) > nonPrintableThreshold {
				log.Printf("[%s]   - Detected as binary due to non-printable chars: %.2f%% > threshold %.2f%%",
					filename, nonPrintablePercentage, 100.0*nonPrintableThreshold/float64(totalBytes))
			}
		} else {
			log.Printf("[%s] RESULT: %s is detected as TEXT file", filename, filename)
		}
	}

	return isBinary, nil
}

// isWhitespace checks if a byte is a whitespace character
func isWhitespace(b byte) bool {
	switch b {
	case ' ', '\t', '\n', '\r', '\f', '\v':
		return true
	default:
		return false
	}
}

// IsStringBinary is kept for backwards compatibility
func IsStringBinary(s string) bool {
	if debug {
		log.Printf("Checking if string is binary: %q", s)
	}
	for _, c := range s {
		// 65279 is GOD DAMNED BOM dogshit
		if (c < ' ' || c > '~') && c != 65279 {
			if debug {
				log.Printf("Found non-printable character: '%c' with ASCII value %d", c, c)
			}
			return true
		}
	}
	if debug {
		log.Println("String is not binary.")
	}
	return false
}

func GetSyncFilesRecursively(input string, output chan string, status chan error) {
	defer close(output)
	defer close(status)

	var filesProcessed int32
	var foldersProcessed int32
	var activeWorkers int32

	done := make(chan struct{})
	defer close(done)

	directories := make(chan string, 100000)
	workerPool := make(chan struct{}, 4000)
	directories <- input

	allDone := make(chan struct{})

	go func() {
		var wg sync.WaitGroup

		go func() {
			for {
				if atomic.LoadInt32(&activeWorkers) == 0 && len(directories) == 0 {
					time.Sleep(10 * time.Millisecond)
					if atomic.LoadInt32(&activeWorkers) == 0 && len(directories) == 0 {
						close(allDone)
						return
					}
				}
				time.Sleep(50 * time.Millisecond)
			}
		}()

		for {
			select {
			case directory, ok := <-directories:
				if !ok {
					wg.Wait()
					return
				}

				atomic.AddInt32(&activeWorkers, 1)

				go func(dir string) {
					workerPool <- struct{}{}

					atomic.AddInt32(&foldersProcessed, 1)
					processDirectory(dir, directories, output, &filesProcessed)

					<-workerPool
					atomic.AddInt32(&activeWorkers, -1)
				}(directory)
			}
		}
	}()

	<-allDone
}

func processDirectory(directory string, directories chan<- string, output chan<- string, filesProcessed *int32) {
	files, err := os.ReadDir(directory)
	if err != nil {
		//log.Printf("Error reading directory %s: %+v", directory, err)
		return
	}

	for _, file := range files {
		if file.IsDir() {
			directories <- filepath.Join(directory, file.Name())
		} else {
			output <- filepath.Join(directory, file.Name())
			atomic.AddInt32(filesProcessed, 1)
		}
	}
}

func NormalizePath(input string) string {
	input = filepath.Clean(input)
	input = filepath.ToSlash(input)
	input = strings.ReplaceAll(input, "\"", "")
	return input
}