282 lines
6.2 KiB
Go
282 lines
6.2 KiB
Go
package main
|
|
|
|
import (
|
|
"flag"
|
|
"fmt"
|
|
"log"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
)
|
|
|
|
var Error *log.Logger
|
|
var Warning *log.Logger
|
|
|
|
func init() {
|
|
log.SetFlags(log.Lmicroseconds | log.Lshortfile)
|
|
log.SetOutput(os.Stdout)
|
|
|
|
Error = log.New(os.Stderr,
|
|
fmt.Sprintf("%sERROR:%s ", "\033[0;101m", "\033[0m"),
|
|
log.Lmicroseconds|log.Lshortfile)
|
|
Warning = log.New(os.Stdout,
|
|
fmt.Sprintf("%sWarning:%s ", "\033[0;93m", "\033[0m"),
|
|
log.Lmicroseconds|log.Lshortfile)
|
|
}
|
|
|
|
type ExtData struct {
|
|
ext string
|
|
binaryCount int
|
|
textCount int
|
|
}
|
|
|
|
var debug bool
|
|
|
|
func main() {
|
|
raw := flag.Bool("r", false, "More application friendly output")
|
|
debugF := flag.Bool("d", false, "Debug mode")
|
|
flag.Parse()
|
|
debug = *debugF
|
|
dir := flag.Arg(0)
|
|
if dir == "" {
|
|
dir = "."
|
|
}
|
|
dir = NormalizePath(dir)
|
|
if debug {
|
|
log.Printf("Scanning directory: %s", dir)
|
|
}
|
|
|
|
files := make(chan string, 10000)
|
|
status := make(chan error)
|
|
go GetSyncFilesRecursively(dir, files, status)
|
|
//files <- "SmarterConstruction.pdb"
|
|
|
|
extensionTypeCount := sync.Map{}
|
|
|
|
wg := sync.WaitGroup{}
|
|
for file := range files {
|
|
wg.Add(1)
|
|
go func(file string) {
|
|
defer wg.Done()
|
|
if debug {
|
|
log.Printf("Processing file: %s", file) // Log the file being processed
|
|
}
|
|
|
|
isBinary, err := IsBinaryFile(file)
|
|
if err != nil {
|
|
if debug {
|
|
log.Printf("Error analyzing file %s: %v", file, err)
|
|
}
|
|
return
|
|
}
|
|
|
|
ext := filepath.Ext(file)
|
|
extData, _ := extensionTypeCount.LoadOrStore(ext, &ExtData{ext: ext, binaryCount: 0, textCount: 0})
|
|
if isBinary {
|
|
extData.(*ExtData).binaryCount++
|
|
if debug {
|
|
log.Printf("Binary file detected: %s (%s)", file, ext)
|
|
}
|
|
} else {
|
|
extData.(*ExtData).textCount++
|
|
if debug {
|
|
log.Printf("Text file detected: %s (%s)", file, ext)
|
|
}
|
|
}
|
|
}(file)
|
|
}
|
|
wg.Wait()
|
|
|
|
extensionTypeCount.Range(func(key, value any) bool {
|
|
extData := value.(*ExtData)
|
|
if extData.ext == "" {
|
|
return true
|
|
}
|
|
if extData.binaryCount > extData.textCount*2 {
|
|
if *raw {
|
|
fmt.Println(extData.ext)
|
|
} else {
|
|
log.Printf("Extension: %q, Binary Count: %d, Text Count: %d", extData.ext, extData.binaryCount, extData.textCount)
|
|
}
|
|
}
|
|
return true
|
|
})
|
|
}
|
|
|
|
// IsBinaryFile detects if a file is binary by analyzing a sample of its content
|
|
// It uses multiple heuristics for more reliable detection
|
|
func IsBinaryFile(filename string) (bool, error) {
|
|
// Open the file
|
|
file, err := os.Open(filename)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
defer file.Close()
|
|
|
|
// Create a buffer to read a sample (first 8KB is usually enough)
|
|
// Adjust the buffer size as needed
|
|
const sampleSize = 8192
|
|
buffer := make([]byte, sampleSize)
|
|
|
|
// Read a sample from the file
|
|
bytesRead, err := file.Read(buffer)
|
|
if err != nil && err.Error() != "EOF" {
|
|
return false, err
|
|
}
|
|
|
|
// Adjust buffer to actual bytes read
|
|
buffer = buffer[:bytesRead]
|
|
|
|
// Null byte check - common in binary files, rare in text files
|
|
nullCount := 0
|
|
nonPrintableCount := 0
|
|
|
|
// Count of characters analyzed
|
|
totalBytes := bytesRead
|
|
|
|
// Check each byte in the sample
|
|
for _, b := range buffer {
|
|
// Count null bytes
|
|
if b == 0 {
|
|
nullCount++
|
|
}
|
|
|
|
// Count non-printable, non-whitespace characters
|
|
// BOM in UTF-8 is represented by bytes 0xEF,0xBB,0xBF, not a single byte value
|
|
if (b < 32 || b > 126) && !isWhitespace(b) {
|
|
nonPrintableCount++
|
|
}
|
|
}
|
|
|
|
// Thresholds for binary detection
|
|
// 1. If more than 1% are null bytes, likely binary
|
|
// 2. If more than 20% are non-printable characters, likely binary
|
|
nullThreshold := float64(totalBytes) * 0.01
|
|
nonPrintableThreshold := float64(totalBytes) * 0.20
|
|
|
|
if debug {
|
|
log.Printf("File: %s, Size: %d, Null bytes: %d, Non-printable: %d",
|
|
filename, totalBytes, nullCount, nonPrintableCount)
|
|
}
|
|
|
|
isBinary := float64(nullCount) > nullThreshold || float64(nonPrintableCount) > nonPrintableThreshold
|
|
|
|
return isBinary, nil
|
|
}
|
|
|
|
// isWhitespace checks if a byte is a whitespace character
|
|
func isWhitespace(b byte) bool {
|
|
switch b {
|
|
case ' ', '\t', '\n', '\r', '\f', '\v':
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
// IsStringBinary is kept for backwards compatibility
|
|
func IsStringBinary(s string) bool {
|
|
if debug {
|
|
log.Printf("Checking if string is binary: %q", s)
|
|
}
|
|
for _, c := range s {
|
|
// 65279 is GOD DAMNED BOM dogshit
|
|
if (c < ' ' || c > '~') && c != 65279 {
|
|
if debug {
|
|
log.Printf("Found non-printable character: '%c' with ASCII value %d", c, c)
|
|
}
|
|
return true
|
|
}
|
|
}
|
|
if debug {
|
|
log.Println("String is not binary.")
|
|
}
|
|
return false
|
|
}
|
|
|
|
func GetSyncFilesRecursively(input string, output chan string, status chan error) {
|
|
defer close(output)
|
|
defer close(status)
|
|
|
|
var filesProcessed int32
|
|
var foldersProcessed int32
|
|
var activeWorkers int32
|
|
|
|
done := make(chan struct{})
|
|
defer close(done)
|
|
|
|
directories := make(chan string, 100000)
|
|
workerPool := make(chan struct{}, 4000)
|
|
directories <- input
|
|
|
|
allDone := make(chan struct{})
|
|
|
|
go func() {
|
|
var wg sync.WaitGroup
|
|
|
|
go func() {
|
|
for {
|
|
if atomic.LoadInt32(&activeWorkers) == 0 && len(directories) == 0 {
|
|
time.Sleep(10 * time.Millisecond)
|
|
if atomic.LoadInt32(&activeWorkers) == 0 && len(directories) == 0 {
|
|
close(allDone)
|
|
return
|
|
}
|
|
}
|
|
time.Sleep(50 * time.Millisecond)
|
|
}
|
|
}()
|
|
|
|
for {
|
|
select {
|
|
case directory, ok := <-directories:
|
|
if !ok {
|
|
wg.Wait()
|
|
return
|
|
}
|
|
|
|
atomic.AddInt32(&activeWorkers, 1)
|
|
|
|
go func(dir string) {
|
|
workerPool <- struct{}{}
|
|
|
|
atomic.AddInt32(&foldersProcessed, 1)
|
|
processDirectory(dir, directories, output, &filesProcessed)
|
|
|
|
<-workerPool
|
|
atomic.AddInt32(&activeWorkers, -1)
|
|
}(directory)
|
|
}
|
|
}
|
|
}()
|
|
|
|
<-allDone
|
|
}
|
|
|
|
func processDirectory(directory string, directories chan<- string, output chan<- string, filesProcessed *int32) {
|
|
files, err := os.ReadDir(directory)
|
|
if err != nil {
|
|
//log.Printf("Error reading directory %s: %+v", directory, err)
|
|
return
|
|
}
|
|
|
|
for _, file := range files {
|
|
if file.IsDir() {
|
|
directories <- filepath.Join(directory, file.Name())
|
|
} else {
|
|
output <- filepath.Join(directory, file.Name())
|
|
atomic.AddInt32(filesProcessed, 1)
|
|
}
|
|
}
|
|
}
|
|
|
|
func NormalizePath(input string) string {
|
|
input = filepath.Clean(input)
|
|
input = filepath.ToSlash(input)
|
|
input = strings.ReplaceAll(input, "\"", "")
|
|
return input
|
|
}
|