#!/usr/bin/perl -w # # This perl script shows you what percentage of a file is # binary data vs plain text in the current directory. # And it also shows whether the file is a DOS text file, # LINUX text file, OSX, etc.. # Files with DOS line breaks will be represented with $$ signs. # Files with Linux line breaks will be represented with /////. # OSX files will be represented with letter O and mixed lines # will be x x x. "Undetermined" means that the file contains # no line breaks at all. A DOS text file contains CR-LF pairs. # Linux text files have LF characters only. OSX text files # contain CR characters only. And "mixed" means that an unequal # number of CR and LF characters are both present.) # Binary data will be shown as bbbbbb. # ################################################## use strict; use warnings; my $RECURSIVE = 0; my $MAXSIZE = 1000000; # Don't analyze files larger than this. my $MSWIN = index($^O, 'MSWin') >= 0 ? 1 : 0; # The following file extensions will be skipped: my $SKIP .= '.ZIP .LNK .MID .PDF .MP3 .MOV .M4A .MP4 .AVI .VOB .WMV .WMA .WAV .3GP .MPG .FLV .MKV .SWF .WEBM .OGG .MP2 .AAC .AC3 .TS .DOC .DOCX .RAR .GZ .TGZ .CAB .EXE .COM .PNG .JPG .BMP .GIF .ICO .PCX .DRV .MSI .CAT .OCX .SYS .CPL .CPX .NLS .CHM .TTF .FON .MST .AX .TSP .DB .MBX .ANI .JAR .GADGET .SO .LIB .DLL '; my $OS = GetOS(); my $DIR = GetCurrentDirectory(); CheckDIR($DIR); ################################################## # # This function scans a string looking for special characters # and determines what percentage of the string is plain text # and also tries to determine the text format. # # Returns an integer whose lower 8 bits is the percentage (0-100). # Bit 9 will be set if any LF characters were found. # Bit 10 will be set if any CR characters were found. # Bit 11 will be set if there are equal number of CR and LF # characters in the string. These can be interpreted as follows: # # 000 = Format is undetermined. # 001 = LINUX string (LF only) # 010 = OSX string (CR only) # 011 = MIXED format # 111 = DOS text (CR-LF pairs) # # Usage: INTEGER = isText(STRING) # sub isText { defined $_[0] or return 100; my $L = length($_[0]); $L or return 100; # We will simply count the number of plain text characters # and the number of CR and LF characters in the string. my $TOTAL = $L; # Total length of string my $C; my $TX = 0; # Number of plain text characters my $CR = 0; # Number of 0D characters my $LF = 0; # Number of 0A characters while ($L--) { $C = vec($_[0], $L, 8); next if ($C > 126); if ($C > 31 || $C == 9) { $TX++; next; } $LF++ if ($C == 10); $CR++ if ($C == 13); } # Now, we will try to determine what type of string # we're dealing with. There are 5 possibilities: # LINUX, DOS, OSX, MIXED, or "undetermined." # # Explanation of formats: # * OSX files contain CR characters as line break. # * Linux text files contain LF characters as line break. # * DOS text files contain an equal number of CR and LF # characters in pairs. # * "MIXED" means that the string contains an unequal number of # both CR and LF characters, so this may be a binary string. # * "Undeteremined" means that the string does not contain # any line break characters at all, so it could be either # a DOS text or Linux text or anything. $C = $LF ? 0x100 : 0; # We use $C to store the string format. $C |= 0x200 if ($CR); $C |= 0x400 if ($CR == $LF); # The percentage is stored in the lower 7 bits, # and the format is stored in bits 9-11. return $C | int(($TX+$LF+$CR) / $TOTAL * 100); } ################################################## # # This function is automatically called by CheckDIR() every time # a file is found. This function gets the full name of the file. # Returns the percentage as an integer (0-100). The value "100" # means that 100% of the file is plain text with no binary # characters in it at all. # # Usage: INTEGER = CatchFile(FULLNAME) <-- Called by CheckDIR() # sub CatchFile { my $F = _FileName(\@_); # Remove unsafe characters from file name # Check file extension to see if we should skip this file if (length($F) > 4) { my $EXT = rindex($F, '.'); if ($EXT >= 0) { $EXT = uc(substr($F, $EXT, length($F))) . ' '; return -1 if (index($SKIP, $EXT) >= 0); # SKIP FILE } } -e $F or return 0; # File exists? # To save time, we're only going to process small files... my $FILE_SIZE = -s $F; if ($FILE_SIZE > $MAXSIZE) { # Print file size in Megabytes printf("%dMB\t too big ", int(($FILE_SIZE+999999) / 1000000) ); } else { # Print file size in Kilobytes printf("%dKB\t", int(($FILE_SIZE+999) / 1000) ); # ANALYZE FILE CONTENTS... my $S = ReadFile($F); my $FORMAT = isText($S); my $PERCENT = $FORMAT & 255; $FORMAT >>= 8; $S = ''; if ($FORMAT == 1) { $S = '/' x 13; } # LINUX if ($FORMAT == 2) { $S = 'O' x 13; } # OSX if ($FORMAT == 3) { $S = 'x ' x 7; } # MIXED if ($FORMAT == 4) { $S = 't' x 13; } # undetermined if ($FORMAT == 7) { $S = '$' x 13; } # DOS my $VISUAL = int($PERCENT * .13); print '|', substr($S, 0, $VISUAL), '|', 'b' x (13-$VISUAL); } print " $F\n"; } ################################################## # # This function reads the contents of a folder and calls # CatchFile() for each file that was found. # Usage: CheckDIR(PATH) # sub CheckDIR { my $PATH = defined $_[0] ? $_[0] : ''; length($PATH) or return; # Change / to \ on Windows computers if ($MSWIN) { $PATH =~ tr#/#\\#; } # print "Reading directory: $PATH\n"; # Make sure that PATH ends with a backslash or forward slash if (index("/\\", substr($PATH, length($PATH)-1, 1)) < 0) { $PATH .= ($MSWIN ? "\\" : '/'); } my $DIR; my $FULLNAME; opendir($DIR, $PATH) or return; my $NAME = 1; while ($NAME) { $NAME = readdir($DIR); defined $NAME or last; $FULLNAME = "$PATH$NAME"; if (-d($FULLNAME)) { # Check into subdirectory if RECURSIVE == 1 # Skip directory if its name starts with "." if ($RECURSIVE) { CheckDIR($FULLNAME) unless (vec($NAME, 0, 8) == 46); } next; } CatchFile($FULLNAME); } closedir($DIR); } ################################################## # # Checks if the argument string ends with a forward slash or # backslash, and if it does, then removes it and returns 1, # or returns 0 if no slash was found at the end of the string. # Usage: INTEGER = EndsWithSlash(STRING) # sub EndsWithSlash { my $P = defined $_[0] ? $_[0] : ''; length($P) or return 0; index("\\/", substr($P, length($P)-1, 1)) >= 0 or return 0; chop $_[0]; return 1; } ################################################## # Usage: STRING = _FileName(\@_) - Removes the first argument from @_ just like shift() does and returns a file name. This function does not check syntax, but it does remove some illegal characters (<>|*?) from the name that obviously should not occur in a file name. If the file name doesn't contain any valid characters, then returns an empty string. sub _FileName { @_ or return ''; my $N = shift; $N = shift(@$N); defined $N or return ''; length($N) or return ''; my $c; my $j = 0; my $V = 0; for (my $i = 0; $i < length($N); $i++) { $c = vec($N, $i, 8); next if ($c == 63 || $c == 42 || $c < 32); last if ($c == 60 || $c == 62 || $c == 124); if ($c > 32) { $V = $j + 1; } if ($V) { $i == $j or vec($N, $j, 8) = $c; $j++; } } return substr($N, 0, $V); } # Usage: STRING = ReadFile(FILE_NAME, [BYTES_TO_READ, [START]]) - Reads an entire file in binary mode and returns the contents in one string. A second argument may be provided to read only a certain number of bytes. And a third argument may be provided to set the file pointer to a certain address before reading. sub ReadFile { my $F = _FileName(\@_); length($F) or return ''; my $L = @_ ? shift : 99999999; defined $L or return ''; my $A = @_ ? shift : 0; defined $A or $A = 0; -f $F or return ''; -s $F or return ''; my $B; open FH, "<$F" or return ''; binmode FH; if ($A) { sysseek(FH, $A, 0); } sysread FH, $B, $L; close FH; defined $B or return ''; return $B; } ################################################## # # This function returns the current working directory. # Usage: STRING = GetCurrentDirectory() # sub GetCurrentDirectory { Trim($OS < 3 ? `cd` : (exists($ENV{PWD}) ? $ENV{PWD} : `pwd`)); } ################################################## # # This function returns the OS type as a number. # 1=DOS 2=WINDOWS 3=LINUX 4=OSX 9=OTHER # Usage: INTEGER = GetOS() # sub GetOS { my $OS = uc($^O); index($OS, 'LINUX') >= 0 ? 3 : index($OS, 'MSWIN') >= 0 ? 2 : index($OS, 'DOS') >= 0 ? 1 : index($OS, 'DARWIN') >= 0 ? 4 : 9; } ################################################## # v2019.8.25 # Removes whitespace from before and after STRING. # Whitespace is here defined as any byte whose # ASCII value is less than 33. That includes # tabs, esc, null, vertical tab, new lines, etc. # Usage: STRING = Trim(STRING) # sub Trim { defined $_[0] or return ''; my $L = length($_[0]); $L or return ''; my $P = 0; while ($P <= $L && vec($_[0], $P++, 8) < 33) {} for ($P--; $P <= $L && vec($_[0], $L--, 8) < 33;) {} substr($_[0], $P, $L - $P + 2) } ##################################################