#!/usr/bin/perl ##################################################################### # # CRLF v1.0 Last Update: 2020.1.20 # # This perl script searches for PL, PM, CGI, HTM, HTML, CSS, JS, # TXT and LOG files in the current directory and changes the # line-breaks to CR+LF (DOS format). The program will skip files # that larger than 1MB or whose name contains Unicode characters. # # Written by Zsolt N Perry on January 20, 2020 in Pensacola, Fla. # For questions, comments, feature requests, or bug reports, # write to zsnp@juno.com. # # This script does not require any Perl modules. This Perl script # was tested with TinyPerl 5.8 under Windows XP. # This file was downloaded from http://www.wzsn.net/perl # # THIS ENTIRE SOURCE CODE IS FREEWARE. # If you want to incorporate it or parts of it into your program, # just copy and paste whatever you need! There is no need to ask # for permission. This software is distributed "AS IS." There is # no warranty of any kind. The author will not be held liable for # any loss resulting from the use or misuse of this software. # ##################################################################### use 5.004; use strict; use warnings; $| = 1; ################################################## ## SETTINGS my $BR = "\r\n"; # Insert DOS line break sequence my $MAXSIZE = 1000000; # Skip very large files (over 1MB) my $MINTEXT = 95; # File must be at least 95% plain text my $RECURSIVE = 0; # Recursive mode? my $TRIM = 1; # Remove whitespace from the end of lines? my $DONT_WRITE = 0; # Don't save any changes? my $TOUCH = 0; # Update a file's last-modified time? # Process files with the following extensions: my $FILES = '.PM .PL .CGI .HTM .HTML .JS .CSS .TXT .LOG'; my @FILE_LIST; ################################################## ## VARIABLES # This script uses a lot of global variables # instead of passing a bunch of parameters around. my $i; # file ptr my $M; # File mode/attribute my $T; # File last-modified date&time my $OS; # operating system my $DIR; # work directory my $BRU; # Line break for unicode files my $BRUB; # Line break for unicode files (big endian) my $LINES = 0; # Total number of lines in a file my $INPUT = ''; # Read buffer my $OUTPUT = ''; # Write buffer my $UNICODE = 0; # Unicode format? my $text_end; # Marks the end of text my $line_start; # Beginning of line ################################################## ## PROGRAM STARTS HERE $OS = GetOS(); $DIR = GetCWD(); print "Current directory : $DIR\n"; CheckDIR($DIR); exit; ################################################## # # This function reads the contents of a folder and # calls CatchFile() for each file that was found. # Usage: CheckDIR(PATH) # sub CheckDIR { my $PATH = defined $_[0] ? $_[0] : ''; length($PATH) or return; # Change / to \ on Windows and DOS computers if ($OS < 3) { $PATH =~ tr#/#\\#; } if ($DONT_WRITE == 0) { print "Reading directory: $PATH\n\n"; } # Make sure that PATH ends with a backslash or forward slash if (index("/\\", substr($PATH, length($PATH)-1, 1)) < 0) { $PATH .= ($OS < 3 ? "\\" : '/'); } my $FULLNAME; local *DIR; opendir(DIR, $PATH) or return; my $NAME = 1; while ($NAME) { $NAME = readdir(DIR); defined $NAME or last; $FULLNAME = "$PATH$NAME"; if (-d($FULLNAME)) { # Check into subdirectory if RECURSIVE == 1 # Skip directory if its name starts with "." if ($RECURSIVE) { CheckDIR($FULLNAME) unless (vec($NAME, 0, 8) == 46); } next; } CatchFile($PATH, $NAME); } closedir(DIR); } ################################################## # # This function is automatically called # by CheckDIR() every time a file is found. # Its role is to see if the current file's name # matches the list of names we're looking for # specified in $FILES and @FILE_LIST. # Usage: CatchFile(PATH_ONLY, NAME_ONLY) # sub CatchFile { my ($P, $F) = @_; if (length($F) > 2) # Match extension { (my $EXT = rindex($F, '.')) >= 0 or next; $EXT = uc(substr($F, $EXT, length($F))) . ' '; if (index($FILES, $EXT) >= 0) { UpdateFile("$P$F"); return; } } } ################################################## # # This function scans the file contents and replaces # various line-break sequences with a new sequence. # Usage: UpdateFile(FILENAME) # sub UpdateFile { $INPUT = ''; $OUTPUT = ''; my $F = shift; if (-s $F > $MAXSIZE) { print "$F\n SKIPPED. File is bigger than ", Commify($MAXSIZE), " bytes.\n\n"; return; } print "\n", $F; return if ($DONT_WRITE); print ' ...'; # We do a series of checks first. Return if we # can't read the file or if the file size is zero. my $SIZE = ReadFile($F); if ($SIZE == 0) { print "SKIPPED. Nothing to read.\n\n"; return; } print Commify($SIZE), " BYTES READ.\n"; my $X = AnalyzeFile(); my $PERCENT_TEXT = $X & 255; my $FLAGS = ($X >> 8) & 7; if ($X & 0x800) # Possibly unicode format { print "Cannot decode unicode format. Sorry.\n"; return; } # Return if the file contains less than 95% plain text # or whatever minimum amount is specified in $MINTEXT # unless it's a unicode file. if ($UNICODE && $PERCENT_TEXT > 45) { print 'UNICODE TEXT. '; } elsif ($PERCENT_TEXT < $MINTEXT) { print "SKIPPED. File is only $PERCENT_TEXT% plain text! Should be at least $MINTEXT%.\n\n"; return; } # File contains NO line-breaks at all? if ($FLAGS == 4 && !$TRIM) { print "SKIPPED. File is all one line!\n\n"; return; } # Input file is in Linux format: my $SAVE_AND_RETURN = 0; if ($FLAGS == 1 && !$TRIM) { # Converting from Linux to DOS: if ($BR eq "\r\n") { $INPUT =~ s/\n/\r\n/g; } $SAVE_AND_RETURN = 1; } if ($SAVE_AND_RETURN == 1) { ($OUTPUT, $INPUT) = ($INPUT, ''); SaveFile($F); return; } # If we get to this point, it means our input file # contains mixed content. So, we will read it # slowly, processing one byte at a time. my $c = 0; # current character my $prev; # previous character $LINES = 0; $text_end = 0; $line_start = -2; my $RESTORE = $BR; $BR = $BRU if ($UNICODE == 1); $BR = $BRUB if ($UNICODE == 2); for ($i = 0; $i < $SIZE; $i++) { $c = vec($INPUT, $i, 8); next if ($UNICODE && $c == 0); if ($c == 10 || $c == 13) # Detect LF CR { if (PrintLine()) { } # PrintLine() returns 0 if we had nothing to print, # so then we check if the previous character was # also the same new-line character... elsif ($prev == $c) { # OK. What we have here is two LF-LF characters # or two CR-CR characters one after the other, # which means we should print a blank line. $OUTPUT .= $BR; $LINES++; } # Okay, what if the previous character was a # different new-line character? elsif ($prev == 10 || $prev == 13) { # We may have the second byte of a CR/LF pair # here, so just skip this byte. And print a # New-Line sequence in the next cycle. $line_start = -2; } } else # All kinds of other characters: { if ($line_start < 0) { $line_start = $i; } ($c == 32 || $c == 9) or $text_end = $i; } $prev = $c; # Save previous char } if ($line_start >= 0) { PrintLine(); } # Print the last line (if any) $BR = $RESTORE; SaveFile($F . '.bak'); # Overwrite original file } ################################################## # # This function sends one line to a global variable # named $OUTPUT which collects all the data to # be written to a file. The line to be stored may # contain text only, or it may be just a blank line. # # When $line_start == -2, we need to store a blank # line. When $line_start == -1, we're in the middle # of a line-break sequence. This function returns 1 # if something was stored, or 0 otherwise. # # Usage: INTEGER = PrintLine() # sub PrintLine { if ($line_start == -1) # Nothing to print yet? { return 0; } if ($line_start >= 0) # Got something to print! { if ($UNICODE == 1) { $OUTPUT .= substr($INPUT, $line_start, ($TRIM ? $text_end + 1 : $i) - $line_start + 1); } else { $OUTPUT .= substr($INPUT, $line_start, ($TRIM ? $text_end + 1 : $i) - $line_start); } } $OUTPUT .= $BR; # Print New-line sequence $LINES++; $line_start = -1; return 1; } ################################################## # # This function writes the contents of $OUTPUT # to a file, replacing the existing file, # and prints a little summary. # sub SaveFile { my $F = shift; if ($INPUT eq $OUTPUT) { print "SKIPPED. NO CHANGES NEEDED.\n\n"; return; } if ($DONT_WRITE) { print "SKIPPED. NO CHANGES MADE!\n\n"; return; } # if ($CREATE_BAK) # { system("ren $F *.BAK"); } chmod $M | 0660, $F; # Make file writable if (open(FH, ">$F") == 0) { print "\tCAN'T WRITE!\n\n"; return; } binmode FH; print FH $OUTPUT; close FH; print "\t", Commify($LINES), ' line(s) ', Commify(length($OUTPUT)), " bytes written.\n\n"; chmod $M, $F; # Restore original mode # The date & time the file was last modified will be reset # back to original, but the file last-access date # will most likely change to the current time. $TOUCH or utime time, $T, $F; } ################################################## # # This function reads the entire contents of a # file into a global variable named $INPUT and # returns the number of bytes read OR # returns 0 if something went wrong. # # Usage: INTEGER = ReadFile(FILENAME) # sub ReadFile { my $F = shift; $LINES = 0; $INPUT = ''; $OUTPUT = ''; my $SIZE = -s $F; $SIZE or return 0; local *FH; sysopen(FH, $F, 0) or return 0; my @INFO = stat(FH); $M = $INFO[2]; # Get file mode/attribute $T = $INFO[9]; # Get last-modified date binmode FH; sysread(FH, $INPUT, $SIZE) or return 0; close FH; $SIZE == length($INPUT) or return 0; return $SIZE; } ################################################## # # This function scans the content of $INPUT string # looking for special characters and determines # what percentage of the string is plain text, and # also tries to determine the text format. # # Returns an integer whose lower 8 bits is the percentage (0-100). # Bit 9 will be set if any LF characters were found. # Bit 10 will be set if any CR characters were found. # Bit 11 will be set if there are equal number of CR and LF # characters in the string. # Bit 12 will be set if there are lots of zeros in the string, # which is an indication that it may be unicode format. # These can be interpreted as follows: # # .000 = Format is undetermined. # .001 = LINUX string (LF only) # .010 = OSX string (CR only) # .011 = MIXED format # .111 = DOS text (CR-LF pairs) # # Usage: INTEGER = AnalyzeFile() # sub AnalyzeFile { my $L = length($INPUT); # We will simply count the number of plain text characters # and the number of CR and LF characters in the string. my $c; my $TOTAL = $L; # Total length of string my $TX = 0; # Number of plain text characters my $CR = 0; # Number of 0D characters my $LF = 0; # Number of 0A characters my $NUL = 0; # Number of 00 characters while ($L--) { $c = vec($INPUT, $L, 8); next if ($c > 126); if ($c > 31 || $c == 9) { $TX++; next; } $LF++ if ($c == 10); $CR++ if ($c == 13); $NUL++ if ($c == 0); } $LINES = ($LF > $CR) ? $LF : $CR; # Number of line breaks # Now, we will try to determine what type of string # we're dealing with. There are 5 possibilities: # LINUX, DOS, OSX, MIXED, or "undetermined." # # Explanation of formats: # * OSX files contain CR characters as line break. # * Linux text files contain LF characters as line break. # * DOS text files contain an equal number of CR and LF # characters in pairs. # * "MIXED" means that the string contains an unequal number of # both CR and LF characters, so this may be a binary data. # * "Undeteremined" means that the string does not contain # any line break characters at all, so it could be either # a DOS text or Linux text or anything. $c = $LF ? 0x100 : 0; # Now we use $c to store the string format. $c |= 0x200 if ($CR); $c |= 0x400 if ($CR == $LF); # Check for possible unicode format $NUL = int($NUL / $TOTAL * 100); # Calculate % of null chars $c |= 0x800 if ($NUL > 40 && $NUL < 60); # The percentage is stored in the lower 8 bits, # and the format is stored in bits 9-12. return $c | int(($TX+$LF+$CR) / $TOTAL * 100); } ################################################## # # This function returns true if a filename matches a # certain wildcard pattern. There may be several # question marks in the search pattern, but only # one asterisk is allowed! The matching is # NOT case sensitive! # Usage: INTEGER = isMatch(FILENAME, WILDCARD) # # Example: isMatch("New_Document.txt", "n*.txt") = 1 # sub isMatch { @_ > 1 or return 0; my $F = shift; defined $F or return 0; length($F) or return 0; my $W = shift; defined $W or return 0; length($W) or return 0; $F = uc($F); $W = uc($W); # If there are invalid characters... if (CountChars($W.$F, '<|>')) { return 0; } # If there aren't any wildcards at all... if (CountChars($W, '*?') == 0) { return ($F eq $W) ? 1 : 0; } # Match what's before the asterisk... return 0 unless (_isMatch($F, $W, 1)); # Match what comes after the asterisk... return _isMatch($F, $W, -1); } ################################################## # # This function is called by isMatch() # This function compares two strings and returns 1 if # both strings match until the first asterisk. # This function can start comparing strings # from the beginning or starting from the end! # DIRECTION must be either 1 or -1. # Usage: INTEGER = _isMatch(FILENAME, WILDCARD, DIRECTION) # sub _isMatch { my $F = shift; my $f; my $LF = length($F)-1; my $W = shift; my $w; my $LW = length($W)-1; my $DIR = shift; my $STOP = $LW; my $START = 0; my $FSTART = 0; if ($DIR < 0) { $STOP = 0; $START = $LW; $FSTART = $LF; } while ($START != $STOP) { $w = vec($W, $START, 8); # Grab byte from wildcard pattern $f = vec($F, $FSTART, 8); # Grab byte from filename $START += $DIR; $FSTART += $DIR; if ($w == 42) # ASTERISK? { return 1; } else { # If the character is "?" then skip, but if # it's not "?", then the characters must match. if ($w != 63) { ($f == $w) or return 0; } } } return 1; } ################################################## # # This function counts how many times STRING contains # any of the characters of SUBSTR. # Usage: INTEGER = CountChars(STRING, SUBSTR) # sub CountChars { @_ > 1 or return 0; my $S = shift; defined $S or return 0; length($S) or return 0; my $L = shift; defined $L or return 0; length($L) or return 0; my $P; my $i = length($L); my $N = 0; while ($i-- > 0) { $P = 0; while (($P = 1+index($S, substr($L, $i, 1), $P)) > 0) { $N++; } } return $N; } ################################################## # # This function returns the OS type as a number. # 1=DOS 2=WINDOWS 3=LINUX 4=OSX 9=OTHER # sub GetOS { my $OS = uc($^O); index($OS, 'LINUX') >= 0 ? 3 : index($OS, 'MSWIN') >= 0 ? 2 : index($OS, 'DOS') >= 0 ? 1 : index($OS, 'DARWIN') >= 0 ? 4 : 9; } ################################################## # # This function returns the NAME, the PATH, or the # CONTENT of this perl script depending on which # item is requested in the argument. # # Usage: STRING = Self(STRING) # # Examples: # Self('NAME') # Self('PATH') # Self('CONTENT') # sub Self { my $S = defined $_[0] ? uc($_[0]) : ''; my $P = rindex($0, ($OS < 3 ? "\\" : '/')); if ($S eq 'NAME') { return ($P < 0) ? $0 : substr($0, $P+1, length($0)); } if ($S eq 'PATH') { return ($P < 0) ? $0 : substr($0, 0, $P); } local *FH; sysopen(FH, $0, 0); binmode FH; sysread(FH, $S, -s $0); close FH; return $S; } ################################################## # # This function removes all whitespace from before # and after text and returns a new string. This # function removes every character whose ASCII # value is less than 33. This includes tab, space, # null, vertical tab, esc, new lines, etc.. # # Usage: STRING = Trim(STRING) # sub Trim { my $S = defined $_[0] ? $_[0] : ''; my $L = length($S) or return ''; my $START = 0; my $LAST = 0; while ($L--) { if (vec($S, $L, 8) > 32) { $START = $L; $LAST or $LAST = $L + 1; } } return substr($S, $START, $LAST - $START); } ################################################## # # This function returns the current working directory. # If the current working directory cannot be determined # then returns the first argument. If there are no # arguments passed, then returns the path where this # script is located. # # Usage: STRING = GetCWD( [DEFAULT] ) # sub GetCWD { my $DIR = Trim($OS < 3 ? `cd` : `pwd`); return length($DIR) ? $DIR : defined $_[0] ? $_[0] : Self('PATH'); } ################################################## # # This function inserts commas into a number at # every 3 digits and returns a string. # Copied from www.PerlMonks.org/?node_id=157725 # sub Commify { my $N = reverse $_[0]; $N =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g; return scalar reverse $N; } ################################################## # # This function converts a binary string to a # series of hex numbers. # Usage: STRING = Str2Hex(STRING) # sub Str2Hex { my $X = ''; for (my $i = 0; $i < length($_[0]); $i++) { $X .= sprintf('%.02X', vec($_[0], $i, 8)); } return $X; } ################################################## # # This function works just like the index() function # except it can compare more than one string. It will # return 0 if there is no match. Returns 1 if the first # substring was found. Returns 2 if the second substring # was found or 3 if both the first and second were found... # Usage: INTEGER = Find(STRING, SUBSTR1, [SUBSTR2...]) # sub Find { my $P = 1; my $BIT = 1; my $FOUND = 0; while (defined $_[$P]) { $FOUND |= index($_[0], $_[$P++]) < 0 ? 0 : $BIT; $BIT += $BIT; } return $FOUND; } ################################################## # # Prints the description of this program and exits. # Usage: PrintHelp_and_Exit() # sub PrintHelp_and_Exit() { my $c; print "\n $0\n\n"; ReadFile($0); (my $S = index($INPUT, '# ')) >= 0 or exit; my $E = index($INPUT, '###', $S); for (my $i = $S; $i < $E; $i++) { $c = vec($INPUT, $i, 8); if ($c < 32 || $c == 35) { print "\n" if ($c == 13); next; } print chr($c); } exit; } ################################################## # # Prints an error message and exits. # Usage: Abort(STRING) # sub Abort { print "\nOops. $_[0]\nType perl newline.pl ? for help.\n"; exit; } ################################################## # # Returns the ASCII code of the first character # of string. Returns 0 if the string is empty. # Usage: INTEGER = ASC(STRING) # sub ASC { return defined $_[0] ? vec($_[0], 0, 8) : 0; } ################################################## # # This function tests if a file is read-only by # trying to open it for writing. Returns 1 if # the file is read-only, or 0 if not. # Usage: INTEGER = isReadOnly(FILENAME) # sub isReadOnly { local *FH; open(FH, ">>$_[0]") or return 1; close FH; return 0; } ##################################################