#!/usr/bin/perl

#####################################################################
#
#  CRLF v1.0                                   Last Update: 2020.1.20
#
#  This perl script searches for PL, PM, CGI, HTM, HTML, CSS, JS,
#  TXT and LOG files in the current directory and changes the
#  line-breaks to CR+LF (DOS format). The program will skip files
#  that larger than 1MB or whose name contains Unicode characters.
#
#  Written by Zsolt N Perry on January 20, 2020 in Pensacola, Fla.
#  For questions, comments, feature requests, or bug reports,
#  write to zsnp@juno.com.
#
#  This script does not require any Perl modules. This Perl script
#  was tested with TinyPerl 5.8 under Windows XP.
#  This file was downloaded from http://www.wzsn.net/perl
#
#               THIS ENTIRE SOURCE CODE IS FREEWARE.
#  If you want to incorporate it or parts of it into your program,
#  just copy and paste whatever you need! There is no need to ask
#  for permission. This software is distributed "AS IS." There is
#  no warranty of any kind. The author will not be held liable for
#  any loss resulting from the use or misuse of this software.
#
#####################################################################

use 5.004;
use strict;
use warnings;
$| = 1;

##################################################
## SETTINGS

my $BR          = "\r\n";  # Insert DOS line break sequence
my $MAXSIZE     = 1000000; # Skip very large files (over 1MB)
my $MINTEXT     = 95;      # File must be at least 95% plain text
my $RECURSIVE   = 0;       # Recursive mode?
my $TRIM        = 1;  # Remove whitespace from the end of lines?
my $DONT_WRITE  = 0;  # Don't save any changes?
my $TOUCH       = 0;  # Update a file's last-modified time?

# Process files with the following extensions:
my $FILES = '.PM .PL .CGI .HTM .HTML .JS .CSS .TXT .LOG';
my @FILE_LIST;

##################################################
## VARIABLES

# This script uses a lot of global variables
# instead of passing a bunch of parameters around.

my $i;               # file ptr
my $M;               # File mode/attribute
my $T;               # File last-modified date&time
my $OS;              # operating system
my $DIR;             # work directory
my $BRU;             # Line break for unicode files
my $BRUB;            # Line break for unicode files (big endian)
my $LINES = 0;       # Total number of lines in a file
my $INPUT = '';      # Read buffer
my $OUTPUT = '';     # Write buffer
my $UNICODE = 0;     # Unicode format?
my $text_end;        # Marks the end of text
my $line_start;      # Beginning of line

##################################################
## PROGRAM STARTS HERE

$OS  = GetOS();
$DIR = GetCWD();
print "Current directory : $DIR\n";
CheckDIR($DIR);
exit;

##################################################
#
# This function reads the contents of a folder and
# calls CatchFile() for each file that was found.
# Usage: CheckDIR(PATH)
#
sub CheckDIR
{
  my $PATH = defined $_[0] ? $_[0] : '';
  length($PATH) or return;

  # Change / to \ on Windows and DOS computers
  if ($OS < 3) { $PATH =~ tr#/#\\#; }

  if ($DONT_WRITE == 0)
  { print "Reading directory: $PATH\n\n"; }

  # Make sure that PATH ends with a backslash or forward slash
  if (index("/\\", substr($PATH, length($PATH)-1, 1)) < 0)
  { $PATH .= ($OS < 3 ? "\\" : '/'); }

  my $FULLNAME;
  local *DIR;
  opendir(DIR, $PATH) or return;
  my $NAME = 1;
  while ($NAME)
  {
    $NAME = readdir(DIR);
    defined $NAME or last;
    $FULLNAME = "$PATH$NAME";
    if (-d($FULLNAME))
    {
      # Check into subdirectory if RECURSIVE == 1
      # Skip directory if its name starts with "."
      if ($RECURSIVE)
      { CheckDIR($FULLNAME) unless (vec($NAME, 0, 8) == 46); }
      next;
    }
    CatchFile($PATH, $NAME);
  }
  closedir(DIR);
}

##################################################
#
# This function is automatically called
# by CheckDIR() every time a file is found.
# Its role is to see if the current file's name
# matches the list of names we're looking for
# specified in $FILES and @FILE_LIST.
# Usage: CatchFile(PATH_ONLY, NAME_ONLY)
#
sub CatchFile
{
  my ($P, $F) = @_;
  if (length($F) > 2)  # Match extension
  {
    (my $EXT = rindex($F, '.')) >= 0 or next;
    $EXT = uc(substr($F, $EXT, length($F))) . ' ';
    if (index($FILES, $EXT) >= 0)
    {
      UpdateFile("$P$F");
      return;
    }
  }
}

##################################################
#
# This function scans the file contents and replaces
# various line-break sequences with a new sequence.
# Usage: UpdateFile(FILENAME)
#
sub UpdateFile
{
  $INPUT = '';
  $OUTPUT = '';
  my $F = shift;

  if (-s $F > $MAXSIZE)
  { print "$F\n SKIPPED. File is bigger than ", Commify($MAXSIZE), " bytes.\n\n"; return; }

  print "\n", $F;
  return if ($DONT_WRITE);
  print ' ...';

  # We do a series of checks first. Return if we
  # can't read the file or if the file size is zero.
  my $SIZE = ReadFile($F);

  if ($SIZE == 0)
  { print "SKIPPED. Nothing to read.\n\n"; return; }
  print Commify($SIZE), " BYTES READ.\n";

  my $X = AnalyzeFile();
  my $PERCENT_TEXT = $X & 255;
  my $FLAGS = ($X >> 8) & 7;
  if ($X & 0x800)     # Possibly unicode format
  {
    print "Cannot decode unicode format. Sorry.\n";
    return;
  }

  # Return if the file contains less than 95% plain text
  # or whatever minimum amount is specified in $MINTEXT
  # unless it's a unicode file.
  if ($UNICODE && $PERCENT_TEXT > 45)
  { print 'UNICODE TEXT. '; }
  elsif ($PERCENT_TEXT < $MINTEXT)
  { print "SKIPPED. File is only $PERCENT_TEXT% plain text! Should be at least $MINTEXT%.\n\n"; return; }

  # File contains NO line-breaks at all?
  if ($FLAGS == 4 && !$TRIM)
  { print "SKIPPED. File is all one line!\n\n"; return; }

  # Input file is in Linux format:
  my $SAVE_AND_RETURN = 0;
  if ($FLAGS == 1 && !$TRIM)
  {
    # Converting from Linux to DOS:
    if ($BR eq "\r\n") { $INPUT =~ s/\n/\r\n/g; }
    $SAVE_AND_RETURN = 1;
  }

  if ($SAVE_AND_RETURN == 1)
  {
    ($OUTPUT, $INPUT) = ($INPUT, '');
    SaveFile($F);
    return;
  }

  # If we get to this point, it means our input file
  # contains mixed content. So, we will read it
  # slowly, processing one byte at a time.

  my $c = 0;       # current character
  my $prev;        # previous character
  $LINES = 0;
  $text_end = 0;
  $line_start = -2;

  my $RESTORE = $BR;
  $BR = $BRU  if ($UNICODE == 1);
  $BR = $BRUB if ($UNICODE == 2);
  for ($i = 0; $i < $SIZE; $i++)
  {
    $c = vec($INPUT, $i, 8);
    next if ($UNICODE && $c == 0);

    if ($c == 10 || $c == 13)     # Detect LF CR
    {
      if (PrintLine()) { }

      # PrintLine() returns 0 if we had nothing to print,
      # so then we check if the previous character was
      # also the same new-line character...

      elsif ($prev == $c)
      {
        # OK. What we have here is two LF-LF characters
        # or two CR-CR characters one after the other,
        # which means we should print a blank line.

        $OUTPUT .= $BR;
        $LINES++;
      }

      # Okay, what if the previous character was a
      # different new-line character?

      elsif ($prev == 10 || $prev == 13)
      {
        # We may have the second byte of a CR/LF pair
        # here, so just skip this byte. And print a
        # New-Line sequence in the next cycle.

        $line_start = -2;
      }
    }
    else	  # All kinds of other characters:
    {
      if ($line_start < 0) { $line_start = $i; }
      ($c == 32 || $c == 9) or $text_end = $i;
    }
    $prev = $c;   # Save previous char
  }
  if ($line_start >= 0)
  { PrintLine(); } # Print the last line (if any)
  $BR = $RESTORE;
  SaveFile($F . '.bak');    # Overwrite original file
}

##################################################
#
# This function sends one line to a global variable
# named $OUTPUT which collects all the data to
# be written to a file. The line to be stored may
# contain text only, or it may be just a blank line.
#
# When $line_start == -2, we need to store a blank
# line. When $line_start == -1, we're in the middle
# of a line-break sequence. This function returns 1
# if something was stored, or 0 otherwise.
#
# Usage: INTEGER = PrintLine()
#
sub PrintLine
{
  if ($line_start == -1)   # Nothing to print yet?
  {
    return 0;
  }
  if ($line_start >= 0)    # Got something to print!
  {
    if ($UNICODE == 1)
    {
      $OUTPUT .= substr($INPUT, $line_start, ($TRIM ? $text_end + 1 : $i) - $line_start + 1);
    }
    else
    {
      $OUTPUT .= substr($INPUT, $line_start, ($TRIM ? $text_end + 1 : $i) - $line_start);
    }
  }
  $OUTPUT .= $BR;          # Print New-line sequence
  $LINES++;
  $line_start = -1;
  return 1;
}

##################################################
#
# This function writes the contents of $OUTPUT
# to a file, replacing the existing file,
# and prints a little summary.
#
sub SaveFile
{
  my $F = shift;

  if ($INPUT eq $OUTPUT)
    { print "SKIPPED. NO CHANGES NEEDED.\n\n"; return; }

  if ($DONT_WRITE)
    { print "SKIPPED. NO CHANGES MADE!\n\n"; return; }

#  if ($CREATE_BAK)
#    { system("ren $F *.BAK"); }

  chmod $M | 0660, $F;     # Make file writable

  if (open(FH, ">$F") == 0)
  {
    print "\tCAN'T WRITE!\n\n";
    return;
  }
  binmode FH;
  print FH $OUTPUT;
  close FH;

  print "\t", Commify($LINES), ' line(s) ', Commify(length($OUTPUT)), " bytes written.\n\n";

  chmod $M, $F;              # Restore original mode

# The date & time the file was last modified will be reset
# back to original, but the file last-access date
# will most likely change to the current time.

  $TOUCH or utime time, $T, $F;
}

##################################################
#
# This function reads the entire contents of a
# file into a global variable named $INPUT and
# returns the number of bytes read OR
# returns 0 if something went wrong.
#
# Usage: INTEGER = ReadFile(FILENAME)
#
sub ReadFile
{
  my $F = shift;

  $LINES = 0;
  $INPUT = '';
  $OUTPUT = '';

  my $SIZE = -s $F;
  $SIZE or return 0;

  local *FH;
  sysopen(FH, $F, 0) or return 0;
  my @INFO = stat(FH);
  $M = $INFO[2];	# Get file mode/attribute
  $T = $INFO[9];	# Get last-modified date
  binmode FH;

  sysread(FH, $INPUT, $SIZE) or return 0;
  close FH;

  $SIZE == length($INPUT) or return 0;

  return $SIZE;
}

##################################################
#
# This function scans the content of $INPUT string
# looking for special characters and determines
# what percentage of the string is plain text, and
# also tries to determine the text format.
#
# Returns an integer whose lower 8 bits is the percentage (0-100).
# Bit 9 will be set if any LF characters were found.
# Bit 10 will be set if any CR characters were found.
# Bit 11 will be set if there are equal number of CR and LF
# characters in the string.
# Bit 12 will be set if there are lots of zeros in the string,
# which is an indication that it may be unicode format.
# These can be interpreted as follows:
#
#  .000 = Format is undetermined.
#  .001 = LINUX string (LF only)
#  .010 = OSX string (CR only)
#  .011 = MIXED format
#  .111 = DOS text (CR-LF pairs)
#
# Usage: INTEGER = AnalyzeFile()
#
sub AnalyzeFile
{
  my $L = length($INPUT);

  # We will simply count the number of plain text characters
  # and the number of CR and LF characters in the string.

  my $c;
  my $TOTAL = $L;   # Total length of string
  my $TX = 0;       # Number of plain text characters
  my $CR = 0;       # Number of 0D characters
  my $LF = 0;       # Number of 0A characters
  my $NUL = 0;      # Number of 00 characters

  while ($L--)
  {
    $c = vec($INPUT, $L, 8);
    next if ($c > 126);
    if ($c > 31 || $c == 9) { $TX++; next; }
    $LF++ if ($c == 10);
    $CR++ if ($c == 13);
    $NUL++ if ($c == 0);
  }

  $LINES = ($LF > $CR) ? $LF : $CR;  # Number of line breaks

  # Now, we will try to determine what type of string
  # we're dealing with. There are 5 possibilities:
  # LINUX, DOS, OSX, MIXED, or "undetermined."
  #
  # Explanation of formats:
  # * OSX files contain CR characters as line break.
  # * Linux text files contain LF characters as line break.
  # * DOS text files contain an equal number of CR and LF
  #   characters in pairs.
  # * "MIXED" means that the string contains an unequal number of
  #   both CR and LF characters, so this may be a binary data.
  # * "Undeteremined" means that the string does not contain
  #   any line break characters at all, so it could be either
  #   a DOS text or Linux text or anything.

  $c = $LF ? 0x100 : 0;   # Now we use $c to store the string format.
  $c |= 0x200 if ($CR);
  $c |= 0x400 if ($CR == $LF);

  # Check for possible unicode format
  $NUL = int($NUL / $TOTAL * 100);   # Calculate % of null chars
  $c |= 0x800 if ($NUL > 40 && $NUL < 60);

  # The percentage is stored in the lower 8 bits,
  # and the format is stored in bits 9-12.
  return $c | int(($TX+$LF+$CR) / $TOTAL * 100);
}

##################################################
#
# This function returns true if a filename matches a
# certain wildcard pattern. There may be several
# question marks in the search pattern, but only
# one asterisk is allowed! The matching is
# NOT case sensitive!
# Usage: INTEGER = isMatch(FILENAME, WILDCARD)
#
# Example: isMatch("New_Document.txt", "n*.txt") = 1
#
sub isMatch
{
  @_ > 1 or return 0;
  my $F = shift; defined $F or return 0; length($F) or return 0;
  my $W = shift; defined $W or return 0; length($W) or return 0;
  $F = uc($F);
  $W = uc($W);

  # If there are invalid characters...
  if (CountChars($W.$F, '<|>')) { return 0; }

  # If there aren't any wildcards at all...
  if (CountChars($W, '*?') == 0) { return ($F eq $W) ? 1 : 0; }

  # Match what's before the asterisk...
  return 0 unless (_isMatch($F, $W, 1));

  # Match what comes after the asterisk...
  return _isMatch($F, $W, -1);
}

##################################################
#
# This function is called by isMatch()
# This function compares two strings and returns 1 if
# both strings match until the first asterisk.
# This function can start comparing strings
# from the beginning or starting from the end!
# DIRECTION must be either 1 or -1.
# Usage: INTEGER = _isMatch(FILENAME, WILDCARD, DIRECTION)
#
sub _isMatch
{
  my $F = shift; my $f; my $LF = length($F)-1;
  my $W = shift; my $w; my $LW = length($W)-1;
  my $DIR = shift;
  my $STOP = $LW; my $START = 0; my $FSTART = 0;
  if ($DIR < 0) { $STOP = 0; $START = $LW; $FSTART = $LF; }
  while ($START != $STOP)
  {
    $w = vec($W, $START, 8);  # Grab byte from wildcard pattern
    $f = vec($F, $FSTART, 8); # Grab byte from filename
    $START += $DIR;
    $FSTART += $DIR;
    if ($w == 42)   # ASTERISK?
    { return 1; }
    else
    {
      # If the character is "?" then skip, but if
      # it's not "?", then the characters must match.
      if ($w != 63) { ($f == $w) or return 0; }
    }
  }
  return 1;
}
##################################################
#
# This function counts how many times STRING contains
# any of the characters of SUBSTR.
# Usage: INTEGER = CountChars(STRING, SUBSTR)
#
sub CountChars
{
  @_ > 1 or return 0;
  my $S = shift; defined $S or return 0; length($S) or return 0;
  my $L = shift; defined $L or return 0; length($L) or return 0;
  my $P;
  my $i = length($L);
  my $N = 0;
  while ($i-- > 0)
  {
    $P = 0;
    while (($P = 1+index($S, substr($L, $i, 1), $P)) > 0)
    { $N++; }
  }
  return $N;
}

##################################################
#
# This function returns the OS type as a number.
# 1=DOS 2=WINDOWS 3=LINUX 4=OSX 9=OTHER
#
sub GetOS
{
  my $OS = uc($^O);
  index($OS, 'LINUX')  >= 0 ? 3 :
  index($OS, 'MSWIN')  >= 0 ? 2 :
  index($OS, 'DOS')    >= 0 ? 1 :
  index($OS, 'DARWIN') >= 0 ? 4 : 9;
}

##################################################
#
# This function returns the NAME, the PATH, or the
# CONTENT of this perl script depending on which
# item is requested in the argument.
#
# Usage: STRING = Self(STRING)
#
# Examples:
#   Self('NAME')
#   Self('PATH')
#   Self('CONTENT')
#
sub Self
{
  my $S = defined $_[0] ? uc($_[0]) : '';
  my $P = rindex($0, ($OS < 3 ? "\\" : '/'));
  if ($S eq 'NAME') { return ($P < 0) ? $0 : substr($0, $P+1, length($0)); }
  if ($S eq 'PATH') { return ($P < 0) ? $0 : substr($0, 0, $P); }
  local *FH;
  sysopen(FH, $0, 0);
  binmode FH;
  sysread(FH, $S, -s $0);
  close FH;
  return $S;
}

##################################################
#
# This function removes all whitespace from before
# and after text and returns a new string. This
# function removes every character whose ASCII
# value is less than 33. This includes tab, space,
# null, vertical tab, esc, new lines, etc..
#
# Usage: STRING = Trim(STRING)
#
sub Trim
{
  my $S = defined $_[0] ? $_[0] : '';
  my $L = length($S) or return '';
  my $START = 0;
  my $LAST = 0;
  while ($L--)
  {
    if (vec($S, $L, 8) > 32)
    {
      $START = $L;
      $LAST or $LAST = $L + 1;
    }
  }
  return substr($S, $START, $LAST - $START);
}

##################################################
#
# This function returns the current working directory.
# If the current working directory cannot be determined
# then returns the first argument. If there are no
# arguments passed, then returns the path where this
# script is located.
#
# Usage: STRING = GetCWD( [DEFAULT] )
#
sub GetCWD
{
  my $DIR = Trim($OS < 3 ? `cd` : `pwd`);
  return length($DIR) ? $DIR : defined $_[0] ? $_[0] : Self('PATH');
}

##################################################
#
# This function inserts commas into a number at
# every 3 digits and returns a string.
# Copied from www.PerlMonks.org/?node_id=157725
#
sub Commify
{
  my $N = reverse $_[0];
  $N =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g;
  return scalar reverse $N;
}

##################################################
#
# This function converts a binary string to a
# series of hex numbers.
# Usage: STRING = Str2Hex(STRING)
#
sub Str2Hex
{
  my $X = '';
  for (my $i = 0; $i < length($_[0]); $i++)
  { $X .= sprintf('%.02X', vec($_[0], $i, 8)); }
  return $X;
}

##################################################
#
# This function works just like the index() function
# except it can compare more than one string. It will
# return 0 if there is no match. Returns 1 if the first
# substring was found. Returns 2 if the second substring
# was found or 3 if both the first and second were found...
# Usage: INTEGER = Find(STRING, SUBSTR1, [SUBSTR2...])
#
sub Find
{
  my $P = 1;
  my $BIT = 1;
  my $FOUND = 0;
  while (defined $_[$P])
  {
    $FOUND |= index($_[0], $_[$P++]) < 0 ? 0 : $BIT;
    $BIT += $BIT;
  }
  return $FOUND;
}
##################################################
#
# Prints the description of this program and exits.
# Usage: PrintHelp_and_Exit()
#
sub PrintHelp_and_Exit()
{
  my $c;
  print "\n $0\n\n";
  ReadFile($0);
 (my $S = index($INPUT, '# ')) >= 0 or exit;
  my $E = index($INPUT, '###', $S);
  for (my $i = $S; $i < $E; $i++)
  {
    $c = vec($INPUT, $i, 8);
    if ($c < 32 || $c == 35)
    { print "\n" if ($c == 13); next; }
    print chr($c);
  }
  exit;
}
##################################################
#
# Prints an error message and exits.
# Usage: Abort(STRING)
#
sub Abort
{
  print "\nOops. $_[0]\nType perl newline.pl ? for help.\n";
  exit;
}
##################################################
#
# Returns the ASCII code of the first character
# of string. Returns 0 if the string is empty.
# Usage: INTEGER = ASC(STRING)
#
sub ASC
{
  return defined $_[0] ? vec($_[0], 0, 8) : 0;
}
##################################################
#
# This function tests if a file is read-only by
# trying to open it for writing. Returns 1 if
# the file is read-only, or 0 if not.
# Usage: INTEGER = isReadOnly(FILENAME)
#
sub isReadOnly
{
  local *FH;
  open(FH, ">>$_[0]") or return 1;
  close FH;
  return 0;
}
##################################################