#!/usr/bin/perl

#####################################################################
#
#  WordCount v1.0                              Last Update: 2023.3.18
#
#  This Perl script reads an English or Spanish text file and displays
#  the number of total letters, number of total words, and the number
#  of individual words used. It also displays character statistics
#  showing how frequently each character was used in the text.
#  It displays the percentages also. It displays the number of lines,
#  the number of sentences, the number of words that are written in
#  all lower case vs. the number of words written with a capital letter
#  and all caps, average length of a line, the longest line, the
#  longest word used. So, it's a pretty handy tool for analyzing text.
#  NOTE: The input file name must not contain Unicode letters!
#
#  Written by Zsolt N Perry in March 2023. For questions, comments,
#  feature requests, or bug reports, write to zsnp@juno.com.
#  This file was downloaded from http://www.wzsn.net/perl
#
#  This script does not use any modules or inline C code.
#  It was developed and tested using TinyPerl 5.8 under Windows XP,
#  however it should run fine on Linux and MacOS as well.
#
#               THIS ENTIRE SOURCE CODE IS FREEWARE.
#  If you want to incorporate it or parts of it into your program,
#  just copy and paste whatever you need! There is no need to ask
#  for permission. This software is distributed "AS IS." There is
#  no warranty of any kind. The author will not be held liable for
#  any loss resulting from the use or misuse of this software.
#
#####################################################################

use 5.004;
use strict;
use warnings;
$| = 1;

my @CHARS;
my %WORDS;
my $OVERFLOW = 0;
my $TOTAL_WORDS = 0;
my $TOTAL_DIGITS = 0;
my $PUNCTUATION = 0;
my $ALLCAPS_WORDS = 0;
my $CAPITAL_WORDS = 0;
my $LWRCASE_WORDS = 0;
my $TOTAL_LETTERS = 0;
my $TOTAL_SENTENCES = 0;
my $LONGEST_WORD = '';
my $LONGEST_LINE = 0;
my $LONGEST_LINE_TEXT = '';
my $TOTAL_LINE_LENGTH = 0;

# Print a little header showing information about this program.
print "\n\n  ", 'WordCount v1.0 by Zsolt N. Perry (zsnp@juno.com)',
  "\n  This perl script counts the number of words in a file.",
  "\n\n  You are using Perl $] ", (length(pack('P', 0)) << 3),
  "-bit running on $^O.\n  The current local time is " . localtime(),
  ".\n  Script name: $0\n\n  ", '-' x 76;

if (@ARGV == 0)
{
  print "\n\n  Usage: perl wordcount.pl <text_filename>\n\n";
  exit;
}

my $F = shift(@ARGV);

# Remove illegal characters from file name.
$F =~ tr`<>*%$?\x00-\x1F\"\|``d;

print "\n\n  Reading file: $F  ";

local *FILE;   # The file handle
my $ERR = '';  # Error messages
my $FILESIZE = 0;

# Here we set up a try {} catch {} structure using a foreach() loop.
# In the first round, we open the file for reading, then if anything
# goes wrong, we exit using "next;" and go for the next round where
# we print the error message and exit. On the other hand, if
# everything goes well, things inside the foreach() loop will
# only run once, and we will exit at the bottom using "last;"
foreach (0, 1)
{
  if ($_) { print "\n\n  Error: $ERR!\n"; exit; }  # Error handler.

  unless (-e $F) { $ERR = 'File not found'; next; }
  unless (-f $F) { $ERR = 'Not a plain file'; next; }
  $FILESIZE = -s $F;
  unless ($FILESIZE) { $ERR = 'File size is zero'; next; }
  unless (open(FILE, "<$F"))
    { $ERR = 'Cannot open file for reading'; next; }
  binmode FILE;
  my $BYTES = ($FILESIZE > 1) ? ' bytes)' : ' byte)';
  print '(', Commify($FILESIZE), $BYTES, "\n";
  last;
}

# Initialize each element of @CHARS. This array will hold the
# character codes 0-255 and how many times each one occurs in the file.
# The lower 8 bits will hold the character code, and the higher bits
# will hold the character count. So, when we sort this array by
# numberical value, we get a list that shows the most often
# used character last.
for (my $i = 0; $i < 256; $i++) { $CHARS[$i] = $i; }

print "\n\n  Analyzing file. Please wait. This may take a while...";

{
  my $c;
  my $P;
  my $EOF = 0;
  my $PREV;
  my $ASCII = -1;
  my $WORD_SEPARATOR = " \t\r\n_-+*=<>()[]{}/\\#%&~`^;:.,|\$\"'!?";
  my $WORD = '';
  my $LINE = '';
  my $LINELEN = 0;
  for (my $i = 0; $EOF == 0 && $i <= $FILESIZE; $i++)
  {
    $PREV = $ASCII;
    $c = getc(FILE);
    $EOF = !defined($c);
    if ($EOF) { $c = ' '; }
    $ASCII = ord($c) & 255;
    $EOF or $CHARS[$ASCII] += 256;   # Count characters
    $P = index($WORD_SEPARATOR, $c);
    if ($ASCII > 47 && $ASCII < 58) { $TOTAL_DIGITS++; }
    if ($ASCII == 46 || $ASCII == 63 || $ASCII == 33)
    {
      if (($PREV > 64 && $PREV < 91) || ($PREV > 96 && $PREV < 123))
      { $TOTAL_SENTENCES++; }
    }
    if (($ASCII > 64 && $ASCII < 91) || ($ASCII > 96 && $ASCII < 123))
    { $TOTAL_LETTERS++; }
    if ($ASCII == 10 || $ASCII == 13)  # Line break captured!
    {
      $TOTAL_LINE_LENGTH += $LINELEN;
      if ($LINELEN > $LONGEST_LINE)
      {
        $LONGEST_LINE = $LINELEN;
        $LONGEST_LINE_TEXT = $LINE;
      }
      $LINELEN = 0;
      $LINE = '';
    }
    else
    { $LINELEN++; $LINE .= $c; }
    if ($P < 0)
    {
      if (index("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZÁáÉéÖöÜüÑñíóôúû", $c) >= 0)
      {
        if (length($WORD) < 300)
        { $WORD .= $c; }
        else
        { $OVERFLOW = 1; }  # Word is longer than 300 bytes?
      }
    }
    else
    {
      if (length($WORD))
      {
        if (length($WORD) > length($LONGEST_WORD))
        { $LONGEST_WORD = $WORD; }

        # Process new word
        $TOTAL_WORDS++;
        my $UWORD = $WORD;
        my $LWORD = $WORD;
        $UWORD =~ tr|abcdefghijklmnopqrstuvwxyzáéöüñ|ABCDEFGHIJKLMNOPQRSTUVWXYZÁÉÖÜÑ|;  # Convert to upper case
        $LWORD =~ tr|ABCDEFGHIJKLMNOPQRSTUVWXYZÁÉÖÜÑ|abcdefghijklmnopqrstuvwxyzáéöüñ|;  # Convert to lower case
        if ($WORD eq $UWORD) { $ALLCAPS_WORDS++; }
        if ($WORD eq $LWORD) { $LWRCASE_WORDS++; }
        if (length($WORD) > 1)
        {
          my $CWORD = substr($UWORD, 0, 1) . (length($WORD) > 1 ? substr($LWORD, 1) : '');
          if ($WORD eq $CWORD) { $CAPITAL_WORDS++; }
        }
        if (exists($WORDS{$LWORD}))
        { $WORDS{$LWORD}++; }
        else
        { $WORDS{$LWORD} = 1; }
      }
      $WORD = '';
    }

    $i & 0x3ffff or print '.';
  }
}
close FILE;  # We're done reading.

my $NULL = GetValueAbove8($CHARS[0]);  # How many null characters were found?
my $TAB = GetValueAbove8($CHARS[9]);   # How may tab characters were found?
my $CR = GetValueAbove8($CHARS[13]);
my $LF = GetValueAbove8($CHARS[10]);
my $TOTAL_LINES = ($CR > $LF) ? $CR : $LF;
my $BR = ($CR == $LF) ? 'DOS' : ($LF > $CR ? 'Linux' : 'MacIntosh');
my $AVERAGE_LINE_LEN = int(($TOTAL_LINES > 0) ? $TOTAL_LINE_LENGTH / $TOTAL_LINES : 0);

print "\n\n  Sorting words and characters...";
# First we sort the character array
@CHARS = sort {$a <=> $b} @CHARS;
# That was easy.

# Next, we put all the words into an array where the first four bytes
# hold the word count, followed by the word itself.
# The word count is encoded in binary format, so when this array
# is sorted alphabetically, it will automatically sort all the
# words exactly the way we want them.
my @ALLWORDS;
my $TOTAL_WORD_LEN = 0;
foreach (keys %WORDS)
{
  my $COUNT = $WORDS{$_};
  push(@ALLWORDS, pack('N', $COUNT) . $_);
  $TOTAL_WORD_LEN += length($_) * $COUNT;
}
undef %WORDS;  # We don't need this anymore.
my $AVERAGE_WORD_LEN = int($TOTAL_WORD_LEN / @ALLWORDS);

# Okay, now we sort all the words.
@ALLWORDS = sort(@ALLWORDS);

# Find those characters that do not appear anywhere in the file.
my $TXT = 0;
my $UNUSED = 0;
my $BINARY = 0;
my $BINTOTAL = 0;
my $CTRLTOTAL = 0;
my $TEXT_TOTAL = 0;
for (my $i = 0; $i < 256; $i++)
{
  my $FREQ = GetValueAbove8($CHARS[$i]);
  if ($FREQ == 0) { $UNUSED++; next; }

  # Count number of binary characters used in the file, the number
  # of plain text characters used, and the number of control characters
  # used. Control characters are those whose ASCII value is under 32
  # EXCEPT CR, LF, and TAB which are classified as "plain text."

  my $C = $CHARS[$i] & 255;

  # Identify binary characters:
  if ($C > 126) {
    $BINARY++;
    $BINTOTAL += $FREQ;
    next;
  }

  # Identify plain text characters:
  if ($C > 31 || $C == 10 || $C == 13 || $C == 9) {
    $TXT++;
    $TEXT_TOTAL += $FREQ;
    next;
  }

  # The rest are control characters:
  $CTRLTOTAL += $FREQ;
}
my $PERCENT_UNUSED = int($UNUSED / 2.56);
my $USED = 256 - $UNUSED;
my $PERCENT_USED = 100 - $PERCENT_UNUSED;
my $PERCENT_TEXT = int(($TEXT_TOTAL / $FILESIZE) * 10000) / 100;
my $PERCENT_BINARY = 100 - $PERCENT_TEXT;
my $PERCENT_CTRL = int(($CTRLTOTAL / $FILESIZE) * 10000) / 100;

print "\n\n  Of the 256 ASCII characters, the number of\n  characters that did not occur anywhere in the file : $UNUSED ($PERCENT_UNUSED %)";
print "\n\n  Of the 256 ASCII characters, the number of\n  characters that were used : $USED ($PERCENT_USED %)";
print "\n\n  PLAIN TEXT: There are 97 plain text characters in the ASCII character set.\n  Of these 97 characters, this file included : $TXT";
print "\n\n  BINARY: There are 129 binary characters in the ASCII characters set.\n  Of these 129 characters, this file included : $BINARY";
print "\n\n  CONTROL CHARACTERS: These are characters whose ASCII value is less than\n  32 with the exception of CR, LF, and TAB which are classified as plain text.\n  What portion of the entire file was made up of control characters? : $CTRLTOTAL ($PERCENT_CTRL %)";
print "\n\n";
CENTER('CHARACTER STATISTICS', 78);
print "\n";
{
  my $i = 0;
  my $CHAR = 0;
  for (my $T = 0; $T < 64; $T++)
  {
    $i = $CHAR;
    my $Z = 0;
    for (my $X = 0; $X < 4; $X++)
    {
      $Z += GetValueAbove8($CHARS[$i++]);
    }
    if ($Z == 0) { $CHAR += 4; next; }

    print "\n";
    $i = $CHAR;
    for (my $X = 0; $X < 4; $X++, $i++)
    {
      my $C = $CHARS[$i] & 255;
      if ($C < 32) { $C = 32; }
      my $FREQ = GetValueAbove8($CHARS[$i]);
      CENTER('"' . chr($C) . '" x ' . $FREQ, 19);
    }
    print "\n";
    $i = $CHAR;
    for (my $X = 0; $X < 4; $X++)
    {
      my $C = $CHARS[$i++] & 255;
      CENTER(sprintf('0x%0.2X (%d)', $C, $C), 19);
    }
    print "\n";
    $i = $CHAR;
    for (my $X = 0; $X < 4; $X++)
    {
      my $PERCENT = int(GetValueAbove8($CHARS[$i++]) / $FILESIZE * 10000) / 100;
      CENTER("$PERCENT%", 19);
    }
    print "\n";
    $CHAR += 4;
  }
}

print "\n\n";
CENTER('WORD STATISTICS', 78);
print "\n";

for (my $i = 0; $i < @ALLWORDS; $i++)
{
  my $WORD = substr($ALLWORDS[$i], 4);
  my $COUNT = unpack('N', substr($ALLWORDS[$i], 0, 4)) + 1;
  print "\n$i. $WORD x $COUNT";
}

my $PERCENT_LETTERS = int(($TOTAL_LETTERS / $FILESIZE) * 10000) / 100;
my $PERCENT_DIGITS = int(($TOTAL_DIGITS / $FILESIZE) * 10000) / 100;
my $PERCENT_NULL = int(($NULL / $FILESIZE) * 10000) / 100;
my $PERCENT_TAB = int(($TAB / $FILESIZE) * 10000) / 100;
my $PERCENT_BR = int((($CR + $LF) / $FILESIZE) * 10000) / 100;

print "\n\n";
print "\n  Plain text characters       : ", Commify($TEXT_TOTAL), " ($PERCENT_TEXT %)";
print "\n  Binary characters           : ", Commify($BINTOTAL), " ($PERCENT_BINARY %)";
print "\n  Number of letters (a-z A-Z) : ", Commify($TOTAL_LETTERS), " ($PERCENT_LETTERS %)";
print "\n  Number of digits (0-9)      : ", Commify($TOTAL_DIGITS), " ($PERCENT_DIGITS %)";
print "\n  Number of NULL bytes        : ", Commify($NULL), " ($PERCENT_NULL %)";
print "\n  Number of TAB characters    : ", Commify($TAB), " ($PERCENT_TAB %)";
print "\n  Number of CR+LF characters  : ", Commify($CR), ' + ', Commify($LF), " ($PERCENT_BR %)";
print "\n  This file contains $BR-style line breaks.";
print "\n  Number of lower-case words  : ", Commify($LWRCASE_WORDS);
print "\n  Number of all-caps words    : ", Commify($ALLCAPS_WORDS);
print "\n  Number of capitalized words : ", Commify($CAPITAL_WORDS);
print "\n  Total words                 : ", Commify($TOTAL_WORDS);
print "\n  Longest word                : $LONGEST_WORD  (", length($LONGEST_WORD), ' letters)';
print "\n  Average word length         : $AVERAGE_WORD_LEN letters";
print "\n  Total sentences             : ", Commify($TOTAL_SENTENCES);
print "\n  Total lines                 : ", Commify($TOTAL_LINES);
print "\n  Average line length         : $AVERAGE_LINE_LEN bytes";
print "\n  Longest line                : $LONGEST_LINE bytes";
print "\n  Longest line looks like this:\n\n  $LONGEST_LINE_TEXT\n\n";
exit;

##################################################
# This function inserts commas into a number at
# every 3 digits and returns a string.
# Usage: STRING = Commify(INTEGER)
# Copied from www.PerlMonks.org/?node_id=157725
#
sub Commify
{
  my $N = reverse $_[0];
  $N =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g;
  return scalar reverse $N;
}
##################################################
#                              String | v2023.3.17
# This function extracts the number that is stored
# in the upper bits of an integer. We are assuming
# that the lower 8 bits store some other value(s)
# which we are not concerned about, and the
# higher bits store the value that we want.
# The input number can be a 32-bit integer, but
# it is not limited to 32 bits. It can be more.
#
# Usage: INTEGER = GetValueAbove8(INTEGER)
#
sub GetValueAbove8
{
  my $FREQ = $_[0];
  return ($FREQ < 2147483648) ? $FREQ >> 8 : int($FREQ / 256);
}
##################################################
sub CENTER
{
  my ($Text, $MaxWidth) = @_;
  my $L = length($Text);
  ($L + 2) < $MaxWidth or
    return print ' ', substr($Text, 0, $MaxWidth - 2), ' ';
  my $LeftPad = ($MaxWidth - $L) >> 1;
  my $RightPad = $MaxWidth - $L - $LeftPad;
  print ' ' x $LeftPad, $Text, ' ' x $RightPad;
}
##################################################