#!/usr/bin/perl

use strict;
use warnings;

#
# This Perl script searches all the HTML files in the
# current directory and collects all the links and
# displays them in a list. Written by Zsolt in 2021.
#
##################################################

my $PATH = SelfPath();
my $RECURSIVE = 1;
my @NAME;
my @HREF;
my @ALLFILES;
my $BIG_REF_LIST = '';

About();

CheckDIR($PATH);  # Collect HTML file names
my $OUTPUT_FILE = "$PATH/links.txt";
$OUTPUT_FILE =~ tr|/||s;
my $DATA;
my $FILE;
my $NAMEONLY;
my @EXT = qw(HTML HTM XML PDF DOC TXT JS JSP ASP ASX PHP PL CGI JPE JPG JPEG BMP GIF PNG TIF PCX);

for (my $i = 0; $i < @ALLFILES; $i++)
{
  $FILE = $ALLFILES[$i];
  $NAMEONLY = cut($FILE, '/', 0x10101);
  $DATA = ReadFile($FILE);
  $DATA =~ s/\<A[ A-Za-z0-9_\'\"\=\;]* HREF=["']+([ A-Z-a-z0-9.,:\/\-\+\&\%\_\?\=\#]+)["']+[ A-Za-z0-9\;\=\&\:\'\"\-\_]*\>*([ A-Za-z0-9.,:\/\-\+\&\%\_\?\#]+)\<\/A\>//i;
  if (defined $1)
  {
    push(@HREF, $1);
    push(@NAME, $2);  
  }
}

my @BIGLIST;
for (my $i = 0; $i < @NAME; $i++)
{
  $BIGLIST[$i] = "\r\n\"" . $HREF[$i] . "|" . $NAME[$i] . "\",";
}
@BIGLIST = sort(@BIGLIST);
CreateFile($OUTPUT_FILE, join("", @BIGLIST));
EXIT();

##################################################
#
# This function searches STRING to see if any of its
# characters match any of the characters of SUBSTR.
# Returns the pointer where the first match occurred
# in STRING. Returns -1 if no matches were found.
# If START is provided, the search will start at a
# certain position in STRING. If the start is a negative
# value, then the search will start from the end of string.
# When INVERT is defined, this function will return the
# first non-matching character's position.
#
# Usage: INTEGER = FindChar(STRING, SUBSTR, [START, [INVERT]])
#
sub FindChar
{
  defined $_[0] or return -1;
  defined $_[1] or return -1;
  (my $END = length($_[0])) && length($_[1]) or return -1;
  my $i = defined $_[2] ? $_[2] : 0;
  my $MATCH = defined $_[3] ? 0 : 1;
  $i < $END or return -1;
  my $DIR = 1;
  if ($i < 0)
  {
    $i += $END;
    $END = $DIR = -1;
  }
  $i >= 0 or return -1;
  for (; $i != $END; $i += $DIR)
  {
    if ((index($_[1], substr($_[0], $i, 1)) < 0 ? 0 : 1) == $MATCH)
    { return $i; }
  }
  return -1;
}
##################################################
#
# This function removes all whitespace from before
# and after text and returns a new string. This
# function removes every character whose ASCII
# value is less than 33. This includes tab, space,
# null, vertical tab, esc, new lines, etc..
#
# Usage: STRING = Trim(STRING)
#
sub Trim
{
  my $S = defined $_[0] ? $_[0] : '';
  my $L = length($S) or return '';
  my $START = 0;
  my $LAST = 0;
  while ($L--)
  {
    if (vec($S, $L, 8) > 32)
    {
      $START = $L;
      $LAST or $LAST = $L + 1;
    }
  }
  return substr($S, $START, $LAST - $START);
}
##################################################
#
# This function returns the OS type as a number.
# 1=DOS 2=WINDOWS 3=LINUX 4=OSX 9=OTHER
#
sub GetOS
{
  my $OS = uc($^O);
  index($OS, 'LINUX')  >= 0 ? 3 :
  index($OS, 'MSWIN')  >= 0 ? 2 :
  index($OS, 'DOS')    >= 0 ? 1 :
  index($OS, 'DARWIN') >= 0 ? 4 : 9;
}
##################################################
# Prints the description of this program.
# Usage: About()
#
sub About
{
  my $PTRSIZE = `$^X -V:ptrsize`;
  $PTRSIZE =~ s/[^0-9]//g;
  print "\nPerl $] ", ($PTRSIZE << 3), '-bit ', $^O, ', ' . localtime(), "\n";
  my $S = ReadFile($0);
  my $P = 1 + index($S, '# ');
  my $E = 1 + index($S, '###', $P);
  $E && $P or die "\nMissing argument.\n";
  $S = substr($S, $P, $E - $P);
  $S =~ tr/#//d;
  print "\n $0\n\n$S";
}
##################################################
#
# This function reads the entire contents of a file
# in binary mode and returns it as a string. If an
# errors occur, an empty string is returned silently.
# A second argument will move the file pointer before
# reading. And a third argument limits the number
# of bytes to read.
# Usage: STRING = ReadFile(FILENAME, [START, [LENGTH]])
#
sub ReadFile
{
  my $NAME = defined $_[0] ? $_[0] : '';
  $NAME =~ tr/\"\0*?|<>//d;	# Remove special characters
  -e $NAME or return '';
  -f $NAME or return '';
  my $SIZE = -s $NAME;
  $SIZE or return '';
  my $LEN = defined $_[2] ? $_[2] : $SIZE;
  $LEN > 0 or return '';
  local *FH;
  sysopen(FH, $NAME, 0) or return '';
  binmode FH;
  my $POS = defined $_[1] ? $_[1] : 0;
  $POS < $SIZE or return '';
  $POS < 1 or sysseek(FH, 0, $POS);	# Move file ptr
  my $DATA = '';
  sysread(FH, $DATA, $LEN);		# Read file
  close FH;
  return $DATA;
}
##################################################
#                                       v2019.6.15
# Just like Trim(), this function can remove spaces
# or tabs from before and after STRING but it can also
# remove any other character, whatever is found in SUBSTR.
# Usage: STRING = TrimChar(STRING, SUBSTR)
#
sub TrimChar
{
  defined $_[0] or return '';
  my $L = length($_[0]);
  $L or return '';
  defined $_[1] or return $_[0];
  length($_[1]) or return $_[0];
  my $START = 0;
  my $LAST = 0;
  while ($L--)
  {
    if (index($_[1], substr($_[0], $L, 1)) < 0)
    {
      $START = $L;
      $LAST or $LAST = $L + 1;
    }
  }
  return substr($_[0], $START, $LAST - $START);
}
##################################################
#
# This function reads the contents of a folder and
# calls CatchFile() for each file that was found.
# Usage: CheckDIR(PATH)
#
sub CheckDIR
{
  my $PATH = defined $_[0] ? $_[0] : '';
  length($PATH) or return;

  $PATH =~ tr|\\|/|;
  $PATH .= '/';
  $PATH =~ tr|/||s;

  my $EXT;
  my @SUBDIRS;
  my $FULLNAME;
  local *DIR;
  opendir(DIR, $PATH) or return;
  while ((my $NAME = readdir(DIR)))
  {
    if (length($NAME) < 3)
    { $NAME ne '.' && $NAME ne '..' or next; }

    $FULLNAME = "$PATH$NAME";

    # We will display directories first, followed by
    # symbolic links, then all other special files,
    # and plain files last.
    if (-d($FULLNAME)) { push(@SUBDIRS, $FULLNAME); next; }
    if (-f($FULLNAME))
    {
      $EXT = uc(cut($FULLNAME, '.', 0x10001));
      if ($EXT eq 'HTM' || $EXT eq 'HTML')
      { push(@ALLFILES, $FULLNAME); next; }
    }
  }
  closedir(DIR);
  $RECURSIVE or return;
  foreach $PATH (@SUBDIRS) { CheckDIR($PATH); }
}
##################################################
#                                      v2019.11.24
# Creates and overwrites a file in binary mode.
# Returns 1 on success or 0 if something went wrong.
# Usage: INTEGER = CreateFile(FILE_NAME, CONTENT)
#
sub CreateFile
{
  defined $_[0] or return 0;
  my $F = $_[0];
  $F =~ tr/\"\0*?|<>//d;	# Remove special characters
  length($F) or return 0;
  local *FH;
  open(FH, ">$F") or return 0;
  binmode FH;
  if (defined $_[1] ? length($_[1]) : 0) { print FH $_[1]; }
  close FH or return 0;
  return 1;
}
####################################################################################################
#                                       v2021.2.12
# This function splits string into two parts along
# the first occurrence of substring and places the
# two resulting parts into $a and $b. If substring
# is not found, then both $a and $b will be empty!
#
# The return value of this function will depend on
# the value of the third argument (CMD). Without
# the third argument, this function will simply cut
# the first instance of substring from the string
# and return what's left.
#
# The CMD argument is a sum of several numbers, and
# it should be given as a hexadecimal number. Here's
# how it works:
#
#   cut('Hello World!', ' ', 0x001) returns 'World!'
#   cut('Hello World!', ' ', 0x010) returns 'Hello'
#   cut('Hello World!', ' ', 0x011) returns 'HelloWorld!'
#
# The third digit from the right controls what is
# returned when the substring is not found.
# 0=Return an empty string
# 1=Return the original string
#
#   cut('Hello World!', ' ', 0x101) returns 'World!'
#   cut('Hello World!', 'L', 0x101) returns 'Hello World!'
#
# To ignore case, we set the 4th digit:
#
#   cut('Hello World!', 'L', 0x1101) returns 'lo World!'
#
# To search from the end of string, we set the 5th digit:
#
#   cut('Hello World!', 'L', 0x11101) returns 'd!'
#
# Usage: STRING = cut(STRING, SUBSTR, [CMD])
#
sub cut
{
  my $STR = defined $_[0] ? $_[0] : '';
  my $SUB = defined $_[1] ? $_[1] : '';
  my $CMD = defined $_[2] ? $_[2] : 0x111;
  my $P = ($CMD & 0x1000)  ?
    (($CMD & 0x10000) ? rindex(uc($STR), $SUB) : index(uc($STR), $SUB))
   :
    (($CMD & 0x10000) ? rindex($STR, $SUB) : index($STR, $SUB));
  $a = $b = '';
  $P < 0 and return ($CMD & 256) ? $STR : '';
  $a = substr($STR, 0, $P);
  $b = substr($STR, $P + length($SUB));
  return ($CMD & 16 ? $a : '') . ($CMD & 1 ? $b : '');
}
##################################################
#                                       v2019.12.7
# This function scans string S and replaces the
# first N occurrences of string A with string B
# and returns a new string. If N is -1 then only
# the last instance is replaced.
# Usage: STRING = Replace(STRING_S, STRING_A, [STRING_B, [N]])
#
sub Replace
{
  # First, we make sure that required arguments are available
  # and any special scenarios are handled correctly.

  defined $_[0] or return '';     # Missing arguments?
  defined $_[1] or return $_[0];  # Missing arguments?
  my $B = defined $_[2] ? $_[2] : '';    # Replace to --> $B
  my $N = defined $_[3] ? $_[3] : 0x7FFFFFFF;   # Get $N
  my ($LA, $LB) = (length($_[1]), length($B));  # Get string lengths

  # The search string must not be an empty string, or we exit.
  # The string that we search for must not be longer than
  # the string in which we search.
  ($N && $LA && $LA <= length($_[0])) or return $_[0];
  my ($LAST, $F, $X) = (0, 0, $_[0]);

  if ($N > 0x7FFFFFFE)
  {
    # If N was not provided, then that means we have to
    # replace every instance, so we'll use regex...

    my $A = $_[1];
    $X =~ s/\Q$A\E/$B/g;
    return $X;
  }

  if ($N < 0)
  {
    # If we get here, we must not replace every
    # instance, and we must go from right to left.

    $F = length($X);
    while (($F = rindex($X, $_[1], $F)) >= 0)
    {
      substr($X, $F, $LA) = $B;
      ++$N or last;
    }
    return $X;
  }

  if ($LA == $LB)
  {
    # In this case, output string will be the
    # same length as the input string.
    # We must not replace every instance,
    # and we search from left to right.

    while (($F = index($X, $_[1], $F)) >= 0)
    {
      substr($X, $F, $LA) = $B;
      $F += $LB;
      --$N or last;
    }
    return $X;
  }

  # In this final scenario, the output string will
  # NOT be the same length as the input string.
  # We must not replace every instance,
  # and we search from left to right.

  # For performance reasons, we build a new string.
  $X = '';
  while (($F = index($_[0], $_[1], $F)) >= 0)
  {
    $X .= substr($_[0], $LAST, $F - $LAST);
    $X .= $B;
    $F += $LA;
    $LAST = $F;
    --$N or last;
  }
  return $X . substr($_[0], $LAST);
}
##################################################
#                                       v2019.12.9
# Returns one or more words from a string.
# The string is treated as a list of words separated
# by whitespace. In this case, a "whitespace" is any
# character whose ASCII value is less than 33. This
# includes new line characters, tab, space, null, etc.
# PTR tells which word to grab starting with 1.
# If PTR is 3, the third word is returned.
# If PTR is not specified, the default value is 1.
# COUNT tells how many words to return. Default is 1.
# When COUNT has a negative value, returns every word
# from PTR all the way to the end of the string.
# The words in the return value will always be separated
# by a space character regardless of how many spaces or
# tabs were between them in the input string.
#
# Usage: STRING = GetWord(STRING, [PTR, [COUNT]])
#
sub GetWord
{
  defined $_[0] or return '';
  my $LEN   = length($_[0]);
  my $PTR   = defined $_[1] ? $_[1] : 1;
  my $COUNT = defined $_[2] ? $_[2] : 1;
  return '' if ($LEN == 0 || $COUNT == 0 || $PTR >= $LEN);

  my $START = -1;
  my $OUTPUT = '';
  $PTR > 0 or $PTR = 1;
  for (my $i = 0; $i <= $LEN; $i++)
  {
    if (vec($_[0], $i, 8) > 32)
    {
      $START >= 0 or $START = $i;
      next;
    }
    if ($START >= 0)
    {
      if ($PTR-- < 2)
      {
        length($OUTPUT) == 0 or $OUTPUT .= ' ';
        $OUTPUT .= substr($_[0], $START, $i - $START);
        last if ($COUNT-- == 1);
      }
      $START = -1;
    }
  }
  return $OUTPUT;
}
##################################################
#
# This function prints all HREF= website addresses
# from a HTML file. This function requires the
# presence of a global variable called $DATA
# which holds the file contents.
#
sub ListLinks
{
  my @A;
  my $L = length($DATA);
  $L or return @A;
  my $U = uc($DATA);
  my $COUNT = 0;

  for (my $i = 0; $i < $L; $i++)
  {
    my $P = index($U, '<A ', $i);
    if ($P < 0)
    {
      return @A;
    }
    else
    {
      $i = $P + 3;
      $P = index($U, " HREF=", $i);
      if ($P < 0)
      {
        next;
      }
      else
      {
        $i = $P + 6;
        my $TEMP = substr($DATA, $P + 7);
        $P = FindChar($TEMP, '>\'"', $i);
        if ($P < 0)
        {
          next;
        }
        else
        {
          my $REF = substr($TEMP, 0, $P);
          $REF =~ tr|"'||d;
          
          if (index($BIG_REF_LIST, $REF) < 0)
          {
            print "\n  <A HREF=\"$REF\">$COUNT</A>";
            $COUNT++;
            $BIG_REF_LIST .= ' *' . $REF;
          }
        }
      }
    }
  }
}
##################################################
#                                       v2021.2.21
# Returns the path where this script resides.
# Usage: STRING = SelfPath()
#
sub SelfPath
{
  my $SELF = $0;
  $SELF =~ tr|\\|/|;
  my $PATH = cut($SELF, '/', 0x10010);
  if (length($PATH)) { return $PATH; }
  $PATH = ($^O =~ /WIN|DOS/i) ? `CD` : (exists($ENV{PWD}) ? $ENV{PWD} : `pwd`);
  $PATH =~ tr|\\?*'\"\x00-\x1F|/|d;
  return $PATH;
}
##################################################
#
# Terminates the script. In Linux, this function
# pauses the screen before exiting.
# Usage: EXIT([ERRORCODE])
#
sub EXIT
{
  my $E = defined $_[0] ? $_[0] : 0;
  if (!($^O =~ /WIN|DOS/i))
  {
    $| = 1;
    print "\n\nEXITCODE = $E\n\nPress [ENTER] to exit...";
    <STDIN>;
  }
  exit($E);
}