#!/usr/bin/perl
use strict;
use warnings;
#
# This Perl script searches all the HTML files in the
# current directory and collects all the links and
# displays them in a list. Written by Zsolt in 2021.
#
##################################################
my $PATH = SelfPath();
my $RECURSIVE = 1;
my @NAME;
my @HREF;
my @ALLFILES;
my $BIG_REF_LIST = '';
About();
CheckDIR($PATH); # Collect HTML file names
my $OUTPUT_FILE = "$PATH/links.txt";
$OUTPUT_FILE =~ tr|/||s;
my $DATA;
my $FILE;
my $NAMEONLY;
my @EXT = qw(HTML HTM XML PDF DOC TXT JS JSP ASP ASX PHP PL CGI JPE JPG JPEG BMP GIF PNG TIF PCX);
for (my $i = 0; $i < @ALLFILES; $i++)
{
$FILE = $ALLFILES[$i];
$NAMEONLY = cut($FILE, '/', 0x10101);
$DATA = ReadFile($FILE);
$DATA =~ s/\*([ A-Za-z0-9.,:\/\-\+\&\%\_\?\#]+)\<\/A\>//i;
if (defined $1)
{
push(@HREF, $1);
push(@NAME, $2);
}
}
my @BIGLIST;
for (my $i = 0; $i < @NAME; $i++)
{
$BIGLIST[$i] = "\r\n\"" . $HREF[$i] . "|" . $NAME[$i] . "\",";
}
@BIGLIST = sort(@BIGLIST);
CreateFile($OUTPUT_FILE, join("", @BIGLIST));
EXIT();
##################################################
#
# This function searches STRING to see if any of its
# characters match any of the characters of SUBSTR.
# Returns the pointer where the first match occurred
# in STRING. Returns -1 if no matches were found.
# If START is provided, the search will start at a
# certain position in STRING. If the start is a negative
# value, then the search will start from the end of string.
# When INVERT is defined, this function will return the
# first non-matching character's position.
#
# Usage: INTEGER = FindChar(STRING, SUBSTR, [START, [INVERT]])
#
sub FindChar
{
defined $_[0] or return -1;
defined $_[1] or return -1;
(my $END = length($_[0])) && length($_[1]) or return -1;
my $i = defined $_[2] ? $_[2] : 0;
my $MATCH = defined $_[3] ? 0 : 1;
$i < $END or return -1;
my $DIR = 1;
if ($i < 0)
{
$i += $END;
$END = $DIR = -1;
}
$i >= 0 or return -1;
for (; $i != $END; $i += $DIR)
{
if ((index($_[1], substr($_[0], $i, 1)) < 0 ? 0 : 1) == $MATCH)
{ return $i; }
}
return -1;
}
##################################################
#
# This function removes all whitespace from before
# and after text and returns a new string. This
# function removes every character whose ASCII
# value is less than 33. This includes tab, space,
# null, vertical tab, esc, new lines, etc..
#
# Usage: STRING = Trim(STRING)
#
sub Trim
{
my $S = defined $_[0] ? $_[0] : '';
my $L = length($S) or return '';
my $START = 0;
my $LAST = 0;
while ($L--)
{
if (vec($S, $L, 8) > 32)
{
$START = $L;
$LAST or $LAST = $L + 1;
}
}
return substr($S, $START, $LAST - $START);
}
##################################################
#
# This function returns the OS type as a number.
# 1=DOS 2=WINDOWS 3=LINUX 4=OSX 9=OTHER
#
sub GetOS
{
my $OS = uc($^O);
index($OS, 'LINUX') >= 0 ? 3 :
index($OS, 'MSWIN') >= 0 ? 2 :
index($OS, 'DOS') >= 0 ? 1 :
index($OS, 'DARWIN') >= 0 ? 4 : 9;
}
##################################################
# Prints the description of this program.
# Usage: About()
#
sub About
{
my $PTRSIZE = `$^X -V:ptrsize`;
$PTRSIZE =~ s/[^0-9]//g;
print "\nPerl $] ", ($PTRSIZE << 3), '-bit ', $^O, ', ' . localtime(), "\n";
my $S = ReadFile($0);
my $P = 1 + index($S, '# ');
my $E = 1 + index($S, '###', $P);
$E && $P or die "\nMissing argument.\n";
$S = substr($S, $P, $E - $P);
$S =~ tr/#//d;
print "\n $0\n\n$S";
}
##################################################
#
# This function reads the entire contents of a file
# in binary mode and returns it as a string. If an
# errors occur, an empty string is returned silently.
# A second argument will move the file pointer before
# reading. And a third argument limits the number
# of bytes to read.
# Usage: STRING = ReadFile(FILENAME, [START, [LENGTH]])
#
sub ReadFile
{
my $NAME = defined $_[0] ? $_[0] : '';
$NAME =~ tr/\"\0*?|<>//d; # Remove special characters
-e $NAME or return '';
-f $NAME or return '';
my $SIZE = -s $NAME;
$SIZE or return '';
my $LEN = defined $_[2] ? $_[2] : $SIZE;
$LEN > 0 or return '';
local *FH;
sysopen(FH, $NAME, 0) or return '';
binmode FH;
my $POS = defined $_[1] ? $_[1] : 0;
$POS < $SIZE or return '';
$POS < 1 or sysseek(FH, 0, $POS); # Move file ptr
my $DATA = '';
sysread(FH, $DATA, $LEN); # Read file
close FH;
return $DATA;
}
##################################################
# v2019.6.15
# Just like Trim(), this function can remove spaces
# or tabs from before and after STRING but it can also
# remove any other character, whatever is found in SUBSTR.
# Usage: STRING = TrimChar(STRING, SUBSTR)
#
sub TrimChar
{
defined $_[0] or return '';
my $L = length($_[0]);
$L or return '';
defined $_[1] or return $_[0];
length($_[1]) or return $_[0];
my $START = 0;
my $LAST = 0;
while ($L--)
{
if (index($_[1], substr($_[0], $L, 1)) < 0)
{
$START = $L;
$LAST or $LAST = $L + 1;
}
}
return substr($_[0], $START, $LAST - $START);
}
##################################################
#
# This function reads the contents of a folder and
# calls CatchFile() for each file that was found.
# Usage: CheckDIR(PATH)
#
sub CheckDIR
{
my $PATH = defined $_[0] ? $_[0] : '';
length($PATH) or return;
$PATH =~ tr|\\|/|;
$PATH .= '/';
$PATH =~ tr|/||s;
my $EXT;
my @SUBDIRS;
my $FULLNAME;
local *DIR;
opendir(DIR, $PATH) or return;
while ((my $NAME = readdir(DIR)))
{
if (length($NAME) < 3)
{ $NAME ne '.' && $NAME ne '..' or next; }
$FULLNAME = "$PATH$NAME";
# We will display directories first, followed by
# symbolic links, then all other special files,
# and plain files last.
if (-d($FULLNAME)) { push(@SUBDIRS, $FULLNAME); next; }
if (-f($FULLNAME))
{
$EXT = uc(cut($FULLNAME, '.', 0x10001));
if ($EXT eq 'HTM' || $EXT eq 'HTML')
{ push(@ALLFILES, $FULLNAME); next; }
}
}
closedir(DIR);
$RECURSIVE or return;
foreach $PATH (@SUBDIRS) { CheckDIR($PATH); }
}
##################################################
# v2019.11.24
# Creates and overwrites a file in binary mode.
# Returns 1 on success or 0 if something went wrong.
# Usage: INTEGER = CreateFile(FILE_NAME, CONTENT)
#
sub CreateFile
{
defined $_[0] or return 0;
my $F = $_[0];
$F =~ tr/\"\0*?|<>//d; # Remove special characters
length($F) or return 0;
local *FH;
open(FH, ">$F") or return 0;
binmode FH;
if (defined $_[1] ? length($_[1]) : 0) { print FH $_[1]; }
close FH or return 0;
return 1;
}
####################################################################################################
# v2021.2.12
# This function splits string into two parts along
# the first occurrence of substring and places the
# two resulting parts into $a and $b. If substring
# is not found, then both $a and $b will be empty!
#
# The return value of this function will depend on
# the value of the third argument (CMD). Without
# the third argument, this function will simply cut
# the first instance of substring from the string
# and return what's left.
#
# The CMD argument is a sum of several numbers, and
# it should be given as a hexadecimal number. Here's
# how it works:
#
# cut('Hello World!', ' ', 0x001) returns 'World!'
# cut('Hello World!', ' ', 0x010) returns 'Hello'
# cut('Hello World!', ' ', 0x011) returns 'HelloWorld!'
#
# The third digit from the right controls what is
# returned when the substring is not found.
# 0=Return an empty string
# 1=Return the original string
#
# cut('Hello World!', ' ', 0x101) returns 'World!'
# cut('Hello World!', 'L', 0x101) returns 'Hello World!'
#
# To ignore case, we set the 4th digit:
#
# cut('Hello World!', 'L', 0x1101) returns 'lo World!'
#
# To search from the end of string, we set the 5th digit:
#
# cut('Hello World!', 'L', 0x11101) returns 'd!'
#
# Usage: STRING = cut(STRING, SUBSTR, [CMD])
#
sub cut
{
my $STR = defined $_[0] ? $_[0] : '';
my $SUB = defined $_[1] ? $_[1] : '';
my $CMD = defined $_[2] ? $_[2] : 0x111;
my $P = ($CMD & 0x1000) ?
(($CMD & 0x10000) ? rindex(uc($STR), $SUB) : index(uc($STR), $SUB))
:
(($CMD & 0x10000) ? rindex($STR, $SUB) : index($STR, $SUB));
$a = $b = '';
$P < 0 and return ($CMD & 256) ? $STR : '';
$a = substr($STR, 0, $P);
$b = substr($STR, $P + length($SUB));
return ($CMD & 16 ? $a : '') . ($CMD & 1 ? $b : '');
}
##################################################
# v2019.12.7
# This function scans string S and replaces the
# first N occurrences of string A with string B
# and returns a new string. If N is -1 then only
# the last instance is replaced.
# Usage: STRING = Replace(STRING_S, STRING_A, [STRING_B, [N]])
#
sub Replace
{
# First, we make sure that required arguments are available
# and any special scenarios are handled correctly.
defined $_[0] or return ''; # Missing arguments?
defined $_[1] or return $_[0]; # Missing arguments?
my $B = defined $_[2] ? $_[2] : ''; # Replace to --> $B
my $N = defined $_[3] ? $_[3] : 0x7FFFFFFF; # Get $N
my ($LA, $LB) = (length($_[1]), length($B)); # Get string lengths
# The search string must not be an empty string, or we exit.
# The string that we search for must not be longer than
# the string in which we search.
($N && $LA && $LA <= length($_[0])) or return $_[0];
my ($LAST, $F, $X) = (0, 0, $_[0]);
if ($N > 0x7FFFFFFE)
{
# If N was not provided, then that means we have to
# replace every instance, so we'll use regex...
my $A = $_[1];
$X =~ s/\Q$A\E/$B/g;
return $X;
}
if ($N < 0)
{
# If we get here, we must not replace every
# instance, and we must go from right to left.
$F = length($X);
while (($F = rindex($X, $_[1], $F)) >= 0)
{
substr($X, $F, $LA) = $B;
++$N or last;
}
return $X;
}
if ($LA == $LB)
{
# In this case, output string will be the
# same length as the input string.
# We must not replace every instance,
# and we search from left to right.
while (($F = index($X, $_[1], $F)) >= 0)
{
substr($X, $F, $LA) = $B;
$F += $LB;
--$N or last;
}
return $X;
}
# In this final scenario, the output string will
# NOT be the same length as the input string.
# We must not replace every instance,
# and we search from left to right.
# For performance reasons, we build a new string.
$X = '';
while (($F = index($_[0], $_[1], $F)) >= 0)
{
$X .= substr($_[0], $LAST, $F - $LAST);
$X .= $B;
$F += $LA;
$LAST = $F;
--$N or last;
}
return $X . substr($_[0], $LAST);
}
##################################################
# v2019.12.9
# Returns one or more words from a string.
# The string is treated as a list of words separated
# by whitespace. In this case, a "whitespace" is any
# character whose ASCII value is less than 33. This
# includes new line characters, tab, space, null, etc.
# PTR tells which word to grab starting with 1.
# If PTR is 3, the third word is returned.
# If PTR is not specified, the default value is 1.
# COUNT tells how many words to return. Default is 1.
# When COUNT has a negative value, returns every word
# from PTR all the way to the end of the string.
# The words in the return value will always be separated
# by a space character regardless of how many spaces or
# tabs were between them in the input string.
#
# Usage: STRING = GetWord(STRING, [PTR, [COUNT]])
#
sub GetWord
{
defined $_[0] or return '';
my $LEN = length($_[0]);
my $PTR = defined $_[1] ? $_[1] : 1;
my $COUNT = defined $_[2] ? $_[2] : 1;
return '' if ($LEN == 0 || $COUNT == 0 || $PTR >= $LEN);
my $START = -1;
my $OUTPUT = '';
$PTR > 0 or $PTR = 1;
for (my $i = 0; $i <= $LEN; $i++)
{
if (vec($_[0], $i, 8) > 32)
{
$START >= 0 or $START = $i;
next;
}
if ($START >= 0)
{
if ($PTR-- < 2)
{
length($OUTPUT) == 0 or $OUTPUT .= ' ';
$OUTPUT .= substr($_[0], $START, $i - $START);
last if ($COUNT-- == 1);
}
$START = -1;
}
}
return $OUTPUT;
}
##################################################
#
# This function prints all HREF= website addresses
# from a HTML file. This function requires the
# presence of a global variable called $DATA
# which holds the file contents.
#
sub ListLinks
{
my @A;
my $L = length($DATA);
$L or return @A;
my $U = uc($DATA);
my $COUNT = 0;
for (my $i = 0; $i < $L; $i++)
{
my $P = index($U, '$COUNT";
$COUNT++;
$BIG_REF_LIST .= ' *' . $REF;
}
}
}
}
}
}
##################################################
# v2021.2.21
# Returns the path where this script resides.
# Usage: STRING = SelfPath()
#
sub SelfPath
{
my $SELF = $0;
$SELF =~ tr|\\|/|;
my $PATH = cut($SELF, '/', 0x10010);
if (length($PATH)) { return $PATH; }
$PATH = ($^O =~ /WIN|DOS/i) ? `CD` : (exists($ENV{PWD}) ? $ENV{PWD} : `pwd`);
$PATH =~ tr|\\?*'\"\x00-\x1F|/|d;
return $PATH;
}
##################################################
#
# Terminates the script. In Linux, this function
# pauses the screen before exiting.
# Usage: EXIT([ERRORCODE])
#
sub EXIT
{
my $E = defined $_[0] ? $_[0] : 0;
if (!($^O =~ /WIN|DOS/i))
{
$| = 1;
print "\n\nEXITCODE = $E\n\nPress [ENTER] to exit...";
;
}
exit($E);
}