#!/usr/bin/perl use strict; use warnings; # # This Perl script searches all the HTML files in the # current directory and collects all the links and # displays them in a list. Written by Zsolt in 2021. # ################################################## my $PATH = SelfPath(); my $RECURSIVE = 1; my @NAME; my @HREF; my @ALLFILES; my $BIG_REF_LIST = ''; About(); CheckDIR($PATH); # Collect HTML file names my $OUTPUT_FILE = "$PATH/links.txt"; $OUTPUT_FILE =~ tr|/||s; my $DATA; my $FILE; my $NAMEONLY; my @EXT = qw(HTML HTM XML PDF DOC TXT JS JSP ASP ASX PHP PL CGI JPE JPG JPEG BMP GIF PNG TIF PCX); for (my $i = 0; $i < @ALLFILES; $i++) { $FILE = $ALLFILES[$i]; $NAMEONLY = cut($FILE, '/', 0x10101); $DATA = ReadFile($FILE); $DATA =~ s/\*([ A-Za-z0-9.,:\/\-\+\&\%\_\?\#]+)\<\/A\>//i; if (defined $1) { push(@HREF, $1); push(@NAME, $2); } } my @BIGLIST; for (my $i = 0; $i < @NAME; $i++) { $BIGLIST[$i] = "\r\n\"" . $HREF[$i] . "|" . $NAME[$i] . "\","; } @BIGLIST = sort(@BIGLIST); CreateFile($OUTPUT_FILE, join("", @BIGLIST)); EXIT(); ################################################## # # This function searches STRING to see if any of its # characters match any of the characters of SUBSTR. # Returns the pointer where the first match occurred # in STRING. Returns -1 if no matches were found. # If START is provided, the search will start at a # certain position in STRING. If the start is a negative # value, then the search will start from the end of string. # When INVERT is defined, this function will return the # first non-matching character's position. # # Usage: INTEGER = FindChar(STRING, SUBSTR, [START, [INVERT]]) # sub FindChar { defined $_[0] or return -1; defined $_[1] or return -1; (my $END = length($_[0])) && length($_[1]) or return -1; my $i = defined $_[2] ? $_[2] : 0; my $MATCH = defined $_[3] ? 0 : 1; $i < $END or return -1; my $DIR = 1; if ($i < 0) { $i += $END; $END = $DIR = -1; } $i >= 0 or return -1; for (; $i != $END; $i += $DIR) { if ((index($_[1], substr($_[0], $i, 1)) < 0 ? 0 : 1) == $MATCH) { return $i; } } return -1; } ################################################## # # This function removes all whitespace from before # and after text and returns a new string. This # function removes every character whose ASCII # value is less than 33. This includes tab, space, # null, vertical tab, esc, new lines, etc.. # # Usage: STRING = Trim(STRING) # sub Trim { my $S = defined $_[0] ? $_[0] : ''; my $L = length($S) or return ''; my $START = 0; my $LAST = 0; while ($L--) { if (vec($S, $L, 8) > 32) { $START = $L; $LAST or $LAST = $L + 1; } } return substr($S, $START, $LAST - $START); } ################################################## # # This function returns the OS type as a number. # 1=DOS 2=WINDOWS 3=LINUX 4=OSX 9=OTHER # sub GetOS { my $OS = uc($^O); index($OS, 'LINUX') >= 0 ? 3 : index($OS, 'MSWIN') >= 0 ? 2 : index($OS, 'DOS') >= 0 ? 1 : index($OS, 'DARWIN') >= 0 ? 4 : 9; } ################################################## # Prints the description of this program. # Usage: About() # sub About { my $PTRSIZE = `$^X -V:ptrsize`; $PTRSIZE =~ s/[^0-9]//g; print "\nPerl $] ", ($PTRSIZE << 3), '-bit ', $^O, ', ' . localtime(), "\n"; my $S = ReadFile($0); my $P = 1 + index($S, '# '); my $E = 1 + index($S, '###', $P); $E && $P or die "\nMissing argument.\n"; $S = substr($S, $P, $E - $P); $S =~ tr/#//d; print "\n $0\n\n$S"; } ################################################## # # This function reads the entire contents of a file # in binary mode and returns it as a string. If an # errors occur, an empty string is returned silently. # A second argument will move the file pointer before # reading. And a third argument limits the number # of bytes to read. # Usage: STRING = ReadFile(FILENAME, [START, [LENGTH]]) # sub ReadFile { my $NAME = defined $_[0] ? $_[0] : ''; $NAME =~ tr/\"\0*?|<>//d; # Remove special characters -e $NAME or return ''; -f $NAME or return ''; my $SIZE = -s $NAME; $SIZE or return ''; my $LEN = defined $_[2] ? $_[2] : $SIZE; $LEN > 0 or return ''; local *FH; sysopen(FH, $NAME, 0) or return ''; binmode FH; my $POS = defined $_[1] ? $_[1] : 0; $POS < $SIZE or return ''; $POS < 1 or sysseek(FH, 0, $POS); # Move file ptr my $DATA = ''; sysread(FH, $DATA, $LEN); # Read file close FH; return $DATA; } ################################################## # v2019.6.15 # Just like Trim(), this function can remove spaces # or tabs from before and after STRING but it can also # remove any other character, whatever is found in SUBSTR. # Usage: STRING = TrimChar(STRING, SUBSTR) # sub TrimChar { defined $_[0] or return ''; my $L = length($_[0]); $L or return ''; defined $_[1] or return $_[0]; length($_[1]) or return $_[0]; my $START = 0; my $LAST = 0; while ($L--) { if (index($_[1], substr($_[0], $L, 1)) < 0) { $START = $L; $LAST or $LAST = $L + 1; } } return substr($_[0], $START, $LAST - $START); } ################################################## # # This function reads the contents of a folder and # calls CatchFile() for each file that was found. # Usage: CheckDIR(PATH) # sub CheckDIR { my $PATH = defined $_[0] ? $_[0] : ''; length($PATH) or return; $PATH =~ tr|\\|/|; $PATH .= '/'; $PATH =~ tr|/||s; my $EXT; my @SUBDIRS; my $FULLNAME; local *DIR; opendir(DIR, $PATH) or return; while ((my $NAME = readdir(DIR))) { if (length($NAME) < 3) { $NAME ne '.' && $NAME ne '..' or next; } $FULLNAME = "$PATH$NAME"; # We will display directories first, followed by # symbolic links, then all other special files, # and plain files last. if (-d($FULLNAME)) { push(@SUBDIRS, $FULLNAME); next; } if (-f($FULLNAME)) { $EXT = uc(cut($FULLNAME, '.', 0x10001)); if ($EXT eq 'HTM' || $EXT eq 'HTML') { push(@ALLFILES, $FULLNAME); next; } } } closedir(DIR); $RECURSIVE or return; foreach $PATH (@SUBDIRS) { CheckDIR($PATH); } } ################################################## # v2019.11.24 # Creates and overwrites a file in binary mode. # Returns 1 on success or 0 if something went wrong. # Usage: INTEGER = CreateFile(FILE_NAME, CONTENT) # sub CreateFile { defined $_[0] or return 0; my $F = $_[0]; $F =~ tr/\"\0*?|<>//d; # Remove special characters length($F) or return 0; local *FH; open(FH, ">$F") or return 0; binmode FH; if (defined $_[1] ? length($_[1]) : 0) { print FH $_[1]; } close FH or return 0; return 1; } #################################################################################################### # v2021.2.12 # This function splits string into two parts along # the first occurrence of substring and places the # two resulting parts into $a and $b. If substring # is not found, then both $a and $b will be empty! # # The return value of this function will depend on # the value of the third argument (CMD). Without # the third argument, this function will simply cut # the first instance of substring from the string # and return what's left. # # The CMD argument is a sum of several numbers, and # it should be given as a hexadecimal number. Here's # how it works: # # cut('Hello World!', ' ', 0x001) returns 'World!' # cut('Hello World!', ' ', 0x010) returns 'Hello' # cut('Hello World!', ' ', 0x011) returns 'HelloWorld!' # # The third digit from the right controls what is # returned when the substring is not found. # 0=Return an empty string # 1=Return the original string # # cut('Hello World!', ' ', 0x101) returns 'World!' # cut('Hello World!', 'L', 0x101) returns 'Hello World!' # # To ignore case, we set the 4th digit: # # cut('Hello World!', 'L', 0x1101) returns 'lo World!' # # To search from the end of string, we set the 5th digit: # # cut('Hello World!', 'L', 0x11101) returns 'd!' # # Usage: STRING = cut(STRING, SUBSTR, [CMD]) # sub cut { my $STR = defined $_[0] ? $_[0] : ''; my $SUB = defined $_[1] ? $_[1] : ''; my $CMD = defined $_[2] ? $_[2] : 0x111; my $P = ($CMD & 0x1000) ? (($CMD & 0x10000) ? rindex(uc($STR), $SUB) : index(uc($STR), $SUB)) : (($CMD & 0x10000) ? rindex($STR, $SUB) : index($STR, $SUB)); $a = $b = ''; $P < 0 and return ($CMD & 256) ? $STR : ''; $a = substr($STR, 0, $P); $b = substr($STR, $P + length($SUB)); return ($CMD & 16 ? $a : '') . ($CMD & 1 ? $b : ''); } ################################################## # v2019.12.7 # This function scans string S and replaces the # first N occurrences of string A with string B # and returns a new string. If N is -1 then only # the last instance is replaced. # Usage: STRING = Replace(STRING_S, STRING_A, [STRING_B, [N]]) # sub Replace { # First, we make sure that required arguments are available # and any special scenarios are handled correctly. defined $_[0] or return ''; # Missing arguments? defined $_[1] or return $_[0]; # Missing arguments? my $B = defined $_[2] ? $_[2] : ''; # Replace to --> $B my $N = defined $_[3] ? $_[3] : 0x7FFFFFFF; # Get $N my ($LA, $LB) = (length($_[1]), length($B)); # Get string lengths # The search string must not be an empty string, or we exit. # The string that we search for must not be longer than # the string in which we search. ($N && $LA && $LA <= length($_[0])) or return $_[0]; my ($LAST, $F, $X) = (0, 0, $_[0]); if ($N > 0x7FFFFFFE) { # If N was not provided, then that means we have to # replace every instance, so we'll use regex... my $A = $_[1]; $X =~ s/\Q$A\E/$B/g; return $X; } if ($N < 0) { # If we get here, we must not replace every # instance, and we must go from right to left. $F = length($X); while (($F = rindex($X, $_[1], $F)) >= 0) { substr($X, $F, $LA) = $B; ++$N or last; } return $X; } if ($LA == $LB) { # In this case, output string will be the # same length as the input string. # We must not replace every instance, # and we search from left to right. while (($F = index($X, $_[1], $F)) >= 0) { substr($X, $F, $LA) = $B; $F += $LB; --$N or last; } return $X; } # In this final scenario, the output string will # NOT be the same length as the input string. # We must not replace every instance, # and we search from left to right. # For performance reasons, we build a new string. $X = ''; while (($F = index($_[0], $_[1], $F)) >= 0) { $X .= substr($_[0], $LAST, $F - $LAST); $X .= $B; $F += $LA; $LAST = $F; --$N or last; } return $X . substr($_[0], $LAST); } ################################################## # v2019.12.9 # Returns one or more words from a string. # The string is treated as a list of words separated # by whitespace. In this case, a "whitespace" is any # character whose ASCII value is less than 33. This # includes new line characters, tab, space, null, etc. # PTR tells which word to grab starting with 1. # If PTR is 3, the third word is returned. # If PTR is not specified, the default value is 1. # COUNT tells how many words to return. Default is 1. # When COUNT has a negative value, returns every word # from PTR all the way to the end of the string. # The words in the return value will always be separated # by a space character regardless of how many spaces or # tabs were between them in the input string. # # Usage: STRING = GetWord(STRING, [PTR, [COUNT]]) # sub GetWord { defined $_[0] or return ''; my $LEN = length($_[0]); my $PTR = defined $_[1] ? $_[1] : 1; my $COUNT = defined $_[2] ? $_[2] : 1; return '' if ($LEN == 0 || $COUNT == 0 || $PTR >= $LEN); my $START = -1; my $OUTPUT = ''; $PTR > 0 or $PTR = 1; for (my $i = 0; $i <= $LEN; $i++) { if (vec($_[0], $i, 8) > 32) { $START >= 0 or $START = $i; next; } if ($START >= 0) { if ($PTR-- < 2) { length($OUTPUT) == 0 or $OUTPUT .= ' '; $OUTPUT .= substr($_[0], $START, $i - $START); last if ($COUNT-- == 1); } $START = -1; } } return $OUTPUT; } ################################################## # # This function prints all HREF= website addresses # from a HTML file. This function requires the # presence of a global variable called $DATA # which holds the file contents. # sub ListLinks { my @A; my $L = length($DATA); $L or return @A; my $U = uc($DATA); my $COUNT = 0; for (my $i = 0; $i < $L; $i++) { my $P = index($U, '$COUNT"; $COUNT++; $BIG_REF_LIST .= ' *' . $REF; } } } } } } ################################################## # v2021.2.21 # Returns the path where this script resides. # Usage: STRING = SelfPath() # sub SelfPath { my $SELF = $0; $SELF =~ tr|\\|/|; my $PATH = cut($SELF, '/', 0x10010); if (length($PATH)) { return $PATH; } $PATH = ($^O =~ /WIN|DOS/i) ? `CD` : (exists($ENV{PWD}) ? $ENV{PWD} : `pwd`); $PATH =~ tr|\\?*'\"\x00-\x1F|/|d; return $PATH; } ################################################## # # Terminates the script. In Linux, this function # pauses the screen before exiting. # Usage: EXIT([ERRORCODE]) # sub EXIT { my $E = defined $_[0] ? $_[0] : 0; if (!($^O =~ /WIN|DOS/i)) { $| = 1; print "\n\nEXITCODE = $E\n\nPress [ENTER] to exit..."; ; } exit($E); }