#!/usr/bin/perl use strict; use warnings; # # "Quick & Dirty HTML META TAG EXTRACTOR" # Written by Zsolt on July 31, 2022. (zsnp@juno.com) # # This perl script reads all HTML and HTM files in the current directory # and extracts META TAGS from the HTML header and saves them in a file # called META.HTM in the current directory. # ################################################################### my @CAPTURE = ('*.HTM', '*.HTML'); # Capture these files my @FIND = (''; } unshift(@OUTPUT, "$HEADER"); } else { unshift(@OUTPUT, $HEADER . $BR); } CreateFile($OUTFILE, join($BR, @OUTPUT)); exit; # Goodbye! ################################################## # v2022.7.31 # This function is automatically called by CatchFile() # every time a file's name matches any search patterns # listed in the @CAPTURE. # # Usage: ProcessFile(FULLNAME) <-- Called by CatchFile() # sub ProcessFile { my $FULLNAME = shift; print "\n $FULLNAME"; my $c; my $P; my $MODE = 0; my $WORD = ''; my $DATA = ReadFile($FULLNAME); my $SOURCE = ''; if ($INCLUDE_FILENAMES == 1) { $SOURCE = " --- $FULLNAME"; } if ($INCLUDE_FILENAMES == 2) { $SOURCE = ' --- ' . ShortenPath($CURDIR, $FULLNAME); } while (length($DATA)) { my $P = FindAnyOf($DATA, @FIND); $P >= 0 or last; $DATA = substr($DATA, $P); $P = index($DATA, '>'); if ($P >= 0) { push(@OUTPUT, Trim(CollapseWhitespace(substr($DATA, 0, $P+1))) . $SOURCE); $DATA = substr($DATA, $P); } else { push(@OUTPUT, $DATA . $SOURCE); last; } } } ################################################## # v2022.7.31 # This function is automatically called by CheckDIR() # every time a file is found. This function gets # the full name of the file. # # Usage: CatchFile(FULLNAME) <-- Called by CheckDIR() # sub CatchFile { my $FULLNAME = shift; -s $FULLNAME or return; # File size is zero? return if (FixPath(uc($FULLNAME)) eq FixPath(uc($OUTFILE))); # Is this our own output file? foreach (@CAPTURE) { if (isMatch($FULLNAME, $_)) { ProcessFile($FULLNAME); } } } ################################################## # # This function reads the contents of a folder and calls # CatchFile() for each file that was found. # Returns 0 on success. # Returns a negative value if an error occurred. # # Usage: CheckDIR(PATH) # sub CheckDIR { @_ or return; my $PATH = shift; defined $PATH or return; length($PATH) or return; my $FULLNAME; print "\n\n Reading directory: $PATH\n"; $PATH = FixPath("$PATH/"); opendir(my $DIR, $PATH) or return; while (my $NAME = readdir $DIR) { $FULLNAME = "$PATH$NAME"; if (-d($FULLNAME)) { # Check into subdirectory if RECURSIVE == 1 # Skip directory if its name starts with "." if ($RECURSIVE) { CheckDIR($FULLNAME) unless (vec($NAME, 0, 8) == 46); } next; } CatchFile($FULLNAME); } closedir $DIR; } ################################################## # v2022.7.31 # This function works just like the index() function # except it can compare more than one string. Returns # the index of the closest match or -1 if no matches were found. # # Usage: INTEGER = FindAnyOf(STRING, SUBSTR1, [SUBSTR2...]) # sub FindAnyOf { my @INDEX; my $TEST = uc(shift); while (@_) { push(@INDEX, index($TEST, shift)); } my $MAX = 9999999999; my $FIRST = $MAX; foreach (@INDEX) { if ($_ >= 0 && $FIRST > $_) { $FIRST = $_; } } return ($FIRST == $MAX) ? -1 : $FIRST; } ############################################################## # # This function converts all adjacent whitespace characters to # a single space. In this function, "whitespace" is defined as # a character whose ASCII value is less than 33. (This includes # many special characters such as new line characters, nul, etc.) # A second argument may be supplied to convert to something # other than a space. The second argument must be a number (0-255). # # Usage: STRING = CollapseWhitespace(STRING, [ASCII_VALUE]) # # Example: # CollapseWhitespace("\n\t abc 123 \n") --> " abc 123 " # CollapseWhitespace("\n\t abc 123 \n", 45) --> "-abc-123-" # sub CollapseWhitespace { @_ or return ''; my $T = shift; defined $T or return ''; my $L = length($T); $L or return ''; my $SP = @_ ? $_[0] & 255 : 32; my $c; my $N = 0; # consecutive whitespace counter my $P = 0; # target pointer to overwrite original str $T my $U = 1; # string length will be left unchanged for (my $i = 0; $i < $L; $i++) { $c = vec($T, $i, 8); if ($c < 33) { # When the first "whitespace" is encountered, we # replace it with a SPACE. When the second consecutive # whitespace is encountered, we have to reduce the string. if ($N++) { $U = 0; } else { vec($T, $P++, 8) = $SP; } } else { $U or vec($T, $P, 8) = $c; $N = 0; $P++; } } return $U ? $T : substr($T, 0, $P); } ################################################## # v2020.11.19 # This function removes duplicate lines from an array # by sorting it and comparing each line with case-sensitive # comparison. Returns a new array. # # Usage: NEW_ARRAY = DeleteDuplicates(ARRAY) # sub DeleteDuplicates { my @A = @_; @A > 1 or return @A; @A = sort(@A); my $i = 0; my $j = 1; while ($j < @A) { if ($A[$i] eq $A[$j]) { splice(@A, $j, 1); } else { $i++; $j++; } } return @A; } ################################################## # v2021.3.16 # This function converts a string to HTML-safe # string output. For example, it will convert the # < > signs to < and > # # Usage: STRING = QuoteHTML(STRING) # sub QuoteHTML { my $STR = defined $_[0] ? $_[0] : ''; $STR =~ s/\r\n/\n/g; $STR =~ s/\t/ /g; my $OUTPUT = ''; my ($i, $L, $C, $c) = (0, length($STR), ''); while ($i < $L) { $c = vec($STR, $i++, 8); if ($c == 38) { $C = '&'; } elsif ($c == 60) { $C = '<'; } elsif ($c == 62) { $C = '>'; } elsif ($c == 32) { $C = ($C eq ' ') ? ' ' : ' '; } elsif ($c == 10) { $C = "\n "; } elsif ($c < 32 || $c > 126) { $C = "&#$c;"; } else { $C = chr($c); } $OUTPUT .= $C; } return $OUTPUT; } ################################################## # Usage: STRING = Trim(STRING) - Removes whitespace before and after STRING. sub Trim {defined$_[0]||return'';(my$L=length($_[0]))||return'';my$P=0;while($P<=$L&&vec($_[0],$P++,8)<33){}for($P--;$P<=$L&&vec($_[0],$L--,8)<33;){}substr($_[0],$P,$L-$P+2);} # Usage: STRING = GetCurrentDirectory([DRIVE]) - Returns the current directory. (If a drive letter is provided in the first argument, then it returns the current directory for that drive. This only applies to DOS and Windows where each drive letter can have its own current directory. If no drive letter is provided, then it returns the current directory of the current drive under DOS and Windows.) v2022.7.11 sub GetCurrentDirectory {if($^O=~/DOS|MSWIN/i){my$DRV=defined$_[0]?substr(Trim($_[0]), 0, 2):'';return Trim(`CD $DRV`);}return Trim(`pwd`);} # Usage: STRING = FixPath(PATH) - Changes the file name separator to forward slash or backslash depending on the current OS. Fixes duplicate slashes. v2022.7.12 sub FixPath {my$P=defined$_[0]?$_[0]:'';if($^O=~/DOS|MSWIN/i){$P=~tr#/#\\#;$P=~tr|\\||s;}else{$P=~tr#\\#/#;$P=~tr|/||s;}return$P;} # Usage: INTEGER = FindChar(STRING, SUBSTR) - This function searches STRING to see if any of its characters match any of the characters of SUBSTR. Returns 1 if a match was found; returns 0 otherwise. sub FindChar {defined$_[0]&&defined$_[1]or return 0;length($_[0])||return 0;my$i=length($_[1]);while($i--){index($_[0],substr($_[1],$i,1))<0or return 1;}return 0;} # Usage: INTEGER = isMatch(FILENAME, WILDCARD) - This function returns true if a filename matches a certain wildcard pattern. There may be several question marks in the search pattern, but only one asterisk is allowed! The matching is NOT case sensitive! Example: isMatch("New_Document.txt", "n*.txt") ==> 1 sub isMatch {my$F=defined$_[0]?uc($_[0]):'';my$W=defined$_[1]?uc($_[1]):'';length($F)&&length($W)or return 0;if(FindChar($W.$F,'<|>')){return 0;}if(FindChar($W,'*?')==0){return($F eq $W)?1:0;}return 0 unless(_isMatch($F,$W,1));return _isMatch($F,$W,-1);} # Usage: INTEGER = _isMatch(FILENAME, WILDCARD, DIRECTION) - This function is called by isMatch() to compare two strings until the first asterisk. Returns 1 if both strings match until the first asterisk. This function can start comparing strings starting from the beginning or starting from the end! DIRECTION must be either 1 or -1. sub _isMatch {my$F=shift;my$f;my$LF=length($F)-1;my$W=shift;my$w;my$LW=length($W)-1;my$DIR=shift;my$STOP=$LW;my$START=0;my$FSTART=0;if($DIR<0){$STOP=0;$START=$LW;$FSTART=$LF;}while($START!=$STOP){$w=vec($W,$START,8);$f=vec($F,$FSTART,8);$START+=$DIR;$FSTART+=$DIR;if($w==42){return 1;}else{if($w!=63){($f==$w)or return 0;}}}return 1;} # Usage: STRING = SafeFileName(FILENAME) - Removes illegal characters from a file name. v2022.7.11 sub SafeFileName {my$F=defined$_[0]?$_[0]:'';$F=~tr#<>*%$?\r\n\"\0|##d;return$F;} # Usage: STRING = ReadFile(FILENAME, [START, [LENGTH]]) - Reads an entire file or part of a file in binary mode. Returns the file contents as a string. A second argument will move the file pointer before reading, and a third argument limits the number of bytes to read. v2022.7.11 sub ReadFile {my$F=SafeFileName($_[0]);-e$F||return'';-f$F||return'';my$S=-s$F;$S||return'';my$L=defined$_[2]?$_[2]:$S;$L>0||return'';local*H;sysopen(H,$F,0)||return'';binmode H;my$P=defined$_[1]?$_[1]:0;$P>=0or $P=0;$P<$S||return'';$P<1||sysseek(H,$P,0);my$D='';sysread(H,$D,$L);close H;return$D;} # Usage: INTEGER = CreateFile(FILENAME, [CONTENT]) - Creates and overwrites a file in binary mode. Returns 1 on success or 0 if something went wrong. v2022.07.11 sub CreateFile {my$F=SafeFileName($_[0]);length($F)||return 0;my$L=defined$_[1]?length($_[1]):0;local*H;open(H,">$F")||return 0;binmode H;if($L){print H $_[1];}close H;-e$F||return 0;return(-s$F==$L)?1:0;} # Usage: STRING = MergePath(STRINGs) - Joins two or more words to form a complete path string. Also removes duplicate slashes and resolves .. parental references. v2021.3.1 sub MergePath {my$P='';foreach(@_){$P.='/'.Trim($_);$P=~tr!\x00-\x1F|<">!!d;}$P=substr($P,1);$P=~tr|\\|/|;$P=~tr|/||s;$P=~s/\/\.\//\//;$P=~s/\/[^\/]+\/\.\.//;return$P;} # Usage: STRING = ShortenPath(PATH1, PATH2) - This function returns Path2 as relative to Path1 if both are full paths. If Path2 has the same origin as Path1, the common origin is removed, leaving only the difference. Example: ShortenPath('C:\\WORK', 'C:\\WORK\\2022\\LIST.pdf') => '2022\LIST.pdf' sub ShortenPath { my $P1 = FixPath($_[0] . '/'); my $P2 = FixPath($_[1]); return (index($P2, $P1) == 0) ? substr($P2, length($P1)) : $P2; } # Returns the current local time as a string in the following format: LocalTime() => "Sun Jul 31 2022 1:24p" sub LocalTime { my @D = localtime(defined $_[0] ? $_[0] : time); my $M = substr('JanFebMarAprMayJunJulAugSepOctNovDec', $D[4] * 3, 3); my $W = substr('SunMonTueWedThuFriSat', $D[6] * 3, 3); my $A = $D[2] > 11 ? 'p' : 'a'; $D[2] or $D[2] = 12; $D[2] < 13 or $D[2] -= 12; return "$W $M $D[3] " . sprintf('%.04d %d:%.02d', (1900+$D[5]), $D[2], $D[1]) . $A; } # Usage: About() - Prints the description of this program. sub About { my $PTRSIZE = `$^X -V:ptrsize`; $PTRSIZE =~ s/[^0-9]//g; print "\nPerl $] ", ($PTRSIZE << 3), '-bit ', $^O, ', ' . LocalTime(), "\n"; my $S = ReadFile($0); my $P = 1 + index($S, '# '); my $E = 1 + index($S, '###', $P); $E && $P or die "\nMissing argument.\n"; $S = substr($S, $P, $E - $P); $S =~ tr/#//d; print "\n $0\n\n$S"; } #################################################################