#!/usr/bin/perl -w # # This Perl script reads a Schwab 1099 tax report of # stock/options trades in raw text format and outputs # a list in XLS format which can then be provided to # the accountant who can then simply copy and paste # the data from the Excel file into the tax software. # # Written by Zsolt Nagy-Perge # in February 2021, Pensacola, Fla. # ################################################## # # This program was made with Notepad2 and was tested # with TinyPerl 5.8 running in Windows XP. # ################################################## use strict; use warnings; ################################################## ## ## GLOBAL VARIABLES ## my $MONTHS = 'JanFebMarAprMayJunJulAugSepOctNovDec'; # Here's a list of popular securities (CUSIP=SYMBOL) # downloaded from https://stockzoa.com/ticker/qqq # This list must be separated by spaces, # and it must start and end with a space: # Each CUSIP number must start with the letters 'CUS' my $SYMBOL_LIST = uc(' CUS46090E103=QQQ CUS78462F103=SPY CUS88160R101=TSLA CUS037833100=AAPL CUS594918104=MSFT CUS217204106=CPRT CUS060505104=BAC CUS172967424=C CUS020002101=ALL CUS89417E109=TRV CUS269246401=ETFC CUS808513105=SCHW CUS949746101=WFC CUS617446448=MS CUS38141G104=GS CUS084670702=BRKB CUS315616102=FFIV CUS369604103=GE CUS64110L106=NFLX CUS023135106=AMZN CUS097023105=BA CUS74347B268=SRTY CUS74347G861=SQQQ CUS74347W114=ZSL CUS74347W148=UVXY CUS74347X864=UPRO CUS46428Q109=SLV CUS78463V107=GLD CUS25459W540=TMF CUS464287432=TLT CUS511795106=LAKE CUS30231G102=XOM CUS254687106=DIS CUS244199105=DE CUS539830109=LMT CUS717081103=PFE CUS654106103=NKE CUS45773H201=INO CUS25460E182=SOXS CUS25460G690=SOXS CUS22542D316=UGLD CUS129500104=CAL CUS22542D332=TVIXF CUS708160106=JCPNQ CUS85207U105=S CUS92343V104=VZ CUS90130A200=FOX CUS98986T108=ZNGA CUS36467W109=GME CUS437076102=HD CUS548661107=LOW CUS931142103=WMT CUS438516106=HON CUS17275R102=CSCO CUS191216100=KO CUS375558103=GILD CUS31428X106=FDX CUS461202103=INTU CUS171340102=CHD CUS67066G104=NVDA CUS458140100=INTC CUS007903107=AMD CUS219350105=AMAT CUS219350105=GLW CUS02209S103=MO CUS88579Y101=MMM CUS247361702=DAL CUS477143101=JBLU '); my $CUSIP_DB = ''; my @WORDS; my @GRAND_TOTAL; # Grand Total: SOLD, COST, WASH, PROFIT my $DATA = ''; # An entire text file's contents my $LINE = ''; # Current line or paragraph my $OUTPUT = ''; # output buffer my $PATTERN = ''; # Line content pattern to match my $NAME = ''; # Company Name my $TYPE = ''; # PUT or CALL my $SYMBOL = ''; # Stock Symbol my $CUSIP = ''; # Stock ID No. my $QTY = ''; # Quantity my $DATE_OPEN = ''; # Date when position was opened (Acquired) my $DATE_CLOSE = ''; # Date when position was closed (Sold) my $EXP_DATE = ''; # Option Expiration Date my $STRIKE = ''; # Option Strike Price my $SOLD = ''; # Sale/Proceeds my $COST = ''; # Cost/Basis my $WASH_SALE = ''; # Wash sale my $PROFIT = ''; # Gain or loss my $WITHHELD = ''; # Federal income tax withheld my $DISCOUNT = ''; # Market discount my $DESCRIPTION = ''; # Description of stock or option my $RAW_DATA = ''; # contains a single line of data read from the file my $BLANK = ''; # a space holder for columns that are intentionally left blank my $NO = 0; # line numbers my $TAXYEAR = 2020; # Current tax year my $UNIT = ''; my $WASH = ''; my $NOPAUSE = 0; my $SYMBOL_LIST = ''; my $REGEX_CAPT = ''; ################################################## ## ## PROGRAM BEGINS HERE ## About(); # Print the description of this program PrintDottedLine(); # CREATE OUTPUT FILE HEADER: # The header is going to be the first row in the CSV file. # Each name here corresponds to a variable name in the program, # so if you change the name in the header, you must also modify # the name of that variable everywhere in the program! my $HEADER = "DESCRIPTION, DATE_OPEN, DATE_CLOSE, SOLD, COST, WASH_SALE, PROFIT, BLANK, NO, WITHHELD, DISCOUNT, CUSIP, SYMBOL, NAME, QTY, TYPE, EXP_DATE, STRIKE, PATTERN, RAW_DATA\r\n"; my $TEMPLATE = $HEADER; $TEMPLATE =~ s/NO/0/; # The transaction number heading will be changed to 0 $TEMPLATE =~ s/BLANK//; # Remove blank column headings $TEMPLATE =~ tr/,/|/; # We change all the commas to '|' temporarily $TEMPLATE =~ tr|_| |; # Next, we change all the '_' to spaces $TEMPLATE = TitleCase($TEMPLATE); # Format the first row $HEADER =~ s/[ ,\t]+/|\$/g; # Insert $ signs in front of header labels $HEADER = '$' . $HEADER; # Find all text files in the current directory. my $MYPATH = GetCurrentWorkingDirectory(); my $CUSIP_FILE = FormatPath($MYPATH, 'ALLCUSIP.CSV'); my @FILES = ReadDIR($MYPATH, 'txt text'); # Look for Schwab text files. my $FOUND = 0; print "Searching for Schwab text files...\n\nDirectory of $MYPATH\n"; foreach my $INPUT_FILE (@FILES) { my $FILESIZE = -s $INPUT_FILE; print "\nChecking: ", $INPUT_FILE; $FILESIZE or next; my $PREVIEW = ReadFile($INPUT_FILE, 0, 1000); # Read first 1000 bytes $PREVIEW =~ tr| a-zA-Z||cd; # Convert UTF-8 to plain text if (index($PREVIEW, 'Schwab One') >= 0) { ParseSchwabFile_fromRH($INPUT_FILE); $FOUND++; } } PrintDottedLine(); print($FOUND ? "PROCESSED: $FOUND SCHWAB FILE(s)" : "NONE FOUND!"); PAUSE(); exit; # END OF PROGRAM ################################################## # # This function searches STRING to see if any of its # characters match any of the characters of SUBSTR. # Returns the pointer where the first match occurred # in STRING. Returns -1 if no matches were found. # If START is provided, the search will start at a # certain position in STRING. If the start is a negative # value, then the search will start from the end of string. # When INVERT is defined, this function will return the # first non-matching character's position. # # Usage: INTEGER = FindChar(STRING, SUBSTR, [START, [INVERT]]) # sub FindChar { defined $_[0] or return -1; defined $_[1] or return -1; (my $END = length($_[0])) && length($_[1]) or return -1; my $i = defined $_[2] ? $_[2] : 0; my $MATCH = defined $_[3] ? 0 : 1; $i < $END or return -1; my $DIR = 1; if ($i < 0) { $i += $END; $END = $DIR = -1; } $i >= 0 or return -1; for (; $i != $END; $i += $DIR) { if ((index($_[1], substr($_[0], $i, 1)) < 0 ? 0 : 1) == $MATCH) { return $i; } } return -1; } ################################################## # v2021.3.1 # This function joins two or more paths and returns # a complete path string in localized format. # Usage: STRING = FormatPath(STRINGs...) # sub FormatPath { my $P = ''; foreach (@_) # Trim SPACEs, double quotes, TAB, CR, LF, etc. { $P .= '/' . TrimChar($_, " \"\t\r\n\0\f"); } $P = substr($P, 1); $P =~ tr|\\|/|; # Convert to Linux format $P =~ tr|/||s; # Remove duplicate '//' $P =~ s/\/[^\/]+\/\.\.//; # Resolve '/directory_name/..' if ($^O =~ /DOS|WIN/i) { $P =~ tr|/|\\|; } # Convert to DOS format return $P; } ################################################## # Prints the description of this program. # Usage: About() # sub About { my $PTRSIZE = `$^X -V:ptrsize`; $PTRSIZE =~ s/[^0-9]//g; print "\nPerl $] ", ($PTRSIZE << 3), '-bit ', $^O, ' ' x 20, TimeStamp(), "\n\n$0"; my $S = ReadFile($0, 0, 1000); my $P = 1 + index($S, '# '); my $E = 1 + index($S, '###', $P); $P && $E or return; $S = substr($S, $P, $E - $P); $S =~ tr|#| |; print "\n\n ", RTRIM($S); } ################################################## # # This function reads the entire contents of a file # in binary mode and returns it as a string. If an # errors occur, an empty string is returned silently. # A second argument will move the file pointer before # reading. And a third argument limits the number # of bytes to read. # Usage: STRING = ReadFile(FILENAME, [START, [LENGTH]]) # sub ReadFile { my $NAME = defined $_[0] ? $_[0] : ''; $NAME =~ tr/\"\0*?|<>//d; # Remove special characters -e $NAME or return ''; -f $NAME or return ''; my $SIZE = -s $NAME; $SIZE or return ''; my $LEN = defined $_[2] ? $_[2] : $SIZE; $LEN > 0 or return ''; local *FH; sysopen(FH, $NAME, 0) or return ''; binmode FH; my $POS = defined $_[1] ? $_[1] : 0; $POS < $SIZE or return ''; $POS < 1 or sysseek(FH, 0, $POS); # Move file ptr my $DATA = ''; sysread(FH, $DATA, $LEN); # Read file close FH; return $DATA; } ################################################## # v2019.11.24 # Creates and overwrites a file in binary mode. # Returns 1 on success or 0 if something went wrong. # Usage: INTEGER = CreateFile(FILE_NAME, CONTENT) # sub CreateFile { defined $_[0] or return 0; my $F = $_[0]; $F =~ tr/\"\0*?|<>//d; # Remove special characters length($F) or return 0; local *FH; open(FH, ">$F") or return 0; binmode FH; if (defined $_[1] ? length($_[1]) : 0) { print FH $_[1]; } close FH or return 0; return 1; } #################################################################################################### # v2021.2.12 # This function splits string into two parts along # the first occurrence of substring and places the # two resulting parts into $a and $b. If substring # is not found, then both $a and $b will be empty! # # The return value of this function will depend on # the value of the third argument (CMD). Without # the third argument, this function will simply cut # the first instance of substring from the string # and return what's left. # # The CMD argument is a sum of several numbers, and # it should be given as a hexadecimal number. Here's # how it works: # # cut('Hello World!', ' ', 0x001) returns 'World!' # cut('Hello World!', ' ', 0x010) returns 'Hello' # cut('Hello World!', ' ', 0x011) returns 'HelloWorld!' # # The third digit from the right controls what is # returned when the substring is not found. # 0=Return an empty string # 1=Return the original string # # cut('Hello World!', ' ', 0x101) returns 'World!' # cut('Hello World!', 'L', 0x101) returns 'Hello World!' # # To ignore case, we set the 4th digit: # # cut('Hello World!', 'L', 0x1101) returns 'lo World!' # # To search from the end of string, we set the 5th digit: # # cut('Hello World!', 'L', 0x11101) returns 'd!' # # Usage: STRING = cut(STRING, SUBSTR, [CMD]) # sub cut { my $STR = defined $_[0] ? $_[0] : ''; my $SUB = defined $_[1] ? $_[1] : ''; my $CMD = defined $_[2] ? $_[2] : 0x111; my $P = ($CMD & 0x1000) ? (($CMD & 0x10000) ? rindex(uc($STR), $SUB) : index(uc($STR), $SUB)) : (($CMD & 0x10000) ? rindex($STR, $SUB) : index($STR, $SUB)); $a = $b = ''; $P < 0 and return ($CMD & 256) ? $STR : ''; $a = substr($STR, 0, $P); $b = substr($STR, $P + length($SUB)); return ($CMD & 16 ? $a : '') . ($CMD & 1 ? $b : ''); } ################################################## # v2019.12.7 # This function scans string S and replaces the # first N occurrences of string A with string B # and returns a new string. If N is -1 then only # the last instance is replaced. # Usage: STRING = Replace(STRING_S, STRING_A, [STRING_B, [N]]) # sub Replace { # First, we make sure that required arguments are available # and any special scenarios are handled correctly. defined $_[0] or return ''; # Missing arguments? defined $_[1] or return $_[0]; # Missing arguments? my $B = defined $_[2] ? $_[2] : ''; # Replace to --> $B my $N = defined $_[3] ? $_[3] : 0x7FFFFFFF; # Get $N my ($LA, $LB) = (length($_[1]), length($B)); # Get string lengths # The search string must not be an empty string, or we exit. # The string that we search for must not be longer than # the string in which we search. ($N && $LA && $LA <= length($_[0])) or return $_[0]; my ($LAST, $F, $X) = (0, 0, $_[0]); if ($N > 0x7FFFFFFE) { # If N was not provided, then that means we have to # replace every instance, so we'll use regex... my $A = $_[1]; $X =~ s/\Q$A\E/$B/g; return $X; } if ($N < 0) { # If we get here, we must not replace every # instance, and we must go from right to left. $F = length($X); while (($F = rindex($X, $_[1], $F)) >= 0) { substr($X, $F, $LA) = $B; ++$N or last; } return $X; } if ($LA == $LB) { # In this case, output string will be the # same length as the input string. # We must not replace every instance, # and we search from left to right. while (($F = index($X, $_[1], $F)) >= 0) { substr($X, $F, $LA) = $B; $F += $LB; --$N or last; } return $X; } # In this final scenario, the output string will # NOT be the same length as the input string. # We must not replace every instance, # and we search from left to right. # For performance reasons, we build a new string. $X = ''; while (($F = index($_[0], $_[1], $F)) >= 0) { $X .= substr($_[0], $LAST, $F - $LAST); $X .= $B; $F += $LA; $LAST = $F; --$N or last; } return $X . substr($_[0], $LAST); } ################################################## # v2019.12.9 # Returns one or more words from a string. # The string is treated as a list of words separated # by whitespace. In this case, a "whitespace" is any # character whose ASCII value is less than 33. This # includes new line characters, tab, space, null, etc. # PTR tells which word to grab starting with 1. # If PTR is 3, the third word is returned. # If PTR is not specified, the default value is 1. # COUNT tells how many words to return. Default is 1. # When COUNT has a negative value, returns every word # from PTR all the way to the end of the string. # The words in the return value will always be separated # by a space character regardless of how many spaces or # tabs were between them in the input string. # # Usage: STRING = GetWord(STRING, [PTR, [COUNT]]) # sub GetWord { defined $_[0] or return ''; my $LEN = length($_[0]); my $PTR = defined $_[1] ? $_[1] : 1; my $COUNT = defined $_[2] ? $_[2] : 1; return '' if ($LEN == 0 || $COUNT == 0 || $PTR >= $LEN); my $START = -1; my $OUTPUT = ''; $PTR > 0 or $PTR = 1; for (my $i = 0; $i <= $LEN; $i++) { if (vec($_[0], $i, 8) > 32) { $START >= 0 or $START = $i; next; } if ($START >= 0) { if ($PTR-- < 2) { length($OUTPUT) == 0 or $OUTPUT .= ' '; $OUTPUT .= substr($_[0], $START, $i - $START); last if ($COUNT-- == 1); } $START = -1; } } return $OUTPUT; } ################################################## # # Terminates the script. In Linux, this function # pauses the screen before exiting. # Usage: EXIT([ERRORCODE]) # sub EXIT { my $E = defined $_[0] ? $_[0] : 0; if (!($^O =~ /WIN|DOS/i)) { $| = 1; print "\n\nEXITCODE = $E\n\nPress [ENTER] to exit..."; ; } exit($E); } ################################################## # # This function inserts commas into a number at # every 3 digits and returns a string. # Usage: STRING = Commify(INTEGER) # Copied from www.PerlMonks.org/?node_id=157725 # sub Commify { my $N = reverse $_[0]; $N =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g; return scalar reverse $N; } ################################################## # Converts all adjacent whitespace characters # to a single space. # Usage: STRING = CollapseSpace(STRING) # sub CollapseSpace { my $STR = defined $_[0] ? $_[0] : ''; $STR =~ s/[ \t\n\r\f\0]+/ /g; return $STR; } ################################################## sub ConvertToDateFormat { my $DATE = defined $_[0] ? $_[0] : ''; # A valid date must contain two forward slashes CountStr($DATE, '/') == 2 or return ''; # Split numbers my @D = GetNumbers($DATE); # A valid date must contain 3 numbers @D == 3 or return ''; # Check for impossible values my $LAST_TWO_DIGITS_OF_TAXYEAR = substr($TAXYEAR, 2, 2); $D[0] <= 12 or return ''; # MONTH $D[1] <= 31 or return ''; # DAY $D[2] == $LAST_TWO_DIGITS_OF_TAXYEAR or return ''; # YEAR # Return date as MM/DD/YYYY return sprintf('%.2d/%.2d/%s', $D[0], $D[1], $TAXYEAR); } ################################################## # v2021.1.17 # This function extracts digits from a string and # returns them in an array. # Example: # GetNumbers("34DX5g") -> ["34", "5"] # GetNumbers("-x39.4") -> ["39", "4"] # Usage: ARRAY = GetNumbers(STRING) # sub GetNumbers { my @N; defined $_[0] or return @N; my ($L, $p, $i, $c) = (length($_[0]), -1); for ($i = 0; $i <= $L; $i++) { $c = vec($_[0], $i, 8); if ($c < 48 || $c > 57) { if ($p >= 0) { push(@N, substr($_[0], $p, $i - $p)); } $p = -1; } elsif ($p < 0) { $p = $i; } } return @N; } ################################################## # Returns 1 if SUBSTRING is found in STRING. # Returns 0 if SUBSTRING is not found. # Usage: INTEGER = Contains(STRING, SUBSTR) sub Contains { defined $_[0] or return 0; length($_[0]) or return 0; defined $_[1] or return 0; length($_[1]) or return 0; return index($_[0], $_[1]) < 0 ? 0 : 1; } ################################################## sub ValidateAmount { my $N = defined $_[0] ? $_[0] : ''; Contains($N, '.') or return -1; Contains($N, '$') or return -1; $N =~ tr|.0-9||cd; SplitAB($N, '.'); length($a) >= 1 or return -1; length($b) == 2 or return -1; return $N; } ################################################## # v2019.11.24 # This function splits STRING into two parts along the # first occurrence of SUBSTR. The two resulting string # segments are stored in $a and $b. The search for # SUBSTR starts at position N. If N is -1, then # starts searching from the end of the string. # Returns 0 if SUBSTR was not found, OR # returns POSITION+1 where SUBSTR was found. # If SUBSTR is not found, the entire input string # will be stored in $a, while $b will be empty. # Usage: FOUND = SplitAB(STRING, SUBSTR, [N]) # sub SplitAB { $a = $b = ''; defined $_[0] or return 0; defined $_[1] or return 0; length($_[1]) or return 0; my ($P, $L) = (0, length($_[0])); $L or return 0; my $N = defined $_[2] ? $_[2] : 0; $N < $L or return 0; if ($N < 0) { $N += $L; $N >= 0 or return 0; $P = rindex($_[0], $_[1], $N); } else { $P = index($_[0], $_[1], $N); } $P >= 0 or return 0; $a = substr($_[0], 0, $P); $b = substr($_[0], $P + length($_[1])); return $P + 1; } ################################################## # Returns 1 if STRING contains any digits. # Returns 0 if STRING does not contain any digits. # Usage: INTEGER = ContainsDigits(STRING) sub ContainsDigits { my $X = defined $_[0] ? $_[0] : ''; return $X =~ /[0-9]/ ? 1 : 0; } ################################################## # v2021.3.13 # Returns 1 if all characters in CHARSET occur in STRING. # Usage: INTEGER = ContainsAllOf(STRING, CHARSET) # sub ContainsAllOf { return ContainsOnly($_[1], $_[0]); } sub DoesntContain { defined $_[0] or return -1; defined $_[1] or return 0; length($_[1]) or return 0; (my $L = length($_[0])) or return -1; while ($L--) { index($_[1], substr($_[0], $L, 1)) >= 0 or return 0; } return 1; } ################################################## # v2021.3.13 # Returns 1 if STRING is strictly made up of characters # found in KNOWNSET. Returns 0 if string contains # one or more characters not found in KNOWNSET. # Returns -1 if STRING is empty or undefined! # Usage: INTEGER = ContainsOnly(STRING, KNOWNSET) # sub ContainsOnly { defined $_[0] or return -1; defined $_[1] or return 0; length($_[1]) or return 0; (my $L = length($_[0])) or return -1; while ($L--) { index($_[1], substr($_[0], $L, 1)) >= 0 or return 0; } return 1; } ################################################## # This function returns the percentage of letters # (A-Z) vs all other characters in a string. # This will be an integer between 0 and 100. # Usage: INTEGER = PercentageOfText(STRING) # sub PercentageOfText { my $X = defined $_[0] ? $_[0] : ''; my $ORIGINAL_LENGTH = length($X); $X =~ tr|a-zA-Z||cd; return int(length($X) / ($ORIGINAL_LENGTH + 1) * 100); } ################################################## # Usage: CLS() - Clears the terminal window. # sub CLS { my $OS = uc($^O); if (index($OS, 'LINUX') >= 0) { print "\x1Bc\x1B[0m\x1B[3J\x1B[H\x1B[2J"; } elsif (index($OS, 'DOS') >= 0) { system('COMMAND.COM /C CLS'); } elsif (index($OS, 'WIN') >= 0) { system('CLS'); } else { print "\x1B[3J"; } # Mac } ################################################## # v2021.2.8 # This function takes any string and merges all # the digits in it into a number. It will ALWAYS # return an integer that is no less than MIN and # no greater than MAX. By default, the MIN # value is -999,999,999,999,999. And MAX # value is +999,999,999,999,999. # Usage: INTEGER = toInt(STRING, [MIN, [MAX]]) # sub toInt { defined $_[0] or return 0; my $MIN = defined $_[1] ? $_[1] : -999999999999999; my $MAX = defined $_[2] ? $_[2] : 999999999999999; my $N = shift; $N = cut($N, '.', 0x110); # Check negative values my $NEG = index($N, '-') < 0 ? 0 : 1; # Remove everything except numbers: $N =~ tr|\x00-\x2F\x3A-\xFF||d; # Remove leading zeros my $i = 0; for (; $i < length($N); $i++) { vec($N, $i, 8) < 49 or last; } if ($i > 0) { $N = substr($N, $i); } length($N) or return 0; # Allow no more than 15 digits. if (length($N) > 15) { $N = substr($N, 0, 15); } # Add sign if ($NEG) { $N = -$N; } # Check MIN value $N = ($N > $MIN) ? $N : $MIN; # Check MAX value return ($N < $MAX) ? $N : $MAX; } ################################################## # v2021.2.23 # This function splits a string along paragraphs. # Each paragraph is to be separated by one blank # line. If a paragraph contains more than two # lines, then it splits those lines as if # each were individual paragraphs. # Usage: ARRAY = SplitParagraphs(STRING) # sub SplitParagraphs { my @A; defined $_[0] or return @A; @A = split(/\n\s*\n/, $_[0]); my $REP = 0; for (my $i = 0; $i < @A; $i++) { if (CountStr($A[$i], "\n") > 1) { $A[$i] =~ tr|\n|\0|; $REP = 1; } } return ($REP) ? split(/\0/, join("\0", @A)) : @A; } ################################################## # This function converts a string that holds a # dollar amount to a number, and performs strict # syntax checking. If the number seems invalid, # then a blank string is returned instead. # Usage: NUMBER = toNumber(STRING) # sub toNumber { defined $_[0] or return ''; my $L = length($_[0]); my $prev; my $c = -1; my $N = ''; my $PAR = 0; my $SIGN = ''; my $DIGIT = 0; my $PLUS_COUNT = 0; my $MINUS_COUNT = 0; my $DIGIT_COUNT = 0; my $DECIMAL_POINT = 0; for (my $i = 0; $i < $L; $i++) { $prev = $c; $c = vec($_[0], $i, 8); $c > 32 or next; # Ignore space \r \n \t \f \0 etc. if ($c == 32 || $c == 9) # Space or tab { # We allow a space or tab anywhere in the number. } elsif ($c == 10 || $c == 13) # New line character { # A new line may mark the end of a number, so # we return whatever we may have up to this point. return $N; } elsif ($c > 47 && $c < 58) # Digit! { # If a number is given as (####.##), then digits # are not allowed after the closing parentheses. $PAR < 3 or return ''; $DIGIT_COUNT++; $DIGIT = 1; $N .= substr($_[0], $i, 1); } elsif ($c == 44) # Comma { # A number cannot start with a comma. $DIGIT_COUNT or return ''; # Commas are not allowed after the decimal point. if ($DECIMAL_POINT) { return ''; } # Two commas cannot be next to each other. $c != $prev or return ''; } elsif ($c == 46) # Decimal point { # Only one decimal point is allowed! ++$DECIMAL_POINT < 2 or return ''; # If we've had no digits yet, then insert a zero. $DIGIT_COUNT or $N .= '0'; $N .= '.'; } elsif ($c == 43) # Plus sign { # Only one plus sign is allowed. ++$PLUS_COUNT < 2 or return ''; # A plus sign may occur only at the beginning # of a number before the decimal point. $DECIMAL_POINT == 0 or return ''; $DIGIT_COUNT == 0 or return ''; # We can't have a plus and a minus sign in the same number. $MINUS_COUNT == 0 or return ''; } elsif ($c == 45) # Minus sign { # Only one minus sign is allowed. ++$MINUS_COUNT < 2 or return ''; # A minus sign may occur only at the beginning # of a number before the decimal point. $DECIMAL_POINT == 0 or return ''; $DIGIT_COUNT == 0 or return ''; # We can't have a plus and a minus sign in the same number. $PLUS_COUNT == 0 or return ''; $SIGN = '-'; } elsif ($c == 40) # ( { $PAR++ == 0 or return ''; } elsif ($c == 41) # ) { $PAR++ == 1 or return ''; $SIGN = '-'; } else { return ''; } } return $SIGN . $N; } ################################################## # v2021.2.13 # This function will sometimes return an integer, # and sometimes a string. Normally, it returns 1 # if string ends with a certain suffix, or 0 otherwise. # This function can also enforce the presence/absence # of the suffix depending on the third argument: # CMD=0x00 : Returns an integer that tells whether the string ends with suffix or not # CMD=0x01 : Returns a string making sure it does end with suffix # CMD=0x02 : Returns a string making sure it doesn't end with suffix # CMD=0x10 : Ignore case # # Usage: X = Suffix(STRING, SUFFIX, [CMD]) # sub Suffix { my $STRING = defined $_[0] ? $_[0] : ''; my $SUFFIX = defined $_[1] ? $_[1] : ''; my $CMD = defined $_[2] ? $_[2] : 0x10; my $LS = length($STRING); my $LX = length($SUFFIX); my $FOUND = ($LS < $LX) ? 0 : 1; if ($FOUND) { my $EXISTING_SUFFIX = substr($STRING, $LS - $LX); if ($CMD & 16) { $FOUND = (uc($EXISTING_SUFFIX) eq uc($SUFFIX)) ? 1 : 0; } else { $FOUND = ($EXISTING_SUFFIX eq $SUFFIX) ? 1 : 0; } } $CMD &= 3; my $WITHOUT_SUFFIX = ($FOUND) ? substr($STRING, 0, $LS - $LX) : ''; if ($CMD & 2) { return $FOUND ? $WITHOUT_SUFFIX : $STRING; } if ($CMD & 1) { return $FOUND ? $WITHOUT_SUFFIX . $SUFFIX : $STRING . $SUFFIX; } return $FOUND; } ################################################## # # This function converts a Schwab 1099 text file # to CSV format. # Usage: ParseSchwabFile(FILENAME) # sub ParseSchwabFile { # Create output file name by changing the extension to .csv my $FILENAME = defined $_[0] ? $_[0] : ''; my $P = rindex($FILENAME, '.'); return if ($P < 0); my $OUTPUT_FILE = substr($FILENAME, 0, $P) . '.csv'; # Read the entire text file. my $DATA = ReadFile($FILENAME); # When the PDF file is saved as text, it's usually saved # in UTF-8 format, so we need to get rid of the junk. # Here we remove everything except letters, numbers, # spaces, line breaks, and a few symbols. $DATA =~ tr|a-z A-Z 0-9 .,:;/*\n\t\'\"\-=+\\()[]{}<>$%&_!? ||cd; $DATA =~ s/[ \t]+/ /g; # Collapse whitespace $DATA =~ tr|a-z|A-Z|; # Convert to all upper case my $SCHWAB_BEGIN_MARKER = 'SHORT-TERM TRANSACTIONS FOR WHICH BASIS IS REPORTED TO THE IRS'; my $SCHWAB_END_MARKER = 'SCHWAB HAS PROVIDED REALIZED GAIN AND LOSS INFORMATION WHENEVER POSSIBLE'; $P = index($DATA, $SCHWAB_BEGIN_MARKER); $P >= 0 or next; my $E = index($DATA, $SCHWAB_END_MARKER); $E = ($E < 0) ? length($DATA) : $E; $DATA = substr($DATA, $P, $E - $P); my $X = 0; $P = 0; my $OUTPUT_LINE_COUNT = 0; my $EXPECT_SUBTOTAL_NEXT = 0; my $MONTH = 'JanFebMarAprMayJunJulAugSepOctNovDec'; my @GRP = SplitParagraphs($DATA); foreach (@GRP) { $CUSIP = $STRIKE = $SYMBOL = $NAME = $TYPE = $DATE_OPEN = $DATE_CLOSE = $EXP_DATE = $QTY = $UNIT = ''; $REGEX_CAPT = $SOLD = $COST = $WASH = $PROFIT = 0; # Any relevant line has at least 50 characters. if (length($_) > 50) { # To find transactions, we use the following criteria: # 1) String must not be a page separator # 2) String must not contain too much text # 3) String must contain at least one '$' sign or a digit $LINE = $_; CountStr($LINE, '-') < 10 or next; my $SHRUNK = $LINE; PercentageOfText($SHRUNK) < 50 or next; (Contains($LINE, '$') || ContainsDigits($LINE)) or next; if (Contains($LINE, 'SECURITY SUBTOTAL')) { next; } # Parse option transaction (Regex #1) # The following regex extracts values from paragraphs that look like this: # # 1CALL APPLE INC $460 EXP 08/2 SC 08/14/20 $ 849.33 $ 780.65 -- $ 68.68 $ 0.00 # AAPL 08/21/2020 460.00 C 08/17/20 -- # # (Remember: The $LINE variable holds a string which contains # a "\n" somewhere in the middle. So, it actually holds two # lines, not just one as the name of the variable might suggest.) # $LINE =~ s/([0-9]+)\s*(PUT|CALL) ([ 0-9A-Z\&\+]+)\$[0-9.]+ [EXP. 0-9\/ SC]*\s+(VARIOUS|([01][0-9]\/[0-3][0-9]\/2[01]))\s*\$\s+([\-0-9.,\(\)]+)\s+\$\s+([\-0-9.,\(\)]+)\s+[ \$\-0-9.,\(\)]+\$\s+([\-0-9.,\(\)]+)\s+\$\s+[0.]+[\r\n\t ]+([A-Z]+)\s+([01][0-9]\/[0-3][0-9]\/20[0123]+)\s+([0-9.]+)\s+[CP]+\s+([01][0-9]\/[0-3][0-9]\/[0123]+)\s+([0-9.,\-\(\)]+)//; if (defined $1) { $REGEX_CAPT = 1; $SYMBOL = $9; $NAME = $3; $QTY = $1; $TYPE = $2; $DATE_OPEN = $4; $DATE_CLOSE = $12; $EXP_DATE = $10; $STRIKE = $11; $SOLD = $6; $COST = $7; $WASH = $13; $PROFIT = $8; } else { # Parse option transaction (Regex #2) # The following regex extracts values from paragraphs that look like this: # # DIA 07/10/2020 260.00 P 1.00 07/06/20 07/06/20 $ 215.35 $ 219.67 -- $ (4.32) # $LINE =~ s/([A-Z]+) ([01][0-9]\/[0-3][0-9]\/2[0123]+) ([0-9.]+) ([PC]) ([0-9.]+) ([01][0-9]\/[0-3][0-9]\/20) ([01][0-9]\/[0-3][0-9]\/20)\s*\$ ([\-()0-9.,]+)\s*\$ ([\-()0-9.,]+)[\$ ]*([\-\(\)0-9.,]+) \$ ([\-\(\)0-9.,]+)//; if (defined $1) { $REGEX_CAPT = 2; $SYMBOL = $1; $NAME = $12; $QTY = $5; $TYPE = $4; $DATE_OPEN = $6; $DATE_CLOSE = $7; $EXP_DATE = $2; $STRIKE = $3; $SOLD = $8; $COST = $9; $WASH = $10; $PROFIT = $11; } else { # Parse stock transaction (Regex #3) # The following regex extracts values from paragraphs that look like this: # # 450 BARCLAYS IPTH SRS B S&P SHRT TRM S 07/07/20 $ 15,028.64 $ 14,728.80 -- $ 299.84 $ 0.00 # 06746P621 / VXX 07/07/20 -- # $LINE =~ s/([0-9.,]+)\s*([A-Z0-9 \-\&]+) S (VARIOUS|[01][0-9]\/[0-3][0-9]\/2[01]) \$ ([0-9\.,\-\(\)]+) \$ ([0-9.,\-\(\)]+)[ \$]+([0-9.,\-\(\)]+) \$ ([0-9.,\-\(\)]+) \$ [0-9.,\-\(\)]+\s*\n([0-9A-Z]{9,9}) \/ ([A-Z\/]+) ([01][0-9]\/[0-3][0-9]\/2[01]) ([0-9.,\-]*)//; if (defined $1) { $REGEX_CAPT = 3; $CUSIP = "CUS$8"; $SYMBOL = $9; $NAME = $2; $QTY = $1; $DATE_OPEN = $3; $DATE_CLOSE = $10; $SOLD = $4; $COST = $5; $WASH = $6; $PROFIT = $7; } else { # Parse stock transaction (Regex #4) # The following regex extracts values from paragraphs that look like this: # # FEDEX CORP 31428X106 14.00 08/24/20 08/25/20 $ 3,021.06 $ 2,984.47 -- $ 36.59 # $LINE =~ s/([A-Z0-9 \-\&\+]+)\s*([0-9A-Z]{9,9}) ([0-9,]+\.00) ([01][0-9]\/[0-3][0-9]\/2[01]) ([01][0-9]\/[0-3][0-9]\/2[01]) \$ ([0-9\.,\-\(\)]+)[ \$]+([0-9.,\-\(\)]+)[ \$]+([0-9.,\-\(\)]+) \$ ([0-9.,\-\(\)]+)[ \r\n\t]*([0-9A-Z\-\+\& ]*)//; if (defined $1) { $REGEX_CAPT = 4; $CUSIP = "CUS$2"; $NAME = "$1 $10"; $QTY = $3; $DATE_OPEN = $4; $DATE_CLOSE = $5; $SOLD = $6; $COST = $7; $WASH = $8; $PROFIT = $9; } else { # Parse stock transaction (Regex #5) # The following regex extracts values from paragraphs that look like this: # # 1,000 DIREXION DAILY XXXREVERSE S S 03/26/20 $ 17,239.72 $ 16,429.00 -- $ 810.72 $ 0.00 # 25460E182 03/26/20 -- # # 100 PROSHARES ULTRAPRO SHXXXREVE S 05/27/20 $ 1,306.98 $ 1,315.00 -- $ (8.02) $ 0.00 # 74347B268 05/28/20 -- # $LINE =~ s/([0-9.,]+)\s*([A-Z \-\&]+)S (VARIOUS|[01][0-9]\/[0-3][0-9]\/2[01]) \$ ([0-9\.,\-\(\)]+) \$ ([0-9.,\-\(\)]+)[ \$]+([0-9.,\-\(\)]+) \$ ([0-9.,\-\(\)]+) \$ [0-9.,\-\(\)]+[ \n]+([0-9][A-Z0-9]+) ([01][0-9]\/[0-3][0-9]\/2[01]) ([0-9.,\-]*)//; if (defined $1) { $REGEX_CAPT = 5; $CUSIP = "CUS$8"; $NAME = $2; $QTY = $1; $DATE_OPEN = $3; $DATE_CLOSE = $9; $SOLD = $4; $COST = $5; $WASH = $6; $PROFIT = $7; } } } } } length($CUSIP) > 3 or $TYPE = index($TYPE, 'C') >= 0 ? 'Call' : 'Put'; $UNIT = ($TYPE =~ /Put|Call/) ? 'contract' : 'share'; $QTY = toInt($QTY); $QTY = "$QTY $UNIT" . ($QTY > 1 ? 's' : ''); my @LIST = ( $REGEX_CAPT, $CUSIP, $SYMBOL, $NAME, $QTY, $TYPE, $DATE_OPEN, $DATE_CLOSE, $EXP_DATE, $STRIKE, $SOLD, $COST, $WASH, $PROFIT); $OUTPUT .= join("\t", @LIST) . "\t$LINE\n"; } } # Write output to file my $HEADER = 'Results
Here is your data ready to be copied into Excel:

\n"; $OUTPUT =~ tr^|^\t^; CreateFile($OUTPUT_FILE, $HEADER . $OUTPUT . $FOOTER); } ################################################## # # This function looks for a 9-byte long code which # is made up of letters and numbers surreounded by # space, tab or newline characters... # When it is found, it is removed from $LINE and # the value is returned. # sub GetCUSIP { my $START = -1; my $CUSIP = ''; my $F = 0; my $L = length($LINE); for (my $i = 0; $i <= $L; $i++) { my $c = vec($LINE, $i, 8); if ($c == 32 || $c == 9 || $c == 10 || $c == 13) { if ($START == $i - 9 && $F == 3) { $CUSIP = substr($LINE, $START, 9); $LINE = substr($LINE, 0, $START) . substr($LINE, $i); return $CUSIP; } $F = 0; $START = -1; } elsif ($c > 47 && $c < 58) { $F |= 1; $START >= 0 or $START = $i; } elsif ($c > 64 && $c < 91) { $F |= 2; $START >= 0 or $START = $i; } else { $F = 0; $START = -1; } } return ''; } ################################################## # v2019.11.25 # Splits a string along numbers and returns an # array of alternating numbers and text. # Usage: ARRAY = SplitNumbers(STRING) # # Example: SplitNumbers('6500 Main St, Miami, FL 33014') ---> # # ('6500', ' Main St, Miami, FL ', '33014') # sub SplitNumbers { defined $_[0] or return (); my ($PTR, $PREV, $LEN, $TYPE, @A) = (0, -1, length($_[0])); $LEN or return (); # Possible values for $PREV: -1=Uninitialized 0=NUMBER 1=TEXT for (my $i = 0; $i < $LEN; $i++) { $TYPE = vec($_[0], $i, 8); $TYPE = $TYPE < 48 || $TYPE > 57; # Is it a number? if ($PREV == !$TYPE) # Same as before? { push(@A, substr($_[0], $PTR, $i-$PTR)); $PTR = $i; } $PREV = $TYPE; } push(@A, substr($_[0], $PTR)); # Process last chunk return @A; } ################################################## # # This function extracts the quantity from the # list and returns it. # sub GetQty { my $QTY = 0; my $UNIT = ''; $LINE =~ s/([0-9]+)\s*(CALL|PUT) /OPT/; if (defined $1) { $QTY = $1; $UNIT = 'contract'; } else { $LINE =~ s/([0-9]+)\s*[A-Z]/STK/; if (defined $1) { $QTY = $1; $UNIT = 'share'; } else { $LINE =~ s/([A-Z]+) ([01][0-9]\/[0-3][0-9]\/202[01]) ([0-9.]+) [PC] ([0-9.]+) ([01][0-9]\/[0-3][0-9]\/20) ([01][0-9]\/[0-3][0-9]\/20) \$ ([\-()0-9.,]+) \$ ([\-()0-9.,]+) ([\-()0-9.,]+) \$ ([\-()0-9.,]+) /OPTION_CHECK/; if (defined $1) { $SYMBOL = $1; $EXP_DATE = $2; $STRIKE = $3; $QTY = $4; $DATE_OPEN = $5; $DATE_CLOSE = $6; $SOLD = $7; $COST = $8; $WASH = $9; $PROFIT = $10; $UNIT = 'contract'; } } } $QTY *= 1; $QTY or return '-'; return "$QTY $UNIT" . ($QTY > 1 ? 's' : ''); } ################################################## # # Grab dates from the line # $EXP_DATE = ''; # $DATE_OPEN = ''; # $DATE_CLOSE = ''; # print "\n@", substr($LINE, 0, 77); # my $DATE = ''; # my @N = GetNumbers($LINE); # $QTY = defined $N[0] ? $N[0] : 0; # $UNIT = "$QTY "; # if ($LINE =~ /CALL|PUT/) # { # $UNIT .= 'contract'; # $LINE =~ s/EXP (\d{1,2}\/\d{1,2}) //; # if (defined $1) { $EXP_DATE = $1; }# # # # Format Option Expiration Date # if (length($EXP_DATE)) # { # my $MO = cut($EXP_DATE, '/', 0x10); # $MO = substr($MONTH, $MO * 3, 3); # $EXP_DATE = $MO . (cut($EXP_DATE, '/', 1) * 1); # } # # # Format Option name # $P = index($LINE, '0.00'); # if ($P >= 0) # { # $LINE = substr($LINE, $P + 5); # $LINE = cut($LINE, '20', 0x111); # } # $DATE_OPEN = cut($LINE, ' ', 1) # } # else # { # $UNIT .= 'share'; # } # $QTY < 2 or $UNIT .= 's';# # # $LINE =~ s/S[C]? ([01][0-9]\/[0-3][0-9]\/2[0-3]) //; # if (defined $1) { $DATE_CLOSE = $1; } # $LINE =~ s/ ([01][0-9]\/[0-3][0-9]\/2[0-3])//; # if (defined $1) { $DATE_OPEN = $1; } # # } # sub DebugTransaction { print "\n\n**** TRANSACTION ****"; my @ITEMS = qw(NO TYPE CUSIP SYMBOL NAME QTY DESCRIPTION STRIKE EXP_DATE DATE_OPEN DATE_CLOSE SOLD COST WASH_SALE PROFIT DISCOUNT WITHHELD PATTERN RAW_DATA); foreach (@ITEMS) { print print "$_=|", eval('$' . $_), "|\n"; } PAUSE(); } ################################################## # This function saves a transaction. # Usage: SaveTransaction() # sub SaveTransaction { $NO++; if ($WASH_SALE eq '...') { $WASH_SALE = '0'; } $EXP_DATE = FormatDate($EXP_DATE); $DATE_OPEN = FormatDate($DATE_OPEN); $DATE_CLOSE = FormatDate($DATE_CLOSE); my $OPTION = $TYPE =~ /PUT|CALL/; if ($OPTION) { $DESCRIPTION = ShortDate($EXP_DATE) . " $SYMBOL $STRIKE " . TitleCase($TYPE); } else { $DESCRIPTION = TruncateStr($NAME, 24) . " $TYPE ($SYMBOL)"; } my $UNIT = ($OPTION) ? ($QTY > 1 ? 'contracts' : 'contract') : 'sh'; $DESCRIPTION = $QTY . ' ' . $UNIT . ' of ' . $DESCRIPTION; $DESCRIPTION =~ s/\s*([0-9,]*)[.0]*\s+/$1 /; $OUTPUT .= eval('"' . $HEADER . '"'); } ################################################## # # This function makes sure that any date that is # MM/DD/YY becomes MM/DD/YYYY by adding '19' or # '20' to it. Years 90-99 will be interpreted as # 1990...1999, while all other dates 2000...2089. # Usage: STRING = FormatDate(STRING) # sub FormatDate { my $DATE = defined $_[0] ? $_[0] : ''; my $L = length($DATE); $L == 8 or return $DATE; my $YR = substr($DATE, 6); $YR = ((vec($YR, 0, 8) > 56) ? '19' : '20') . $YR; return substr($DATE, 0, 6) . $YR; } ################################################## # v2021.3.2 # This function expects a list of words made up # of letters and/or numbers, etc. # It looks at each word and decides what it is. # If it looks like a date, then it writes 'D' # If it looks like a price, then it writes 'P' # If it's a 9-digit CUSIP code, then writes 'C' # If it's a forward slash, then it writes 'F' # If it looks like some English word or # something else, then it writes 'W' # The letter 'A' marks the beginning of each line. # The letter 'Z' marks the end of each line. # As each word is analyzed, a unique pattern # will emerge, which will be returned to the caller. # # Usage: STRING = CreatePattern(ARRAY_OF_STRINGS) # sub CreatePattern { my $PATTERN = 'A'; foreach my $S (@_) # Look at each string one by one { if ($S eq '/') { $PATTERN .= 'F'; } elsif ($S =~ /VARIOUS|Various|[01][0-9]\/[0-3][0-9]\/2[0123]+/) { $PATTERN .= 'D'; } elsif ($S =~ /[0-9A-Z]{9}/ && length($S) == 9) { $PATTERN .= 'C'; } elsif ($S =~ /[0-9,.\-\(\)]+/) { $PATTERN .= 'P'; } elsif ($S =~ /[a-zA-Z.:\/\&\+\-]/) { $PATTERN .= 'W'; } } $PATTERN .= 'Z'; return $PATTERN; } # Usage: INTEGER = CheckStringCharSet(STRING, ALLOWED, REQUIRED, [FORBIDDEN]) sub CheckStringCharSet { my $STRING = defined $_[0] ? $_[0] : ''; my $ALLOWED = defined $_[1] ? $_[1] : ''; my $REQUIRED = defined $_[2] ? $_[2] : ''; my $FORBIDDEN = defined $_[3] ? $_[3] : ''; my $L = length($STRING); for (my $i = 0; $i < $L; $i++) { my $c = substr($STRING, $i, 1); index($FORBIDDEN, $c) < 0 or return 0; if (length($REQUIRED)) { $REQUIRED =~ s/\Q$c//; } } return length($REQUIRED) ? 0 : 1; } ################################################## # v2019.11.23 # Counts how many times SUBSTR occurs in STRING and # returns the number. The search is case sensitive. # Usage: INTEGER = CountStr(STRING, SUBSTR) # sub CountStr { defined $_[0] or return 0; defined $_[1] or return 0; (my $LA = length($_[0])) or return 0; (my $LB = length($_[1])) or return 0; $LA >= $LB or return 0; my $COUNT = 0; for (my $i = 0; $i < $LA; $i += $LB) { $i = index($_[0], $_[1], $i); $i >= 0 or last; $COUNT++; } return $COUNT; } ################################################## # v2021.3.3 # This function cuts a text to a certain length # and adds '...' at the end if it was too long. # Usage: STRING = TruncateStr(STRING) # sub TruncateStr { my $S = defined $_[0] ? Trim($_[0]) : ''; my $MAXLEN = defined $_[1] ? $_[1] : 20; my $SUFFIX = '...'; $MAXLEN > 3 or return $SUFFIX; return ($MAXLEN > length($S)) ? $S : substr($S, 0, $MAXLEN - 3) . $SUFFIX; } ################################################## # v2021.3.3 # This function expects a date in MM/DD/YYYY format # and returns a date in MmmDD format. Example: # ShortDate("03/26/2020") --> "Mar26" # Usage: STRING = ShortDate(STRING) # sub ShortDate { my $DATE = defined $_[0] ? $_[0] : ''; $DATE =~ /([01]*[0-9])\/([0-3]*[0-9])[\/0-9]*/; return (defined $1 && defined $2) ? substr($MONTHS, ($1 * 3), 3) . ($2 * 1) : ''; } ################################################## # v2021.3.3 # Capitalizes the first letter of every word. # Usage: ASCII_STRING = TitleCase(ASCII_STRING) # sub TitleCase { my $S = defined $_[0] ? lc($_[0]) : ''; my $L = length($S); my $LETTERCOUNT = 0; my $WORD_SEPARATORS = " .,:;!?&/\()[]{}<>|-+=\t\n\r\xFF"; for (my $i = 0; $i < $L; $i++) { if (index($WORD_SEPARATORS, substr($S, $i, 1)) >= 0) { $LETTERCOUNT = 0; } elsif ($LETTERCOUNT++ == 0) { my $c = vec($S, $i, 8); if ($c > 96 && $c < 123) # Convert letter to upper case { vec($S, $i, 8) = $c & 223; } } } return $S; } ################################################## # # This function returns the appropriate Excel column # when given a heading label. It does this by looking # for the header label in the $HEADER global variable. # Example: FindColumn('COST') --> 'D' # Usage: STRING = FindColumn(LABEL) # sub FindColumn { my $LABEL = defined $_[0] ? $_[0] : ''; length($LABEL) or return ''; my $P = index(uc($HEADER), uc($LABEL)); return ($P < 0) ? '' : chr(65 + CountStr(substr($HEADER, 0, $P), '|')); } ################################################## # # This function reads the contents of a folder # and returns an array that contains file names # whose extensions match the ones specified in # the second argument. The second argument should # be a string containing extensions separated # by a single space. # Example: ReadDIR('/work/text', 'TXT TEXT HTM') # # Usage: ARRAY = ReadDIR(PATH, [EXTENSIONS]) # sub ReadDIR { my @FILELIST; my $PATH = defined $_[0] ? $_[0] : ''; my $FILTER = defined $_[1] ? $_[1] : ''; my $F = length($FILTER); length($PATH) or return @FILELIST; $PATH .= '/'; # Make sure that path ends with '/' $PATH =~ tr|\\|/|; # Convert path to Linux format $PATH =~ tr|/||s; # Remove double '//' if ($F) { # Format filter $FILTER =~ tr|a-z A-Z 0-9 _ ||cd; # Remove bad characters $FILTER =~ tr|a-z|A-Z|; # Convert to uppercase $FILTER = " $FILTER "; # Add spaces for easy search } my $FULLNAME; local *DIR; opendir(DIR, $PATH) or return @FILELIST; while ((my $NAME = readdir(DIR))) { if (length($NAME) < 3) { $NAME ne '.' && $NAME ne '..' or next; } my $FULLNAME = "$PATH$NAME"; # Ignore subdirectories; just deal with files. if (-f($FULLNAME)) { # Do we return all files or just the ones # that have a certain extension? if ($F) { # Grab the file extension my $P = rindex($NAME, '.'); $P++ > 0 or next; my $EXT = uc(substr($NAME, $P)); # Skip file if its extension doesn't match index($FILTER, " $EXT ") >= 0 or next; } push(@FILELIST, $NAME); # Add file to the list } } closedir(DIR); return @FILELIST; } ################################################## # # This function removes all whitespace from before # and after text and returns a new string. This # function removes every character whose ASCII # value is less than 33. This includes tab, space, # null, vertical tab, esc, new lines, etc.. # # Usage: STRING = Trim(STRING) # sub Trim { my $S = defined $_[0] ? $_[0] : ''; my $L = length($S) or return ''; my $START = 0; my $LAST = 0; while ($L--) { if (vec($S, $L, 8) > 32) { $START = $L; $LAST or $LAST = $L + 1; } } return substr($S, $START, $LAST - $START); } ################################################## # v2019.6.15 # This function is just like Trim() except it # removes characters specified in SUBSTR. # Usage: STRING = TrimChar(STRING, SUBSTR) # sub TrimChar { defined $_[0] or return ''; my $L = length($_[0]); $L or return ''; defined $_[1] or return $_[0]; length($_[1]) or return $_[0]; my $START = 0; my $LAST = 0; while ($L--) { if (index($_[1], substr($_[0], $L, 1)) < 0) { $START = $L; $LAST or $LAST = $L + 1; } } return substr($_[0], $START, $LAST - $START); } ################################################## # Just like the RTRIM$() function in BASIC language, # this function removes whitespace from the right # side of a string. # Usage: STRING = RTRIM(STRING) sub RTRIM { defined $_[0] or return ''; my $L = length($_[0]); while (vec($_[0], --$L, 8) < 33) {} return substr($_[0], 0, $L + 1); } ################################################## # v2021.2.21 # Returns the current working directory. # Usage: STRING = GetCurrentWorkingDirectory() # sub GetCurrentWorkingDirectory { return FormatPath($^O =~ /WIN|DOS/i ? `CD` : (exists($ENV{PWD}) ? $ENV{PWD} : `pwd`)); } ################################################## # This function returns the current date and time # in the following format: Mmm D YYYY HH:MM:SSmm # Usage: STRING = TimeStamp() # sub TimeStamp { my @D = localtime(); my $M = substr($MONTHS, $D[4] * 3, 3); my $A = $D[2] > 11 ? 'pm' : 'am'; $D[2] or $D[2] = 12; $D[2] < 13 or $D[2] -= 12; return sprintf("%s %d %d %d:%.02d:%.02d%s", $M, $D[3], 1900+$D[5], $D[2], $D[1], $D[0], $A); } ################################################## sub PAUSE { # Wait for user to press Enter... $| = 1; print "\n\nPRESS TO EXIT..."; $a = ; } ################################################## sub PrintDottedLine { print "\n\n", '.' x 66, "\n\n"; } ################################################## # Returns the index+1 of the first match. # Usage: INTEGER = IndexOfEither(STRING, SUBSTR, [SUBSTR...]) sub Either { defined $_[0] or return 0; defined $_[1] or return -1; (my $END = length($_[0])) && length($_[1]) or return -1; my $i = defined $_[2] ? $_[2] : 0; my $MATCH = defined $_[3] ? 0 : 1; $i < $END or return -1; my $DIR = 1; if ($i < 0) { $i += $END; $END = $DIR = -1; } $i >= 0 or return -1; for (; $i != $END; $i += $DIR) {} } ################################################## # This function reads an entire Schwab 1099 # text file and converts it to CSV format. # Usage: ParseSchwabFile_fromRH(FILE_NAME) # sub ParseSchwabFile_fromRH { # Create output file name by changing the extension to .csv my $FILENAME = defined $_[0] ? $_[0] : ''; my $P = rindex($FILENAME, '.'); return if ($P < 0); my $OUTPUT_FILE = substr($FILENAME, 0, $P) . '.csv'; # Create the header for the output file $NO = 0; $OUTPUT = $TEMPLATE; $NAME = $SYMBOL = $CUSIP = $TYPE = $QTY = $DATE_OPEN = $DATE_CLOSE = $COST = $SOLD = $WASH_SALE = $PROFIT = $PATTERN = ''; # Read the entire text file. my $DATA = ReadFile($FILENAME); $DATA =~ tr|\x00-\x09\x0B-\x1F\x7F-\xFF||d; # Convert UTF-8 to ASCII $DATA =~ s/[ \t]+/ /g; # Collapse whitespace $DATA =~ tr|$||d; # Remove all dollar signs my @LINES = split(/\n/, $DATA); # Split lines # Read line by line. @GRAND_TOTAL = (0, 0, 0, 0); for (my $i = 0; $i < @LINES; $i++) { $RAW_DATA = Trim($LINES[$i]); # Trim line @WORDS = split(/[\x00-\x20]+/, $RAW_DATA); # Split into words # In this program, we are looking for complex patterns in a # text file, but instead of using mile-long regex patterns # which can be a nightmare to read, we work backwards: # We build a simple letter-based pattern from each line, # and then we extract values when we find a familiar pattern. $PATTERN = CreatePattern(@WORDS); # Categorize each word #$OUTPUT .= "\n" . $PATTERN . '|||||' . $RAW_DATA . "\n"; # SaveTransaction(); #DebugTransaction(); #if ($PATTERN =~ /P/ && CountStr($PATTERN, 'W') < 10) #{ print substr($RAW_DATA, 0, 50), "$PATTERN\n"; } my $W = ''; if ($PATTERN =~ /AZ/) # BLANK LINE { ClearEverything(); } elsif ($PATTERN =~ /AWWWWPPPPPZ/) # GRAND TOTAL { if ($RAW_DATA =~ /TOTAL REALIZED GAIN OR \(LOSS\)/) { $PROFIT = pop(@WORDS); $WASH_SALE = pop(@WORDS); $COST = pop(@WORDS); $SOLD = pop(@WORDS); @GRAND_TOTAL = ($SOLD, $COST, $WASH_SALE, $PROFIT); } } elsif ($PATTERN =~ /A[WPC]+DPPPPPZ/) # TRANSACTION OR SPLIT FIRST LINE { ClearEverything(); # The quantity and the words PUT/CALL are sometimes one word # and sometimes two words, so we temporarily join the first two # words together and extract whatever we need. Then we # put back whatever was left: my $TEMP = shift(@WORDS) . ' ' . shift(@WORDS); $TEMP =~ s/([0-9.,]+)//; if (defined $1) { $QTY = $1; } $TEMP =~ s/()(CALL|PUT)//; if (defined $2) { $TYPE = $2; } unshift(@WORDS, $TEMP); $WITHHELD = pop(@WORDS); # Federal income tax withheld $PROFIT = pop(@WORDS); # Gain or loss $DISCOUNT = pop(@WORDS); # Market discount $COST = pop(@WORDS); # Basis $SOLD = pop(@WORDS); # Proceeds $DATE_OPEN = pop(@WORDS); # Date acquired $NAME = join(' ', @WORDS); # Name of security $NAME = cut($NAME, 'XXX', 0x110); # Remove the word 'XXXREVERSE_SPLITXXX' if (length($CUSIP) < 4) { $CUSIP = GetSymbol_or_CUSIP($SYMBOL); } if (length($SYMBOL) == 0) { $SYMBOL = GetSymbol_or_CUSIP($CUSIP); } # Pattern: APWWWWWWWWDPPPPPZ # Example: 450 BARCLAYS IPTH SRS B S&P SHRT TRM S 07/07/20 $ 15028.64 $ 14728.80 -- $ 299.84 $ 0.00 # # This pattern is always followed by: # # Pattern: ACFWDPZ # Example: 06746P621 / VXX 07/07/20 -- } elsif ($PATTERN =~ /ACFWDPZ/) { $CUSIP = shift(@WORDS); shift(@WORDS); # IGNORE / ($SYMBOL, $DATE_CLOSE, $WASH_SALE) = @WORDS; SaveTransaction(); ClearEverything(); } elsif ($PATTERN =~ /ACDPZ/) # STOCK SPLIT SECOND LINE { $TYPE = 'SPLIT'; ($CUSIP, $DATE_CLOSE, $WASH_SALE) = @WORDS; $SYMBOL = GetSymbol_or_CUSIP($CUSIP); SaveTransaction(); } elsif ($PATTERN =~ /AP[WPDF]+PPPPPZ/) # OPTION TRADE FIRST LINE { ClearEverything(); # The quantity and the words PUT/CALL are sometimes one word # and sometimes two words, so we temporarily join the first two # words together and extract whatever we need. Then we # put back whatever was left: my $TEMP = shift(@WORDS) . shift(@WORDS); $TEMP =~ s/([0-9]+)//; if (defined $1) { $QTY = $1; } $TEMP =~ s/()(CALL|PUT)//; if (defined $2) { $TYPE = $2; } unshift(@WORDS, $TEMP); if (length($TYPE) && length($QTY)) { $WITHHELD = pop(@WORDS); $PROFIT = pop(@WORDS); $DISCOUNT = pop(@WORDS); $COST = pop(@WORDS); $SOLD = pop(@WORDS); $DATE_OPEN = pop(@WORDS); $NAME = join(' ', @WORDS); #print ">>>\n$NAME"; } } elsif ($PATTERN =~ /AWDPWDPZ/) # OPTION TRADE SECOND LINE { my $TEMP; ($SYMBOL, $EXP_DATE, $STRIKE, $TEMP, $DATE_CLOSE, $WASH_SALE) = @WORDS; SaveTransaction(); ClearEverything(); } # AZ # 7 PUT SPDR S&P 500 286 EXP 05/ SC 05/06/20 415.33APWWWPPWPWDPPPPPZ # SPY 05/06/2020 286.00 P 05/06/20 93.34 AWDPWDPZ # AZ # 13 PUT SPDR S&P 500 286 EXP 0 SC 05/06/20 771.31 APWWWPPWPWDPPPPPZ # SPY 05/06/2020 286.00 P 05/06/20 --AWDPWDPZ # AZ # Security Subtotal 1,186.64 1,536.68 -- (256.70)AWWPPPPPZ # 93.34APZ if ($W ne 'W') { $WASH_SALE = 0; } $TYPE =~ tr|a-z|A-Z|; $TYPE =~ tr|a-zA-Z||cd; } # Create footer my $LINE_COUNT = CountStr($OUTPUT, "\n"); # Count number of lines # Add calculated totals $COST = $SOLD = $WASH_SALE = $PROFIT = "=SUM(X2:X$LINE_COUNT)"; my $COLUMN; $COLUMN = FindColumn('COST'); $COST =~ s/X/$COLUMN/g; $COLUMN = FindColumn('SOLD'); $SOLD =~ s/X/$COLUMN/g; $COLUMN = FindColumn('PROFIT'); $PROFIT =~ s/X/$COLUMN/g; $COLUMN = FindColumn('WASH_SALE'); $WASH_SALE =~ s/X/$COLUMN/g; $DESCRIPTION = 'CALCULATED TOTAL >'; $NAME = $QTY = $TYPE = $SYMBOL = $CUSIP = $PATTERN = $DATE_OPEN = $DATE_CLOSE = $STRIKE = $EXP_DATE = ''; $OUTPUT .= eval('"' . $HEADER . '"'); # Add grand total (last line) ($SOLD, $COST, $WASH_SALE, $PROFIT) = @GRAND_TOTAL; $DESCRIPTION = 'GRAND TOTAL >'; $OUTPUT .= eval('"' . $HEADER . '"'); # Save CSV file. $OUTPUT =~ tr/,//d; # Remove all commas that would mess up the columns $OUTPUT =~ tr/|/,/; # Change '|' back to commas CreateFile($OUTPUT_FILE, $OUTPUT); print " Created: $OUTPUT_FILE"; } ################################################## sub ClearEverything { $TYPE = $CUSIP = $SYMBOL = $NAME = $QTY = $DESCRIPTION = $STRIKE = $EXP_DATE = $DATE_OPEN = $DATE_CLOSE = $SOLD = $COST = $WASH_SALE = $PROFIT = $DISCOUNT = $WITHHELD = ''; } ################################################## # # This function receives a 9-letter CUSIP code # and returns the stock symbol. If the symbol # is not found, then it will be left blank. # This function can also work backwards: when # given a stock symbol, it returns the CUSIP. # Usage: STRING = GetSymbol_or_CUSIP(STRING) # sub GetSymbol_or_CUSIP { my $S = defined $_[0] ? uc($_[0]) : ''; length($S) or return ''; # If we can find the CUSIP or symbol within the program, GREAT! if ($SYMBOL_LIST =~ / $S\=CUS([0-9A-Z]+) /) { return $1; } if ($SYMBOL_LIST =~ / ([A-Z]+)\=CUS$S /) { return $1; } # But if we cannot find it, then we need to search the big database. if (length($CUSIP_DB) == 0) { $CUSIP_DB = ReadFile($CUSIP_FILE); } # This should be about 200KB if ($CUSIP_DB =~ /\n([A-Z]+),CUS$S,/) { return $1; } if ($CUSIP_DB =~ /\n$S,CUS([0-9A-Z]+),/) { return $1; } return ''; } ##################################################