#!/usr/bin/perl

#####################################################################
#
#  DELDUP v1.0                                Last Update: 2018.10.16
#
#  This Perl program will try to find and remove duplicate JPG files
#  within a specific directory. It will scan subdirectories too.
#  This script does not require any Perl modules.
#
#  Written by Zsolt N Perry in Oct. 2018, Pensacola, Fla.
#  For questions, comments, feature requests, or bug reports,
#  write to zsnp@juno.com.
#  This Perl script was tested with TinyPerl 5.8 under Windows XP.
#  This file was downloaded from http://www.wzsn.net/perl
#
#               THIS ENTIRE SOURCE CODE IS FREEWARE.
#  If you want to incorporate it or parts of it into your program,
#  just copy and paste whatever you need! There is no need to ask
#  for permission. This software is distributed "AS IS." There is
#  no warranty of any kind. The author will not be held liable for
#  any loss resulting from the use or misuse of this software.
#
#####################################################################

use 5.004;
use strict;
use warnings;
$| = 1;

my $P;
my $DIR = 'C:\\PHOTOS\\2009';

my @FIND = qw(.JPG| .JPE| .JPEG|);

my $RECURSIVE = 1;    # Scan subdirectories too?

my $DEMO_MODE = 0;    # When you change this to 1, the program will not
                      # make any changes to your hard drive even if you
                      # confirm to delete a file. It won't delete it,
                      # even though it says it's done.

my $READ_LIMIT = 70000;  # Compare only this many bytes to boost performance
#
# Why is it enough to compare the first 70000 bytes?
#
# JPG files have a header that often contains the name of the
# camera, the precise date & time the photo was taken and
# sometimes even the GPS location. So, in most cases it's
# enough to compare the JPG headers and ignore the content.
# This is not a foolproof way to compare JPG files, because
# some JPG images do not have such detailed header info.
# Just in case, we're going to compare the first 70000 bytes
# instead of just comparing the headers.
#
# Some JPG files also contain a small version of the entire
# picture, so if the small images are the same, then we can
# assume that two photos are the same. Again, some JPG images
# do not have a small version. So, we can't rely on that.
#
# Lastly, if you try to take 100 shots of the clear blue sky,
# you're going to end up with 100 photos whose content is
# different. If you zoom into each photo, you will see that
# every pixel is slightly different blue. There won't be two
# identical photos even if you can hold the camera still.
# So, it is safe to assume that if there is any difference in
# two JPG images, then that difference is going to show up in
# the details all over the picture, not just near the end
# of the file.
#
# So, if we compare the first 70000 bytes, and they are
# exactly the same, then we have nearly ruled out
# the possibility that the photos are same.
#
# My disclaimer is: This program will not say,
# "These two files are the same." Instead, it will say,
# "These two files SEEM to be the same."
#

my $TOTAL = 0;        # Total number of JPEG files found
my $DUP_FOUND = 0;    # Number of duplicates found
my $DUP_DELETED = 0;  # Number of duplicates deleted
my $MORE_SPACE = 0;   # Freed up space as a result of deleting duplicates
my @ALLFILES;         # This is going to be a gigantic array
my $Y = '';           # User's input

##################################################
########### THE PROGRAM BEGINS HERE #############
                                             ###
About();                                    ###
CheckDIR($DIR);                            ###
FindDup();                                ###
ShowResults();                             ###
exit;                                       ###
                                             ###
############ THE PROGRAM ENDS HERE ##############
##################################################


################################################################# CheckDIR
#
# This function reads the contents of a folder and calls
# CatchFile() for each file that was found.
# Returns 0 on success.
# Returns a negative value if an error occurred.
#
#   Usage: CheckDIR(PATH)
#
sub CheckDIR
{
  @_ or return;
  my $PATH = shift;
  defined $PATH or return;
  length($PATH) or return;
  $PATH =~ tr|/|\\|;   # Change / to \
  print "Reading directory: $PATH\n";
  $PATH = AddSuffix($PATH, '\\');
  my $FULLNAME;
  opendir(my $DIR, $PATH) or return;
  while (my $NAME = readdir $DIR)
  {
    $FULLNAME = "$PATH$NAME";
    if (-d($FULLNAME))
    {
      # Check into subdirectory if RECURSIVE == 1
      # Skip directory if its name starts with "."
      if ($RECURSIVE)
      { CheckDIR($FULLNAME) unless (vec($NAME, 0, 8) == 46); }
      next;
    }
    CatchFile($FULLNAME);
  }
  closedir $DIR;
}
################################################################# CatchFile
#
# This function is automatically called by CheckDIR() every time
# a file is found. This function gets the full name of the file.
#
#   Usage: CatchFile(FULLNAME) <-- Called by CheckDIR()
#
sub CatchFile
{
  my $F = shift;

  # Check file extension to see if it matches
  my $MATCH = 0;
  my $U = uc($F) . '|';
  foreach my $M (@FIND)
  { index($U, $M) < 0 or $MATCH++; }
  $MATCH or return;   # Oops, Not a match!

  # Save file name...
  $ALLFILES[$TOTAL] = sprintf('%.10d%s', (-s($F)), $F);
  $TOTAL++;
}
################################################################# FoundDup
#
# This function is called each time two files have the same length.
# The only argument it receives is the file's number in @ALLFILES.
#
#   Usage: FoundDup(FILEINDEX)  <-- Called by FindDup()
#
sub FoundDup
{
  my $i = shift;
  my $F1 = substr($ALLFILES[$i-1], 26);
  my $F2 = substr($ALLFILES[$i], 26);

  my $MATCH = CompareFiles($F1, $F2, $READ_LIMIT);

  if ($MATCH == 0)
  {
    $DUP_FOUND++;
    if ($Y ne 'D')
    {
      print '>' x 80;
      print "\nThese two photos seem to be the same:\nA)  $F1\nB)  $F2\n\n";
      print "What would you like to do now?\n\nFILE MENU:\n\n  1. Press Enter to skip these files.\n  2. To delete file A, type 'A' and press Enter.\n  3. To delete file B, type 'B' and press Enter.\n  4. To automatically delete the first duplicate every time, type 'D'\n  5. To exit, type 'X' and press Enter.\n\n";
      $Y = <STDIN>;
      $Y = uc(substr(Trim($Y), 0, 1));   # Take the first char
    }
    if ($Y eq 'A') { DeleteFile($F1); }
    if ($Y eq 'B') { DeleteFile($F2); }
    if ($Y eq 'D') { DeleteFile($F1); }
    if ($Y eq 'X') { ShowResults(); exit; }
  }
}
################################################################# FindDup
#
# This function prints the list of JPG files gathered and
# looks for pairs that have the same size.
#
#   Usage: FindDup()
#
sub FindDup
{
  print "Sorting all files...\n";
  # First, we sort the array by file size.
  @ALLFILES = sort(@ALLFILES);

  print "Searching for duplicates...\n";

  my $F;       # Full name
  my $S;       # Current file's size
  my $P = -1;  # Previous file's size
  my $i;
  my $COUNT = 0;

  # Keep files that have the same length.
  for ($i = 0; $i < @ALLFILES; $i++)
  {
    $F = substr($ALLFILES[$i], 10);  # Get full name
    $S = -s($F);                     # Get file size
    if ($S == $P)
    { $COUNT = 0; }
    else
    { if (++$COUNT > 1) { $ALLFILES[$i-1] = ''; } }
    $P = $S;
  }
  if ($COUNT) { $#ALLFILES = $i-2; }

  CompactArray();

  print "Comparing similar files...\n";

  # Calculate hash for files if there are more than
  # 2 files with the same size
  my $HASH;
  for (my $i = 0; $i < @ALLFILES; $i++)
  {
    $F = substr($ALLFILES[$i], 10);  # Get full name
    $S = -s($F);                     # Get file size
    if ($P == $S) { $COUNT++; }
    $HASH = GetFileHash($F, $READ_LIMIT);
    $ALLFILES[$i] = $HASH . $ALLFILES[$i];
    $P = $S;
  }

  @ALLFILES = sort(@ALLFILES);  # Sort by hash

  my @D;       # File's last modified date
  my $P = '';
  $COUNT = 0;
  for (my $i = 0; $i < @ALLFILES; $i++)
  {
    $HASH = substr($ALLFILES[$i], 0, 16); # Get hash
    $F = substr($ALLFILES[$i], 26);       # Get full name
    @D = localtime((stat($F))[9]);        # Get file date
    $S = -s($F);                          # Get file size
    print -r($F) ? 'r' : '-';
    print -w($F) ? 'w' : '-';
    print -x($F) ? 'x' : '-';
    print -f($F) ? 'f' : '-';
    printf(' %.04d-%.02d-%.02d %.02d:%.02d:%.02d', (1900+$D[5]), (1+$D[4]), $D[3], $D[2], $D[1], $D[0]);
    printf(" %.10d $HASH %s\n", $S, $F);
    if ($P eq $HASH) { FoundDup($i); }
    $P = $HASH;
  }
}
################################################################# CompareFiles
#
# This function compares two files, and returns 0 if the
# first N bytes are the same. Returns non-zero otherwise.
# If N is omitted, then it will read and compare the entire file.
# And if both files have equal size and content, then returns 0.
#
#   Usage: INTEGER = CompareFiles(FILE1, FILE2, [N])
#
sub CompareFiles
{
  @_ > 1 or return 1;
  my $F1 = shift;
  my $F2 = shift;
  (defined $F1) && (defined $F2) or return 2;
  (length($F1) && length($F2)) or return 3;
  my $N = @_ ? shift : 9999999999999;

  # Rule out cases when comparison is not necessary
  (-f $F1) or return 4;  # File1 is not a plain file ?
  (-f $F2) or return 5;  # File2 is not a plain file ?
  my $S1 = -s($F1);
  my $S2 = -s($F2);
  if ($S1 == 0 && $S2 == 0) { return 0; }  # Both files zero size?
  if ($S1 == 0 || $S2 == 0) { return 6; }  # One file zero size?
  (-r $F1) or return 7;  # Cannot read File1 ?
  (-r $F2) or return 8;  # Cannot read File2 ?

  # Figure out how many bytes to read
  if ($N > $S1) { $N = $S1 + 1; }
  if ($N > $S2) { $N = $S2 + 1; }
  my $BUFFSIZE = 65536;
  if ($BUFFSIZE > $N) { $BUFFSIZE = $N; }

  # Open files for reading
  open 0, "< $F1" or return 9;
  unless (open(1, "< $F2")) { close(0); return 10; }
  binmode 0;
  binmode 1;

  # Compare every byte
  my $BUFF1; my $R1;
  my $BUFF2; my $R2;
  my $FPOS = 0;
  my $DIFF = 0;
  while ($FPOS < $N)
  {
    $R1 = read(0, $BUFF1, $BUFFSIZE, $FPOS);
    $R2 = read(1, $BUFF2, $BUFFSIZE, $FPOS);
    if ($R1 != $R2) { $DIFF = 11; last; }
    if ($BUFF1 ne $BUFF2) { $DIFF = 12; last; }
    if ($R1 < $BUFFSIZE || $R2 < $BUFFSIZE) { last; }
    if ($FPOS + $BUFFSIZE > $N) { $BUFFSIZE = $N - $FPOS; if ($BUFFSIZE < 1) { last; } }
    $FPOS += $BUFFSIZE;
  }
  close 0;
  close 1;
  return $DIFF;
}
################################################################# GetFileHash
#
# This function generates a hash from the first N bytes of
# a binary file. Always returns a 16-byte hex string.
#
#   Usage: STRING = GetFileHash(FILENAME, [N])
#
sub GetFileHash
{
  my $Z = '0' x 16;         # Hash to return
  @_ or return $Z;
  my $F = shift;
  my $L = 9999999999999;    # Default limit
  my $N = @_ ? shift : $L;  # Get limit
  defined $F or return $Z;
  length($F) or return $Z;  # No filename?
  defined $N or $N = $L;
  length($N) or $N = $L;    # No known limit?
  $N = int($N);
  $N > 0 or return $Z;      # Read zero bytes?
  (-f $F) or return $Z;     # File is not a plain file?
  my $S = -s($F);
  $S > 0 or return $Z;      # File has zero size?
  (-r $F) or return $Z;     # File cannot be read?

  # Figure out how many bytes to read
  my $BUFFSIZE = 65536;
  if ($BUFFSIZE > $N) { $BUFFSIZE = $N; }

  # Set up RND generator
  my $A = 3419.4091596672;
  my $B = 1.19928832973 + $S;
  my $C = 1209.1773850021;
  my $SEED = 2.022173177;

  # Open file for reading
  open 0, "< $F" or return $Z;
  binmode 0;

  # Read every byte
  my $FPOS = 0;
  my $BUFF;
  my $R;
  my $c;
  my $i;
  my $j;
  while ($FPOS < $N)
  {
    $R = read(0, $BUFF, $BUFFSIZE, $FPOS);
    for ($i = 0; $i < $R; $i++)
    {
      $c = vec($BUFF, $i, 8);
      $SEED = ($SEED * $A + $c * 77.0192783563 + $B) % $C;
    }
    if ($R < $BUFFSIZE) { last; }
    if ($FPOS + $BUFFSIZE > $N) { $BUFFSIZE = $N - $FPOS; if ($BUFFSIZE < 1) { last; } }
    $FPOS += $BUFFSIZE;
  }
  close 0;
  for ($j = 0; $j < 16; $j++)
  { vec($Z, $j, 8) = HexCode(($SEED/($j+1))&255); }
  print "$Z $F\n";
  return $Z;
}
################################################################# CompareFile
#
# This function removes one or more files.
# Returns the number of files removed.
#
#   Usage: COUNT = DeleteFile(FILE_LIST)
#
sub DeleteFile
{
  my $S;
  my $N = 0;
  my $DELETED;

  foreach my $F (@_)
  {
    $S = -s($F);
    print "\nDELETING: $F  ($S)  ";
    $DELETED = ($DEMO_MODE == 0) ? unlink($F) : 1;
    if ($DELETED)
    {
      $MORE_SPACE += $S;
      print "DONE.\n\n";
      $DUP_DELETED++;
      $N++;
    }
    else
    { print "FAILED.\n\n"; }
  }
  return $N;
}
################################################################# ShowResults
#
# This function displays the number of files scanned,
# the number of duplicates found and erased, etc.
#
#   Usage: ShowResults()
#
sub ShowResults
{
  print "\n\nDirectory of $DIR";
  print "\nJPEG files         = ". AddCommas($TOTAL);
  print "\nDuplicates found   = ". AddCommas($DUP_FOUND);
  print "\nDuplicates removed = ". AddCommas($DUP_DELETED);
  print "\nSpace freed up     = ". AddCommas($MORE_SPACE) ." bytes\n\n";
}
################################################################# Trim
#
# This function removes newline characters and whitespace and
# all sorts of special characters before and after STRING,
# and it returns a new string.
#
#   Usage: STRING = Trim(STRING)
#
sub Trim
{
  @_ or return '';
  my $T = shift;
  defined $T or return '';
  length($T) or return '';
  my $s = -1;  # start ptr
  my $e = 0;   # end ptr
  for (my $i = 0; $i < length($T); $i++)
  {
    if (vec($T, $i, 8) > 32)
    {
      if ($s < 0) { $s = $i; }
      $e = $i;
    }
  }
  return substr($T, $s, $e - $s + 1);
}
################################################################# AddSuffix
#
# This function makes sure that STRING ends with SUFFIX.
#
#   Usage: STRING = AddSuffix(STRING, SUFFIX)
#
#   Example: AddSuffix('Abcdef', 'def')  ---> 'Abcdef'
#            AddSuffix('Abcdef', 'DEF')  ---> 'AbcdefDEF'
#
sub AddSuffix
{
  @_ or return ''; my $S = shift; defined $S or return ''; my $LS = length($S); $LS or return '';
  @_ or return $S; my $X = shift; defined $X or return $S; my $LX = length($X); $LX or return $S;
  if ($LS >= $LX) { if (substr($S, $LS - $LX, $LX) eq $X) { return $S; } }
  return $S . $X;
}
################################################################# AddCommas
#
# This function inserts commas after every 3rd digit in a big number.
# I copied this function straight from the PerlMonks website:
# https://www.perlmonks.org/?node_id=110137
#
#   Usage: NUMBER_WITH_COMMAS = AddCommas(BIG_NUMBER)
#
#   Example: AddCommas(12345678.2233) --> 12,345,678.2233
#
sub AddCommas
{
  local $_ = shift;
  1 while s/^(-?\d+)(\d{3})/$1,$2/;
  return $_;
}
################################################################# CompactArray
#
# This function removes empty elements from array @ALLFILES.
# Returns the length of @ALLFILES.
#
#   Usage: LENGTH = CompactArray()
#
sub CompactArray
{
  my $j = 0;
  for (my $i = 0; $i < @ALLFILES; $i++)
  {
    length($ALLFILES[$i]) or next;
    $ALLFILES[$j++] = $ALLFILES[$i];
  }
  $#ALLFILES = $j - 1;
  return @ALLFILES;
}
################################################################# HexCode
#
# This function converts a number (0-255) to hex format and
# returns the corresponding ASCII codes as an integer.
#
#   Usage: INTEGER = HexCode(INTEGER)
#
#   Example: HexCode(195) => "C3" => 67 51 => 0x4333
#
sub HexCode
{
  @_ or return 0x3030;
  my $N = $_[0];
  defined $N or return 0x3030;
  length($N) or return 0x3030;
  $N = int($N);
  $N > 0 or return 0x3030;
  $N < 255 or return 0x4646;
  my @XX = (48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 65, 66, 67, 68, 69, 70);
  return ($XX[($N&240)>>4]<<8)+$XX[$N&15];
}
############################################################### ReadFile
#
# This function reads an entire file in raw mode
# and returns the contents in one string.
#
# Usage: STRING = ReadFile(FILE_NAME, [BYTES_TO_READ])
#
sub ReadFile
{
  @_ or return '';
  my $NAME = shift;
  defined $NAME or return '';
  length($NAME) or return '';
  my $L = @_ ? shift : 9999999;
  (-f $NAME) or return '';  # plain file
  (-s $NAME) or return '';  # file size
  my $T;
  open 0, "<$NAME" or return '';
  binmode 0;
  read 0, $T, $L, 0;
  close 0;
  defined $T or return '';
  return $T;
}
############################################################### CENTER
#
# This function will print some text in the center of the
# screen, assuming we are in 80x25 text mode.
#
#   Usage: CENTER(ROWS_TO_SKIP, TEXT)
#
sub CENTER
{
  @_ or return;
  print "\n"x(shift);
  @_ or return;
  my $T = substr(shift, 0, 80);
  my $PADDING = int((80 - length($T)) / 2);
  print ' ' x $PADDING, $T;
  if (length($T) != 80) { print "\n"; }
}
################################################################# About
#
# This function prints the description of this program.
#
# Usage: About()
#
sub About
{
  my $X = ReadFile($0, 512);
  my @Z = split("\n", $X, 24);
  shift @Z;
  CENTER(2, $0);
  foreach my $L (@Z)
  {
    length($L) > 1 or next;
    index($L, '###') < 0 or last;
    CENTER(0, substr($L, 2));
  }
  print "\n\tDIRECTORY  = $DIR\n\n";
  print "\tREAD_LIMIT = $READ_LIMIT\n\n";
  print "\tRECURSIVE  = ", YN($RECURSIVE), "\n\n";
  print "\tDEMO_MODE  = ", YN($DEMO_MODE), "\n\n";
  CENTER(3, '< PRESS ENTER TO START >');
  <STDIN>;
}
################################################################# YN
#
# This function returns either YES or NO depending on the
# input value.
#
#   Usage: STRING = YN(NUMBER)
#
sub YN
{
  return ($_[0] == 0) ? 'NO' : 'YES';
}
#################################################################