#!/usr/bin/perl
#  
#  Copyright (c) 2002 Steve Slaven, All Rights Reserved.
#  
#  This program is free software; you can redistribute it and/or
#  modify it under the terms of the GNU General Public License as
#  published by the Free Software Foundation; either version 2 of
#  the License, or (at your option) any later version.
#  
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#  GNU General Public License for more details.
#  
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 59 Temple Place, Suite 330, Boston,
#  MA 02111-1307 USA
#  

use strict;

use Getopt::Std;
use Digest::MD5 qw( md5_hex );

# Finds potential dup content files based on partial file MD5 hash
my %hashes;
my %o;

# Defaults
$o{ s } = 1024 * 100; # 100K

getopts( 'hs:dD', \%o );

$o{ h } = 1 unless scalar( @ARGV ); # Force help on no dirs
die( qq{
partialdupes v1.0
Find duplicate content based on partial file MD5 hashes
Author: Steve Slaven - http://hoopajoo.net

Usage: $0 [-hdD] [-s size]

	-h	This help text
	-d	Delete all but largest file
	-D	Always 'y' to delete
	-s	Size of data chunk used to compute MD5 sum (default 100K)

} ) if $o{ h };

# Do it yo!
for( @ARGV ) {
	s#/$##;
	recursive_comp( $_ ) if -d $_;
}

# Dump results
my( $after_one, $ans );
for( keys( %hashes ) ) {
	if( scalar( @{ $hashes{ $_ } } ) > 1 ) {
		print "$_:\n";
		$after_one = 0; # Set to 1 in loop to know we're after 1 and with -d
		                # will delete all other files (with prompt)
		for( sort { $a -> { size } <=> $b -> { size } } @{ $hashes{ $_ } } ) {
		  printf( '  %-60s %9.1fK',
			  $_ -> { file },
			  $_ -> { size } / 1024 );
		  print "\n";

		  if( $after_one && $o{ d } ) {
		    print "Delete? (y/N) ";
		    if( $o{ D } ) {
		      print "y\n";
		      $ans = 'y';
		    }else{
		      chomp( $ans = <STDIN> );
		    }
		    if( lc( $ans ) eq 'y' ) {
		      print "Deleting '$_->{file}'\n";
		      unlink( $_ -> { file } );
		    }
		  }

		  $after_one = 1;
		}
	}
}

sub recursive_comp {
	my $path = shift;
	my( $f, $key );
	
	local *DIR;

	return( undef ) unless -e $path;
	
	if ( -f $path ) {
	  # Let the magic happen...
	  # we take the MD5 digest of the
	  # first N bytes and throw it in
	  # the hashes hash under it's MD5
	  # hash for the key and the fileinfo
	  # is thrown in as a hashref in an
	  # arrayref
	  $key = getmdinfo( $path );
	  $hashes{ $key } = [] unless $hashes{ $key };
	  push( @{ $hashes{ $key } }, { file => $path, size => -s $path } );
	} else {
	  print "Descending: $path\n";
	  opendir( DIR, $path ) || die( "Couldn't open dir '$path': $!" );
	  while ( $f = readdir( DIR ) ) {
	    if ( $f !~ /^\./ ) {
	      recursive_comp( "$path/$f" );
	    }
	  }
	}
	closedir( DIR );
}

sub getmdinfo {
	my $path = shift;
	my $data;
	local *IN;

	open( IN, $path ) || die( "Couldn't open '$path' for MD5 info: $!" );
	read IN, $data, $o{ s };
	close( IN );

	# Return the MD5 data
	return( md5_hex( $data ) . '-' . length( $data ) );
}