#!/usr/bin/perl # # Copyright (c) 2002 Steve Slaven, All Rights Reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of # the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, # MA 02111-1307 USA # use strict; use Getopt::Std; use Digest::MD5 qw( md5_hex ); # Finds potential dup content files based on partial file MD5 hash my %hashes; my %o; # Defaults $o{ s } = 1024 * 100; # 100K getopts( 'hs:dD', \%o ); $o{ h } = 1 unless scalar( @ARGV ); # Force help on no dirs die( qq{ partialdupes v1.0 Find duplicate content based on partial file MD5 hashes Author: Steve Slaven - http://hoopajoo.net Usage: $0 [-hdD] [-s size] -h This help text -d Delete all but largest file -D Always 'y' to delete -s Size of data chunk used to compute MD5 sum (default 100K) } ) if $o{ h }; # Do it yo! for( @ARGV ) { s#/$##; recursive_comp( $_ ) if -d $_; } # Dump results my( $after_one, $ans ); for( keys( %hashes ) ) { if( scalar( @{ $hashes{ $_ } } ) > 1 ) { print "$_:\n"; $after_one = 0; # Set to 1 in loop to know we're after 1 and with -d # will delete all other files (with prompt) for( sort { $a -> { size } <=> $b -> { size } } @{ $hashes{ $_ } } ) { printf( ' %-60s %9.1fK', $_ -> { file }, $_ -> { size } / 1024 ); print "\n"; if( $after_one && $o{ d } ) { print "Delete? (y/N) "; if( $o{ D } ) { print "y\n"; $ans = 'y'; }else{ chomp( $ans = ); } if( lc( $ans ) eq 'y' ) { print "Deleting '$_->{file}'\n"; unlink( $_ -> { file } ); } } $after_one = 1; } } } sub recursive_comp { my $path = shift; my( $f, $key ); local *DIR; return( undef ) unless -e $path; if ( -f $path ) { # Let the magic happen... # we take the MD5 digest of the # first N bytes and throw it in # the hashes hash under it's MD5 # hash for the key and the fileinfo # is thrown in as a hashref in an # arrayref $key = getmdinfo( $path ); $hashes{ $key } = [] unless $hashes{ $key }; push( @{ $hashes{ $key } }, { file => $path, size => -s $path } ); } else { print "Descending: $path\n"; opendir( DIR, $path ) || die( "Couldn't open dir '$path': $!" ); while ( $f = readdir( DIR ) ) { if ( $f !~ /^\./ ) { recursive_comp( "$path/$f" ); } } } closedir( DIR ); } sub getmdinfo { my $path = shift; my $data; local *IN; open( IN, $path ) || die( "Couldn't open '$path' for MD5 info: $!" ); read IN, $data, $o{ s }; close( IN ); # Return the MD5 data return( md5_hex( $data ) . '-' . length( $data ) ); }