#!/usr/bin/perl -w # $Id: findup-md5,v 1.4 1999/11/13 11:04:29 root Exp root $ # Copyright (c) 1999 Mark Summerfield. All Rights Reserved. # May be used/distributed under the GPL. require 5.004 ; use strict ; use integer ; use Cwd ; use File::Find ; use Getopt::Long ; use MD5 ; use vars qw( $VERSION ) ; $VERSION = '1.01' ; my %Opt ; my %MD5 ; my $PrevDir = '' ; my $Found = 0 ; my $MD5 = MD5->new() ; my $startdir = cwd ; $SIG{INT} = sub { chdir $startdir ; die "\nUser aborted.\n" } ; &getoptions ; print STDERR "\n-- Reading folders --\n\n" if $Opt{'verbose'} ; find( \&wanted, @ARGV ) ; print STDERR "\n-- Duplicate files --\n\n" if $Opt{'verbose'} ; # Sort my @Sorted ; { my %Sortby ; while( my( $key, $arrayref ) = each %MD5 ) { # Sort the files that match the same MD5, stripping off the no longer # needed dev and inode no. @{$arrayref} = sort { lc $a cmp lc $b } map { ( split /\t/ )[ 0 ] } @{$arrayref} ; $Sortby{${$arrayref}[ 0 ]} = $key ; } # Sort the MD5's in order of the first file under each MD5 @Sorted = map { $Sortby{$_} } sort { lc $a cmp lc $b } keys %Sortby ; } # Show foreach my $key ( @Sorted ) { my @file = @{$MD5{$key}} ; next if @file < 2 ; $Found++ ; print "MD5=$key\n" ; foreach my $file ( @file ) { print "\t$file\n" ; } } if( $Found ) { print STDERR "\n-- Done --\n\n" if $Opt{'verbose'} ; } else { print STDERR "-- No duplicates found --\n\n" ; } sub wanted { return if -l ; # Ignore symbolic links if( -d ) { if( $Opt{'verbose'} ) { print STDERR "Reading $File::Find::dir\n" if $PrevDir !~ /^$File::Find::dir/ ; $PrevDir = $File::Find::dir ; } return ; } elsif( -f and -s ) { # Ignore non-files and zero length files my( $dev, $ino ) = ( stat( _ ) )[ 0, 1 ] ; $MD5->reset() ; if( open INPUT, $_ ) { $MD5->addfile( \*INPUT ) ; close INPUT ; my $key = $MD5->hexdigest() ; # Ignore hard links if( defined $MD5{$key} ) { my @file = @{$MD5{$key}} ; foreach my $file ( @file ) { # If we've already got this inode then its a hard link and # we ignore. $ino = undef, last if "$dev:$ino" eq ( split /\t/, $file )[ 1 ] ; } } push @{$MD5{$key}}, "$File::Find::name\t$dev:$ino" if defined $ino ; } else { warn "Failed to read $File::Find::name: $!\n" ; } } } sub getoptions { # Defaults. $Opt{'verbose'} = 0 ; Getopt::Long::config 'no_ignore_case' ; GetOptions( \%Opt, 'h|help', 'verbose|v', ) or die "$!\nfindup -h for help\n" ; &help if $Opt{'help'} or not @ARGV ; # print STDERR map { "$_=[$Opt{$_}]\n" } keys %Opt ; exit ; # DEBUG } sub help { print <<__EOT__ ; findup-md5 v $VERSION. Copyright (c) Mark Summerfield 1999. All rights reserved. May be used/distributed under the GPL. usage: findup-md5 [options] Finds exact duplicates using MD5 algorithm, irrespective of filename or date. Can work across filesytems. -v --verbose Verbose to STDERR [$Opt{'verbose'}] __EOT__ exit ; } __END__ =head1 NAME Finds exact duplicates using MD5 algorithm, irrespective of filename or date. =head1 SYNOPSIS findup-md5 -v /path1 > /tmp/duplicates.txt =head1 README Finds exact duplicates using MD5 algorithm, irrespective of filename or date. =head1 PREREQUISITES C C C C C C =head1 COREQUISITES =head1 COPYRIGHT Copyright (c) Mark Summerfield 1999. All Rights Reserved. May be used/distributed under the GPL. Email with 'findup-md5' in the subject line. =head1 OSNAMES Linux =head1 SCRIPT CATEGORIES UNIX/System_administration =cut