#!/usr/bin/perl -w # $Id$ # $Source$ # Generates a list of files in cur. dir which are actually the same # Compares size and MD5 checksum # Handy for cleaning up pictures like this: # list_same -sd # wishlist: # - entering a list of file to check on the commandline # - option to ignore 0 length files # - recurse through directories # - hardlink same files use strict; use Digest::MD5; use Getopt::Std; my %opts = (); my @dirs; &cmdline; my @filelist = &getdir; if ( $opts{m} ) { &mp3_comp(@filelist); } else { my %sizes = &get_sizes(@filelist); &quick_comp(%sizes); } ################################# # functions sub cmdline { getopts('dehlms', \%opts); if ( $opts{h} ) { &help; } map { if ( -d $_ ) { s/\/$//; push @dirs, $_; } else { die "$_ not a directory" } } @ARGV; if ( scalar @dirs == 0 ) { push @dirs, "."; } if ( $opts{l} && ( $opts{e} || $opts{d} || $opts{s} ) ) { print "-l doens't combine with -d, -e or -s\n\n"; &help; } # map { print "$_\n"; } @dirs; } sub getdir { my ( $file, @filelist ); foreach ( @dirs ) { opendir(DIR, $_) or die "can't open $_: $!"; while (defined($file = readdir(DIR))) { if ( -f "$_/$file" ) { push @filelist, "$_/$file"; } } closedir(DIR); } return @filelist; } sub get_sizes(@) { my @filelist = @_; my ( @stat, %sizes ); foreach (@filelist) { @stat = stat $_; push @{$sizes{$stat[7]}}, $_; } return %sizes; } sub quick_comp(%) { my %sizes = @_; my ( $size, %md5s ); foreach $size (keys %sizes) { if (@{$sizes{$size}} > 1) { %md5s = (); foreach (@{$sizes{$size}}) { push @{$md5s{&calc_md5($_)}}, $_; } &output_doubles(%md5s); } } } # same md5 calculation i use in mv_wrap sub calc_md5($) { my $file = shift; my ( $digest, $md5, $FILE ); $md5 = Digest::MD5->new; open $FILE, "<$file" or die "couldn't open file: $!\n"; seek($FILE, 0, 0); $md5->reset; $md5->addfile($FILE); $digest = $md5->hexdigest; close($FILE); return $digest; } sub mp3_comp(@) { my @filelist = @_; my %md5s = (); foreach ( @filelist ) { push @{$md5s{&calc_mp3md5($_)}}, $_; } &output_doubles(%md5s); } sub calc_mp3md5($) { my $file = shift; my ($fh, $off, $size); my $buf = 4096*1024; open($fh, "<$file") or die "Couldn't open file: $!\n"; binmode($fh); seek $fh, 0, 2; # go to end of file my $eof = tell $fh; if ($size = &has_v1_tag($fh)) { $eof -= $size; } $off = 0; if ($size = &has_v2_tag($fh)) { $off = $size; } my $md5 = Digest::MD5->new; while ($off < $eof) { seek $fh, $off, 0; if ($buf > ($eof - $off)) { $buf = $eof - $off; } read $fh, my($bytes), $buf; $md5->add($bytes); $off += $buf; } close $fh; return $md5->hexdigest; } sub has_v1_tag { my $fh = shift; seek $fh, -128, 2; if (<$fh> =~ /^TAG/) { return 128; } return 0; } sub has_v2_tag { my $fh = shift; my ($head, @bytes); my $tagsize = 10; seek $fh, 0, 0; read $fh, $head, 3; if ($head =~ /^ID3/) { read $fh, $head, 3; # skip 3 bytes read $fh, $head, 4; @bytes = reverse unpack 'C4', $head; foreach (0 .. 3) { $tagsize += $bytes[$_] * 128 ** $_; } return $tagsize; } return 0; } sub output_doubles(@) { my %md5s = @_; my ( $key, @files, $i ); my $start = $opts{s} ? 1 : 0; foreach $key (keys %md5s) { if ( @{$md5s{$key}} > 1 ) { @files = sort dirsort @{$md5s{$key}}; for $i ($start .. $#files) { chomp $files[$i]; if ($opts{d}) { print "Deleting: $files[$i]\n"; unlink($files[$i]) or die "Couldn't unlink $files[$i]"; } else { &output("$files[$i]\n"); } } unless($opts{s}) { print "\n"; } } } } # escape output if necessary sub output($) { my $string = shift; if ($opts{e}) { $string = &escape($string); } print "$string"; } sub escape($) { my $string = shift; $string =~ s/([ \&\;\`\'\\\"\|\*\?\~\<\>\^\(\)\[\]\{\}\$\010\013\020\011])/\\${1}/g; return $string; } sub unescape($) { my $string = shift; $string =~ s/\\([ \&\;\`\'\\\"\|\*\?\~\<\>\^\(\)\[\]\{\}\$\010\013\020\011])/${1}/g; return $string; } sub dirsort { my $firstdir = &escape($dirs[0]); my $debuglevel = 0; print "$a $b\n" if $debuglevel == 1; if ( $a =~ /^$firstdir\/[^\/]*$/o && $b =~ /^$firstdir\/[^\/]*$/o ) { print "A $a $b\n" if $debuglevel == 1; return &wardsort; } elsif ( ( $a =~ /^$firstdir\/[^\/]*$/o ) && ! ( $b =~ /^$firstdir\/[^\/]*$/o ) ) { print "B $a $b\n" if $debuglevel == 1; return -1; } elsif ( ! ( $a =~ /^$firstdir\/[^\/]*$/o ) && ( $b =~ /^$firstdir\/[^\/]*$/o ) ) { print "C $a $b\n" if $debuglevel == 1; return 1; } else { print "D $a $b\n" if $debuglevel == 1; return &wardsort; } } sub wardsort { my @a = split(/([0-9]+)/,$a); my @b = split(/([0-9]+)/,$b); foreach my $x (@a) { my $y = shift @b; my $r = ($x =~ /^[0-9]+$/ && $y =~ /^[0-9]+$/) ? ($x <=> $y) : ($x cmp $y); $r && return $r; } return -1 if (@b); 0; } sub help { my $name = $0; $name =~ s/.*\///; print <