diff --git a/list_same/list_same b/list_same/list_same index 8931356..2c98034 100755 --- a/list_same/list_same +++ b/list_same/list_same @@ -6,89 +6,172 @@ # Generates a list of files in cur. dir which are actually the same # Compares size and MD5 checksum # Handy for cleaning up pictures like this: -# list_same|cut -f 1 -d " "|xargs rm +# list_same -es | xargs rm # wishlist: # - entering a list of file to check on the commandline -# - include escapes in output (spaces, brackets, etc) (optional) # - option to ignore 0 length files - -# Changelog: -# 2001-08-30: -# _much_ saner now. only calculate md5s if sizes of -# files are the same -# nearly complete rewrite -# 2001-08-29: -# use MD5; instead of a shell call to md5 -# some cleaning of code - +use strict; use Digest::MD5; use Getopt::Std; -getopts('es'); +my ( $opt_e, $opt_h, $opt_s ); +my %opts = (); +getopts('ehms', \%opts); -opendir(DIR, ".") or die "can't open . $!"; -while (defined($file = readdir(DIR))) { - push @filelist, $file; -} -closedir(DIR); +if ( $opts{h} ) { &help; } -foreach (@filelist) { - if ( -f $_ ) { - push @{$sizes{&get_size($_)}}, $_; - } +my @filelist = &getdir; +if ( $opts{m} ) { + &mp3_comp(@filelist); +} else { + my %sizes = &get_sizes(@filelist); + &quick_comp(%sizes); } -$start = $opt_s ? 1 : 0; - -foreach $size (keys %sizes) { - if (@{$sizes{$size}} > 1) { - %md5s = (); - foreach (@{$sizes{$size}}) { - push @{$md5s{&calc_md5($_)}}, $_; - } - foreach $key (keys %md5s) { - if ( @{$md5s{$key}} > 1 ) { - @files = sort @{$md5s{$key}}; - for $i ($start .. $#files) { - chomp $files[$i]; - &output("$files[$i]\n"); - } - unless($opt_s) { print "\n"; } - } - } - } -} ################################# # functions -sub get_size { - my $file = shift; - my @stat; - @stat = stat $file; - return $stat[7]; +sub getdir { + my ( $file, @filelist ); + opendir(DIR, ".") or die "can't open . $!"; + while (defined($file = readdir(DIR))) { + if ( -f $file ) { + push @filelist, $file; + } + } + closedir(DIR); + return @filelist; +} + +sub get_sizes(@) { + my @filelist = @_; + my ( @stat, %sizes ); + + foreach (@filelist) { + @stat = stat $_; + push @{$sizes{$stat[7]}}, $_; + } + return %sizes; +} + +sub quick_comp(%) { + my %sizes = @_; + my ( $size, %md5s ); + + foreach $size (keys %sizes) { + if (@{$sizes{$size}} > 1) { + %md5s = (); + foreach (@{$sizes{$size}}) { + push @{$md5s{&calc_md5($_)}}, $_; + } + &output_doubles(%md5s); + } + } } # same md5 calculation i use in mv_wrap sub calc_md5($) { - my ($file, $digest); + my ($file, $digest, $md5, $FILE); $file = shift; $md5 = Digest::MD5->new; - open FILE, "<$file" or die "couldn't open file: $!\n"; - seek(FILE, 0, 0); + open $FILE, "<$file" or die "couldn't open file: $!\n"; + seek($FILE, 0, 0); $md5->reset; - $md5->addfile(FILE); + $md5->addfile($FILE); $digest = $md5->hexdigest; - close(FILE); + close($FILE); return $digest; } +sub mp3_comp(@) { + my @filelist = @_; + my %md5s = (); + foreach ( @filelist ) { + push @{$md5s{&calc_mp3md5($_)}}, $_; + } + &output_doubles(%md5s); +} + +sub calc_mp3md5($) { + my $file = shift; + my ($fh, $off, $size); + my $buf = 4096*1024; + + open($fh, "<$file") or die "Couldn't open file: $!\n"; + binmode($fh); + seek $fh, 0, 2; # go to end of file + my $eof = tell $fh; + if ($size = &has_v1_tag($fh)) { + $eof -= $size; + } + + $off = 0; + if ($size = &has_v2_tag($fh)) { + $off = $size; + } + my $md5 = Digest::MD5->new; + while ($off < $eof) { + seek $fh, $off, 0; + if ($buf > ($eof - $off)) { $buf = $eof - $off; } + read $fh, my($bytes), $buf; + $md5->add($bytes); + $off += $buf; + } + close $fh; + return $md5->hexdigest; +} + +sub has_v1_tag { + my $fh = shift; + seek $fh, -128, 2; + if (<$fh> =~ /^TAG/) { + return 128; + } + return 0; +} + +sub has_v2_tag { + my $fh = shift; + my ($head, @bytes); + my $tagsize = 10; + seek $fh, 0, 0; + read $fh, $head, 3; + if ($head =~ /^ID3/) { + read $fh, $head, 3; # skip 3 bytes + read $fh, $head, 4; + @bytes = reverse unpack 'C4', $head; + foreach (0 .. 3) { + $tagsize += $bytes[$_] * 128 ** $_; + } + return $tagsize; + } + return 0; +} + +sub output_doubles(@) { + my %md5s = @_; + my ( $key, @files, $i ); + my $start = $opts{s} ? 1 : 0; + + foreach $key (keys %md5s) { + if ( @{$md5s{$key}} > 1 ) { + @files = sort @{$md5s{$key}}; + for $i ($start .. $#files) { + chomp $files[$i]; + &output("$files[$i]\n"); + } + unless($opts{s}) { print "\n"; } + } + } +} + # escape output if necessary sub output($) { my $string = shift; - if ($opt_e) { + if ($opts{e}) { $string =~ s/\\/\\\\/g; $string =~ s/ /\\ /g; $string =~ s/'/\\'/g; @@ -101,5 +184,22 @@ sub output($) { } else { print "$string"; } - $opt_e = $opt_e; } + +sub help { + my $name = $0; + $name =~ s/.*\///; + print <