#!/usr/bin/perl -w # $Id$ # $Source$ # Generates a list of files in cur. dir which are actually the same # Compares size and MD5 checksum # Handy for cleaning up pictures like this: # list_same -es | xargs rm # wishlist: # - entering a list of file to check on the commandline # - option to ignore 0 length files use strict; use Digest::MD5; use Getopt::Std; my %opts = (); getopts('ehms', \%opts); if ( $opts{h} ) { &help; } my @filelist = &getdir; if ( $opts{m} ) { &mp3_comp(@filelist); } else { my %sizes = &get_sizes(@filelist); &quick_comp(%sizes); } ################################# # functions sub getdir { my ( $file, @filelist ); opendir(DIR, ".") or die "can't open . $!"; while (defined($file = readdir(DIR))) { if ( -f $file ) { push @filelist, $file; } } closedir(DIR); return @filelist; } sub get_sizes(@) { my @filelist = @_; my ( @stat, %sizes ); foreach (@filelist) { @stat = stat $_; push @{$sizes{$stat[7]}}, $_; } return %sizes; } sub quick_comp(%) { my %sizes = @_; my ( $size, %md5s ); foreach $size (keys %sizes) { if (@{$sizes{$size}} > 1) { %md5s = (); foreach (@{$sizes{$size}}) { push @{$md5s{&calc_md5($_)}}, $_; } &output_doubles(%md5s); } } } # same md5 calculation i use in mv_wrap sub calc_md5($) { my $file = shift; my ( $digest, $md5, $FILE ); $md5 = Digest::MD5->new; open $FILE, "<$file" or die "couldn't open file: $!\n"; seek($FILE, 0, 0); $md5->reset; $md5->addfile($FILE); $digest = $md5->hexdigest; close($FILE); return $digest; } sub mp3_comp(@) { my @filelist = @_; my %md5s = (); foreach ( @filelist ) { push @{$md5s{&calc_mp3md5($_)}}, $_; } &output_doubles(%md5s); } sub calc_mp3md5($) { my $file = shift; my ($fh, $off, $size); my $buf = 4096*1024; open($fh, "<$file") or die "Couldn't open file: $!\n"; binmode($fh); seek $fh, 0, 2; # go to end of file my $eof = tell $fh; if ($size = &has_v1_tag($fh)) { $eof -= $size; } $off = 0; if ($size = &has_v2_tag($fh)) { $off = $size; } my $md5 = Digest::MD5->new; while ($off < $eof) { seek $fh, $off, 0; if ($buf > ($eof - $off)) { $buf = $eof - $off; } read $fh, my($bytes), $buf; $md5->add($bytes); $off += $buf; } close $fh; return $md5->hexdigest; } sub has_v1_tag { my $fh = shift; seek $fh, -128, 2; if (<$fh> =~ /^TAG/) { return 128; } return 0; } sub has_v2_tag { my $fh = shift; my ($head, @bytes); my $tagsize = 10; seek $fh, 0, 0; read $fh, $head, 3; if ($head =~ /^ID3/) { read $fh, $head, 3; # skip 3 bytes read $fh, $head, 4; @bytes = reverse unpack 'C4', $head; foreach (0 .. 3) { $tagsize += $bytes[$_] * 128 ** $_; } return $tagsize; } return 0; } sub output_doubles(@) { my %md5s = @_; my ( $key, @files, $i ); my $start = $opts{s} ? 1 : 0; foreach $key (keys %md5s) { if ( @{$md5s{$key}} > 1 ) { @files = sort @{$md5s{$key}}; for $i ($start .. $#files) { chomp $files[$i]; &output("$files[$i]\n"); } unless($opts{s}) { print "\n"; } } } } # escape output if necessary sub output($) { my $string = shift; if ($opts{e}) { $string =~ s/([ \&\;\`\'\\\"\|\*\?\~\<\>\^\(\)\[\]\{\}\$\010\013\020\011])/\\${1}/g; } print "$string"; } sub help { my $name = $0; $name =~ s/.*\///; print <