publicscripts/list_same/list_same

258 lines
4.9 KiB
Text
Raw Normal View History

#!/usr/bin/perl -w
2002-07-10 09:08:46 +00:00
# $Id$
# $Source$
# Generates a list of files in cur. dir which are actually the same
# Compares size and MD5 checksum
# Handy for cleaning up pictures like this:
# list_same -es | xargs rm
# wishlist:
# - entering a list of file to check on the commandline
2002-07-10 09:08:46 +00:00
# - option to ignore 0 length files
use strict;
2002-07-10 09:08:46 +00:00
use Digest::MD5;
use Getopt::Std;
my %opts = ();
my @dirs;
&cmdline;
my @filelist = &getdir;
if ( $opts{m} ) {
&mp3_comp(@filelist);
} else {
my %sizes = &get_sizes(@filelist);
&quick_comp(%sizes);
}
#################################
# functions
sub cmdline {
2004-04-24 20:19:25 +00:00
getopts('dehms', \%opts);
if ( $opts{h} ) { &help; }
map { if ( -d $_ ) { s/\/$//; push @dirs, $_; } else { die "$_ not a directory" } } @ARGV;
if ( scalar @dirs == 0 ) {
push @dirs, ".";
}
# map { print "$_\n"; } @dirs;
}
sub getdir {
my ( $file, @filelist );
foreach ( @dirs ) {
opendir(DIR, $_) or die "can't open $_: $!";
while (defined($file = readdir(DIR))) {
if ( -f "$_/$file" ) {
push @filelist, "$_/$file";
}
}
closedir(DIR);
}
return @filelist;
}
sub get_sizes(@) {
my @filelist = @_;
my ( @stat, %sizes );
2003-01-23 14:26:11 +00:00
foreach (@filelist) {
@stat = stat $_;
push @{$sizes{$stat[7]}}, $_;
}
return %sizes;
}
sub quick_comp(%) {
my %sizes = @_;
my ( $size, %md5s );
foreach $size (keys %sizes) {
if (@{$sizes{$size}} > 1) {
%md5s = ();
foreach (@{$sizes{$size}}) {
push @{$md5s{&calc_md5($_)}}, $_;
}
&output_doubles(%md5s);
}
}
}
# same md5 calculation i use in mv_wrap
2002-11-14 12:14:13 +00:00
sub calc_md5($) {
2003-01-23 22:38:02 +00:00
my $file = shift;
my ( $digest, $md5, $FILE );
2002-07-10 09:08:46 +00:00
$md5 = Digest::MD5->new;
open $FILE, "<$file" or die "couldn't open file: $!\n";
seek($FILE, 0, 0);
$md5->reset;
$md5->addfile($FILE);
$digest = $md5->hexdigest;
close($FILE);
return $digest;
}
2003-01-23 14:26:11 +00:00
sub mp3_comp(@) {
my @filelist = @_;
my %md5s = ();
foreach ( @filelist ) {
push @{$md5s{&calc_mp3md5($_)}}, $_;
}
&output_doubles(%md5s);
}
sub calc_mp3md5($) {
my $file = shift;
my ($fh, $off, $size);
my $buf = 4096*1024;
open($fh, "<$file") or die "Couldn't open file: $!\n";
binmode($fh);
seek $fh, 0, 2; # go to end of file
my $eof = tell $fh;
if ($size = &has_v1_tag($fh)) {
$eof -= $size;
}
$off = 0;
if ($size = &has_v2_tag($fh)) {
$off = $size;
}
my $md5 = Digest::MD5->new;
while ($off < $eof) {
seek $fh, $off, 0;
if ($buf > ($eof - $off)) { $buf = $eof - $off; }
read $fh, my($bytes), $buf;
$md5->add($bytes);
$off += $buf;
}
close $fh;
return $md5->hexdigest;
}
sub has_v1_tag {
my $fh = shift;
seek $fh, -128, 2;
if (<$fh> =~ /^TAG/) {
return 128;
}
return 0;
}
sub has_v2_tag {
my $fh = shift;
my ($head, @bytes);
my $tagsize = 10;
seek $fh, 0, 0;
read $fh, $head, 3;
if ($head =~ /^ID3/) {
read $fh, $head, 3; # skip 3 bytes
read $fh, $head, 4;
@bytes = reverse unpack 'C4', $head;
foreach (0 .. 3) {
$tagsize += $bytes[$_] * 128 ** $_;
}
return $tagsize;
}
return 0;
}
sub output_doubles(@) {
my %md5s = @_;
my ( $key, @files, $i );
my $start = $opts{s} ? 1 : 0;
foreach $key (keys %md5s) {
if ( @{$md5s{$key}} > 1 ) {
@files = sort dirsort @{$md5s{$key}};
for $i ($start .. $#files) {
chomp $files[$i];
2004-04-24 20:19:25 +00:00
if ($opts{d}) {
print "Deleting: $files[$i]\n";
unlink($files[$i]) or die "Couldn't unlink $files[$i]";
} else {
&output("$files[$i]\n");
}
}
unless($opts{s}) { print "\n"; }
}
}
}
2003-01-23 14:26:11 +00:00
# escape output if necessary
sub output($) {
my $string = shift;
if ($opts{e}) {
$string = &escape($string);
2003-01-23 14:26:11 +00:00
}
2003-01-23 22:38:02 +00:00
print "$string";
2004-04-24 20:19:25 +00:00
}
sub escape($) {
my $string = shift;
$string =~ s/([ \&\;\`\'\\\"\|\*\?\~\<\>\^\(\)\[\]\{\}\$\010\013\020\011])/\\${1}/g;
return $string;
}
sub unescape($) {
my $string = shift;
$string =~ s/\\([ \&\;\`\'\\\"\|\*\?\~\<\>\^\(\)\[\]\{\}\$\010\013\020\011])/${1}/g;
return $string;
}
sub dirsort {
my $firstdir = &escape($dirs[0]);
my $debuglevel = 0;
print "$a $b\n" if $debuglevel == 1;
if ( $a =~ /^$firstdir\/[^\/]*$/o && $b =~ /^$firstdir\/[^\/]*$/o ) {
print "A $a $b\n" if $debuglevel == 1;
return &wardsort;
} elsif ( ( $a =~ /^$firstdir\/[^\/]*$/o ) && ! ( $b =~ /^$firstdir\/[^\/]*$/o ) ) {
print "B $a $b\n" if $debuglevel == 1;
return -1;
} elsif ( ! ( $a =~ /^$firstdir\/[^\/]*$/o ) && ( $b =~ /^$firstdir\/[^\/]*$/o ) ) {
print "C $a $b\n" if $debuglevel == 1;
return 1;
} else {
print "D $a $b\n" if $debuglevel == 1;
return &wardsort;
}
}
sub wardsort {
my @a = split(/([0-9]+)/,$a);
my @b = split(/([0-9]+)/,$b);
foreach my $x (@a)
{
my $y = shift @b;
my $r = ($x =~ /^[0-9]+$/ && $y =~ /^[0-9]+$/) ?
($x <=> $y) : ($x cmp $y);
$r && return $r;
}
return -1 if (@b);
0;
}
sub help {
my $name = $0;
$name =~ s/.*\///;
print <<EOF;
Usage: $name [OPTION] ...
2004-04-24 20:19:25 +00:00
-d delete resulting files
-e escape output filenames with backslashes
-h display this help message
-m mp3 compare, ignores ID3 tags (slow)
-s skip the first entry for doubles
EOF
exit;
}