313 lines
5.9 KiB
Perl
Executable file
313 lines
5.9 KiB
Perl
Executable file
#!/usr/bin/perl -w
|
|
|
|
# Generates a list of files in cur. dir which are actually the same
|
|
# Compares size and MD5 checksum
|
|
# Handy for cleaning up pictures like this:
|
|
# list_same -sd
|
|
|
|
# wishlist:
|
|
# - entering a list of file to check on the commandline
|
|
# - option to ignore 0 length files
|
|
# - recurse through directories
|
|
# - hardlink same files
|
|
|
|
use strict;
|
|
use Digest::MD5;
|
|
use Getopt::Std;
|
|
|
|
my %opts = ();
|
|
my @dirs;
|
|
my @files;
|
|
|
|
&cmdline;
|
|
|
|
my @filelist = &getdir;
|
|
push @filelist, @files;
|
|
|
|
if ( $opts{m} ) {
|
|
&mp3_comp(@filelist);
|
|
} else {
|
|
if ( $opts{f} or $opts{a} ) {
|
|
&get_md5s(@filelist);
|
|
} else {
|
|
my %sizes = &get_sizes(@filelist);
|
|
&quick_comp(%sizes);
|
|
}
|
|
}
|
|
|
|
#################################
|
|
# functions
|
|
|
|
sub cmdline {
|
|
getopts('adefhlms', \%opts);
|
|
if ( $opts{h} ) { &help; }
|
|
|
|
map { if ( -d $_ ) {
|
|
s/\/$//;
|
|
push @dirs, $_;
|
|
} else {
|
|
if ( -f $_ ) {
|
|
push @files, $_;
|
|
} else {
|
|
die "$_ not a directory or file"
|
|
}
|
|
} } @ARGV;
|
|
if ( scalar @dirs == 0 && scalar @files == 0 ) {
|
|
push @dirs, ".";
|
|
}
|
|
|
|
if ( $opts{l} && ( $opts{e} || $opts{d} || $opts{s} ) ) {
|
|
print "-l doens't combine with -d, -e or -s\n\n";
|
|
&help;
|
|
}
|
|
}
|
|
|
|
sub getdir {
|
|
my ( $file, @filelist );
|
|
foreach ( @dirs ) {
|
|
opendir(DIR, $_) or die "can't open $_: $!";
|
|
while (defined($file = readdir(DIR))) {
|
|
if ( -f "$_/$file" ) {
|
|
push @filelist, "$_/$file";
|
|
}
|
|
}
|
|
closedir(DIR);
|
|
}
|
|
return @filelist;
|
|
}
|
|
|
|
sub get_sizes(@) {
|
|
my @filelist = @_;
|
|
my ( @stat, %sizes );
|
|
|
|
foreach (@filelist) {
|
|
@stat = stat $_;
|
|
push @{$sizes{$stat[7]}}, $_;
|
|
}
|
|
return %sizes;
|
|
}
|
|
|
|
sub get_md5s(@) {
|
|
my %md5s;
|
|
foreach (@_) {
|
|
push @{$md5s{&calc_md5($_)}}, $_;
|
|
}
|
|
&output_doubles(%md5s);
|
|
}
|
|
|
|
sub quick_comp(%) {
|
|
my %sizes = @_;
|
|
my ( $size, %md5s );
|
|
|
|
foreach $size (keys %sizes) {
|
|
if (@{$sizes{$size}} > 1) {
|
|
%md5s = ();
|
|
foreach (@{$sizes{$size}}) {
|
|
push @{$md5s{&calc_md5($_)}}, $_;
|
|
}
|
|
&output_doubles(%md5s);
|
|
}
|
|
}
|
|
}
|
|
|
|
# same md5 calculation i use in mv_wrap
|
|
sub calc_md5($) {
|
|
my $file = shift;
|
|
my ( $digest, $fh, $off, $startoff );
|
|
my $buf = 4096*1024;
|
|
my $md5 = Digest::MD5->new;
|
|
$md5 = Digest::MD5->new;
|
|
open($fh, "<$file") or die "Couldn't open file: $!\n";
|
|
binmode($fh);
|
|
seek $fh, 0, 2; # go to end of file
|
|
my $eof = tell $fh;
|
|
|
|
seek($fh, 0, 0);
|
|
if ( $opts{f} ) {
|
|
readline($fh);
|
|
$off = tell $fh;
|
|
} else {
|
|
$off = 0;
|
|
}
|
|
|
|
while ($off < $eof) {
|
|
seek $fh, $off, 0;
|
|
if ($buf > ($eof - $off)) { $buf = $eof - $off; }
|
|
read $fh, my($bytes), $buf;
|
|
$md5->add($bytes);
|
|
$off += $buf;
|
|
}
|
|
close $fh;
|
|
return $md5->hexdigest;
|
|
|
|
|
|
}
|
|
|
|
sub mp3_comp(@) {
|
|
my @filelist = @_;
|
|
my %md5s = ();
|
|
foreach ( @filelist ) {
|
|
push @{$md5s{&calc_mp3md5($_)}}, $_;
|
|
}
|
|
&output_doubles(%md5s);
|
|
}
|
|
|
|
sub calc_mp3md5($) {
|
|
my $file = shift;
|
|
my ($fh, $off, $size);
|
|
my $buf = 4096*1024;
|
|
|
|
open($fh, "<$file") or die "Couldn't open file: $!\n";
|
|
binmode($fh);
|
|
seek $fh, 0, 2; # go to end of file
|
|
my $eof = tell $fh;
|
|
if ($size = &has_v1_tag($fh)) {
|
|
$eof -= $size;
|
|
}
|
|
|
|
$off = 0;
|
|
if ($size = &has_v2_tag($fh)) {
|
|
$off = $size;
|
|
}
|
|
my $md5 = Digest::MD5->new;
|
|
while ($off < $eof) {
|
|
seek $fh, $off, 0;
|
|
if ($buf > ($eof - $off)) { $buf = $eof - $off; }
|
|
read $fh, my($bytes), $buf;
|
|
$md5->add($bytes);
|
|
$off += $buf;
|
|
}
|
|
close $fh;
|
|
return $md5->hexdigest;
|
|
}
|
|
|
|
sub has_v1_tag {
|
|
my $fh = shift;
|
|
seek $fh, -128, 2;
|
|
if (<$fh> =~ /^TAG/) {
|
|
return 128;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
sub has_v2_tag {
|
|
my $fh = shift;
|
|
my ($head, @bytes);
|
|
my $tagsize = 10;
|
|
seek $fh, 0, 0;
|
|
read $fh, $head, 3;
|
|
if ($head =~ /^ID3/) {
|
|
read $fh, $head, 3; # skip 3 bytes
|
|
read $fh, $head, 4;
|
|
@bytes = reverse unpack 'C4', $head;
|
|
foreach (0 .. 3) {
|
|
$tagsize += $bytes[$_] * 128 ** $_;
|
|
}
|
|
return $tagsize;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
sub output_doubles(@) {
|
|
my %md5s = @_;
|
|
my ( $key, @files, $i, $numkeys );
|
|
my $start = $opts{s} ? 1 : 0;
|
|
|
|
if ( $opts{a} ) { $numkeys = 0; } else { $numkeys = 1; }
|
|
|
|
foreach $key (keys %md5s) {
|
|
if ( @{$md5s{$key}} > $numkeys ) {
|
|
if ( scalar @dirs > 0 ) {
|
|
@files = sort dirsort @{$md5s{$key}};
|
|
} else {
|
|
@files = @{$md5s{$key}};
|
|
}
|
|
for $i ($start .. $#files) {
|
|
chomp $files[$i];
|
|
if ($opts{d}) {
|
|
print "Deleting: $files[$i]\n";
|
|
unlink($files[$i]) or die "Couldn't unlink $files[$i]";
|
|
} else {
|
|
&output("$files[$i]\n");
|
|
}
|
|
}
|
|
unless($opts{s}) { print "\n"; }
|
|
}
|
|
}
|
|
}
|
|
|
|
# escape output if necessary
|
|
sub output($) {
|
|
my $string = shift;
|
|
if ($opts{e}) {
|
|
$string = &escape($string);
|
|
}
|
|
print "$string";
|
|
}
|
|
|
|
sub escape($) {
|
|
my $string = shift;
|
|
$string =~ s/([ \&\;\`\'\\\"\|\*\?\~\<\>\^\(\)\[\]\{\}\$\010\013\020\011])/\\${1}/g;
|
|
return $string;
|
|
}
|
|
|
|
sub unescape($) {
|
|
my $string = shift;
|
|
$string =~ s/\\([ \&\;\`\'\\\"\|\*\?\~\<\>\^\(\)\[\]\{\}\$\010\013\020\011])/${1}/g;
|
|
return $string;
|
|
}
|
|
|
|
sub dirsort {
|
|
my $firstdir = &escape($dirs[0]);
|
|
my $debuglevel = 0;
|
|
print "$a $b\n" if $debuglevel == 1;
|
|
if ( $a =~ /^$firstdir\/[^\/]*$/o && $b =~ /^$firstdir\/[^\/]*$/o ) {
|
|
print "A $a $b\n" if $debuglevel == 1;
|
|
return &wardsort;
|
|
} elsif ( ( $a =~ /^$firstdir\/[^\/]*$/o ) && ! ( $b =~ /^$firstdir\/[^\/]*$/o ) ) {
|
|
print "B $a $b\n" if $debuglevel == 1;
|
|
return -1;
|
|
} elsif ( ! ( $a =~ /^$firstdir\/[^\/]*$/o ) && ( $b =~ /^$firstdir\/[^\/]*$/o ) ) {
|
|
print "C $a $b\n" if $debuglevel == 1;
|
|
return 1;
|
|
} else {
|
|
print "D $a $b\n" if $debuglevel == 1;
|
|
return &wardsort;
|
|
}
|
|
}
|
|
|
|
sub wardsort {
|
|
my @a = split(/([0-9]+)/,$a);
|
|
my @b = split(/([0-9]+)/,$b);
|
|
|
|
foreach my $x (@a)
|
|
{
|
|
my $y = shift @b;
|
|
my $r = ($x =~ /^[0-9]+$/ && $y =~ /^[0-9]+$/) ?
|
|
($x <=> $y) : ($x cmp $y);
|
|
$r && return $r;
|
|
}
|
|
return -1 if (@b);
|
|
0;
|
|
}
|
|
|
|
sub help {
|
|
my $name = $0;
|
|
$name =~ s/.*\///;
|
|
print <<EOF;
|
|
|
|
Usage: $name [OPTION] ...
|
|
|
|
-a output all files
|
|
-d delete resulting files
|
|
-e escape output filenames with backslashes
|
|
-f skip first line
|
|
-h display this help message
|
|
-l hardlink resulting files (no change if on
|
|
different filesystems)
|
|
-m mp3 compare, ignores ID3 tags (slow)
|
|
-s skip the first entry for doubles
|
|
|
|
EOF
|
|
exit;
|
|
}
|