#!/usr/bin/perl -w

# Generates a list of files in cur. dir which are actually the same
# Compares size and MD5 checksum
# Handy for cleaning up pictures like this:
# list_same -sd

# wishlist:
#	- entering a list of file to check on the commandline
#	- option to ignore 0 length files
#	- recurse through directories
#	- hardlink same files

use strict;
use Digest::MD5;
use Getopt::Std;

my %opts = ();
my @dirs;
my @files;

&cmdline;

my @filelist = &getdir;
push @filelist, @files;

if ( $opts{m} ) {
	&mp3_comp(@filelist);
} else {
	if ( $opts{f} or $opts{a} ) {
		&get_md5s(@filelist);
	} else {
		my %sizes = &get_sizes(@filelist);
		&quick_comp(%sizes);
	}
}

#################################
# functions

sub cmdline {
	getopts('adefhlms', \%opts);
	if ( $opts{h} ) { &help; }

	map { if ( -d $_ ) {
		s/\/$//;
		push @dirs, $_;
	} else {
		if ( -f $_ ) {
			push @files, $_;
		} else {
			die "$_ not a directory or file"
		}
	} } @ARGV;
	if ( scalar @dirs == 0 && scalar @files == 0 ) {
		push @dirs, ".";
	}

	if ( $opts{l} && ( $opts{e} || $opts{d} || $opts{s} ) ) {
		print "-l doens't combine with -d, -e or -s\n\n";
		&help;
	}
}

sub getdir {
	my ( $file, @filelist );
	foreach ( @dirs ) {
		opendir(DIR, $_) or die "can't open $_: $!";
		while (defined($file = readdir(DIR))) {
			if ( -f "$_/$file" ) {
				push @filelist, "$_/$file";
			}
		}
		closedir(DIR);
	}
	return @filelist;
}

sub get_sizes(@) {
	my @filelist = @_;
	my ( @stat, %sizes );

	foreach (@filelist) {
		@stat = stat $_;
		push @{$sizes{$stat[7]}}, $_;
	}
	return %sizes;
}

sub get_md5s(@) {
	my %md5s;
	foreach (@_) {
		push @{$md5s{&calc_md5($_)}}, $_;
	}
	&output_doubles(%md5s);
}

sub quick_comp(%) {
	my %sizes = @_;
	my ( $size, %md5s );

	foreach $size (keys %sizes) {
		if (@{$sizes{$size}} > 1) {
			%md5s = ();
			foreach (@{$sizes{$size}}) {
				push @{$md5s{&calc_md5($_)}}, $_;
			}
			&output_doubles(%md5s);
		}
	}
}

# same md5 calculation i use in mv_wrap
sub calc_md5($) {
	my $file = shift;
	my ( $digest, $fh, $off, $startoff );
	my $buf = 4096*1024;
	my  $md5 = Digest::MD5->new;
	$md5 = Digest::MD5->new;
	open($fh, "<$file") or die "Couldn't open file: $!\n";
	binmode($fh);
	seek $fh, 0, 2;		# go to end of file
	my $eof = tell $fh;

	seek($fh, 0, 0);
	if ( $opts{f} ) {
		readline($fh);
		$off = tell $fh;
	} else {
		$off = 0;
	}

	while ($off < $eof) {
		seek $fh, $off, 0;
		if ($buf > ($eof - $off)) { $buf = $eof - $off; }
		read $fh, my($bytes), $buf;
		$md5->add($bytes);
		$off += $buf;
	}
	close $fh;
	return $md5->hexdigest;


}

sub mp3_comp(@) {
	my @filelist = @_;
	my %md5s = ();
	foreach ( @filelist ) {
		push @{$md5s{&calc_mp3md5($_)}}, $_;
	}
	&output_doubles(%md5s);
}

sub calc_mp3md5($) {
	my $file = shift;
	my ($fh, $off, $size);
	my $buf = 4096*1024;

	open($fh, "<$file") or die "Couldn't open file: $!\n";
	binmode($fh);
	seek $fh, 0, 2;		# go to end of file
	my $eof = tell $fh;
	if ($size = &has_v1_tag($fh)) {
		$eof -= $size;
	}

	$off = 0;
	if ($size = &has_v2_tag($fh)) {
		$off = $size;
	}
	my  $md5 = Digest::MD5->new;
	while ($off < $eof) {
		seek $fh, $off, 0;
		if ($buf > ($eof - $off)) { $buf = $eof - $off; }
		read $fh, my($bytes), $buf;
		$md5->add($bytes);
		$off += $buf;
	}
	close $fh;
	return $md5->hexdigest;
}

sub has_v1_tag {
	my $fh = shift;
	seek $fh, -128, 2;
	if (<$fh> =~ /^TAG/) {
		return 128;
	}
	return 0;
}

sub has_v2_tag {
	my $fh = shift;
	my ($head, @bytes);
	my $tagsize = 10;
	seek $fh, 0, 0;
	read $fh, $head, 3;
	if ($head =~ /^ID3/) {
		read $fh, $head, 3; # skip 3 bytes
		read $fh, $head, 4;
		@bytes = reverse unpack 'C4', $head;
		foreach (0 .. 3) {
			$tagsize += $bytes[$_] * 128 ** $_;
		}
		return $tagsize;
	}
	return 0;
}

sub output_doubles(@) {
	my %md5s = @_;
	my ( $key, @files, $i, $numkeys );
	my $start = $opts{s} ? 1 : 0;

	if ( $opts{a} ) { $numkeys = 0; } else { $numkeys = 1; }

	foreach $key (keys %md5s) {
		if ( @{$md5s{$key}} > $numkeys ) {
			if ( scalar @dirs > 0 ) {
				@files = sort dirsort @{$md5s{$key}};
			} else {
				@files = @{$md5s{$key}};
			}
			for $i ($start .. $#files) {
				chomp $files[$i];
				if ($opts{d}) {
					print "Deleting: $files[$i]\n";
					unlink($files[$i]) or die "Couldn't unlink $files[$i]";
				} else {
					&output("$files[$i]\n");
				}
			}
			unless($opts{s}) { print "\n"; }
		}
	}
}

# escape output if necessary
sub output($) {
	my $string = shift;
	if ($opts{e}) {
		$string = &escape($string);
	}
	print "$string";
} 

sub escape($) {
	my $string = shift;
	$string =~ s/([ \&\;\`\'\\\"\|\*\?\~\<\>\^\(\)\[\]\{\}\$\010\013\020\011])/\\${1}/g;
	return $string;
}

sub unescape($) {
	my $string = shift;
	$string =~ s/\\([ \&\;\`\'\\\"\|\*\?\~\<\>\^\(\)\[\]\{\}\$\010\013\020\011])/${1}/g;
	return $string;
}

sub dirsort {
	my $firstdir = &escape($dirs[0]);
	my $debuglevel = 0;
	print "$a $b\n" if $debuglevel == 1;
	if ( $a =~ /^$firstdir\/[^\/]*$/o && $b =~ /^$firstdir\/[^\/]*$/o ) {
		print "A $a $b\n" if $debuglevel == 1;
		return &wardsort;
	} elsif ( ( $a =~ /^$firstdir\/[^\/]*$/o ) && ! ( $b =~ /^$firstdir\/[^\/]*$/o ) ) {
		print "B $a $b\n" if $debuglevel == 1;
		return -1;
	} elsif ( ! ( $a =~ /^$firstdir\/[^\/]*$/o ) && ( $b =~ /^$firstdir\/[^\/]*$/o ) ) {
		print "C $a $b\n" if $debuglevel == 1;
		return 1;
	} else {
		print "D $a $b\n" if $debuglevel == 1;
		return &wardsort;
	}
}

sub wardsort {
	my @a = split(/([0-9]+)/,$a);
	my @b = split(/([0-9]+)/,$b);

	foreach my $x (@a)
	{
		my $y = shift @b;
		my $r = ($x =~ /^[0-9]+$/ && $y =~ /^[0-9]+$/) ?
					($x <=> $y) :  ($x cmp $y);
		$r && return $r;
	}
	return -1 if (@b);
	0;
}

sub help {
	my $name = $0;
	$name =~ s/.*\///;
	print <<EOF;

Usage: $name [OPTION] ...

	-a		output all files
	-d		delete resulting files
	-e		escape output filenames with backslashes
	-f		skip first line
	-h		display this help message
	-l		hardlink resulting files (no change if on
			different filesystems)
	-m		mp3 compare, ignores ID3 tags (slow)
	-s		skip the first entry for doubles

EOF
	exit;
}
