On 21.09.24 20:53, fir wrote:
i think if to write a simple comandline program
that remove duplicates in a given folder
[...]
I have had the same problem. My solution was to use extended file attributes and some file checksum, eg sha512sum, also, I wrote this in PERL (see code below). Using the file attributes, I can re-run the program after a while without having to re-calculate the checksums.
So, this solution only works for filesystems that have extended file attributes, but you could also use some simple database (sqlite3?) to map checksums to pathnames.
What I did was to walk through the directory tree and check if the file being considered already has a checksum in an extended attribute. If not. I'll calculate the checksum and store that in the extended attribute. Also, I store the pathname in a hash (remember, this is PERL), key is the checksum.
If there is a collision (checksum already in the hash), I remove the new file (and link the new filename to the old file). One could be paranoid and do a byte-by-byte file comparison then.
If I needed to do this in a C program, I'd probably use a GList to store the hash, but otherwise the code logic would be the same.
HTH,
Josef
#! /usr/bin/perl
use warnings;
use strict;
use File::ExtAttr ':all'; # In case of problems, maybe insert "use Scalar:Utils;" in /usr/lib/x86_64-linux-gnu/perl5/5.22/File/ExtAttr.pm
use Digest::SHA;
use File::Find;
use Getopt::Std;
# OPTIONS:
# s: force symlink
# n: don't do the actula removing/linking
# v: be more verbose
# h: print short help
my %opt = (
s => undef,
n => undef,
v => undef,
h => undef,
);
getopts('hnsv', \%opt);
if ($opt{h}) {
print STDERR "usage: lndup [-snvh] [dirname..]\n";
print STDERR "\t-s: use symlink rather than hard link\n";
print STDERR "\t-n: don't remove/link, just show what would be done\n";
print STDERR "\t-v: be more verbose (show pathname and SHA512 sum\n";
print STDERR "\t-h: show this text\n";
exit(0);
}
my %file;
if (@ARGV == 0) {
find({ wanted => \&lndup, no_chdir => 1 }, '.');
} else {
find({ wanted => \&lndup, no_chdir => 1 }, @ARGV);
}
# NAME: lndup
# PURPOSE: To handle a single file
# ARGUMENTS: None, pathname is taken from $File::Find::name
# RETURNS: Nothing
# NOTE: The SHA512 sum of a file is calculated.
# IF a file with the same sum was already found earlier, AND
# iF both files are NOT the same (hard link) AND
# iF both files reside on the same disk
# THEN the second occurrence is removed and
# replaced by a link to the first occurrence
sub lndup {
my $pathname = $File::Find::name;
return if ! -f $pathname;
if (-s $pathname) {
my $sha512sum = getfattr($pathname, 'SHA512');
if (!defined $sha512sum) {
my $ctx = Digest::SHA->new(512);
$ctx->addfile($pathname);
$sha512sum = $ctx->hexdigest;
print STDERR "$pathname $sha512sum\n" if $opt{v};
setfattr($pathname, "SHA512", $sha512sum);
} elsif ($opt{v}) {
print STDERR "Using sha512sum from attributes\n";
}
if (exists $file{$sha512sum}) {
if (!same_file($pathname, $file{$sha512sum})) {
my $links1 = (stat($pathname))[3];
my $links2 = (stat($file{$sha512sum}))[3];
# If one of them is a symbolic link, make sure it's $pathname
if (is_symlink($file{$sha512sum})) {
print STDERR "Swapping $pathname and $file{$sha512sum}\n" if $opt{v};
swap($file{$sha512sum}, $pathname);
}
# If $pathname has more links than $file{$sha512sum},
# exchange the two names.
# This ensures that $file{$sha512sum} has the most links.
elsif ($links1 > $links2) {
print STDERR "Swapping $pathname and $file{$sha512sum}\n" if $opt{v};
swap($file{$sha512sum}, $pathname);
}
print "rm \"$pathname\"; ln \"$file{$sha512sum}\" \"$pathname\"\n";
if (! $opt{n}) {
my $same_disk = same_disk($pathname, $file{$sha512sum});
if (unlink($pathname)) {
if (! $same_disk || $opt{s}) {
symlink($file{$sha512sum}, $pathname) || print STDERR "Failed to symlink($file{$sha512sum}, $pathname): $!\n";
} else {
link($file{$sha512sum}, $pathname) || print STDERR "Failed to link($file{$sha512sum}, $pathname): $!\n";
}
} else {
print STDERR "Failed to unlink $pathname: $!\n";
}
}
# print "Removing $pathname\n";
# unlink $pathname or warn "$0: Cannot remove $_: $!\n";
}
} else {
$file{$sha512sum} = $pathname;
}
}
}
# NAME: same_disk
# PURPOSE: To check if two files are on the same disk
# ARGUMENTS: pn1, pn2: pathnames of files
# RETURNS: true if files are on the same disk, else false
# NOTE: The check is made by comparing the device numbers of the
# filesystems of the two files.
sub same_disk {
my ($pn1, $pn2) = @_;
my @s1 = stat($pn1);
my @s2 = stat($pn2);
return $s1[0] == $s2[0];
}
# NAME: same_file
# PURPOSE: To check if two files are the same
# ARGUMENTS: pn1, pn2: pathnames of files
# RETURNS: true if files are the same, else false
# NOTE: files are the same if device number AND inode number
# are identical
sub same_file {
my ($pn1, $pn2) = @_;
my @s1 = stat($pn1);
my @s2 = stat($pn2);
return ($s1[0] == $s2[0]) && ($s1[1] == $s2[1]);
}
sub is_symlink {
my ($path) = @_;
return -l $path;
}
sub swap {
my $tmp;
$tmp = $_[0];
$_[0] = $_[1];
$_[1] = $tmp;
}