-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdupdnp.py
executable file
·97 lines (85 loc) · 3.84 KB
/
dupdnp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/python3
# find /path/to/search/ -type f -not -empty -printf '%p\t%s\n' | ./dupdnp.py
from optparse import OptionParser
parser = OptionParser(usage='%prog [--all] [--md5|--sha1] [--h4k]')
parser.add_option('-a', '--all', help='print all files', action='store_true')
parser.add_option('-4', '--h4k', help='use 4096 bytes per headers', action='store_true')
parser.add_option('-m', '--md5', help='use md5 instead of xxhash', action='store_true')
parser.add_option('-s', '--sha1', help='use sha1 instead of md5', action='store_true')
(options, _) = parser.parse_args()
if options.md5 and options.sha1:
parser.error('md5 and sha1 are mutually exclusive')
try:
# pip install xxhash #--user
from xxhash import xxh64 as message
except:
if not options.md5 and not options.sha1:
from warnings import warn
warn('switching to sha1', stacklevel=2)
from hashlib import sha1 as message
if options.md5: from hashlib import md5 as message
if options.sha1: from hashlib import sha1 as message
from collections import defaultdict
# check size :
# fill dict sizes with size as key and list of paths as value
sizes = defaultdict(list)
with open('/dev/stdin', 'r') as lines:
for line in lines:
# line is 'path\tsize\n'
path, size = line.strip('\n').split('\t')
sizes[int(size)].append(path)
# remove empty files if ever
sizes.get(0) and sizes.pop(0)
# remove single files
sizes = {size: paths for size, paths in sizes.items() if len(paths) > 1}
# check header :
# fill dict headers with (size, header) as key and list of paths as value
headers = defaultdict(list)
headerWidth = 1024 * (options.h4k and 4 or 1)
for size, paths in sizes.items():
for path in paths:
with open(path, 'rb') as data:
header = data.read(headerWidth)
headers[(size, header)].append(path)
# free memory
del(sizes)
# remove single files
headers = {(size, header): paths for (size, header), paths in headers.items() if len(paths) > 1}
# check hash of fragment :
# fill dict fragments with (size, hash) as key and list of paths as value
fragmentWidth = 1024 * 1024 * 4
# preload files already read (and stored in memory)
fragments = defaultdict(list, {(size, message(header).digest()): paths for (size, header), paths in headers.items() if size < headerWidth + 1})
# remove files already read and header
headers = {(size, header): paths for (size, header), paths in headers.items() if size > headerWidth}
for (size, header), paths in headers.items():
for path in paths:
with open(path, 'rb') as data:
fragment = message(data.read(fragmentWidth)).digest()
fragments[(size, fragment)].append(path)
# free memory
del(headers)
# remove single files
fragments = {(size, fragment): paths for (size, fragment), paths in fragments.items() if len(paths) > 1}
# check hash of totality :
# fill dict with hash as key and list of paths as value
# preload files already hashed (and stored in memory)
checksums = defaultdict(list, {(size, fragment): paths for (size, fragment), paths in fragments.items() if size < fragmentWidth + 1})
# remove files already hashed and fragment
fragments = {(size, fragment): paths for (size, fragment), paths in fragments.items() if size > fragmentWidth}
for (size, fragment), paths in fragments.items():
for path in paths:
with open(path, 'rb') as data:
checksum = message(data.read()).digest()
checksums[(size, checksum)].append(path)
# free memory
del(fragments)
# remove single files
checksums = {(size, checksum): paths for (size, checksum), paths in checksums.items() if len(paths) > 1}
# write results without first path in list
for (size, checksum), paths in checksums.items():
if options.all:
paths = [None] + paths + ['']
for path in paths[1:]: print(path)
# cython3 --embed ./dupdnp.py
# gcc $( python3-config --cflags --libs ) ./dupdnp.c -o ./dupdnp