diff --git a/README.md b/README.md index 54c1413..2c42824 100644 --- a/README.md +++ b/README.md @@ -7,64 +7,72 @@ scorch is a tool to catalog files and their hashes to help in discovering file c ``` usage: scorch [] [] -scorch (Silent CORruption CHecker) is a tool to catalog files and hashes -to help in discovering file corruption, missing files, duplicates, etc. +scorch (Silent CORruption CHecker) is a tool to catalog files, hash +digests, and other metadata to help in discovering file corruption, +missing files, duplicates, etc. positional arguments: - instruction: * add: compute and store hashes for all found files - * append: compute and store for newly found files - * backup: backs up selected database - * restore: restore backed up database - * list-backups: list database backups - * diff-backup: show diff between current & backup DB - * hashes: print available hash functions - * check: check stored hashes against files - * update: update metadata of changed files - * check+update: check and update if new - * cleanup: remove hashes of missing files - * delete: remove hashes for found files - * list-dups: list files w/ dup hashes - * list-missing: list files no longer on filesystem - * list-solo: list files w/ no dup hashes - * list-unhashed: list files not yet hashed - * list: md5sum'ish compatible listing - * in-db: show if hashed files exist in DB - * found-in-db: print files found in DB - * notfound-in-db: print files not found in DB - directory: Directory or file to scan + instruction: * add: compute & store digests for found files + * append: compute & store digests for unhashed files + * backup: backs up selected database + * restore: restore backed up database + * list-backups: list database backups + * diff-backup: show diff between current & backup DB + * hashes: print available hash functions + * check: check stored info against files + * update: update metadata of changed files + * check+update: check and update if new + * cleanup: remove info of missing files + * delete: remove info for found files + * list: md5sum'ish compatible listing + * list-unhashed: list files not yet hashed + * list-missing: list files no longer on filesystem + * list-dups: list files w/ dup digests + * list-solo: list files w/ no dup digests + * list-failed: list files marked failed + * list-changed: list files marked changed + * in-db: show if files exist in DB + * found-in-db: print files found in DB + * notfound-in-db: print files not found in DB + directory: Directory or file to scan. optional arguments: - -d, --db=: File to store hashes and other metadata in. - (default: /var/tmp/scorch/scorch.db) - -v, --verbose: Make `instruction` more verbose. Actual behavior - depends on the instruction. Can be used multiple - times. - -q, --quote: Shell quote/escape filenames when printed. - -r, --restrict=: * sticky: restrict scan to files with sticky bit - * readonly: restrict scan to readonly files - -f, --fnfilter=: Restrict actions to files which match regex - -F, --negate-fnfilter Negate the fnfilter regex match - -s, --sort=: Sorting routine on input & output (default: natural) - * random: shuffled / random - * natural: human-friendly sort, ascending - * reverse-natural: human-friendly sort, descending - * radix: RADIX sort, ascending - * reverse-radix: RADIX sort, descending - * time: sort by file mtime, ascending - * reverse-time: sort by file mtime, descending - -m, --maxactions=: Max actions to take before exiting (default: maxint) - -M, --maxdata=: Max bytes to process before exiting (default: maxint) - -b, --break-on-error: Any error or hash failure will exit - -D, --diff-fields=: Fields to use to indicate a file has 'changed' and - and should be rehashed. Combine with ','. - (default: size) - * size - * inode - * mtime - * mode - -H, --hash=: Hash algo. Use 'scorch hashes' get available algos. - (default: md5) - -h, --help: Print this message + -d, --db=: File to store digests and other metadata in. See + docs for info. (default: /var/tmp/scorch/scorch.db) + -v, --verbose: Make `instruction` more verbose. Actual behavior + depends on the instruction. Can be used multiple + times. + -q, --quote: Shell quote/escape filenames when printed. + -r, --restrict=: * sticky: restrict scan to files with sticky bit + * readonly: restrict scan to readonly files + -f, --fnfilter=: Restrict actions to files which match regex. + -F, --negate-fnfilter Negate the fnfilter regex match. + -s, --sort=: Sorting routine on input & output. (default: natural) + * random: shuffled / random + * natural: human-friendly sort, ascending + * natural-desc: human-friendly sort, descending + * radix: RADIX sort, ascending + * radix-desc: RADIX sort, descending + * mtime: sort by file mtime, ascending + * mtime-desc: sort by file mtime, descending + * checked: sort by last time checked, ascending + * checked-desc: sort by last time checked, descending + -m, --maxactions=: Max actions before exiting. (default: maxint) + -M, --maxdata=: Max bytes to process before exiting. (default: maxint) + Can use 'K', 'M', 'G', 'T' suffix. + -T, --maxtime=: Max time to process before exiting. (default: maxint) + Can use 's', 'm', 'h', 'd' suffix. + -b, --break-on-error: Any error or digest mismatch will cause an exit. + -D, --diff-fields=: Fields to use to indicate a file has 'changed' (vs. + bitrot / modified) and should be rehashed. + Combine with ','. (default: size) + * size + * inode + * mtime + * mode + -H, --hash=: Hash algo. Use 'scorch hashes' get available algos. + (default: md5) + -h, --help: Print this message. exit codes: * 0 : success, behavior executed, something found @@ -73,6 +81,7 @@ exit codes: * 4 : hash mismatch * 8 : found * 16 : not found, nothing processed + * 32 : interrupted ``` ### Database @@ -82,14 +91,19 @@ exit codes: The file is simply CSV compressed with gzip. ``` -$ # file, hash digest, size, mode, mtime, inode +$ # file, hash:digest, size, mode, mtime, inode, state, checked $ zcat /var/tmp/scorch/scorch.db -/tmp/files/a,md5:d41d8cd98f00b204e9800998ecf8427e,0,33188,1546377833.3844686,123456 +/tmp/files/a,md5:d41d8cd98f00b204e9800998ecf8427e,0,33188,1546377833.3844686,123456,0,1588895022.6193066 ``` +The 'state' value can be 'U' for unknown, 'C' for changed, 'F' for failed, or 'O' for OK. + +The 'mtime' and 'checked' values are floating point seconds since epoch. + + #### --db argument -The `--db` argument is takes more than a path. +The `--db` argument can take more than a path. * /tmp/test/myfiles.db : Full path. Used as is. * /tmp/test : If /tmp/test is a directory -> /tmp/test/scorch.db @@ -101,11 +115,6 @@ The `--db` argument is takes more than a path. If there is no extension then `.db` will be added. -#### Upgrade - -If you're using an older version of scorch with the default database in `/var/tmp/scorch.db` just copy/move the file to `/var/tmp/scorch/scorch.db`. The old format was not compressed but scorch will handle reading it uncompressed and compressing it on write. - - #### Backup / Restore To simplify backing up the scorch database there is a backup command. Without a directory defined it will store the database to the same location as the database. If directories are added to the arguments then the database backup will be stored there. @@ -149,10 +158,16 @@ $ scorch -v -d /tmp/hash.db list-unhashed /tmp/files /tmp/files/d $ scorch -v -d /tmp/hash.db append /tmp/files -1/1 /tmp/files/d: 2b00042f7481c7b056c4b410d28f33cf +1/1 /tmp/files/d: md5:2b00042f7481c7b056c4b410d28f33cf + +$ scorch -d /tmp/hash.db list-dups /tmp/files +md5:d41d8cd98f00b204e9800998ecf8427e /tmp/files/a /tmp/files/b /tmp/files/c $ scorch -v -d /tmp/hash.db list-dups /tmp/files -d41d8cd98f00b204e9800998ecf8427e /tmp/files/a /tmp/files/b /tmp/files/c +md5:d41d8cd98f00b204e9800998ecf8427e + - /tmp/files/a + - /tmp/files/b + - /tmp/files/c $ echo foo > /tmp/files/a $ scorch -v -d /tmp/hash.db check+update /tmp/files @@ -179,7 +194,7 @@ A typical setup would probably be initialized manually by using **add** or **app ``` #!/bin/sh -scorch check+update /tmp/files +scorch -M 128G -T 2h check+update /tmp/files scorch append /tmp/files scorch cleanup /tmp/files ``` @@ -202,7 +217,10 @@ This software is free to use and released under a very liberal license. That sai * PayPal: trapexit@spawn.link * Patreon: https://www.patreon.com/trapexit -* Bitcoin (BTC): 12CdMhEPQVmjz3SSynkAEuD5q9JmhTDCZA -* Bitcoin Cash (BCH): 1AjPqZZhu7GVEs6JFPjHmtsvmDL4euzMzp -* Ethereum (ETH): 0x09A166B11fCC127324C7fc5f1B572255b3046E94 -* Litecoin (LTC): LXAsq6yc6zYU3EbcqyWtHBrH1Ypx4GjUjm +* Bitcoin (BTC): 1DfoUd2m5WCxJAMvcFuvDpT4DR2gWX2PWb +* Bitcoin Cash (BCH): qrf257j0l09yxty4kur8dk2uma8p5vntdcpks72l8z +* Ethereum (ETH): 0xb486C0270fF75872Fc51d85879b9c15C380E66CA +* Litecoin (LTC): LW1rvHRPWtm2NUEMhJpP4DjHZY1FaJ1WYs +* Basic Attention Token (BAT): 0xE651d4900B4C305284Da43E2e182e9abE149A87A +* Zcash (ZEC): t1ZwTgmbQF23DJrzqbAmw8kXWvU2xUkkhTt +* Zcoin (XZC): a8L5Vz35KdCQe7Y7urK2pcCGau7JsqZ5Gw diff --git a/scorch b/scorch index c6105a3..fab284e 100755 --- a/scorch +++ b/scorch @@ -20,6 +20,7 @@ import argparse import collections import csv import errno +import fcntl import gzip import hashlib import io @@ -29,9 +30,11 @@ import random import re import shlex import shutil +import signal import stat import sys import tempfile +import time import zlib from collections import namedtuple from datetime import datetime as dt @@ -46,6 +49,7 @@ ERROR_ARG = 2 ERROR_DIGEST_MISMATCH = 4 ERROR_FOUND = 8 ERROR_NOT_FOUND = 16 +ERROR_INTERRUPTED = 32 class Options(object): @@ -56,30 +60,29 @@ class Options(object): filter = None maxactions = sys.maxsize maxdata = sys.maxsize + maxtime = sys.maxsize breakonerror = False - diff_fields = [] + diff_fields = ['size'] class FileInfo(object): - digest = '' - size = 0 - mode = 0 - mtime = 0 - inode = 0 - - def __init__(self,digest,size,mode,mtime,inode): - self.digest = digest - self.size = size - self.mode = mode - self.mtime = mtime - self.inode = inode + def __init__(self,digest,size,mode,mtime,inode,state='U',checked=0): + self.digest = digest + self.size = size + self.mode = mode + self.mtime = mtime + self.inode = inode + self.state = state + self.checked = checked def __str__(self): return str({'digest': self.digest, 'size': self.size, 'mode': self.mode, 'mtime': self.mtime, - 'inode': self.inode}) + 'inode': self.inode, + 'state': self.state, + 'checked': self.checked}) def __eq__(self, rhs): return ((self.digest == rhs.digest) and @@ -182,7 +185,7 @@ def allnamesequal(name): return all(n==name[0] for n in name[1:]) -def commonprefix(paths, sep='/'): +def commonprefix(paths,sep='/'): bydirectorylevels = zip(*[p.split(sep) for p in paths]) return sep.join(x[0] for x in takewhile(allnamesequal, bydirectorylevels)) @@ -205,60 +208,67 @@ digests, and other metadata to help in discovering file corruption, missing files, duplicates, etc. positional arguments: - instruction: * add: compute and store hash digests for found files - * append: compute and store for newly found files - * backup: backs up selected database - * restore: restore backed up database - * list-backups: list database backups - * diff-backup: show diff between current & backup DB - * hashes: print available hash functions - * check: check stored info against files - * update: update metadata of changed files - * check+update: check and update if new - * cleanup: remove info of missing files - * delete: remove info for found files - * list-dups: list files w/ dup digests - * list-missing: list files no longer on filesystem - * list-solo: list files w/ no dup digests - * list-unhashed: list files not yet hashed - * list: md5sum'ish compatible listing - * in-db: show if files exist in DB - * found-in-db: print files found in DB - * notfound-in-db: print files not found in DB - directory: Directory or file to scan + instruction: * add: compute & store digests for found files + * append: compute & store digests for unhashed files + * backup: backs up selected database + * restore: restore backed up database + * list-backups: list database backups + * diff-backup: show diff between current & backup DB + * hashes: print available hash functions + * check: check stored info against files + * update: update metadata of changed files + * check+update: check and update if new + * cleanup: remove info of missing files + * delete: remove info for found files + * list: md5sum'ish compatible listing + * list-unhashed: list files not yet hashed + * list-missing: list files no longer on filesystem + * list-dups: list files w/ dup digests + * list-solo: list files w/ no dup digests + * list-failed: list files marked failed + * list-changed: list files marked changed + * in-db: show if files exist in DB + * found-in-db: print files found in DB + * notfound-in-db: print files not found in DB + directory: Directory or file to scan. optional arguments: - -d, --db=: File to store digests and other metadata in. See - docs for info. (default: /var/tmp/scorch/scorch.db) - -v, --verbose: Make `instruction` more verbose. Actual behavior - depends on the instruction. Can be used multiple - times. - -q, --quote: Shell quote/escape filenames when printed. - -r, --restrict=: * sticky: restrict scan to files with sticky bit - * readonly: restrict scan to readonly files - -f, --fnfilter=: Restrict actions to files which match regex. - -F, --negate-fnfilter Negate the fnfilter regex match. - -s, --sort=: Sorting routine on input & output (default: natural) - * random: shuffled / random - * natural: human-friendly sort, ascending - * reverse-natural: human-friendly sort, descending - * radix: RADIX sort, ascending - * reverse-radix: RADIX sort, descending - * time: sort by file mtime, ascending - * reverse-time: sort by file mtime, descending - -m, --maxactions=: Max actions before exiting (default: maxint) - -M, --maxdata=: Max bytes to process before exiting (default: maxint) - -b, --break-on-error: Any error or digest mismatch will cause an exit. - -D, --diff-fields=: Fields to use to indicate a file has 'changed' (vs. - bitrot / modified) and should be rehashed. - Combine with ','. (default: size) - * size - * inode - * mtime - * mode - -H, --hash=: Hash algo. Use 'scorch hashes' get available algos. - (default: md5) - -h, --help: Print this message + -d, --db=: File to store digests and other metadata in. See + docs for info. (default: /var/tmp/scorch/scorch.db) + -v, --verbose: Make `instruction` more verbose. Actual behavior + depends on the instruction. Can be used multiple + times. + -q, --quote: Shell quote/escape filenames when printed. + -r, --restrict=: * sticky: restrict scan to files with sticky bit + * readonly: restrict scan to readonly files + -f, --fnfilter=: Restrict actions to files which match regex. + -F, --negate-fnfilter Negate the fnfilter regex match. + -s, --sort=: Sorting routine on input & output. (default: natural) + * random: shuffled / random + * natural: human-friendly sort, ascending + * natural-desc: human-friendly sort, descending + * radix: RADIX sort, ascending + * radix-desc: RADIX sort, descending + * mtime: sort by file mtime, ascending + * mtime-desc: sort by file mtime, descending + * checked: sort by last time checked, ascending + * checked-desc: sort by last time checked, descending + -m, --maxactions=: Max actions before exiting. (default: maxint) + -M, --maxdata=: Max bytes to process before exiting. (default: maxint) + Can use 'K', 'M', 'G', 'T' suffix. + -T, --maxtime=: Max time to process before exiting. (default: maxint) + Can use 's', 'm', 'h', 'd' suffix. + -b, --break-on-error: Any error or digest mismatch will cause an exit. + -D, --diff-fields=: Fields to use to indicate a file has 'changed' (vs. + bitrot / modified) and should be rehashed. + Combine with ','. (default: size) + * size + * inode + * mtime + * mode + -H, --hash=: Hash algo. Use 'scorch hashes' get available algos. + (default: md5) + -h, --help: Print this message. exit codes: * 0 : success, behavior executed, something found @@ -267,6 +277,7 @@ exit codes: * 4 : hash mismatch * 8 : found * 16 : not found, nothing processed + * 32 : interrupted ''' print(help) @@ -286,6 +297,7 @@ def build_arg_parser(): 'cleanup','list', 'list-unhashed','list-dups', 'list-solo','list-missing', + 'list-failed','list-changed', 'in-db', 'found-in-db','notfound-in-db'], nargs='?') @@ -307,9 +319,12 @@ def build_arg_parser(): action='store_true', default=False) parser.add_argument('-s','--sort', - choices=['none','radix','reverse-radix', - 'natural','reverse-natural','random', - 'time','reverse-time'], + choices=['none', + 'random', + 'radix','radix-desc', + 'natural','natural-desc', + 'mtime','mtime-desc', + 'checked','checked-desc'], default='natural') parser.add_argument('-m','--maxactions', type=int, @@ -317,6 +332,9 @@ def build_arg_parser(): parser.add_argument('-M','--maxdata', type=str, default=str(sys.maxsize)) + parser.add_argument('-T','--maxtime', + type=str, + default=str(sys.maxsize)) parser.add_argument('-b','--break-on-error', action='store_true', default=False) @@ -367,6 +385,8 @@ def get_files(basepath,filefilter,db={}): def filter_files(files,filefilter,other=(lambda f: False)): + if filefilter.basepath == '/': + return [(filepath,fi) for (filepath,fi) in files] if filefilter.basepath in files: return files @@ -389,7 +409,7 @@ def get_fileinfo(filepath): mode=st.st_mode, mtime=st.st_mtime, inode=st.st_ino) - except: + except IOError as e: return None @@ -430,6 +450,22 @@ def human_to_bytes(s): return i +def human_to_time(s): + m = s[-1] + if m in ['s','S']: + t = int(s[0:-1]) + elif m in ['m','M']: + t = int(s[0:-1]) * 60 + elif m in ['h','H']: + t = int(s[0:-1]) * 60 * 60 + elif m in ['d','D']: + t = int(s[0:-1]) * 60 * 60 * 24 + else: + t = int(s) + + return t + + def humansize(nbytes): suffixes = ['B','KB','MB','GB','TB','PB','ZB'] rank = int(math.log(nbytes,1024)) if nbytes else 0 @@ -442,13 +478,13 @@ def humansize(nbytes): return '{}{}'.format(f, suffixes[rank]) -def inst_hashes(opts,path,db,dbadd,dbremove): +def inst_hashes(opts,path,db,dbremove): for hash in available_hashes(): print(hash) return SUCCESS -def inst_add(opts,path,db,dbadd,dbremove): +def inst_add(opts,path,db,dbremove): rv = ERROR_NOT_FOUND err = SUCCESS; filepaths = get_files(path,opts.filter) @@ -456,12 +492,15 @@ def inst_add(opts,path,db,dbadd,dbremove): actions = 0 processed = 0 + endtime = time.time() + opts.maxtime total = min(opts.maxactions,len(filepaths)) for (filepath,fi) in filepaths: if actions >= opts.maxactions: break if processed >= opts.maxdata: break + if time.time() >= endtime: + break actions += 1 processed += fi.size @@ -470,12 +509,15 @@ def inst_add(opts,path,db,dbadd,dbremove): print_filepath(filepath,actions,total,opts.quote) try: - fi.digest = hash_file(filepath,opts.hash) - dbadd[filepath] = fi - rv = SUCCESS + fi.digest = hash_file(filepath,opts.hash) + fi.state = 'O' + fi.checked = time.time() if opts.verbose: print(':',fi.digest) + append_to_db(opts.dbpath,filepath,fi) + rv = SUCCESS except (KeyboardInterrupt,SystemExit): + err = err | ERROR_INTERRUPTED break except Exception as e: err = err | ERROR_PROCESSING @@ -487,7 +529,7 @@ def inst_add(opts,path,db,dbadd,dbremove): return rv | err -def inst_append(opts,path,db,dbadd,dbremote): +def inst_append(opts,path,db,dbremote): rv = ERROR_NOT_FOUND err = SUCCESS filepaths = get_files(path,opts.filter,db) @@ -495,12 +537,15 @@ def inst_append(opts,path,db,dbadd,dbremote): actions = 0 processed = 0 + endtime = time.time() + opts.maxtime total = min(opts.maxactions,len(filepaths)) for (filepath,fi) in filepaths: if actions >= opts.maxactions: break if processed >= opts.maxdata: break + if time.time() >= endtime: + break actions += 1 processed += fi.size @@ -509,12 +554,15 @@ def inst_append(opts,path,db,dbadd,dbremote): print_filepath(filepath,actions,total,opts.quote) try: - fi.digest = hash_file(filepath,opts.hash) - dbadd[filepath] = fi - rv = SUCCESS + fi.digest = hash_file(filepath,opts.hash) + fi.state = 'O' + fi.checked = time.time() if opts.verbose: print(':',fi.digest) + append_to_db(opts.dbpath,filepath,fi) + rv = SUCCESS except (KeyboardInterrupt,SystemExit): + err = err | ERROR_INTERRUPTED break except Exception as e: err = err | ERROR_PROCESSING @@ -567,7 +615,7 @@ def different_files(old,new,fields): (('mode' in fields) and (old.mode != new.mode))) -def inst_check(opts,path,db,dbadd,dbremove,update=False): +def inst_check(opts,path,db,dbremove,update=False): rv = ERROR_NOT_FOUND err = SUCCESS filepaths = filter_files(db.items(),opts.filter) @@ -575,12 +623,15 @@ def inst_check(opts,path,db,dbadd,dbremove,update=False): actions = 0 processed = 0 + endtime = time.time() + opts.maxtime total = min(opts.maxactions,len(filepaths)) for (filepath,oldfi) in filepaths: if actions >= opts.maxactions: break if processed >= opts.maxdata: break + if time.time() >= endtime: + break actions += 1 processed += oldfi.size @@ -604,12 +655,16 @@ def inst_check(opts,path,db,dbadd,dbremove,update=False): print(' - digest: {} -> '.format(oldfi.digest),end='') if update: - newfi.digest = hash_file(filepath,opts.hash) - print('{}'.format(newfi.digest)) - - dbadd[filepath] = newfi + newfi.digest = hash_file(filepath,opts.hash) + newfi.state = 'O' + newfi.checked = time.time() + print(newfi.digest) + append_to_db(opts.dbpath,filepath,newfi) else: + oldfi.state = 'C' + oldfi.checked = time.time() print('not calculated') + append_to_db(opts.dbpath,filepath,oldfi) else: if opts.verbose: print_filepath(filepath,actions,total,opts.quote) @@ -617,6 +672,8 @@ def inst_check(opts,path,db,dbadd,dbremove,update=False): newfi.digest = hash_file(filepath,oldfi.digest) if newfi.digest != oldfi.digest: err = err | ERROR_DIGEST_MISMATCH + oldfi.state = 'F' + oldfi.checked = time.time() if not opts.verbose: print_filepath(filepath,actions,total,opts.quote) print(': FAILED') @@ -624,6 +681,7 @@ def inst_check(opts,path,db,dbadd,dbremove,update=False): for change in changes: print(change) print(' - digest: {} -> {}'.format(oldfi.digest,newfi.digest)) + append_to_db(opts.dbpath,filepath,oldfi) if opts.breakonerror: break elif opts.verbose: @@ -633,11 +691,16 @@ def inst_check(opts,path,db,dbadd,dbremove,update=False): for change in changes: print(change) print(' - digest: {} (unchanged)'.format(oldfi.digest)) - dbadd[filepath] = newfi + newfi.state = 'O' + newfi.checked = time.time() + append_to_db(opts.dbpath,filepath,newfi) else: + oldfi.state = 'O' + oldfi.checked = time.time() print(': OK') - + append_to_db(opts.dbpath,filepath,oldfi) except (KeyboardInterrupt,SystemExit): + err = err | ERROR_INTERRUPTED break except Exception as e: err = err | ERROR_PROCESSING @@ -649,23 +712,26 @@ def inst_check(opts,path,db,dbadd,dbremove,update=False): return rv | err -def inst_check_and_update(opts,path,db,dbadd,dbremove): - return inst_check(opts,path,db,dbadd,dbremove,update=True) +def inst_check_and_update(opts,path,db,dbremove): + return inst_check(opts,path,db,dbremove,update=True) -def inst_update(opts,path,db,dbadd,dbremove): +def inst_update(opts,path,db,dbremove): rv = SUCCESS filepaths = filter_files(db.items(),opts.filter) opts.sort(filepaths) actions = 0 processed = 0 + endtime = time.time() + opts.maxtime total = min(opts.maxactions,len(filepaths)) for (filepath,oldfi) in filepaths: if actions >= opts.maxactions: break if processed >= opts.maxdata: break + if time.time() >= endtime: + break actions += 1 processed += oldfi.size @@ -699,8 +765,11 @@ def inst_update(opts,path,db,dbadd,dbremove): else: print('{} (unchanged)'.format(oldfi.digest)) - dbadd[filepath] = newfi + newfi.state = 'O' + newfi.checked = time.time() + append_to_db(opts.dbpath,filepath,newfi) except (KeyboardInterrupt,SystemExit): + rv = rv | ERROR_INTERRUPTED break except Exception as e: rv = rv | ERROR_PROCESSING @@ -712,7 +781,7 @@ def inst_update(opts,path,db,dbadd,dbremove): return rv -def inst_delete(opts,path,db,dbadd,dbremove): +def inst_delete(opts,path,db,dbremove): rv = ERROR_NOT_FOUND err = SUCCESS filepaths = filter_files(db.items(),opts.filter) @@ -720,12 +789,15 @@ def inst_delete(opts,path,db,dbadd,dbremove): actions = 0 processed = 0 + endtime = time.time() + opts.maxtime total = min(opts.maxactions,len(filepaths)) for (filepath,fi) in filepaths: if actions >= opts.maxactions: break if processed >= opts.maxdata: break + if time.time() >= endtime: + break actions += 1 processed += fi.size @@ -741,7 +813,7 @@ def inst_delete(opts,path,db,dbadd,dbremove): return rv | err -def inst_cleanup(opts,path,db,dbadd,dbremove): +def inst_cleanup(opts,path,db,dbremove): rv = ERROR_NOT_FOUND err = SUCCESS filepaths = filter_files(db.items(),opts.filter,os.path.exists) @@ -749,12 +821,15 @@ def inst_cleanup(opts,path,db,dbadd,dbremove): actions = 0 processed = 0 + endtime = time.time() + opts.maxtime total = min(opts.maxactions,len(filepaths)) for (filepath,fi) in filepaths: if actions >= opts.maxactions: break if processed >= opts.maxdata: break + if time.time() >= endtime: + break actions += 1 processed += fi.size @@ -768,7 +843,7 @@ def inst_cleanup(opts,path,db,dbadd,dbremove): return rv | err -def inst_list(opts,path,db,dbadd,dbremove): +def inst_list(opts,path,db,dbremove): rv = ERROR_NOT_FOUND err = SUCCESS filepaths = filter_files(db.items(),opts.filter) @@ -776,19 +851,21 @@ def inst_list(opts,path,db,dbadd,dbremove): actions = 0 processed = 0 + endtime = time.time() + opts.maxtime for (filepath,fi) in filepaths: if actions >= opts.maxactions: break if processed >= opts.maxdata: break + if time.time() >= endtime: + break actions += 1 processed += fi.size rv = SUCCESS if not opts.verbose: - filepath = filepath[len(path)+1:] - filepath = os.path.join('.',filepath) + filepath = os.path.relpath(filepath,path) if opts.quote: filepath = shlex.quote(filepath) @@ -797,7 +874,7 @@ def inst_list(opts,path,db,dbadd,dbremove): return rv | err -def inst_list_unhashed(opts,path,db,dbadd,dbremove): +def inst_list_unhashed(opts,path,db,dbremove): rv = ERROR_NOT_FOUND err = SUCCESS filepaths = get_files(path,opts.filter,db) @@ -805,26 +882,28 @@ def inst_list_unhashed(opts,path,db,dbadd,dbremove): actions = 0 processed = 0 + endtime = time.time() + opts.maxtime for (filepath,fi) in filepaths: if actions >= opts.maxactions: break if processed >= opts.maxdata: break + if time.time() >= endtime: + break actions += 1 processed += fi.size rv = SUCCESS if not opts.verbose: - filepath = filepath[len(path)+1:] - filepath = os.path.join('.',filepath) + filepath = os.path.relpath(filepath,path) print_filepath(filepath,quote=opts.quote,end='\n') return rv | err -def inst_list_dups(opts,path,db,dbadd,dbremove): +def inst_list_dups(opts,path,db,dbremove): rv = ERROR_NOT_FOUND err = SUCCESS hashdb = {} @@ -838,12 +917,15 @@ def inst_list_dups(opts,path,db,dbadd,dbremove): hashdb[fi.digest].append(filepath) actions = 0 + endtime = time.time() + opts.maxtime for (digest,filepaths) in hashdb.items(): if len(filepaths) <= 1: continue if actions >= opts.maxactions: break + if time.time() >= endtime: + break actions += 1 @@ -859,7 +941,7 @@ def inst_list_dups(opts,path,db,dbadd,dbremove): return rv | err -def inst_list_solo(opts,path,db,dbadd,dbremove): +def inst_list_solo(opts,path,db,dbremove): rv = ERROR_NOT_FOUND err = SUCCESS hashdb = {} @@ -873,12 +955,15 @@ def inst_list_solo(opts,path,db,dbadd,dbremove): hashdb[fi.digest].append(filepath) actions = 0 + endtime = time.time() + opts.maxtime for (digest,filepaths) in hashdb.items(): if len(filepaths) > 1: continue if actions >= opts.maxactions: return rv + if time.time() >= endtime: + break actions += 1 @@ -890,7 +975,7 @@ def inst_list_solo(opts,path,db,dbadd,dbremove): return rv | err -def inst_list_missing(opts,path,db,dbadd,dbremove): +def inst_list_missing(opts,path,db,dbremove): rv = ERROR_NOT_FOUND err = SUCCESS filepaths = get_files(path,opts.filter) @@ -898,6 +983,7 @@ def inst_list_missing(opts,path,db,dbadd,dbremove): actions = 0 processed = 0 + endtime = time.time() + opts.maxtime output = [] for (filepath,fi) in db.items(): if commonprefix([path,filepath]) != path: @@ -910,6 +996,8 @@ def inst_list_missing(opts,path,db,dbadd,dbremove): break if processed >= opts.maxdata: break + if time.time() >= endtime: + break actions += 1 processed += fi.size @@ -926,7 +1014,7 @@ def inst_list_missing(opts,path,db,dbadd,dbremove): return rv | err -def inst_in_db(opts,path,db,dbadd,dbremove): +def inst_in_db(opts,path,db,dbremove): rv = SUCCESS sizedb = set() hashdb = {} @@ -939,6 +1027,7 @@ def inst_in_db(opts,path,db,dbadd,dbremove): actions = 0 processed = 0 + endtime = time.time() + opts.maxtime filepaths = get_files(path,opts.filter) total = min(opts.maxactions,len(filepaths)) for (filepath,fi) in filepaths: @@ -947,6 +1036,8 @@ def inst_in_db(opts,path,db,dbadd,dbremove): break if processed >= opts.maxdata: break + if time.time() >= endtime: + break actions += 1 processed += fi.size @@ -964,6 +1055,7 @@ def inst_in_db(opts,path,db,dbadd,dbremove): rv = rv | ERROR_FOUND print(': YES') except (KeyboardInterrupt,SystemExit): + rv = rv | ERROR_INTERRUPTED break except Exception as e: rv = rv | ERROR_PROCESSING @@ -972,7 +1064,7 @@ def inst_in_db(opts,path,db,dbadd,dbremove): return rv -def inst_found_in_db(opts,path,db,dbadd,dbremove): +def inst_found_in_db(opts,path,db,dbremove): rv = SUCCESS sizedb = set() hashdb = {} @@ -986,6 +1078,7 @@ def inst_found_in_db(opts,path,db,dbadd,dbremove): actions = 0 processed = 0 + endtime = time.time() + opts.maxtime filepaths = get_files(path,opts.filter) for (filepath,fi) in filepaths: try: @@ -993,6 +1086,8 @@ def inst_found_in_db(opts,path,db,dbadd,dbremove): break if processed >= opts.maxdata: break + if time.time() >= endtime: + break actions += 1 processed += fi.size @@ -1009,13 +1104,13 @@ def inst_found_in_db(opts,path,db,dbadd,dbremove): rv = rv | ERROR_FOUND if opts.verbose in [1,2]: if opts.verbose == 1: - filepath = os.path.join('.',filepath[len(path)+1:]) + filepath = os.path.relpath(filepath,path) if opts.quote: filepath = shlex.quote(filepath) print(filepath) elif opts.verbose in [3,4]: if opts.verbose == 3: - filepath = os.path.join('.',filepath[len(path)+1:]) + filepath = os.path.relpath(filepath,path) if opts.quote: filepath = shlex.quote(filepath) print(digest,filepath) @@ -1023,6 +1118,7 @@ def inst_found_in_db(opts,path,db,dbadd,dbremove): t = tuple([digest,filepath] + hashdb[digest]) writer.writerow(t) except (KeyboardInterrupt,SystemExit): + rv = rv | ERROR_INTERRUPTED break except Exception as e: rv = rv | ERROR_PROCESSING @@ -1031,7 +1127,7 @@ def inst_found_in_db(opts,path,db,dbadd,dbremove): return rv -def inst_notfound_in_db(opts,path,db,dbadd,dbremove): +def inst_notfound_in_db(opts,path,db,dbremove): rv = SUCCESS hashes = set() sizes = set() @@ -1041,6 +1137,7 @@ def inst_notfound_in_db(opts,path,db,dbadd,dbremove): actions = 0 processed = 0 + endtime = time.time() + opts.maxtime filepaths = get_files(path,opts.filter) for (filepath,fi) in filepaths: try: @@ -1048,6 +1145,8 @@ def inst_notfound_in_db(opts,path,db,dbadd,dbremove): break if processed >= opts.maxdata: break + if time.time() >= endtime: + break actions += 1 processed += fi.size @@ -1062,7 +1161,7 @@ def inst_notfound_in_db(opts,path,db,dbadd,dbremove): elif opts.verbose in [1,2]: printpath = filepath if opts.verbose == 1: - printpath = os.path.join('.',filepath[len(path)+1:]) + printpath = os.path.relpath(filepath,path) if opts.quote: printpath = shlex.quote(printpath) @@ -1079,7 +1178,7 @@ def inst_notfound_in_db(opts,path,db,dbadd,dbremove): elif opts.verbose in [3,4]: printpath = filepath if opts.verbose == 3: - printpath = os.path.join('.',filepath[len(path)+1:]) + printpath = os.path.relpath(filepath,path) if opts.quote: printpath = shlex.quote(printpath) @@ -1090,6 +1189,7 @@ def inst_notfound_in_db(opts,path,db,dbadd,dbremove): else: rv = rv | ERROR_FOUND except (KeyboardInterrupt,SystemExit): + rv = rv | ERROR_INTERRUPTED break except Exception as e: rv = rv | ERROR_PROCESSING @@ -1098,30 +1198,29 @@ def inst_notfound_in_db(opts,path,db,dbadd,dbremove): return rv -def inst_backup(opts,path,db,dbadd,dbremove): +def inst_backup(opts,path,db,dbremove): timestamp = dt.utcnow().replace(microsecond=0).isoformat() + 'Z' basepath = os.path.dirname(opts.dbpath) filename = os.path.basename(opts.dbpath) filename = '{}.backup_{}'.format(filename,timestamp) tgt_dbpath = os.path.join(path,filename) - tmp_dbpath = None try: - (fd,tmp_dbpath) = tempfile.mkstemp(dir=basepath) - os.close(fd) - shutil.copy2(opts.dbpath,tmp_dbpath) - os.replace(tmp_dbpath,tgt_dbpath) + with tempfile.NamedTemporaryFile(dir=basepath,delete=False) as tgtfile: + with open(opts.dbpath,'rb') as srcfile: + fcntl.flock(srcfile,fcntl.LOCK_EX) + shutil.copyfileobj(srcfile,tgtfile) + fcntl.flock(srcfile,fcntl.LOCK_UN) + os.replace(tgtfile.name,tgt_dbpath) if opts.verbose: print(tgt_dbpath) return SUCCESS except Exception as e: - if tmp_dbpath: - os.remove(tmp_dbpath) print('Error backing up: {} - {}'.format(tgt_dbpath,e),file=sys.stderr) return ERROR_PROCESSING -def inst_restore(opts,path,db,dbadd,dbremove): +def inst_restore(opts,path,db,dbremove): basepath = os.path.dirname(opts.dbpath) tmp_dbpath = None @@ -1138,7 +1237,7 @@ def inst_restore(opts,path,db,dbadd,dbremove): return ERROR_PROCESSING -def inst_list_backups(opts,path,db,dbadd,dbremove): +def inst_list_backups(opts,path,db,dbremove): try: filename = os.path.basename(opts.dbpath) prefix = '{}.backup_'.format(filename) @@ -1162,7 +1261,7 @@ def inst_list_backups(opts,path,db,dbadd,dbremove): return ERROR_PROCESSING -def inst_diff_backup(opts,path,db,dbadd,dbremove): +def inst_diff_backup(opts,path,db,dbremove): backup_db = read_db(path) for (k,v) in db.items(): @@ -1178,6 +1277,60 @@ def inst_diff_backup(opts,path,db,dbadd,dbremove): return SUCCESS +def inst_list_failed(opts,path,db,dbremove): + filepaths = filter_files(db.items(),opts.filter) + opts.sort(filepaths) + + actions = 0 + processed = 0 + endtime = time.time() + opts.maxtime + for (filepath,fi) in filepaths: + if actions >= opts.maxactions: + break + if processed >= opts.maxdata: + break + if time.time() >= endtime: + break + + actions += 1 + processed += fi.size + + if fi.state != 'F': + continue + if opts.quote: + filepath = shlex.quote(filepath) + print(filepath) + + return SUCCESS + + +def inst_list_changed(opts,path,db,dbremove): + filepaths = filter_files(db.items(),opts.filter) + opts.sort(filepaths) + + actions = 0 + processed = 0 + endtime = time.time() + opts.maxtime + for (filepath,fi) in filepaths: + if actions >= opts.maxactions: + break + if processed >= opts.maxdata: + break + if time.time() >= endtime: + break + + actions += 1 + processed += fi.size + + if fi.state != 'C': + continue + if opts.quote: + filepath = shlex.quote(filepath) + print(filepath) + + return SUCCESS + + def is_not_sticky(fi): return not bool(fi.mode & stat.S_ISVTX) @@ -1204,7 +1357,6 @@ def fnfilter_fun(regex,negate): Instruction = namedtuple('Instruction',['fun','needs_dirs','load_db']) - INSTRUCTIONS = { 'hashes': Instruction(inst_hashes,False,False), 'add': Instruction(inst_add,True,False), @@ -1223,6 +1375,8 @@ INSTRUCTIONS = { 'list-dups': Instruction(inst_list_dups,True,True), 'list-solo': Instruction(inst_list_solo,True,True), 'list-missing': Instruction(inst_list_missing,True,True), + 'list-failed': Instruction(inst_list_failed,True,True), + 'list-changed': Instruction(inst_list_changed,True,True), 'in-db': Instruction(inst_in_db,True,True), 'found-in-db': Instruction(inst_found_in_db,True,True), 'notfound-in-db': Instruction(inst_notfound_in_db,True,True) @@ -1230,102 +1384,143 @@ INSTRUCTIONS = { def sort_fun(sort): - if sort == 'radix': + if sort == 'random': + return (lambda l: random.shuffle(l)) + elif sort == 'radix': return (lambda l: l.sort()) - elif sort == 'reverse-radix': + elif sort == 'radix-desc': return (lambda l: l.sort(reverse=True)) - elif sort == 'random': - return (lambda l: random.shuffle(l)) elif sort == 'natural': cre = re.compile('(\d+)') sort_key = lambda s: [int(t) if t.isdigit() else t.lower() for t in re.split(cre,s[0])] return (lambda l: l.sort(key=sort_key)) - elif sort == 'reverse-natural': + elif sort == 'natural-desc': cre = re.compile('(\d+)') sort_key = lambda s: [int(t) if t.isdigit() else t.lower() for t in re.split(cre,s[0])] return (lambda l: l.sort(key=sort_key,reverse=True)) - elif sort == 'time': + elif sort == 'mtime': sort_key = lambda s: s[1].mtime return (lambda l: l.sort(key=sort_key)) - elif sort == 'reverse-time': + elif sort == 'mtime-desc': sort_key = lambda s: s[1].mtime return (lambda l: l.sort(key=sort_key,reverse=True)) + elif sort == 'checked': + sort_key = lambda s: s[1].checked + return (lambda l: l.sort(key=sort_key)) + elif sort == 'checked-desc': + sort_key = lambda s: s[1].checked + return (lambda l: l.sort(key=sort_key,reverse=True)) return (lambda l: None) -def open_db(filepath): - try: - f = gzip.open(filepath,'rt',encoding='utf8',errors='surrogateescape',newline='') - f.read(10) - f.seek(0) - return f - except Exception as e: - return open(filepath,'rt',encoding='utf8',errors='surrogateescape',newline='') +def gzip_open_r(filepath): + return gzip.open(filepath, + 'rt', + encoding='utf8', + errors='surrogateescape', + newline='') +def gzip_open_w(filepath): + return gzip.open(filepath, + 'wt', + encoding='utf8', + errors='surrogateescape', + newline='') -def read_db(filepath): +def gzip_open_append(filepath): + return gzip.open(filepath, + 'at', + encoding='utf8', + errors='surrogateescape', + newline='') + + +def read_db_from_fd(f): db = {} - try: - with open_db(filepath) as f: - reader = csv.reader(f,delimiter=',',quotechar='"') - for row in reader: - if len(row) == 5: - (filename,digest,size,mode,mtime) = row - inode = 0 - elif len(row) == 6: - (filename,digest,size,mode,mtime,inode) = row - else: - raise IOError('unknown data layout in scorch database') - - if ':' not in digest: - digest = 'md5:'+digest - db[filename]=FileInfo(digest, - int(size), - int(mode), - float(mtime), - int(inode)) - except (KeyboardInterrupt,SystemExit): - raise - except Exception as e: - msg = 'Error reading scorch DB: {} - {}'.format(filepath,e) - print(msg,file=sys.stderr) + reader = csv.reader(f,delimiter=',',quotechar='"') + for row in reader: + rowlen = len(row) + if rowlen == 8: + (filename,digest,size,mode, + mtime,inode,state,checked) = row + elif rowlen == 6: + (filename,digest,size,mode,mtime,inode) = row + state = 'U' + checked = 0 + elif rowlen == 5: + (filename,digest,size,mode,mtime) = row + inode = 0 + state = 'U' + checked = 0 + else: + raise IOError('unknown data layout in scorch database') + + if ':' not in digest: + digest = 'md5:'+digest + db[filename]=FileInfo(digest, + int(size), + int(mode), + float(mtime), + int(inode), + state, + float(checked)) return db -def write_db_core(filepath,db): - basepath = os.path.dirname(filepath) - os.makedirs(basepath,mode=0o775,exist_ok=True) - (fd,tmpfilepath) = tempfile.mkstemp(dir=basepath) - os.close(fd) +def read_db(filepath): + db = {} try: - with gzip.open(tmpfilepath,'wt',encoding='utf8',errors='surrogateescape',newline='') as f: - writer = csv.writer(f,delimiter=',') - for (k,v) in db.items(): - row = (k,v.digest,v.size,v.mode,v.mtime,v.inode) - writer.writerow(row) - os.replace(tmpfilepath,filepath) - except: - os.remove(tmpfilepath) - raise + with gzip_open_r(filepath) as f: + fcntl.flock(f,fcntl.LOCK_EX) + db = read_db_from_fd(f) + fcntl.flock(f,fcntl.LOCK_UN) + except IOError as e: + if e.errno != errno.ENOENT: + raise + return db -def write_db(src_filepath,tgt_filepath,dbadd,dbremove): +def write_db(dbpath,dbremove): try: - db = read_db(src_filepath) - - for k in dbremove: - del db[k] - for (k,v) in dbadd.items(): - db[k] = v; - - write_db_core(tgt_filepath,db) - except (KeyboardInterrupt,SystemExit): - raise + signal.signal(signal.SIGINT,signal.SIG_IGN) + with gzip_open_r(dbpath) as fr: + fcntl.flock(fr,fcntl.LOCK_EX) + db = read_db_from_fd(fr) + for filepath in dbremove: + del db[filepath] + + with gzip_open_w(dbpath) as fw: + writer = csv.writer(fw,delimiter=',') + for (k,v) in db.items(): + row = (k,v.digest,v.size,v.mode, + v.mtime,v.inode,v.state, + v.checked) + writer.writerow(row) + + fcntl.flock(fr,fcntl.LOCK_UN) except Exception as e: - msg = 'Error writing scorch DB: {} - {}'.format(tgt_filepath,e) + msg = 'Error writing scorch DB: {} - {}'.format(dbpath,e) print(msg,file=sys.stderr) + finally: + signal.signal(signal.SIGINT,signal.SIG_DFL) + + +def append_to_db(dbpath,filepath,fi): + try: + signal.signal(signal.SIGINT,signal.SIG_IGN) + with gzip_open_append(dbpath) as f: + fcntl.flock(f,fcntl.LOCK_EX) + writer = csv.writer(f,delimiter=',') + row = (filepath,fi.digest,fi.size,fi.mode, + fi.mtime,fi.inode,fi.state,fi.checked) + writer.writerow(row) + fcntl.flock(f,fcntl.LOCK_UN) + except PermissionError as e: + pass + finally: + signal.signal(signal.SIGINT,signal.SIG_DFL) def process_directories(dirs): @@ -1338,7 +1533,7 @@ def process_directories(dirs): return rv -def process_dbpath(dbpath): +def calculate_dbpath(dbpath): if dbpath[0] == '/': pass elif '/' in dbpath: @@ -1358,6 +1553,18 @@ def process_dbpath(dbpath): return dbpath +def mtime(filepath): + try: + return os.lstat(filepath).st_mtime + except: + return 0 + + +def check_db(dbpath): + if not os.access(dbpath,os.W_OK): + print('WARNING: unable to write to database -',dbpath,file=sys.stderr) + + def main(): sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors="surrogateescape", @@ -1383,8 +1590,9 @@ def main(): opts.sort = sort_fun(args.sort) opts.maxactions = args.maxactions opts.maxdata = human_to_bytes(args.maxdata) + opts.maxtime = human_to_time(args.maxtime) opts.breakonerror = args.break_on_error - opts.dbpath = process_dbpath(args.db) + opts.dbpath = calculate_dbpath(args.db) opts.diff_fields = args.diff_fields.split(',') if not args.dir: @@ -1400,37 +1608,36 @@ def main(): directories = process_directories(args.dir) load_db = INSTRUCTIONS[args.inst].load_db + check_db(opts.dbpath) + rv = SUCCESS try: for directory in directories: db = {} - dbadd = {} dbremove = [] + db_mtime = mtime(opts.dbpath) if load_db: db = read_db(opts.dbpath) opts.filter = FileFilter(basepath=directory, fnfilter=fnfilter, fifilter=fifilter) - try: - rv = rv | func(opts,directory,db,dbadd,dbremove) + rv = rv | func(opts,directory,db,dbremove) except (KeyboardInterrupt,SystemExit): - pass + rv = rv | ERROR_INTERRUPTED - if len(dbadd) or len(dbremove): - write_db(opts.dbpath,opts.dbpath,dbadd,dbremove) + if (len(dbremove) or (db_mtime != mtime(opts.dbpath))): + write_db(opts.dbpath,dbremove) if opts.breakonerror and rv: break except (KeyboardInterrupt,SystemExit): - pass + rv = rv | ERROR_INTERRUPTED except IOError as e: if e.errno != errno.EPIPE: - rv = rv | ERROR_PROCESSING - print('General processing error:',e,file=sys.stderr) + raise except Exception as e: - rv = rv | ERROR_PROCESSING raise sys.exit(rv)