dupechecker.dupechecker
1import argparse 2import filecmp 3import time 4from concurrent.futures import ThreadPoolExecutor 5from copy import deepcopy 6 7from griddle import griddy 8from noiftimer import Timer 9from pathier import Pathier 10from printbuddies import Spinner 11from younotyou import younotyou 12 13 14def find_dupes(paths: list[Pathier]) -> list[list[Pathier]]: 15 """Return a list of lists for duplicate files in `paths`.""" 16 matching_sets = [] 17 paths = deepcopy(paths) 18 while len(paths) > 0: 19 comparee = paths.pop() 20 matching_files = [file for file in paths if filecmp.cmp(comparee, file, False)] 21 if matching_files: 22 [paths.pop(paths.index(file)) for file in matching_files] 23 matching_files.insert(0, comparee) 24 matching_sets.append(matching_files) 25 return matching_sets 26 27 28def group_by_size(paths: list[Pathier]) -> list[list[Pathier]]: 29 """Returns a list of lists where each sublist is a list of files that have the same size.""" 30 sizes = {} 31 for path in paths: 32 size = path.size 33 if size in sizes: 34 sizes[size].append(path) 35 else: 36 sizes[size] = [path] 37 return list(sizes.values()) 38 39 40def delete_wizard(matches: list[list[Pathier]]): 41 """Ask which file to keep for each set.""" 42 print() 43 print("Enter the corresponding number of the file to keep.") 44 print( 45 "Press 'Enter' without giving a number to skip deleting any files for the given set." 46 ) 47 print() 48 for match in matches: 49 map_ = {str(i): file for i, file in enumerate(match, 1)} 50 options = "\n".join(f"({i}) {file}" for i, file in map_.items()) + "\n" 51 print(options) 52 keeper = input(f"Enter number of file to keep ({', '.join(map_.keys())}): ") 53 if keeper: 54 [map_[num].delete() for num in map_ if num != keeper] 55 print() 56 57 58def autodelete(matches: list[list[Pathier]]): 59 """Keep one of each set in `matches` and delete the others.""" 60 for match in matches: 61 match.pop() 62 [file.delete() for file in match] 63 64 65def dupechecker(paths: list[Pathier]) -> list[list[Pathier]]: 66 grouped_paths = group_by_size(paths) 67 matches = [] 68 with Spinner() as spinner: 69 with ThreadPoolExecutor() as exc: 70 threads = [exc.submit(find_dupes, paths) for paths in grouped_paths] 71 while any(not thread.done() for thread in threads): 72 spinner.display() 73 time.sleep(0.025) 74 for thread in threads: 75 matches.extend(thread.result()) 76 return matches 77 78 79def get_args() -> argparse.Namespace: 80 parser = argparse.ArgumentParser() 81 82 parser.add_argument( 83 "-r", 84 "--recursive", 85 action="store_true", 86 help=""" Glob files to compare recursively. """, 87 ) 88 89 parser.add_argument( 90 "-i", 91 "--ignores", 92 type=str, 93 nargs="*", 94 default=[], 95 help=""" Ignore files matching these patterns. 96 e.g. `dupechecker -i *.wav` will compare all files in the current working directory except .wav files.""", 97 ) 98 99 parser.add_argument( 100 "-d", 101 "--delete_dupes", 102 action="store_true", 103 help=""" After finding duplicates, delete all but one copy. 104 For each set of duplicates, the tool will ask you to enter the number corresponding to the copy you want to keep. 105 Pressing 'enter' without entering a number will skip that set without deleting anything.""", 106 ) 107 108 parser.add_argument( 109 "-ad", 110 "--autodelete", 111 action="store_true", 112 help=""" Automatically decide which file to keep and which to delete from each set of duplicate files instead of asking which to keep. """, 113 ) 114 115 parser.add_argument( 116 "-ns", 117 "--no_show", 118 action="store_true", 119 help=""" Don't show printout of matching files. """, 120 ) 121 122 parser.add_argument( 123 "paths", 124 type=str, 125 default=[Pathier.cwd()], 126 nargs="*", 127 help=""" The paths to compare files in. """, 128 ) 129 130 args = parser.parse_args() 131 if not args.paths == [Pathier.cwd()]: 132 args.paths = [Pathier(path) for path in args.paths] 133 files = [] 134 print("Gathering files...") 135 for path in args.paths: 136 files.extend( 137 list(path.rglob("*.*")) if args.recursive else list(path.glob("*.*")) 138 ) 139 args.paths = [ 140 Pathier(path) 141 for path in younotyou( 142 [str(file) for file in files], exclude_patterns=args.ignores 143 ) 144 ] 145 print(f"Checking {len(args.paths)} files...") 146 147 return args 148 149 150def main(args: argparse.Namespace | None = None): 151 print() 152 if not args: 153 args = get_args() 154 timer = Timer().start() 155 matches = dupechecker(args.paths) 156 timer.stop() 157 if matches: 158 print(f"Found {len(matches)} duplicate sets of files in {timer.elapsed_str}.") 159 if not args.no_show: 160 print( 161 griddy( 162 [["\n".join([str(file) for file in match])] for match in matches] 163 ) 164 ) 165 if args.delete_dupes or args.autodelete: 166 size = lambda: sum(path.size() for path in args.paths) # type: ignore 167 start_size = size() 168 delete_wizard(matches) if args.delete_dupes else autodelete(matches) 169 deleted_size = start_size - size() 170 print(f"Deleted {Pathier.format_bytes(deleted_size)}.") 171 else: 172 print("No duplicates detected.") 173 174 175if __name__ == "__main__": 176 main(get_args())
def
find_dupes( paths: list[pathier.pathier.Pathier]) -> list[list[pathier.pathier.Pathier]]:
15def find_dupes(paths: list[Pathier]) -> list[list[Pathier]]: 16 """Return a list of lists for duplicate files in `paths`.""" 17 matching_sets = [] 18 paths = deepcopy(paths) 19 while len(paths) > 0: 20 comparee = paths.pop() 21 matching_files = [file for file in paths if filecmp.cmp(comparee, file, False)] 22 if matching_files: 23 [paths.pop(paths.index(file)) for file in matching_files] 24 matching_files.insert(0, comparee) 25 matching_sets.append(matching_files) 26 return matching_sets
Return a list of lists for duplicate files in paths
.
def
group_by_size( paths: list[pathier.pathier.Pathier]) -> list[list[pathier.pathier.Pathier]]:
29def group_by_size(paths: list[Pathier]) -> list[list[Pathier]]: 30 """Returns a list of lists where each sublist is a list of files that have the same size.""" 31 sizes = {} 32 for path in paths: 33 size = path.size 34 if size in sizes: 35 sizes[size].append(path) 36 else: 37 sizes[size] = [path] 38 return list(sizes.values())
Returns a list of lists where each sublist is a list of files that have the same size.
def
delete_wizard(matches: list[list[pathier.pathier.Pathier]]):
41def delete_wizard(matches: list[list[Pathier]]): 42 """Ask which file to keep for each set.""" 43 print() 44 print("Enter the corresponding number of the file to keep.") 45 print( 46 "Press 'Enter' without giving a number to skip deleting any files for the given set." 47 ) 48 print() 49 for match in matches: 50 map_ = {str(i): file for i, file in enumerate(match, 1)} 51 options = "\n".join(f"({i}) {file}" for i, file in map_.items()) + "\n" 52 print(options) 53 keeper = input(f"Enter number of file to keep ({', '.join(map_.keys())}): ") 54 if keeper: 55 [map_[num].delete() for num in map_ if num != keeper] 56 print()
Ask which file to keep for each set.
def
autodelete(matches: list[list[pathier.pathier.Pathier]]):
59def autodelete(matches: list[list[Pathier]]): 60 """Keep one of each set in `matches` and delete the others.""" 61 for match in matches: 62 match.pop() 63 [file.delete() for file in match]
Keep one of each set in matches
and delete the others.
def
dupechecker( paths: list[pathier.pathier.Pathier]) -> list[list[pathier.pathier.Pathier]]:
66def dupechecker(paths: list[Pathier]) -> list[list[Pathier]]: 67 grouped_paths = group_by_size(paths) 68 matches = [] 69 with Spinner() as spinner: 70 with ThreadPoolExecutor() as exc: 71 threads = [exc.submit(find_dupes, paths) for paths in grouped_paths] 72 while any(not thread.done() for thread in threads): 73 spinner.display() 74 time.sleep(0.025) 75 for thread in threads: 76 matches.extend(thread.result()) 77 return matches
def
get_args() -> argparse.Namespace:
80def get_args() -> argparse.Namespace: 81 parser = argparse.ArgumentParser() 82 83 parser.add_argument( 84 "-r", 85 "--recursive", 86 action="store_true", 87 help=""" Glob files to compare recursively. """, 88 ) 89 90 parser.add_argument( 91 "-i", 92 "--ignores", 93 type=str, 94 nargs="*", 95 default=[], 96 help=""" Ignore files matching these patterns. 97 e.g. `dupechecker -i *.wav` will compare all files in the current working directory except .wav files.""", 98 ) 99 100 parser.add_argument( 101 "-d", 102 "--delete_dupes", 103 action="store_true", 104 help=""" After finding duplicates, delete all but one copy. 105 For each set of duplicates, the tool will ask you to enter the number corresponding to the copy you want to keep. 106 Pressing 'enter' without entering a number will skip that set without deleting anything.""", 107 ) 108 109 parser.add_argument( 110 "-ad", 111 "--autodelete", 112 action="store_true", 113 help=""" Automatically decide which file to keep and which to delete from each set of duplicate files instead of asking which to keep. """, 114 ) 115 116 parser.add_argument( 117 "-ns", 118 "--no_show", 119 action="store_true", 120 help=""" Don't show printout of matching files. """, 121 ) 122 123 parser.add_argument( 124 "paths", 125 type=str, 126 default=[Pathier.cwd()], 127 nargs="*", 128 help=""" The paths to compare files in. """, 129 ) 130 131 args = parser.parse_args() 132 if not args.paths == [Pathier.cwd()]: 133 args.paths = [Pathier(path) for path in args.paths] 134 files = [] 135 print("Gathering files...") 136 for path in args.paths: 137 files.extend( 138 list(path.rglob("*.*")) if args.recursive else list(path.glob("*.*")) 139 ) 140 args.paths = [ 141 Pathier(path) 142 for path in younotyou( 143 [str(file) for file in files], exclude_patterns=args.ignores 144 ) 145 ] 146 print(f"Checking {len(args.paths)} files...") 147 148 return args
def
main(args: argparse.Namespace | None = None):
151def main(args: argparse.Namespace | None = None): 152 print() 153 if not args: 154 args = get_args() 155 timer = Timer().start() 156 matches = dupechecker(args.paths) 157 timer.stop() 158 if matches: 159 print(f"Found {len(matches)} duplicate sets of files in {timer.elapsed_str}.") 160 if not args.no_show: 161 print( 162 griddy( 163 [["\n".join([str(file) for file in match])] for match in matches] 164 ) 165 ) 166 if args.delete_dupes or args.autodelete: 167 size = lambda: sum(path.size() for path in args.paths) # type: ignore 168 start_size = size() 169 delete_wizard(matches) if args.delete_dupes else autodelete(matches) 170 deleted_size = start_size - size() 171 print(f"Deleted {Pathier.format_bytes(deleted_size)}.") 172 else: 173 print("No duplicates detected.")