dupechecker.dupechecker

  1import argparse
  2import filecmp
  3import time
  4from concurrent.futures import ThreadPoolExecutor
  5from copy import deepcopy
  6
  7from griddle import griddy
  8from noiftimer import Timer
  9from pathier import Pathier
 10from printbuddies import Spinner
 11from younotyou import younotyou
 12
 13
 14def find_dupes(paths: list[Pathier]) -> list[list[Pathier]]:
 15    """Return a list of lists for duplicate files in `paths`."""
 16    matching_sets = []
 17    paths = deepcopy(paths)
 18    while len(paths) > 0:
 19        comparee = paths.pop()
 20        matching_files = [file for file in paths if filecmp.cmp(comparee, file, False)]
 21        if matching_files:
 22            [paths.pop(paths.index(file)) for file in matching_files]
 23            matching_files.insert(0, comparee)
 24            matching_sets.append(matching_files)
 25    return matching_sets
 26
 27
 28def group_by_size(paths: list[Pathier]) -> list[list[Pathier]]:
 29    """Returns a list of lists where each sublist is a list of files that have the same size."""
 30    sizes = {}
 31    for path in paths:
 32        size = path.size()
 33        if size in sizes:
 34            sizes[size].append(path)
 35        else:
 36            sizes[size] = [path]
 37    return list(sizes.values())
 38
 39
 40def delete_wizard(matches: list[list[Pathier]]):
 41    """Ask which file to keep for each set."""
 42    print()
 43    print("Enter the corresponding number of the file to keep.")
 44    print(
 45        "Press 'Enter' without giving a number to skip deleting any files for the given set."
 46    )
 47    print()
 48    for match in matches:
 49        map_ = {str(i): file for i, file in enumerate(match, 1)}
 50        options = "\n".join(f"({i}) {file}" for i, file in map_.items()) + "\n"
 51        print(options)
 52        keeper = input(f"Enter number of file to keep ({', '.join(map_.keys())}): ")
 53        if keeper:
 54            [map_[num].delete() for num in map_ if num != keeper]
 55        print()
 56
 57
 58def autodelete(matches: list[list[Pathier]]):
 59    """Keep one of each set in `matches` and delete the others."""
 60    for match in matches:
 61        match.pop()
 62        [file.delete() for file in match]
 63
 64
 65def dupechecker(paths: list[Pathier]) -> list[list[Pathier]]:
 66    grouped_paths = group_by_size(paths)
 67    matches = []
 68    with Spinner() as spinner:
 69        with ThreadPoolExecutor() as exc:
 70            threads = [exc.submit(find_dupes, paths) for paths in grouped_paths]
 71            while any(not thread.done() for thread in threads):
 72                spinner.display()
 73                time.sleep(0.025)
 74            for thread in threads:
 75                matches.extend(thread.result())
 76    return matches
 77
 78
 79def get_args() -> argparse.Namespace:
 80    parser = argparse.ArgumentParser()
 81
 82    parser.add_argument(
 83        "-r",
 84        "--recursive",
 85        action="store_true",
 86        help=""" Glob files to compare recursively. """,
 87    )
 88
 89    parser.add_argument(
 90        "-i",
 91        "--ignores",
 92        type=str,
 93        nargs="*",
 94        default=[],
 95        help=""" Ignore files matching these patterns.
 96        e.g. `dupechecker -i *.wav` will compare all files in the current working directory except .wav files.""",
 97    )
 98
 99    parser.add_argument(
100        "-d",
101        "--delete_dupes",
102        action="store_true",
103        help=""" After finding duplicates, delete all but one copy.
104        For each set of duplicates, the tool will ask you to enter the number corresponding to the copy you want to keep.
105        Pressing 'enter' without entering a number will skip that set without deleting anything.""",
106    )
107
108    parser.add_argument(
109        "-ad",
110        "--autodelete",
111        action="store_true",
112        help=""" Automatically decide which file to keep and which to delete from each set of duplicate files instead of asking which to keep. """,
113    )
114
115    parser.add_argument(
116        "-ns",
117        "--no_show",
118        action="store_true",
119        help=""" Don't show printout of matching files. """,
120    )
121
122    parser.add_argument(
123        "paths",
124        type=str,
125        default=[Pathier.cwd()],
126        nargs="*",
127        help=""" The paths to compare files in. """,
128    )
129
130    args = parser.parse_args()
131    if not args.paths == [Pathier.cwd()]:
132        args.paths = [Pathier(path) for path in args.paths]
133    files = []
134    print("Gathering files...")
135    for path in args.paths:
136        files.extend(
137            list(path.rglob("*.*")) if args.recursive else list(path.glob("*.*"))
138        )
139    args.paths = [
140        Pathier(path)
141        for path in younotyou(
142            [str(file) for file in files], exclude_patterns=args.ignores
143        )
144    ]
145    print(f"Checking {len(args.paths)} files...")
146
147    return args
148
149
150def main(args: argparse.Namespace | None = None):
151    print()
152    if not args:
153        args = get_args()
154    timer = Timer().start()
155    matches = dupechecker(args.paths)
156    timer.stop()
157    if matches:
158        print(f"Found {len(matches)} duplicate sets of files in {timer.elapsed_str}.")
159        if not args.no_show:
160            print(
161                griddy(
162                    [["\n".join([str(file) for file in match])] for match in matches]
163                )
164            )
165        if args.delete_dupes or args.autodelete:
166            size = lambda: sum(path.size() for path in args.paths)  # type: ignore
167            start_size = size()
168            delete_wizard(matches) if args.delete_dupes else autodelete(matches)
169            deleted_size = start_size - size()
170            print(f"Deleted {Pathier.format_size(deleted_size)}.")
171    else:
172        print("No duplicates detected.")
173
174
175if __name__ == "__main__":
176    main(get_args())
def find_dupes( paths: list[pathier.pathier.Pathier]) -> list[list[pathier.pathier.Pathier]]:
15def find_dupes(paths: list[Pathier]) -> list[list[Pathier]]:
16    """Return a list of lists for duplicate files in `paths`."""
17    matching_sets = []
18    paths = deepcopy(paths)
19    while len(paths) > 0:
20        comparee = paths.pop()
21        matching_files = [file for file in paths if filecmp.cmp(comparee, file, False)]
22        if matching_files:
23            [paths.pop(paths.index(file)) for file in matching_files]
24            matching_files.insert(0, comparee)
25            matching_sets.append(matching_files)
26    return matching_sets

Return a list of lists for duplicate files in paths.

def group_by_size( paths: list[pathier.pathier.Pathier]) -> list[list[pathier.pathier.Pathier]]:
29def group_by_size(paths: list[Pathier]) -> list[list[Pathier]]:
30    """Returns a list of lists where each sublist is a list of files that have the same size."""
31    sizes = {}
32    for path in paths:
33        size = path.size()
34        if size in sizes:
35            sizes[size].append(path)
36        else:
37            sizes[size] = [path]
38    return list(sizes.values())

Returns a list of lists where each sublist is a list of files that have the same size.

def delete_wizard(matches: list[list[pathier.pathier.Pathier]]):
41def delete_wizard(matches: list[list[Pathier]]):
42    """Ask which file to keep for each set."""
43    print()
44    print("Enter the corresponding number of the file to keep.")
45    print(
46        "Press 'Enter' without giving a number to skip deleting any files for the given set."
47    )
48    print()
49    for match in matches:
50        map_ = {str(i): file for i, file in enumerate(match, 1)}
51        options = "\n".join(f"({i}) {file}" for i, file in map_.items()) + "\n"
52        print(options)
53        keeper = input(f"Enter number of file to keep ({', '.join(map_.keys())}): ")
54        if keeper:
55            [map_[num].delete() for num in map_ if num != keeper]
56        print()

Ask which file to keep for each set.

def autodelete(matches: list[list[pathier.pathier.Pathier]]):
59def autodelete(matches: list[list[Pathier]]):
60    """Keep one of each set in `matches` and delete the others."""
61    for match in matches:
62        match.pop()
63        [file.delete() for file in match]

Keep one of each set in matches and delete the others.

def dupechecker( paths: list[pathier.pathier.Pathier]) -> list[list[pathier.pathier.Pathier]]:
66def dupechecker(paths: list[Pathier]) -> list[list[Pathier]]:
67    grouped_paths = group_by_size(paths)
68    matches = []
69    with Spinner() as spinner:
70        with ThreadPoolExecutor() as exc:
71            threads = [exc.submit(find_dupes, paths) for paths in grouped_paths]
72            while any(not thread.done() for thread in threads):
73                spinner.display()
74                time.sleep(0.025)
75            for thread in threads:
76                matches.extend(thread.result())
77    return matches
def get_args() -> argparse.Namespace:
 80def get_args() -> argparse.Namespace:
 81    parser = argparse.ArgumentParser()
 82
 83    parser.add_argument(
 84        "-r",
 85        "--recursive",
 86        action="store_true",
 87        help=""" Glob files to compare recursively. """,
 88    )
 89
 90    parser.add_argument(
 91        "-i",
 92        "--ignores",
 93        type=str,
 94        nargs="*",
 95        default=[],
 96        help=""" Ignore files matching these patterns.
 97        e.g. `dupechecker -i *.wav` will compare all files in the current working directory except .wav files.""",
 98    )
 99
100    parser.add_argument(
101        "-d",
102        "--delete_dupes",
103        action="store_true",
104        help=""" After finding duplicates, delete all but one copy.
105        For each set of duplicates, the tool will ask you to enter the number corresponding to the copy you want to keep.
106        Pressing 'enter' without entering a number will skip that set without deleting anything.""",
107    )
108
109    parser.add_argument(
110        "-ad",
111        "--autodelete",
112        action="store_true",
113        help=""" Automatically decide which file to keep and which to delete from each set of duplicate files instead of asking which to keep. """,
114    )
115
116    parser.add_argument(
117        "-ns",
118        "--no_show",
119        action="store_true",
120        help=""" Don't show printout of matching files. """,
121    )
122
123    parser.add_argument(
124        "paths",
125        type=str,
126        default=[Pathier.cwd()],
127        nargs="*",
128        help=""" The paths to compare files in. """,
129    )
130
131    args = parser.parse_args()
132    if not args.paths == [Pathier.cwd()]:
133        args.paths = [Pathier(path) for path in args.paths]
134    files = []
135    print("Gathering files...")
136    for path in args.paths:
137        files.extend(
138            list(path.rglob("*.*")) if args.recursive else list(path.glob("*.*"))
139        )
140    args.paths = [
141        Pathier(path)
142        for path in younotyou(
143            [str(file) for file in files], exclude_patterns=args.ignores
144        )
145    ]
146    print(f"Checking {len(args.paths)} files...")
147
148    return args
def main(args: argparse.Namespace | None = None):
151def main(args: argparse.Namespace | None = None):
152    print()
153    if not args:
154        args = get_args()
155    timer = Timer().start()
156    matches = dupechecker(args.paths)
157    timer.stop()
158    if matches:
159        print(f"Found {len(matches)} duplicate sets of files in {timer.elapsed_str}.")
160        if not args.no_show:
161            print(
162                griddy(
163                    [["\n".join([str(file) for file in match])] for match in matches]
164                )
165            )
166        if args.delete_dupes or args.autodelete:
167            size = lambda: sum(path.size() for path in args.paths)  # type: ignore
168            start_size = size()
169            delete_wizard(matches) if args.delete_dupes else autodelete(matches)
170            deleted_size = start_size - size()
171            print(f"Deleted {Pathier.format_size(deleted_size)}.")
172    else:
173        print("No duplicates detected.")