dupechecker.dupechecker

  1import argparse
  2import filecmp
  3import time
  4from concurrent.futures import ThreadPoolExecutor
  5from copy import deepcopy
  6
  7from griddle import griddy
  8from pathier import Pathier
  9from printbuddies import Spinner
 10from younotyou import younotyou
 11
 12
 13def get_duplicates(paths: list[Pathier]) -> list[list[Pathier]]:
 14    """Return a list of lists for duplicate files in `paths`."""
 15    matching_sets = []
 16    while len(paths) > 0:
 17        comparee = paths.pop()
 18        matching_files = [file for file in paths if filecmp.cmp(comparee, file, False)]
 19        if matching_files:
 20            [paths.pop(paths.index(file)) for file in matching_files]
 21            matching_files.insert(0, comparee)
 22            matching_sets.append(matching_files)
 23    return matching_sets
 24
 25
 26def get_args() -> argparse.Namespace:
 27    parser = argparse.ArgumentParser()
 28
 29    parser.add_argument(
 30        "-r",
 31        "--recursive",
 32        action="store_true",
 33        help=""" Glob files to compare recursively. """,
 34    )
 35
 36    parser.add_argument(
 37        "-i",
 38        "--ignores",
 39        type=str,
 40        nargs="*",
 41        default=[],
 42        help=""" Ignore files matching these patterns.
 43        e.g. `dupechecker -i *.wav` will compare all files in the current working directory except .wav files.""",
 44    )
 45
 46    parser.add_argument(
 47        "-d",
 48        "--delete_dupes",
 49        action="store_true",
 50        help=""" After finding duplicates, delete all but one copy.
 51        For each set of duplicates, the tool will ask you to enter the number corresponding to the copy you want to keep.
 52        Pressing 'enter' without entering a number will skip that set without deleting anything.""",
 53    )
 54
 55    parser.add_argument(
 56        "-ad",
 57        "--autodelete",
 58        action="store_true",
 59        help=""" Automatically decide which file to keep and which to delete from each set of duplicate files instead of asking which to keep. """,
 60    )
 61
 62    parser.add_argument(
 63        "-ns",
 64        "--no_show",
 65        action="store_true",
 66        help=""" Don't show printout of matching files. """,
 67    )
 68
 69    parser.add_argument(
 70        "paths",
 71        type=str,
 72        default=[Pathier.cwd()],
 73        nargs="*",
 74        help=""" The paths to compare files in. """,
 75    )
 76
 77    args = parser.parse_args()
 78    if not args.paths == [Pathier.cwd()]:
 79        args.paths = [Pathier(path) for path in args.paths]
 80    files = []
 81    print("Gathering files...")
 82    for path in args.paths:
 83        files.extend(
 84            list(path.rglob("*.*")) if args.recursive else list(path.glob("*.*"))
 85        )
 86    args.paths = [
 87        Pathier(path)
 88        for path in younotyou(
 89            [str(file) for file in files], exclude_patterns=args.ignores
 90        )
 91    ]
 92    print(f"Comparing {len(args.paths)} files...")
 93
 94    return args
 95
 96
 97def delete_wizard(matches: list[list[Pathier]]):
 98    """Ask which file to keep for each set."""
 99    print()
100    print("Enter the corresponding number of the file to keep.")
101    print(
102        "Press 'Enter' without giving a number to skip deleting any files for the given set."
103    )
104    print()
105    for match in matches:
106        map_ = {str(i): file for i, file in enumerate(match, 1)}
107        options = "\n".join(f"({i}) {file}" for i, file in map_.items()) + "\n"
108        print(options)
109        keeper = input(f"Enter number of file to keep ({', '.join(map_.keys())}): ")
110        if keeper:
111            [map_[num].delete() for num in map_ if num != keeper]
112        print()
113
114
115def autodelete(matches: list[list[Pathier]]):
116    """Keep one of each set in `matches` and delete the others."""
117    for match in matches:
118        match.pop()
119        [file.delete() for file in match]
120
121
122def dupechecker(args: argparse.Namespace | None = None):
123    print()
124    if not args:
125        args = get_args()
126    s = [
127        ch.rjust(i + j)
128        for i in range(1, 25, 3)
129        for j, ch in enumerate(["/", "-", "\\"])
130    ]
131    s += s[::-1]
132    with Spinner(s) as spinner:
133        with ThreadPoolExecutor() as exc:
134            thread = exc.submit(get_duplicates, deepcopy(args.paths))
135            while not thread.done():
136                spinner.display()
137                time.sleep(0.025)
138            matches = thread.result()
139    if matches:
140        print(f"Found {len(matches)} duplicate sets of files.")
141        if not args.no_show:
142            print(
143                griddy(
144                    [["\n".join([str(file) for file in match])] for match in matches]
145                )
146            )
147        if args.delete_dupes or args.autodelete:
148            size = lambda: sum(path.size() for path in args.paths)  # type: ignore
149            start_size = size()
150            delete_wizard(matches) if args.delete_dupes else autodelete(matches)
151            deleted_size = start_size - size()
152            print(f"Deleted {Pathier.format_size(deleted_size)}.")
153    else:
154        print("No duplicates detected.")
155
156
157if __name__ == "__main__":
158    dupechecker(get_args())
def get_duplicates( paths: list[pathier.pathier.Pathier]) -> list[list[pathier.pathier.Pathier]]:
14def get_duplicates(paths: list[Pathier]) -> list[list[Pathier]]:
15    """Return a list of lists for duplicate files in `paths`."""
16    matching_sets = []
17    while len(paths) > 0:
18        comparee = paths.pop()
19        matching_files = [file for file in paths if filecmp.cmp(comparee, file, False)]
20        if matching_files:
21            [paths.pop(paths.index(file)) for file in matching_files]
22            matching_files.insert(0, comparee)
23            matching_sets.append(matching_files)
24    return matching_sets

Return a list of lists for duplicate files in paths.

def get_args() -> argparse.Namespace:
27def get_args() -> argparse.Namespace:
28    parser = argparse.ArgumentParser()
29
30    parser.add_argument(
31        "-r",
32        "--recursive",
33        action="store_true",
34        help=""" Glob files to compare recursively. """,
35    )
36
37    parser.add_argument(
38        "-i",
39        "--ignores",
40        type=str,
41        nargs="*",
42        default=[],
43        help=""" Ignore files matching these patterns.
44        e.g. `dupechecker -i *.wav` will compare all files in the current working directory except .wav files.""",
45    )
46
47    parser.add_argument(
48        "-d",
49        "--delete_dupes",
50        action="store_true",
51        help=""" After finding duplicates, delete all but one copy.
52        For each set of duplicates, the tool will ask you to enter the number corresponding to the copy you want to keep.
53        Pressing 'enter' without entering a number will skip that set without deleting anything.""",
54    )
55
56    parser.add_argument(
57        "-ad",
58        "--autodelete",
59        action="store_true",
60        help=""" Automatically decide which file to keep and which to delete from each set of duplicate files instead of asking which to keep. """,
61    )
62
63    parser.add_argument(
64        "-ns",
65        "--no_show",
66        action="store_true",
67        help=""" Don't show printout of matching files. """,
68    )
69
70    parser.add_argument(
71        "paths",
72        type=str,
73        default=[Pathier.cwd()],
74        nargs="*",
75        help=""" The paths to compare files in. """,
76    )
77
78    args = parser.parse_args()
79    if not args.paths == [Pathier.cwd()]:
80        args.paths = [Pathier(path) for path in args.paths]
81    files = []
82    print("Gathering files...")
83    for path in args.paths:
84        files.extend(
85            list(path.rglob("*.*")) if args.recursive else list(path.glob("*.*"))
86        )
87    args.paths = [
88        Pathier(path)
89        for path in younotyou(
90            [str(file) for file in files], exclude_patterns=args.ignores
91        )
92    ]
93    print(f"Comparing {len(args.paths)} files...")
94
95    return args
def delete_wizard(matches: list[list[pathier.pathier.Pathier]]):
 98def delete_wizard(matches: list[list[Pathier]]):
 99    """Ask which file to keep for each set."""
100    print()
101    print("Enter the corresponding number of the file to keep.")
102    print(
103        "Press 'Enter' without giving a number to skip deleting any files for the given set."
104    )
105    print()
106    for match in matches:
107        map_ = {str(i): file for i, file in enumerate(match, 1)}
108        options = "\n".join(f"({i}) {file}" for i, file in map_.items()) + "\n"
109        print(options)
110        keeper = input(f"Enter number of file to keep ({', '.join(map_.keys())}): ")
111        if keeper:
112            [map_[num].delete() for num in map_ if num != keeper]
113        print()

Ask which file to keep for each set.

def autodelete(matches: list[list[pathier.pathier.Pathier]]):
116def autodelete(matches: list[list[Pathier]]):
117    """Keep one of each set in `matches` and delete the others."""
118    for match in matches:
119        match.pop()
120        [file.delete() for file in match]

Keep one of each set in matches and delete the others.

def dupechecker(args: argparse.Namespace | None = None):
123def dupechecker(args: argparse.Namespace | None = None):
124    print()
125    if not args:
126        args = get_args()
127    s = [
128        ch.rjust(i + j)
129        for i in range(1, 25, 3)
130        for j, ch in enumerate(["/", "-", "\\"])
131    ]
132    s += s[::-1]
133    with Spinner(s) as spinner:
134        with ThreadPoolExecutor() as exc:
135            thread = exc.submit(get_duplicates, deepcopy(args.paths))
136            while not thread.done():
137                spinner.display()
138                time.sleep(0.025)
139            matches = thread.result()
140    if matches:
141        print(f"Found {len(matches)} duplicate sets of files.")
142        if not args.no_show:
143            print(
144                griddy(
145                    [["\n".join([str(file) for file in match])] for match in matches]
146                )
147            )
148        if args.delete_dupes or args.autodelete:
149            size = lambda: sum(path.size() for path in args.paths)  # type: ignore
150            start_size = size()
151            delete_wizard(matches) if args.delete_dupes else autodelete(matches)
152            deleted_size = start_size - size()
153            print(f"Deleted {Pathier.format_size(deleted_size)}.")
154    else:
155        print("No duplicates detected.")