Coverage for e2xgrader/exchange/hash_utils.py: 98%
43 statements
« prev ^ index » next coverage.py v7.4.2, created at 2024-03-14 13:22 +0100
« prev ^ index » next coverage.py v7.4.2, created at 2024-03-14 13:22 +0100
1import fnmatch
2import hashlib
3import os
4from typing import List, Tuple
7def compute_hashcode_of_file(filename, method="sha1") -> str:
8 """
9 Compute the hash code of a file.
11 Args:
12 filename (str): The path to the file.
13 method (str, optional): The hash algorithm to use. Defaults to "sha1".
15 Returns:
16 str: The computed hash code.
18 Raises:
19 ValueError: If the specified method is not supported.
20 """
21 if method == "md5":
22 hashcode = hashlib.md5()
23 elif method == "sha1":
24 hashcode = hashlib.sha1()
25 else:
26 raise ValueError("Currently only the methods md5 and sha1 are supported!")
28 with open(filename, "rb") as f:
29 for chunk in iter(lambda: f.read(4096), b""):
30 hashcode.update(chunk)
32 return hashcode.hexdigest()
35def hash_files_in_directory(
36 directory,
37 method="sha1",
38 exclude_files=None,
39 exclude_subfolders=None,
40) -> List[Tuple[str, str]]:
41 """
42 Hashes all files in a directory using the specified method.
44 Args:
45 directory (str): The directory path to hash files in.
46 method (str, optional): The hashing method to use. Defaults to "sha1".
47 exclude_files (List[str], optional): List of file names or patterns to exclude from hashing.
48 Defaults to None.
49 exclude_subfolders (List[str], optional): List of subfolder names to exclude from hashing.
50 Defaults to None.
52 Returns:
53 List[Tuple[str, str]]: A list of tuples containing the relative file paths and their
54 corresponding hash codes.
56 Examples:
57 >>> hash_files_in_directory("path/to/directory")
58 [
59 ("file1.txt", "hashcode1"),
60 ("file2.txt", "hashcode2"),
61 ("notebook.ipynb", "hashcode3"),
62 (".ipynb_checkpoints/notebook-checkpoint.ipynb", "hashcode4"),
63 ...
64 ]
65 >>> hash_files_in_directory(
66 "path/to/directory",
67 exclude_files=["*.txt"],
68 exclude_subfolders=[".ipynb_checkpoints"]
69 )
70 [
71 ("notebook.ipynb", "hashcode3"),
72 ...
73 ]
74 """
75 if exclude_files is None:
76 exclude_files = []
77 if exclude_subfolders is None:
78 exclude_subfolders = []
79 else:
80 exclude_subfolders = [
81 os.path.normpath(subfolder) for subfolder in exclude_subfolders
82 ]
84 hashes = dict()
86 for root, dirs, files in os.walk(directory):
87 for file in files:
88 if any(fnmatch.fnmatch(file, pattern) for pattern in exclude_files):
89 continue
90 if any([subfolder in root for subfolder in exclude_subfolders]):
91 continue
92 filename = os.path.join(root, file)
93 hashes[os.path.relpath(filename, start=directory)] = (
94 compute_hashcode_of_file(filename, method)
95 )
97 return sorted(hashes.items())
100def generate_directory_hash_file(
101 directory,
102 method="sha1",
103 exclude_files=None,
104 exclude_subfolders=None,
105 output_file="hashes.txt",
106):
107 """
108 Creates a file containing the hash codes of all files in a directory.
110 Args:
111 directory (str): The directory path to hash files in.
112 method (str, optional): The hashing method to use. Defaults to "sha1".
113 exclude_files (List[str], optional): List of file names or patterns to exclude from hashing.
114 Defaults to None.
115 exclude_subfolders (List[str], optional): List of subfolder names to exclude from hashing.
116 Defaults to None.
117 output_file (str, optional): The name of the file to write the hash codes to.
118 Defaults to "hashes.txt".
119 """
120 if exclude_files is None:
121 exclude_files = [output_file]
122 else:
123 exclude_files.append(output_file)
125 hashes = hash_files_in_directory(
126 directory, method, exclude_files, exclude_subfolders
127 )
128 formatted_hashes = "\n".join(
129 [f"{hashcode} {filename}" for filename, hashcode in hashes]
130 )
131 with open(os.path.join(directory, output_file), "w") as f:
132 f.write(formatted_hashes)
135def truncate_hashcode(hashcode, number_of_chunks=3, chunk_size=4):
136 """
137 Truncate a hash code into a more readable format.
139 Args:
140 hashcode (str): The hash code to truncate.
141 number_of_chunks (int, optional): The number of chunks to split the hash code into.
142 Defaults to 4.
143 chunk_size (int, optional): The size of each chunk. Defaults to 5.
145 Returns:
146 str: The truncated hash code.
147 """
148 hash_string = ""
149 for i in range(0, number_of_chunks * chunk_size, chunk_size):
150 hash_string += f"-{hashcode[i:i+chunk_size]}"
151 return hash_string[1:]