Coverage for src/artemis_sg/img_downloader.py: 87%
84 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-12 17:31 -0700
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-12 17:31 -0700
1#!/usr/bin/env python
3import json
4import logging
5import os
6import tempfile
8import isbnlib
9import puremagic
10import requests
11from rich.console import Console
12from rich.text import Text
14from artemis_sg.config import CFG
16MODULE = os.path.splitext(os.path.basename(__file__))[0]
17console = Console()
20class ImgDownloader:
21 def is_image(self, path):
22 """Check given filepath to see if it is an image.
23 If so, return extension type, else return None."""
24 namespace = f"{type(self).__name__}.{self.is_image.__name__}"
25 try:
26 kind = puremagic.from_file(path)
27 except (puremagic.main.PureError, ValueError):
28 logging.warning(f"{namespace}: non-image file found")
29 kind = None
30 if kind not in [".jpg", ".png"]:
31 kind = None
32 return kind
34 def download(self, image_dict, target_dir=""):
35 namespace = f"{type(self).__name__}.{self.download.__name__}"
37 if not target_dir:
38 target_dir = tempfile.mkdtemp(prefix="ImgDownloader-")
39 logging.warning(f"{namespace}: Creating target directory at {target_dir}")
40 if not os.path.isdir(target_dir): 40 ↛ 41line 40 didn't jump to line 41, because the condition on line 40 was never true
41 os.mkdir(target_dir)
43 for key in image_dict:
44 for i, url in enumerate(image_dict[key]):
45 isbn = isbnlib.to_isbn13(key)
46 if not isbn:
47 isbn = key
48 suffix = "" if i == 0 else f"-{i}"
49 image = f"{isbn}{suffix}.jpg"
50 image_path = os.path.join(target_dir, image)
51 if not os.path.isfile(image_path) or not self.is_image(image_path):
52 logging.debug(f"{namespace}: Downloading '{url}' to '{target_dir}'")
53 with open(image_path, "wb") as fp:
54 r = requests.get(url, timeout=10)
55 fp.write(r.content)
57 # validate file and name it in accordance with its type
58 fmt = self.is_image(image_path)
59 if fmt == ".jpg":
60 pass
61 elif fmt == ".png":
62 # rename file with png suffix
63 old_path = image_path
64 image_path = os.path.splitext(old_path)[0] + ".png"
65 os.rename(old_path, image_path)
66 else:
67 os.remove(image_path)
68 logging.warning(
69 f"{namespace}: Skipping unsupported file type in '{url}'"
70 )
71 logging.info(f"{namespace}: Saved '{image_path}")
73 return target_dir
76def main():
77 scraped_datafile = CFG["asg"]["data"]["file"]["scraped"]
78 saved_images_dir = CFG["asg"]["data"]["dir"]["images"]
79 if not os.path.isdir(saved_images_dir): 79 ↛ 80line 79 didn't jump to line 80, because the condition on line 79 was never true
80 dest = None
82 dloader = ImgDownloader()
84 def get_json_data_from_file(datafile):
85 namespace = f"{MODULE}.main.{get_json_data_from_file.__name__}"
86 try:
87 with open(datafile) as filepointer: 87 ↛ 88, 87 ↛ 892 missed branches: 1) line 87 didn't jump to line 88, 2) line 87 didn't jump to line 89
88 data = json.load(filepointer)
89 filepointer.close()
90 return data
91 except FileNotFoundError: 91 ↛ 94line 91 didn't jump to line 94
92 logging.error(f"{namespace}: Datafile '{datafile}' not found")
93 return {}
94 except json.decoder.JSONDecodeError:
95 logging.error(
96 f"{namespace}: Datafile '{datafile}' did not contain valid JSON"
97 )
98 return {}
100 def get_image_url_dict(data):
101 url_dict = {}
102 for key in data: 102 ↛ 103line 102 didn't jump to line 103, because the loop on line 102 never started
103 url_dict[key] = data[key]["image_urls"]
104 return url_dict
106 scraped_data = get_json_data_from_file(scraped_datafile)
107 img_dict = get_image_url_dict(scraped_data)
108 dest = dloader.download(img_dict, saved_images_dir)
109 dest_text = Text(f"Images downloaded to {dest}.")
110 dest_text.stylize("green")
111 console.print(dest_text)
114if __name__ == "__main__":
115 main()