Coverage for src/artemis_sg/img_downloader.py: 66%
79 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-05 09:33 -0700
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-05 09:33 -0700
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
4import json
5import logging
6import os
7import tempfile
9import isbnlib
10import puremagic
11import requests
13from artemis_sg.config import CFG
15MODULE = os.path.splitext(os.path.basename(__file__))[0]
18class ImgDownloader:
19 def is_image(self, path):
20 """Check given filepath to see if it is an image.
21 If so, return extension type, else return None."""
22 try:
23 kind = puremagic.from_file(path)
24 except puremagic.main.PureError:
25 kind = None
26 if kind not in [".jpg", ".png"]:
27 kind = None
28 return kind
30 def download(self, image_dict, target_dir=""):
31 namespace = f"{type(self).__name__}.{self.download.__name__}"
33 if not target_dir:
34 target_dir = tempfile.mkdtemp(prefix="ImgDownloader-")
35 logging.warning(f"{namespace}: Creating target directory at {target_dir}")
36 if not os.path.isdir(target_dir): 36 ↛ 37line 36 didn't jump to line 37, because the condition on line 36 was never true
37 os.mkdir(target_dir)
39 for key in image_dict:
40 for i, url in enumerate(image_dict[key]):
41 isbn = isbnlib.to_isbn13(key)
42 if not isbn:
43 isbn = key
44 if i == 0:
45 suffix = ""
46 else:
47 suffix = f"-{i}"
48 image = f"{isbn}{suffix}.jpg"
49 image_path = os.path.join(target_dir, image)
50 if not os.path.isfile(image_path) or not self.is_image(image_path):
51 logging.debug(f"{namespace}: Downloading '{url}' to '{target_dir}'")
52 with open(image_path, "wb") as fp:
53 r = requests.get(url)
54 fp.write(r.content)
56 # validate file and name it in accordance with its type
57 fmt = self.is_image(image_path)
58 if fmt == ".jpg":
59 pass
60 elif fmt == ".png":
61 # rename file with png suffix
62 old_path = image_path
63 image_path = os.path.splitext(old_path)[0] + ".png"
64 os.rename(old_path, image_path)
65 else:
66 os.remove(image_path)
67 logging.warning(
68 f"{namespace}: Skipping unsupported file type in '{url}'"
69 )
70 logging.info(f"{namespace}: Saved '{image_path}")
72 return target_dir
75def main():
76 scraped_datafile = CFG["asg"]["data"]["file"]["scraped"]
77 saved_images_dir = CFG["asg"]["data"]["dir"]["images"]
78 if not os.path.isdir(saved_images_dir):
79 dest = None
81 dloader = ImgDownloader()
83 def get_json_data_from_file(datafile):
84 namespace = f"{MODULE}.main.{get_json_data_from_file.__name__}"
85 try:
86 with open(datafile, "r") as filepointer:
87 data = json.load(filepointer)
88 filepointer.close()
89 return data
90 except FileNotFoundError:
91 logging.error(f"{namespace}: Datafile '{datafile}' not found")
92 return {}
93 except json.decoder.JSONDecodeError:
94 logging.error(
95 f"{namespace}: Datafile '{datafile}' did not contain valid JSON"
96 )
97 return {}
99 def get_image_url_dict(data):
100 url_dict = {}
101 for key in data:
102 url_dict[key] = data[key]["image_urls"]
103 return url_dict
105 scraped_data = get_json_data_from_file(scraped_datafile)
106 img_dict = get_image_url_dict(scraped_data)
107 dest = dloader.download(img_dict, saved_images_dir)
108 print(f"Images downloaded to {dest}.")
111if __name__ == "__main__":
112 main()