Coverage for src/artemis_sg/img_downloader.py: 66%

79 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-05 09:33 -0700

1#!/usr/bin/env python 

2# -*- coding: utf-8 -*- 

3 

4import json 

5import logging 

6import os 

7import tempfile 

8 

9import isbnlib 

10import puremagic 

11import requests 

12 

13from artemis_sg.config import CFG 

14 

15MODULE = os.path.splitext(os.path.basename(__file__))[0] 

16 

17 

18class ImgDownloader: 

19 def is_image(self, path): 

20 """Check given filepath to see if it is an image. 

21 If so, return extension type, else return None.""" 

22 try: 

23 kind = puremagic.from_file(path) 

24 except puremagic.main.PureError: 

25 kind = None 

26 if kind not in [".jpg", ".png"]: 

27 kind = None 

28 return kind 

29 

30 def download(self, image_dict, target_dir=""): 

31 namespace = f"{type(self).__name__}.{self.download.__name__}" 

32 

33 if not target_dir: 

34 target_dir = tempfile.mkdtemp(prefix="ImgDownloader-") 

35 logging.warning(f"{namespace}: Creating target directory at {target_dir}") 

36 if not os.path.isdir(target_dir): 36 ↛ 37line 36 didn't jump to line 37, because the condition on line 36 was never true

37 os.mkdir(target_dir) 

38 

39 for key in image_dict: 

40 for i, url in enumerate(image_dict[key]): 

41 isbn = isbnlib.to_isbn13(key) 

42 if not isbn: 

43 isbn = key 

44 if i == 0: 

45 suffix = "" 

46 else: 

47 suffix = f"-{i}" 

48 image = f"{isbn}{suffix}.jpg" 

49 image_path = os.path.join(target_dir, image) 

50 if not os.path.isfile(image_path) or not self.is_image(image_path): 

51 logging.debug(f"{namespace}: Downloading '{url}' to '{target_dir}'") 

52 with open(image_path, "wb") as fp: 

53 r = requests.get(url) 

54 fp.write(r.content) 

55 

56 # validate file and name it in accordance with its type 

57 fmt = self.is_image(image_path) 

58 if fmt == ".jpg": 

59 pass 

60 elif fmt == ".png": 

61 # rename file with png suffix 

62 old_path = image_path 

63 image_path = os.path.splitext(old_path)[0] + ".png" 

64 os.rename(old_path, image_path) 

65 else: 

66 os.remove(image_path) 

67 logging.warning( 

68 f"{namespace}: Skipping unsupported file type in '{url}'" 

69 ) 

70 logging.info(f"{namespace}: Saved '{image_path}") 

71 

72 return target_dir 

73 

74 

75def main(): 

76 scraped_datafile = CFG["asg"]["data"]["file"]["scraped"] 

77 saved_images_dir = CFG["asg"]["data"]["dir"]["images"] 

78 if not os.path.isdir(saved_images_dir): 

79 dest = None 

80 

81 dloader = ImgDownloader() 

82 

83 def get_json_data_from_file(datafile): 

84 namespace = f"{MODULE}.main.{get_json_data_from_file.__name__}" 

85 try: 

86 with open(datafile, "r") as filepointer: 

87 data = json.load(filepointer) 

88 filepointer.close() 

89 return data 

90 except FileNotFoundError: 

91 logging.error(f"{namespace}: Datafile '{datafile}' not found") 

92 return {} 

93 except json.decoder.JSONDecodeError: 

94 logging.error( 

95 f"{namespace}: Datafile '{datafile}' did not contain valid JSON" 

96 ) 

97 return {} 

98 

99 def get_image_url_dict(data): 

100 url_dict = {} 

101 for key in data: 

102 url_dict[key] = data[key]["image_urls"] 

103 return url_dict 

104 

105 scraped_data = get_json_data_from_file(scraped_datafile) 

106 img_dict = get_image_url_dict(scraped_data) 

107 dest = dloader.download(img_dict, saved_images_dir) 

108 print(f"Images downloaded to {dest}.") 

109 

110 

111if __name__ == "__main__": 

112 main()