Coverage for src/artemis_sg/img_downloader.py: 86%

93 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2024-03-06 08:01 -0800

1#!/usr/bin/env python 

2 

3import json 

4import logging 

5import os 

6import tempfile 

7 

8import isbnlib 

9import puremagic 

10import requests 

11from rich.console import Console 

12from rich.text import Text 

13 

14from artemis_sg.config import CFG 

15 

16MODULE = os.path.splitext(os.path.basename(__file__))[0] 

17console = Console() 

18 

19 

20class ImgDownloader: 

21 # constants 

22 MAX_FILESIZE = 1048576 # 1 MB 

23 

24 def is_image(self, path): 

25 """Check given filepath to see if it is an image. 

26 If so, return extension type, else return None.""" 

27 namespace = f"{type(self).__name__}.{self.is_image.__name__}" 

28 try: 

29 kind = puremagic.from_file(path) 

30 except (puremagic.main.PureError, ValueError): 

31 logging.warning(f"{namespace}: non-image file found") 

32 kind = None 

33 if kind not in [".jpg", ".png"]: 

34 kind = None 

35 return kind 

36 

37 def download(self, image_dict, target_dir=""): # noqa: C901 

38 namespace = f"{type(self).__name__}.{self.download.__name__}" 

39 

40 if not target_dir: 

41 target_dir = tempfile.mkdtemp(prefix="ImgDownloader-") 

42 logging.warning(f"{namespace}: Creating target directory at {target_dir}") 

43 if not os.path.isdir(target_dir): 43 ↛ 44line 43 didn't jump to line 44, because the condition on line 43 was never true

44 os.mkdir(target_dir) 

45 

46 for key in image_dict: 

47 for i, url in enumerate(image_dict[key]): 

48 isbn = isbnlib.to_isbn13(key) 

49 if not isbn: 

50 isbn = key 

51 suffix = "" if i == 0 else f"-{i}" 

52 image = f"{isbn}{suffix}.jpg" 

53 image_path = os.path.join(target_dir, image) 

54 if not os.path.isfile(image_path) or not self.is_image(image_path): 

55 logging.debug(f"{namespace}: Downloading '{url}' to '{target_dir}'") 

56 with open(image_path, "wb") as fp: 

57 r = requests.get(url, timeout=10) 

58 fp.write(r.content) 

59 

60 # validate file and name it in accordance with its type 

61 fmt = self.is_image(image_path) 

62 if fmt == ".jpg": 

63 pass 

64 elif fmt == ".png": 

65 # rename file with png suffix 

66 old_path = image_path 

67 image_path = os.path.splitext(old_path)[0] + ".png" 

68 if os.path.isfile(image_path): 

69 logging.warning( 

70 f"{namespace}: Overwriting existing file " 

71 f"'{image_path}'." 

72 ) 

73 os.remove(image_path) 

74 os.rename(old_path, image_path) 

75 else: 

76 os.remove(image_path) 

77 logging.warning( 

78 f"{namespace}: Skipping unsupported file type in '{url}'" 

79 ) 

80 # validate file size 

81 if os.path.isfile(image_path): 

82 file_size = os.path.getsize(image_path) 

83 if file_size > self.MAX_FILESIZE: 83 ↛ 84line 83 didn't jump to line 84, because the condition on line 83 was never true

84 os.remove(image_path) 

85 logging.warning( 

86 f"{namespace}: Skipping file too large at '{url}'" 

87 ) 

88 logging.info(f"{namespace}: Saved '{image_path}") 

89 

90 return target_dir 

91 

92 

93def main(): 

94 scraped_datafile = CFG["asg"]["data"]["file"]["scraped"] 

95 saved_images_dir = CFG["asg"]["data"]["dir"]["images"] 

96 if not os.path.isdir(saved_images_dir): 96 ↛ 97line 96 didn't jump to line 97, because the condition on line 96 was never true

97 dest = None 

98 

99 dloader = ImgDownloader() 

100 

101 def get_json_data_from_file(datafile): 

102 namespace = f"{MODULE}.main.{get_json_data_from_file.__name__}" 

103 try: 

104 with open(datafile) as filepointer: 104 ↛ 105,   104 ↛ 1062 missed branches: 1) line 104 didn't jump to line 105, 2) line 104 didn't jump to line 106

105 data = json.load(filepointer) 

106 filepointer.close() 

107 return data 

108 except FileNotFoundError: 108 ↛ 111line 108 didn't jump to line 111

109 logging.error(f"{namespace}: Datafile '{datafile}' not found") 

110 return {} 

111 except json.decoder.JSONDecodeError: 

112 logging.error( 

113 f"{namespace}: Datafile '{datafile}' did not contain valid JSON" 

114 ) 

115 return {} 

116 

117 def get_image_url_dict(data): 

118 url_dict = {} 

119 for key in data: 119 ↛ 120line 119 didn't jump to line 120, because the loop on line 119 never started

120 url_dict[key] = data[key]["image_urls"] 

121 return url_dict 

122 

123 scraped_data = get_json_data_from_file(scraped_datafile) 

124 img_dict = get_image_url_dict(scraped_data) 

125 dest = dloader.download(img_dict, saved_images_dir) 

126 dest_text = Text(f"Images downloaded to {dest}.") 

127 dest_text.stylize("green") 

128 console.print(dest_text) 

129 

130 

131if __name__ == "__main__": 

132 main()