Coverage for src/artemis_sg/img_downloader.py: 87%

84 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-10-12 17:31 -0700

1#!/usr/bin/env python 

2 

3import json 

4import logging 

5import os 

6import tempfile 

7 

8import isbnlib 

9import puremagic 

10import requests 

11from rich.console import Console 

12from rich.text import Text 

13 

14from artemis_sg.config import CFG 

15 

16MODULE = os.path.splitext(os.path.basename(__file__))[0] 

17console = Console() 

18 

19 

20class ImgDownloader: 

21 def is_image(self, path): 

22 """Check given filepath to see if it is an image. 

23 If so, return extension type, else return None.""" 

24 namespace = f"{type(self).__name__}.{self.is_image.__name__}" 

25 try: 

26 kind = puremagic.from_file(path) 

27 except (puremagic.main.PureError, ValueError): 

28 logging.warning(f"{namespace}: non-image file found") 

29 kind = None 

30 if kind not in [".jpg", ".png"]: 

31 kind = None 

32 return kind 

33 

34 def download(self, image_dict, target_dir=""): 

35 namespace = f"{type(self).__name__}.{self.download.__name__}" 

36 

37 if not target_dir: 

38 target_dir = tempfile.mkdtemp(prefix="ImgDownloader-") 

39 logging.warning(f"{namespace}: Creating target directory at {target_dir}") 

40 if not os.path.isdir(target_dir): 40 ↛ 41line 40 didn't jump to line 41, because the condition on line 40 was never true

41 os.mkdir(target_dir) 

42 

43 for key in image_dict: 

44 for i, url in enumerate(image_dict[key]): 

45 isbn = isbnlib.to_isbn13(key) 

46 if not isbn: 

47 isbn = key 

48 suffix = "" if i == 0 else f"-{i}" 

49 image = f"{isbn}{suffix}.jpg" 

50 image_path = os.path.join(target_dir, image) 

51 if not os.path.isfile(image_path) or not self.is_image(image_path): 

52 logging.debug(f"{namespace}: Downloading '{url}' to '{target_dir}'") 

53 with open(image_path, "wb") as fp: 

54 r = requests.get(url, timeout=10) 

55 fp.write(r.content) 

56 

57 # validate file and name it in accordance with its type 

58 fmt = self.is_image(image_path) 

59 if fmt == ".jpg": 

60 pass 

61 elif fmt == ".png": 

62 # rename file with png suffix 

63 old_path = image_path 

64 image_path = os.path.splitext(old_path)[0] + ".png" 

65 os.rename(old_path, image_path) 

66 else: 

67 os.remove(image_path) 

68 logging.warning( 

69 f"{namespace}: Skipping unsupported file type in '{url}'" 

70 ) 

71 logging.info(f"{namespace}: Saved '{image_path}") 

72 

73 return target_dir 

74 

75 

76def main(): 

77 scraped_datafile = CFG["asg"]["data"]["file"]["scraped"] 

78 saved_images_dir = CFG["asg"]["data"]["dir"]["images"] 

79 if not os.path.isdir(saved_images_dir): 79 ↛ 80line 79 didn't jump to line 80, because the condition on line 79 was never true

80 dest = None 

81 

82 dloader = ImgDownloader() 

83 

84 def get_json_data_from_file(datafile): 

85 namespace = f"{MODULE}.main.{get_json_data_from_file.__name__}" 

86 try: 

87 with open(datafile) as filepointer: 87 ↛ 88,   87 ↛ 892 missed branches: 1) line 87 didn't jump to line 88, 2) line 87 didn't jump to line 89

88 data = json.load(filepointer) 

89 filepointer.close() 

90 return data 

91 except FileNotFoundError: 91 ↛ 94line 91 didn't jump to line 94

92 logging.error(f"{namespace}: Datafile '{datafile}' not found") 

93 return {} 

94 except json.decoder.JSONDecodeError: 

95 logging.error( 

96 f"{namespace}: Datafile '{datafile}' did not contain valid JSON" 

97 ) 

98 return {} 

99 

100 def get_image_url_dict(data): 

101 url_dict = {} 

102 for key in data: 102 ↛ 103line 102 didn't jump to line 103, because the loop on line 102 never started

103 url_dict[key] = data[key]["image_urls"] 

104 return url_dict 

105 

106 scraped_data = get_json_data_from_file(scraped_datafile) 

107 img_dict = get_image_url_dict(scraped_data) 

108 dest = dloader.download(img_dict, saved_images_dir) 

109 dest_text = Text(f"Images downloaded to {dest}.") 

110 dest_text.stylize("green") 

111 console.print(dest_text) 

112 

113 

114if __name__ == "__main__": 

115 main()