Coverage for src/tyora/session.py: 90%

71 statements  

« prev     ^ index     » next       coverage.py v7.5.1, created at 2024-05-24 14:35 -0400

1import importlib.metadata 

2import logging 

3import os 

4import sys 

5from typing import AnyStr, Optional 

6from urllib.parse import urljoin 

7 

8import html5lib 

9import requests 

10from requests_toolbelt import user_agent 

11 

12logger = logging.getLogger(__name__) 

13 

14try: 

15 __version__ = importlib.metadata.version("tyora") 

16except importlib.metadata.PackageNotFoundError: 

17 __version__ = "unknown" 

18 

19 

20class MoocfiCsesSession(requests.Session): 

21 def __init__( 

22 self, 

23 username: str, 

24 password: str, 

25 base_url: str, 

26 cookies: Optional[dict] = None, 

27 *args, 

28 **kwargs, 

29 ): 

30 super().__init__(*args, **kwargs) 

31 

32 self.username = username 

33 self.password = password 

34 self.base_url = base_url 

35 

36 if cookies: 

37 self.cookies.update(cookies) 

38 

39 self.headers.update( 

40 {"User-Agent": user_agent(os.path.basename(sys.argv[0]), __version__)} 

41 ) 

42 

43 @property 

44 def is_logged_in(self) -> bool: 

45 res = self.get(urljoin(self.base_url, "list")) 

46 res.raise_for_status() 

47 login_link = find_link(res.text, './/a[@class="account"]') 

48 login_text = login_link.get("text") or "" 

49 return self.username in login_text 

50 

51 def login(self) -> None: 

52 """Log into the site using webscraping 

53 

54 Steps: 

55 - checks if already logged in 

56 - retrieves base URL 

57 - finds and retrieves login URL 

58 - finds and submits login form 

59 - checks if logged in 

60 """ 

61 if self.is_logged_in: 

62 return 

63 

64 res = self.get(urljoin(self.base_url, "list")) 

65 res.raise_for_status() 

66 login_link = find_link(res.text, './/a[@class="account"]') 

67 if login_link: 

68 login_url = urljoin(res.url, login_link.get("href")) 

69 else: 

70 logger.debug( 

71 f"url: {res.url}, status: {res.status_code}\nhtml:\n{res.text}" 

72 ) 

73 raise ValueError("Failed to find login url") 

74 

75 res = self.get(login_url, headers={"referer": res.url}) 

76 login_form = parse_form(res.text, ".//form") 

77 if login_form: 

78 action = login_form.get("_action") 

79 login_form.pop("_action") 

80 else: 

81 logger.debug( 

82 f"url: {res.url}, status: {res.status_code}\nhtml:\n{res.text}" 

83 ) 

84 raise ValueError("Failed to find login form") 

85 

86 login_form["session[login]"] = self.username 

87 login_form["session[password]"] = self.password 

88 

89 self.post( 

90 url=urljoin(res.url, action), 

91 headers={"referer": res.url}, 

92 data=login_form, 

93 ) 

94 

95 if not self.is_logged_in: 

96 logger.debug( 

97 f"url: {res.url}, status: {res.status_code}\nhtml:\n{res.text}" 

98 ) 

99 raise ValueError("Login failed") 

100 

101 

102def find_link(html: AnyStr, xpath: str) -> dict[str, Optional[str]]: 

103 """Search for html link by xpath and return dict with href and text""" 

104 anchor_element = html5lib.parse(html, namespaceHTMLElements=False).find(xpath) 

105 if anchor_element is None: 

106 return dict() 

107 

108 link_data = dict() 

109 link_data["href"] = anchor_element.get("href") 

110 link_data["text"] = anchor_element.text 

111 

112 return link_data 

113 

114 

115def parse_form(html: AnyStr, xpath: str = ".//form") -> dict: 

116 """Search for the first form in html and return dict with action and all other found inputs""" 

117 form_element = html5lib.parse(html, namespaceHTMLElements=False).find(xpath) 

118 form_data = dict() 

119 if form_element is not None: 

120 form_data["_action"] = form_element.get("action") 

121 for form_input in form_element.iter("input"): 

122 form_key = form_input.get("name") or "" 

123 form_value = form_input.get("value") or "" 

124 form_data[form_key] = form_value 

125 

126 return form_data