Coverage for src/m6rclib/metaphor_parser.py: 100%

216 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-11-13 18:02 +0000

1# Copyright 2024 M6R Ltd. 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14 

15import glob 

16import os 

17from pathlib import Path 

18 

19from typing import List, Set, Optional, Union 

20 

21from .metaphor_token import Token, TokenType 

22from .embed_lexer import EmbedLexer 

23from .metaphor_lexer import MetaphorLexer 

24from .metaphor_ast_node import MetaphorASTNode, MetaphorASTNodeType 

25 

26class MetaphorParserFileAlreadyUsedError(Exception): 

27 """Exception raised when a file is used more than once.""" 

28 def __init__(self, filename: str, token: Token) -> None: 

29 super().__init__(f"The file '{filename}' has already been used.") 

30 self.filename: str = filename 

31 self.token: Token = token 

32 

33 

34class MetaphorParserSyntaxError(Exception): 

35 """Exception generated when there is a syntax error.""" 

36 def __init__(self, message: str, filename: str, line: int, column: int, input_text: str) -> None: 

37 super().__init__(f"{message}: file: {filename}, line {line}, column {column}, ") 

38 self.message: str = message 

39 self.filename: str = filename 

40 self.line: int = line 

41 self.column: int = column 

42 self.input_text: str = input_text 

43 

44 

45class MetaphorParserError(Exception): 

46 """Exception wrapper generated when there is a syntax error.""" 

47 def __init__(self, message: str, errors: List[MetaphorParserSyntaxError]) -> None: 

48 super().__init__(message) 

49 self.errors: List[MetaphorParserSyntaxError] = errors 

50 

51 

52class MetaphorParser: 

53 """ 

54 Parser class to process tokens and build an Abstract Syntax Tree (AST). 

55 

56 Attributes: 

57 syntax_tree (MetaphorASTNode): The root node of the AST. 

58 parse_errors (List[MetaphorParserSyntaxError]): List of syntax errors encountered during parsing. 

59 lexers (List[Union[MetaphorLexer, EmbedLexer]]): Stack of lexers used for parsing multiple files. 

60 previously_seen_files (Set[str]): Set of canonical filenames already processed. 

61 search_paths (List[str]): List of paths to search for included files. 

62 current_token (Optional[Token]): The current token being processed. 

63 """ 

64 def __init__(self) -> None: 

65 self.syntax_tree: MetaphorASTNode = MetaphorASTNode(MetaphorASTNodeType.ROOT, "") 

66 self.parse_errors: List[MetaphorParserSyntaxError] = [] 

67 self.lexers: List[Union[MetaphorLexer, EmbedLexer]] = [] 

68 self.previously_seen_files: Set[str] = set() 

69 self.search_paths: List[str] = [] 

70 self.current_token: Optional[Token] = None 

71 

72 def _insert_preamble_text(self, text: str) -> None: 

73 self.syntax_tree.attach_child(MetaphorASTNode(MetaphorASTNodeType.TEXT, text)) 

74 

75 def _generate_preamble(self) -> None: 

76 preamble: List[str] = [ 

77 "The following is written in a language called Metaphor.", 

78 "", 

79 "Metaphor has the structure of a document tree with branches and leaves being prefixed", 

80 "by the keywords \"Role:\", \"Context:\" or \"Action:\". Each of these indicates the", 

81 "start of a new block of information.", 

82 "", 

83 "Blocks have an optional section name that will immediately follow them on the same line.", 

84 "If this is missing then the section name is not defined.", 

85 "", 

86 "After a keyword line there may be one or more lines of text that will describe the purpose", 

87 "of that block. A block may also include one or more optional child blocks inside them and", 

88 "that further clarify their parent block.", 

89 "", 

90 "The indentation of the blocks indicates where in the tree the pieces appear. For example a", 

91 "\"Context:\" indented by 8 spaces is a child of the context above it that is indented by 4", 

92 "spaces. One indented 12 spaces would be a child of the block above it that is indented by", 

93 "8 spaces.", 

94 "", 

95 "If you are presented with code or document fragments inside a block delimited by 3", 

96 "backticks then please pay close attention to the indentation level of the opening set of", 

97 "backticks. Please remove this amount of whitespace from the start of each line of the", 

98 "enclosed text. In the following example, even though \"text line 1\" is indented by", 

99 "4 spaces, you should remove these 4 spaces because the backticks are also indented by", 

100 "4 spaces. You should also remove 4 spaces from \"text line 2\" because of this", 

101 "backtick indentation, but leave the remaining 2 spaces:", 

102 " ```plaintext", 

103 " text line 1", 

104 " text line 2", 

105 " ```" 

106 "", 

107 "If a \"Role:\" block exists then this is the role you should fulfil.", 

108 "", 

109 "\"Context:\" blocks provide context necessary to understand what you will be asked to do.", 

110 "", 

111 "An \"Action:\" block describes the task I would like you to do.", 

112 "", 

113 "When you process the actions please carefully ensure you do all of them accurately. These", 

114 "need to fulfil all the details described in the \"Context:\". Ensure you complete all the", 

115 "elements and do not include any placeholders.", 

116 "" 

117 ] 

118 

119 for text in preamble: 

120 self._insert_preamble_text(text) 

121 

122 def parse(self, input_text: str, filename: str, search_paths: List[str]) -> MetaphorASTNode: 

123 """ 

124 Parse an input string and construct the AST. 

125 

126 Args: 

127 input_text (str): The text to be parsed. 

128 filename (str): The name of the file being parsed. 

129 search_paths (List[str]): List of paths to search for included files. 

130 

131 Returns: 

132 List[Optional[MetaphorASTNode]]: A list containing the role, context, and action AST nodes. 

133 

134 Raises: 

135 MetaphorParserError: If there are syntax errors during parsing. 

136 FileNotFoundError: If a required file cannot be found. 

137 """ 

138 self.search_paths = search_paths 

139 

140 try: 

141 self.lexers.append(MetaphorLexer(input_text, filename)) 

142 self._generate_preamble() 

143 

144 seen_action_tree: bool = False 

145 seen_context_tree: bool = False 

146 seen_role_tree: bool = False 

147 

148 while True: 

149 token = self.get_next_token() 

150 if token.type == TokenType.ACTION: 

151 if seen_action_tree: 

152 self._record_syntax_error(token, "'Action' already defined") 

153 

154 self.syntax_tree.attach_child(self._parse_action(token)) 

155 seen_action_tree = True 

156 elif token.type == TokenType.CONTEXT: 

157 if seen_context_tree: 

158 self._record_syntax_error(token, "'Context' already defined") 

159 

160 self.syntax_tree.attach_child(self._parse_context(token)) 

161 seen_context_tree = True 

162 elif token.type == TokenType.ROLE: 

163 if seen_role_tree: 

164 self._record_syntax_error(token, "'Role' already defined") 

165 

166 self.syntax_tree.attach_child(self._parse_role(token)) 

167 seen_role_tree = True 

168 elif token.type == TokenType.END_OF_FILE: 

169 if self.parse_errors: 

170 raise(MetaphorParserError("parser error", self.parse_errors)) 

171 

172 return self.syntax_tree 

173 else: 

174 self._record_syntax_error(token, f"Unexpected token: {token.value} at top level") 

175 except FileNotFoundError as e: 

176 err_token = self.current_token 

177 self.parse_errors.append(MetaphorParserSyntaxError( 

178 f"{e}", err_token.filename, err_token.line, err_token.column, err_token.input 

179 )) 

180 raise(MetaphorParserError("parser error", self.parse_errors)) from e 

181 except MetaphorParserFileAlreadyUsedError as e: 

182 self.parse_errors.append(MetaphorParserSyntaxError( 

183 f"The file '{e.filename}' has already been used", 

184 e.token.filename, 

185 e.token.line, 

186 e.token.column, 

187 e.token.input 

188 )) 

189 raise(MetaphorParserError("parser error", self.parse_errors)) from e 

190 

191 def parse_file(self, filename: str, search_paths: List[str]) -> MetaphorASTNode: 

192 """ 

193 Parse a file and construct the AST. 

194 

195 Args: 

196 filename (str): The path to the file to be parsed. 

197 search_paths (List[str]): List of paths to search for included files. 

198 

199 Returns: 

200 List[Optional[MetaphorASTNode]]: A list containing the role, context, and action AST nodes. 

201 

202 Raises: 

203 MetaphorParserError: If there are syntax errors during parsing. 

204 FileNotFoundError: If the file cannot be found. 

205 """ 

206 try: 

207 self._check_file_not_loaded(filename) 

208 input_text = self._read_file(filename) 

209 return self.parse(input_text, filename, search_paths) 

210 except FileNotFoundError as e: 

211 self.parse_errors.append(MetaphorParserSyntaxError( 

212 f"{e}", "", 0, 0, "" 

213 )) 

214 raise(MetaphorParserError("parser error", self.parse_errors)) from e 

215 except MetaphorParserError as e: 

216 raise(MetaphorParserError("parser error", self.parse_errors)) from e 

217 

218 def get_next_token(self) -> Token: 

219 """Get the next token from the active lexer.""" 

220 while self.lexers: 

221 lexer = self.lexers[-1] 

222 token = lexer.get_next_token() 

223 self.current_token = token 

224 

225 if token.type == TokenType.INCLUDE: 

226 self._parse_include() 

227 elif token.type == TokenType.EMBED: 

228 self._parse_embed() 

229 elif token.type == TokenType.END_OF_FILE: 

230 self.lexers.pop() 

231 else: 

232 return token 

233 

234 return Token(TokenType.END_OF_FILE, "", "", "", 0, 0) 

235 

236 def _record_syntax_error(self, token, message): 

237 """Raise a syntax error and add it to the error list.""" 

238 error = MetaphorParserSyntaxError( 

239 message, token.filename, token.line, token.column, token.input 

240 ) 

241 self.parse_errors.append(error) 

242 

243 def _find_file_path(self, filename): 

244 """Try to find a valid path for a file, given all the search path options""" 

245 if Path(filename).exists(): 

246 return filename 

247 

248 # If we don't have an absolute path then we can try search paths. 

249 if not os.path.isabs(filename): 

250 for path in self.search_paths: 

251 try_name = os.path.join(path, filename) 

252 if Path(try_name).exists(): 

253 return try_name 

254 

255 raise FileNotFoundError(f"File not found: {filename}") 

256 

257 def _read_file(self, filename): 

258 """Read file content into memory.""" 

259 try: 

260 with open(filename, 'r', encoding='utf-8') as file: 

261 return file.read() 

262 except FileNotFoundError as e: 

263 raise FileNotFoundError(f"File not found: {filename}") from e 

264 except PermissionError as e: 

265 raise FileNotFoundError(f"You do not have permission to access: {filename}") from e 

266 except IsADirectoryError as e: 

267 raise FileNotFoundError(f"Is a directory: {filename}") from e 

268 except OSError as e: 

269 raise FileNotFoundError(f"OS error: {e}") from e 

270 

271 def _check_file_not_loaded(self, filename): 

272 """Check we have not already loaded a file.""" 

273 canonical_filename = os.path.realpath(filename) 

274 if canonical_filename in self.previously_seen_files: 

275 raise MetaphorParserFileAlreadyUsedError(filename, self.current_token) 

276 

277 self.previously_seen_files.add(canonical_filename) 

278 

279 def _parse_text(self, token): 

280 """Parse a text block.""" 

281 return MetaphorASTNode(MetaphorASTNodeType.TEXT, token.value) 

282 

283 def _parse_action(self, token): 

284 """Parse an action block and construct its AST node.""" 

285 label_name = "" 

286 

287 init_token = self.get_next_token() 

288 if init_token.type == TokenType.KEYWORD_TEXT: 

289 label_name = init_token.value 

290 indent_token = self.get_next_token() 

291 if indent_token.type != TokenType.INDENT: 

292 self._record_syntax_error( 

293 token, 

294 "Expected indent after keyword description for 'Action' block" 

295 ) 

296 elif init_token.type != TokenType.INDENT: 

297 self._record_syntax_error(token, "Expected description or indent for 'Action' block") 

298 

299 action_node = MetaphorASTNode(MetaphorASTNodeType.ACTION, label_name) 

300 

301 while True: 

302 token = self.get_next_token() 

303 if token.type == TokenType.TEXT: 

304 action_node.attach_child(self._parse_text(token)) 

305 elif token.type == TokenType.OUTDENT or token.type == TokenType.END_OF_FILE: 

306 return action_node 

307 else: 

308 self._record_syntax_error( 

309 token, 

310 f"Unexpected token: {token.value} in 'Action' block" 

311 ) 

312 

313 def _parse_context(self, token): 

314 """Parse a Context block.""" 

315 label_name = "" 

316 

317 seen_token_type = TokenType.NONE 

318 

319 init_token = self.get_next_token() 

320 if init_token.type == TokenType.KEYWORD_TEXT: 

321 label_name = init_token.value 

322 indent_token = self.get_next_token() 

323 if indent_token.type != TokenType.INDENT: 

324 self._record_syntax_error( 

325 token, 

326 "Expected indent after keyword description for 'Context' block" 

327 ) 

328 elif init_token.type != TokenType.INDENT: 

329 self._record_syntax_error(token, "Expected description or indent for 'Context' block") 

330 

331 context_node = MetaphorASTNode(MetaphorASTNodeType.CONTEXT, label_name) 

332 

333 while True: 

334 token = self.get_next_token() 

335 if token.type == TokenType.TEXT: 

336 if seen_token_type != TokenType.NONE: 

337 self._record_syntax_error(token, "Text must come first in a 'Context' block") 

338 

339 context_node.attach_child(self._parse_text(token)) 

340 elif token.type == TokenType.CONTEXT: 

341 context_node.attach_child(self._parse_context(token)) 

342 seen_token_type = TokenType.CONTEXT 

343 elif token.type == TokenType.OUTDENT or token.type == TokenType.END_OF_FILE: 

344 return context_node 

345 else: 

346 self._record_syntax_error(token, f"Unexpected token: {token.value} in 'Context' block") 

347 

348 def _parse_role(self, token): 

349 """Parse a Role block.""" 

350 label_name = "" 

351 

352 init_token = self.get_next_token() 

353 if init_token.type == TokenType.KEYWORD_TEXT: 

354 label_name = init_token.value 

355 indent_token = self.get_next_token() 

356 if indent_token.type != TokenType.INDENT: 

357 self._record_syntax_error( 

358 token, 

359 "Expected indent after keyword description for 'Role' block" 

360 ) 

361 elif init_token.type != TokenType.INDENT: 

362 self._record_syntax_error(token, "Expected description or indent for 'Role' block") 

363 

364 role_node = MetaphorASTNode(MetaphorASTNodeType.ROLE, label_name) 

365 

366 while True: 

367 token = self.get_next_token() 

368 if token.type == TokenType.TEXT: 

369 role_node.attach_child(self._parse_text(token)) 

370 elif token.type == TokenType.OUTDENT or token.type == TokenType.END_OF_FILE: 

371 return role_node 

372 else: 

373 self._record_syntax_error( 

374 token, 

375 f"Unexpected token: {token.value} in 'Role' block" 

376 ) 

377 

378 def _parse_include(self): 

379 """Parse an Include block and load the included file.""" 

380 token_next = self.get_next_token() 

381 if token_next.type != TokenType.KEYWORD_TEXT: 

382 self._record_syntax_error(token_next, "Expected file name for 'Include'") 

383 return 

384 

385 filename = token_next.value 

386 self._check_file_not_loaded(filename) 

387 try_file = self._find_file_path(filename) 

388 input_text = self._read_file(try_file) 

389 self.lexers.append(MetaphorLexer(input_text, try_file)) 

390 

391 def _parse_embed(self): 

392 """Parse an Embed block and load the embedded file.""" 

393 token_next = self.get_next_token() 

394 if token_next.type != TokenType.KEYWORD_TEXT: 

395 self._record_syntax_error(token_next, "Expected file name or wildcard match for 'Embed'") 

396 return 

397 

398 recurse = False 

399 match = token_next.value 

400 if "**/" in match: 

401 recurse = True 

402 

403 files = glob.glob(match, recursive=recurse) 

404 if not files: 

405 self._record_syntax_error(token_next, f"{match} does not match any files for 'Embed'") 

406 return 

407 

408 for file in files: 

409 input_text = self._read_file(file) 

410 self.lexers.append(EmbedLexer(input_text, file))