Coverage for ezdag/layers.py: 92.6%

216 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-29 15:59 -0700

1# Copyright (C) 2020 Patrick Godwin 

2# 

3# This Source Code Form is subject to the terms of the Mozilla Public License, v2.0. 

4# If a copy of the MPL was not distributed with this file, You can obtain one at 

5# <https://mozilla.org/MPL/2.0/>. 

6# 

7# SPDX-License-Identifier: MPL-2.0 

8 

9from __future__ import annotations 

10 

11import itertools 

12import os 

13import re 

14import shutil 

15import warnings 

16from collections.abc import Iterable 

17from dataclasses import dataclass, field 

18from typing import Any, Dict, List, Optional, Tuple, Union 

19 

20# disable warnings when condor config source is not found 

21with warnings.catch_warnings(): 

22 warnings.simplefilter("ignore", UserWarning) 

23 import htcondor 

24 from htcondor import dags 

25 

26from . import path 

27from .format import format_submit_description 

28from .options import PROTECTED_CONDOR_VARS, Argument, Literal, Option 

29 

30 

31@dataclass 

32class Layer: 

33 """Defines a single layer (or set of related jobs) in an HTCondor DAG. 

34 

35 Stores submit configuration for a set of nodes as well as 

36 providing functionality to determine the parent-child 

37 relationships between nodes. 

38 

39 Parameters 

40 ---------- 

41 executable 

42 The path of the executable to run. 

43 name 

44 The human-readable name of this node. Defaults to the basename 

45 of the executable if not given. 

46 universe 

47 The execution environment for a job. Defaults to 'vanilla'. 

48 log_dir 

49 The directory in which logs will be written to. Defaults to ./logs. 

50 retries 

51 The number of retries given for a job. Defaults to 3. 

52 transfer_files 

53 Whether to leverage Condor file transfer for moving around 

54 files. On by default. 

55 submit_description 

56 The submit descriptors representing this set of jobs. 

57 requirements 

58 The submit descriptors representing this set of jobs. 

59 Deprecated in favor for submit_description to avoid confusion, 

60 as 'requirements' refers to a specific submit descriptor. 

61 This option will be removed in a future release. 

62 nodes 

63 The nodes representing the layer. Nodes can be passed upon 

64 instantiation or added to the layer after the fact via 

65 Layer.append(node), Layer.extend(nodes), or Layer += node. 

66 

67 """ 

68 

69 executable: str 

70 name: str = "" 

71 universe: str = "vanilla" 

72 log_dir: str = "logs" 

73 retries: int = 3 

74 transfer_files: bool = True 

75 requirements: dict = field(default_factory=dict) 

76 submit_description: Union[dict, htcondor.Submit] = field(default_factory=dict) 

77 nodes: list = field(default_factory=list) 

78 inputs: dict = field(init=False, default_factory=dict) 

79 outputs: dict = field(init=False, default_factory=dict) 

80 

81 def __post_init__(self) -> None: 

82 if not self.name: 

83 self.name = os.path.basename(self.executable) 

84 if self.requirements: 

85 self.submit_description.update(self.requirements) 

86 warnings.warn( 

87 "requirements has been deprecated in favor of submit_description" 

88 "to avoid confusion and will be removed in a future release", 

89 DeprecationWarning, 

90 stacklevel=2, 

91 ) 

92 self.extend(self.nodes) 

93 

94 def config( 

95 self, 

96 formatter: Optional[dags.NodeNameFormatter] = None, 

97 ) -> Dict[str, Any]: 

98 """Generates a layer configuration. 

99 

100 This configuration can be passed directly into an 

101 htcondor.dags.NodeLayer if desired. 

102 

103 Parameters 

104 ---------- 

105 formatter : htcondor.dags.NodeNameFormatter 

106 Defines how the node names are defined and formatted. Defaults to a 

107 hex-based formatter with 5 digits. 

108 

109 """ 

110 # check that nodes are valid 

111 self.validate() 

112 

113 # update submit description with defaults + other layer configuration 

114 submit_description = self._update_submit_defaults(self.submit_description) 

115 

116 if not formatter: 

117 formatter = HexFormatter() 

118 return { 

119 "name": self.name, 

120 "submit_description": submit_description, 

121 "vars": self._vars(formatter), 

122 "retries": self.retries, 

123 } 

124 

125 def append(self, node: Node) -> None: 

126 """Append a node to this layer.""" 

127 assert isinstance(node.inputs, list) 

128 assert isinstance(node.outputs, list) 

129 for input_ in node.inputs: 

130 self.inputs.setdefault(input_.name, []).append(input_.argument) 

131 for output in node.outputs: 

132 self.outputs.setdefault(output.name, []).append(output.argument) 

133 self.nodes.append(node) 

134 

135 def extend(self, nodes: Iterable[Node]) -> None: 

136 """Append multiple nodes to this layer.""" 

137 for node in nodes: 

138 self.append(node) 

139 

140 def __iadd__(self, nodes) -> Layer: 

141 if isinstance(nodes, Iterable): 

142 self.extend(nodes) 

143 else: 

144 self.append(nodes) 

145 return self 

146 

147 def new(self) -> Layer: 

148 """Create an identical layer without any nodes attached.""" 

149 return self.__class__( 

150 executable=self.executable, 

151 name=self.name, 

152 universe=self.universe, 

153 log_dir=self.log_dir, 

154 retries=self.retries, 

155 transfer_files=self.transfer_files, 

156 submit_description=self.submit_description, 

157 ) 

158 

159 def validate(self) -> None: 

160 """Ensure all nodes in this layer are consistent with each other.""" 

161 assert self.nodes, "at least one node must be connected to this layer" 

162 

163 # check arg names across nodes are equal 

164 args = [arg.name for arg in self.nodes[0].arguments] 

165 for node in self.nodes[:-1]: 

166 assert args == [arg.name for arg in node.arguments] 

167 

168 # check input/output names across nodes are equal 

169 inputs = [arg.name for arg in self.nodes[0].inputs] 

170 for node in self.nodes[:-1]: 

171 assert inputs == [arg.name for arg in node.inputs] 

172 outputs = [arg.name for arg in self.nodes[0].outputs] 

173 for node in self.nodes[:-1]: 

174 assert outputs == [arg.name for arg in node.outputs] 

175 

176 # check meta-parameters (equality, name validity) 

177 variables = list(self.nodes[0].variables.keys()) 

178 for node in self.nodes[:-1]: 

179 assert variables == list(node.variables.keys()) 

180 for var in variables: 

181 if var in PROTECTED_CONDOR_VARS: 

182 msg = f"{var} is a protected condor name for node {self.name}" 

183 raise ValueError(msg) 

184 

185 def command(self, node, *, readjust_paths: bool = True): 

186 """Given a node, return the command that would be run. 

187 

188 Parameters 

189 ---------- 

190 node : Node 

191 The node to return the command for. 

192 readjust_paths : bool 

193 Determines whether path locations are readjusted based on 

194 the command that would be run on the node's execute point. 

195 This only has an effect if using file transfer. Default is True. 

196 

197 """ 

198 args = re.sub(r"\$\(((\w+?))\)", r"{\1}", self._arguments()) 

199 # extract node variables 

200 node_vars = {arg.condor_name: arg.vars() for arg in node.arguments} 

201 for arg in node.inputs: 

202 if self.transfer_files and readjust_paths: 

203 node_vars[arg.condor_name] = arg.vars(basename=path.is_abs_or_url) 

204 else: 

205 node_vars[arg.condor_name] = arg.vars() 

206 for arg in node.outputs: 

207 basename = readjust_paths and self.transfer_files 

208 node_vars[arg.condor_name] = arg.vars(basename=basename) 

209 return self.executable + " " + args.format(**node_vars) 

210 

211 @property 

212 def has_dependencies(self) -> bool: 

213 """Check if any of the nodes in this layer have dependencies.""" 

214 return any(node.requires for node in self.nodes) 

215 

216 def _arguments(self) -> str: 

217 args = [] 

218 for arg in self.nodes[0].arguments: 

219 args.append(arg.vars() if arg.static else f"$({arg.condor_name})") 

220 io_args = [] 

221 io_opts = [] 

222 for arg in itertools.chain(self.nodes[0].inputs, self.nodes[0].outputs): 

223 if arg.static: 

224 basename = path.is_abs_or_url if self.transfer_files else False 

225 io_args.append(arg.vars(basename=basename)) 

226 elif not arg.suppress: 

227 if isinstance(arg, Argument): 

228 io_args.append(f"$({arg.condor_name})") 

229 else: 

230 io_opts.append(f"$({arg.condor_name})") 

231 return " ".join(itertools.chain(args, io_opts, io_args)) 

232 

233 def _inputs(self) -> str: 

234 inputs = [] 

235 for arg in self.nodes[0].inputs: 

236 if arg.static: 

237 inputs.append(arg.files()) 

238 else: 

239 inputs.append(f"$(input_{arg.condor_name})") 

240 return ",".join(inputs) 

241 

242 def _outputs(self) -> str: 

243 outputs = [] 

244 for arg in self.nodes[0].outputs: 

245 if arg.static: 

246 outputs.append(arg.files(basename=path.is_abs_or_url)) 

247 else: 

248 outputs.append(f"$(output_{arg.condor_name})") 

249 return ",".join(outputs) 

250 

251 def _output_remaps(self) -> str: 

252 remaps = [] 

253 for arg in self.nodes[0].outputs: 

254 if arg.static: 

255 remaps.append(arg.remaps()) 

256 else: 

257 remaps.append(f"$(output_{arg.condor_name}_remap)") 

258 return ";".join(remaps) 

259 

260 def _vars(self, formatter: dags.NodeNameFormatter) -> List[Dict[str, str]]: 

261 allvars = [] 

262 for i, node in enumerate(self.nodes): 

263 nodevars = { 

264 "nodename": formatter.generate(self.name, i), 

265 "log_dir": self.log_dir, 

266 **node.variables, 

267 } 

268 

269 # add arguments which aren't suppressed 

270 for arg in node.arguments: 

271 if not arg.static and not arg.suppress: 

272 nodevars[arg.condor_name] = arg.vars() 

273 

274 # then add arguments defined as 'inputs'. if file transfer is enabled, 

275 # also define the $(input_{arg}) variable containing the files 

276 for arg in node.inputs: 

277 if not arg.static: 

278 if not arg.suppress: 

279 basename = path.is_abs_or_url if self.transfer_files else False 

280 nodevars[arg.condor_name] = arg.vars(basename=basename) 

281 if self.transfer_files: 

282 # adjust file location for input files if they are 

283 # absolute paths. condor will transfer the file 

284 # /path/to/file.txt to the job's current working 

285 # directory, so arguments should point to file.txt 

286 nodevars[f"input_{arg.condor_name}"] = arg.files() 

287 

288 # finally, add arguments defined as 'outputs'. if file transfer is 

289 # enabled, also define the $(output_{arg}) variable containing the 

290 # files. if argument if not suppressed, some extra hoops are done 

291 # with remaps to ensure that files are also saved to the right 

292 # place. the main problem is that when jobs are submitted, the 

293 # directory structure is present in the submit node but not the 

294 # execute node, so when a job tries to create a file assuming the 

295 # directories are there, the job fails. this gets around the issue 

296 # by writing the files to the root directory then remaps them so 

297 # they get stored in the right place after the job completes and 

298 # files are transferred back 

299 for arg in node.outputs: 

300 if not arg.static: 

301 if not arg.suppress: 

302 basename = path.is_abs_or_url if self.transfer_files else False 

303 nodevars[arg.condor_name] = arg.vars(basename=basename) 

304 if self.transfer_files: 

305 nodevars[f"output_{arg.condor_name}"] = arg.files( 

306 basename=path.is_abs_or_url 

307 ) 

308 nodevars[f"output_{arg.condor_name}_remap"] = arg.remaps() 

309 allvars.append(nodevars) 

310 

311 return allvars 

312 

313 def _update_submit_defaults( 

314 self, submit_description: Union[dict, htcondor.Submit] 

315 ) -> htcondor.Submit: 

316 # resolve executable path 

317 if os.path.exists(self.executable): 

318 executable = self.executable 

319 elif found_exec := shutil.which(self.executable): 

320 executable = found_exec 

321 else: 

322 warnings.warn( 

323 f"executable {self.executable} not found for layer {self.name}, " 

324 "this may be a failure mode during job submission", 

325 stacklevel=4, 

326 ) 

327 executable = self.executable 

328 

329 # add base submit opts + additional submit descriptors 

330 universe = submit_description.get("universe", self.universe) 

331 submit: Dict[str, Any] = { 

332 "universe": universe, 

333 "executable": executable, 

334 "arguments": self._arguments(), 

335 **submit_description, 

336 } 

337 

338 # file submit opts 

339 if self.transfer_files: 

340 inputs = self._inputs() 

341 outputs = self._outputs() 

342 output_remaps = self._output_remaps() 

343 

344 if inputs or outputs: 

345 submit.setdefault("should_transfer_files", "YES") 

346 submit.setdefault("when_to_transfer_output", "ON_SUCCESS") 

347 submit.setdefault("success_exit_code", 0) 

348 submit["preserve_relative_paths"] = True 

349 if inputs: 

350 submit["transfer_input_files"] = inputs 

351 if outputs: 

352 submit["transfer_output_files"] = outputs 

353 submit["transfer_output_remaps"] = f'"{output_remaps}"' 

354 

355 # log submit opts 

356 submit.setdefault("output", "$(log_dir)/$(nodename)-$(cluster)-$(process).out") 

357 submit.setdefault("error", "$(log_dir)/$(nodename)-$(cluster)-$(process).err") 

358 

359 # extra boilerplate submit opts 

360 submit.setdefault("notification", "never") 

361 

362 return htcondor.Submit(format_submit_description(submit)) 

363 

364 

365@dataclass 

366class Node: 

367 """Defines a single node (or job) in an HTCondor DAG. 

368 

369 Stores both the arguments used within a job as well 

370 as capturing any inputs and outputs the job uses/creates. 

371 

372 Parameters 

373 ---------- 

374 arguments 

375 The arguments the node uses which aren't I/O related. 

376 inputs 

377 The arguments the node takes as inputs. 

378 outputs 

379 The arguments the node takes as outputs. 

380 variables 

381 Meta parameters that can be used within the submit description. 

382 

383 """ 

384 

385 arguments: Union[Argument, Option, list] = field(default_factory=list) 

386 inputs: Union[Argument, Option, list] = field(default_factory=list) 

387 outputs: Union[Argument, Option, list] = field(default_factory=list) 

388 variables: dict = field(default_factory=dict) 

389 

390 def __post_init__(self) -> None: 

391 if isinstance(self.arguments, (Argument, Option)): 

392 self.arguments = [self.arguments] 

393 if isinstance(self.inputs, (Argument, Option)): 

394 self.inputs = [self.inputs] 

395 if isinstance(self.outputs, (Argument, Option)): 

396 self.outputs = [self.outputs] 

397 

398 # wrap string literals 

399 self.arguments = [_wrap_string_literal(arg) for arg in self.arguments] 

400 self.inputs = [_wrap_string_literal(arg) for arg in self.inputs] 

401 self.outputs = [_wrap_string_literal(arg) for arg in self.outputs] 

402 

403 @property 

404 def requires(self) -> List[str]: 

405 """ 

406 Returns 

407 ------- 

408 list 

409 The inputs this node explicitly depends on to run. 

410 

411 """ 

412 assert isinstance(self.inputs, list) 

413 return list( 

414 itertools.chain(*[input_.args() for input_ in self.inputs if input_.track]) 

415 ) 

416 

417 @property 

418 def provides(self) -> List[str]: 

419 """ 

420 Returns 

421 ------- 

422 list 

423 The outputs this node provides when it completes. 

424 

425 """ 

426 assert isinstance(self.outputs, list) 

427 return list( 

428 itertools.chain(*[output.args() for output in self.outputs if output.track]) 

429 ) 

430 

431 

432class HexFormatter(dags.SimpleFormatter): 

433 """A hex-based node formatter that produces names like LayerName:0000C.""" 

434 

435 def __init__( 

436 self, separator: str = ":", index_format: str = "{:05X}", offset: int = 0 

437 ) -> None: 

438 self.separator = separator 

439 self.index_format = index_format 

440 self.offset = offset 

441 

442 def parse(self, node_name: str) -> Tuple[str, int]: 

443 layer, hex_index = node_name.split(self.separator) 

444 index = int(hex_index, 16) 

445 return layer, index - self.offset 

446 

447 

448def _wrap_string_literal( 

449 argument: Union[str, int, float, Argument, Option], 

450) -> Union[Literal, Argument, Option]: 

451 """Wraps a string literal, passing other arguments unchanged.""" 

452 if isinstance(argument, (Argument, Option)): 

453 return argument 

454 return Literal(argument)