Coverage for ezdag/layers.py: 92.6%
216 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-29 15:59 -0700
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-29 15:59 -0700
1# Copyright (C) 2020 Patrick Godwin
2#
3# This Source Code Form is subject to the terms of the Mozilla Public License, v2.0.
4# If a copy of the MPL was not distributed with this file, You can obtain one at
5# <https://mozilla.org/MPL/2.0/>.
6#
7# SPDX-License-Identifier: MPL-2.0
9from __future__ import annotations
11import itertools
12import os
13import re
14import shutil
15import warnings
16from collections.abc import Iterable
17from dataclasses import dataclass, field
18from typing import Any, Dict, List, Optional, Tuple, Union
20# disable warnings when condor config source is not found
21with warnings.catch_warnings():
22 warnings.simplefilter("ignore", UserWarning)
23 import htcondor
24 from htcondor import dags
26from . import path
27from .format import format_submit_description
28from .options import PROTECTED_CONDOR_VARS, Argument, Literal, Option
31@dataclass
32class Layer:
33 """Defines a single layer (or set of related jobs) in an HTCondor DAG.
35 Stores submit configuration for a set of nodes as well as
36 providing functionality to determine the parent-child
37 relationships between nodes.
39 Parameters
40 ----------
41 executable
42 The path of the executable to run.
43 name
44 The human-readable name of this node. Defaults to the basename
45 of the executable if not given.
46 universe
47 The execution environment for a job. Defaults to 'vanilla'.
48 log_dir
49 The directory in which logs will be written to. Defaults to ./logs.
50 retries
51 The number of retries given for a job. Defaults to 3.
52 transfer_files
53 Whether to leverage Condor file transfer for moving around
54 files. On by default.
55 submit_description
56 The submit descriptors representing this set of jobs.
57 requirements
58 The submit descriptors representing this set of jobs.
59 Deprecated in favor for submit_description to avoid confusion,
60 as 'requirements' refers to a specific submit descriptor.
61 This option will be removed in a future release.
62 nodes
63 The nodes representing the layer. Nodes can be passed upon
64 instantiation or added to the layer after the fact via
65 Layer.append(node), Layer.extend(nodes), or Layer += node.
67 """
69 executable: str
70 name: str = ""
71 universe: str = "vanilla"
72 log_dir: str = "logs"
73 retries: int = 3
74 transfer_files: bool = True
75 requirements: dict = field(default_factory=dict)
76 submit_description: Union[dict, htcondor.Submit] = field(default_factory=dict)
77 nodes: list = field(default_factory=list)
78 inputs: dict = field(init=False, default_factory=dict)
79 outputs: dict = field(init=False, default_factory=dict)
81 def __post_init__(self) -> None:
82 if not self.name:
83 self.name = os.path.basename(self.executable)
84 if self.requirements:
85 self.submit_description.update(self.requirements)
86 warnings.warn(
87 "requirements has been deprecated in favor of submit_description"
88 "to avoid confusion and will be removed in a future release",
89 DeprecationWarning,
90 stacklevel=2,
91 )
92 self.extend(self.nodes)
94 def config(
95 self,
96 formatter: Optional[dags.NodeNameFormatter] = None,
97 ) -> Dict[str, Any]:
98 """Generates a layer configuration.
100 This configuration can be passed directly into an
101 htcondor.dags.NodeLayer if desired.
103 Parameters
104 ----------
105 formatter : htcondor.dags.NodeNameFormatter
106 Defines how the node names are defined and formatted. Defaults to a
107 hex-based formatter with 5 digits.
109 """
110 # check that nodes are valid
111 self.validate()
113 # update submit description with defaults + other layer configuration
114 submit_description = self._update_submit_defaults(self.submit_description)
116 if not formatter:
117 formatter = HexFormatter()
118 return {
119 "name": self.name,
120 "submit_description": submit_description,
121 "vars": self._vars(formatter),
122 "retries": self.retries,
123 }
125 def append(self, node: Node) -> None:
126 """Append a node to this layer."""
127 assert isinstance(node.inputs, list)
128 assert isinstance(node.outputs, list)
129 for input_ in node.inputs:
130 self.inputs.setdefault(input_.name, []).append(input_.argument)
131 for output in node.outputs:
132 self.outputs.setdefault(output.name, []).append(output.argument)
133 self.nodes.append(node)
135 def extend(self, nodes: Iterable[Node]) -> None:
136 """Append multiple nodes to this layer."""
137 for node in nodes:
138 self.append(node)
140 def __iadd__(self, nodes) -> Layer:
141 if isinstance(nodes, Iterable):
142 self.extend(nodes)
143 else:
144 self.append(nodes)
145 return self
147 def new(self) -> Layer:
148 """Create an identical layer without any nodes attached."""
149 return self.__class__(
150 executable=self.executable,
151 name=self.name,
152 universe=self.universe,
153 log_dir=self.log_dir,
154 retries=self.retries,
155 transfer_files=self.transfer_files,
156 submit_description=self.submit_description,
157 )
159 def validate(self) -> None:
160 """Ensure all nodes in this layer are consistent with each other."""
161 assert self.nodes, "at least one node must be connected to this layer"
163 # check arg names across nodes are equal
164 args = [arg.name for arg in self.nodes[0].arguments]
165 for node in self.nodes[:-1]:
166 assert args == [arg.name for arg in node.arguments]
168 # check input/output names across nodes are equal
169 inputs = [arg.name for arg in self.nodes[0].inputs]
170 for node in self.nodes[:-1]:
171 assert inputs == [arg.name for arg in node.inputs]
172 outputs = [arg.name for arg in self.nodes[0].outputs]
173 for node in self.nodes[:-1]:
174 assert outputs == [arg.name for arg in node.outputs]
176 # check meta-parameters (equality, name validity)
177 variables = list(self.nodes[0].variables.keys())
178 for node in self.nodes[:-1]:
179 assert variables == list(node.variables.keys())
180 for var in variables:
181 if var in PROTECTED_CONDOR_VARS:
182 msg = f"{var} is a protected condor name for node {self.name}"
183 raise ValueError(msg)
185 def command(self, node, *, readjust_paths: bool = True):
186 """Given a node, return the command that would be run.
188 Parameters
189 ----------
190 node : Node
191 The node to return the command for.
192 readjust_paths : bool
193 Determines whether path locations are readjusted based on
194 the command that would be run on the node's execute point.
195 This only has an effect if using file transfer. Default is True.
197 """
198 args = re.sub(r"\$\(((\w+?))\)", r"{\1}", self._arguments())
199 # extract node variables
200 node_vars = {arg.condor_name: arg.vars() for arg in node.arguments}
201 for arg in node.inputs:
202 if self.transfer_files and readjust_paths:
203 node_vars[arg.condor_name] = arg.vars(basename=path.is_abs_or_url)
204 else:
205 node_vars[arg.condor_name] = arg.vars()
206 for arg in node.outputs:
207 basename = readjust_paths and self.transfer_files
208 node_vars[arg.condor_name] = arg.vars(basename=basename)
209 return self.executable + " " + args.format(**node_vars)
211 @property
212 def has_dependencies(self) -> bool:
213 """Check if any of the nodes in this layer have dependencies."""
214 return any(node.requires for node in self.nodes)
216 def _arguments(self) -> str:
217 args = []
218 for arg in self.nodes[0].arguments:
219 args.append(arg.vars() if arg.static else f"$({arg.condor_name})")
220 io_args = []
221 io_opts = []
222 for arg in itertools.chain(self.nodes[0].inputs, self.nodes[0].outputs):
223 if arg.static:
224 basename = path.is_abs_or_url if self.transfer_files else False
225 io_args.append(arg.vars(basename=basename))
226 elif not arg.suppress:
227 if isinstance(arg, Argument):
228 io_args.append(f"$({arg.condor_name})")
229 else:
230 io_opts.append(f"$({arg.condor_name})")
231 return " ".join(itertools.chain(args, io_opts, io_args))
233 def _inputs(self) -> str:
234 inputs = []
235 for arg in self.nodes[0].inputs:
236 if arg.static:
237 inputs.append(arg.files())
238 else:
239 inputs.append(f"$(input_{arg.condor_name})")
240 return ",".join(inputs)
242 def _outputs(self) -> str:
243 outputs = []
244 for arg in self.nodes[0].outputs:
245 if arg.static:
246 outputs.append(arg.files(basename=path.is_abs_or_url))
247 else:
248 outputs.append(f"$(output_{arg.condor_name})")
249 return ",".join(outputs)
251 def _output_remaps(self) -> str:
252 remaps = []
253 for arg in self.nodes[0].outputs:
254 if arg.static:
255 remaps.append(arg.remaps())
256 else:
257 remaps.append(f"$(output_{arg.condor_name}_remap)")
258 return ";".join(remaps)
260 def _vars(self, formatter: dags.NodeNameFormatter) -> List[Dict[str, str]]:
261 allvars = []
262 for i, node in enumerate(self.nodes):
263 nodevars = {
264 "nodename": formatter.generate(self.name, i),
265 "log_dir": self.log_dir,
266 **node.variables,
267 }
269 # add arguments which aren't suppressed
270 for arg in node.arguments:
271 if not arg.static and not arg.suppress:
272 nodevars[arg.condor_name] = arg.vars()
274 # then add arguments defined as 'inputs'. if file transfer is enabled,
275 # also define the $(input_{arg}) variable containing the files
276 for arg in node.inputs:
277 if not arg.static:
278 if not arg.suppress:
279 basename = path.is_abs_or_url if self.transfer_files else False
280 nodevars[arg.condor_name] = arg.vars(basename=basename)
281 if self.transfer_files:
282 # adjust file location for input files if they are
283 # absolute paths. condor will transfer the file
284 # /path/to/file.txt to the job's current working
285 # directory, so arguments should point to file.txt
286 nodevars[f"input_{arg.condor_name}"] = arg.files()
288 # finally, add arguments defined as 'outputs'. if file transfer is
289 # enabled, also define the $(output_{arg}) variable containing the
290 # files. if argument if not suppressed, some extra hoops are done
291 # with remaps to ensure that files are also saved to the right
292 # place. the main problem is that when jobs are submitted, the
293 # directory structure is present in the submit node but not the
294 # execute node, so when a job tries to create a file assuming the
295 # directories are there, the job fails. this gets around the issue
296 # by writing the files to the root directory then remaps them so
297 # they get stored in the right place after the job completes and
298 # files are transferred back
299 for arg in node.outputs:
300 if not arg.static:
301 if not arg.suppress:
302 basename = path.is_abs_or_url if self.transfer_files else False
303 nodevars[arg.condor_name] = arg.vars(basename=basename)
304 if self.transfer_files:
305 nodevars[f"output_{arg.condor_name}"] = arg.files(
306 basename=path.is_abs_or_url
307 )
308 nodevars[f"output_{arg.condor_name}_remap"] = arg.remaps()
309 allvars.append(nodevars)
311 return allvars
313 def _update_submit_defaults(
314 self, submit_description: Union[dict, htcondor.Submit]
315 ) -> htcondor.Submit:
316 # resolve executable path
317 if os.path.exists(self.executable):
318 executable = self.executable
319 elif found_exec := shutil.which(self.executable):
320 executable = found_exec
321 else:
322 warnings.warn(
323 f"executable {self.executable} not found for layer {self.name}, "
324 "this may be a failure mode during job submission",
325 stacklevel=4,
326 )
327 executable = self.executable
329 # add base submit opts + additional submit descriptors
330 universe = submit_description.get("universe", self.universe)
331 submit: Dict[str, Any] = {
332 "universe": universe,
333 "executable": executable,
334 "arguments": self._arguments(),
335 **submit_description,
336 }
338 # file submit opts
339 if self.transfer_files:
340 inputs = self._inputs()
341 outputs = self._outputs()
342 output_remaps = self._output_remaps()
344 if inputs or outputs:
345 submit.setdefault("should_transfer_files", "YES")
346 submit.setdefault("when_to_transfer_output", "ON_SUCCESS")
347 submit.setdefault("success_exit_code", 0)
348 submit["preserve_relative_paths"] = True
349 if inputs:
350 submit["transfer_input_files"] = inputs
351 if outputs:
352 submit["transfer_output_files"] = outputs
353 submit["transfer_output_remaps"] = f'"{output_remaps}"'
355 # log submit opts
356 submit.setdefault("output", "$(log_dir)/$(nodename)-$(cluster)-$(process).out")
357 submit.setdefault("error", "$(log_dir)/$(nodename)-$(cluster)-$(process).err")
359 # extra boilerplate submit opts
360 submit.setdefault("notification", "never")
362 return htcondor.Submit(format_submit_description(submit))
365@dataclass
366class Node:
367 """Defines a single node (or job) in an HTCondor DAG.
369 Stores both the arguments used within a job as well
370 as capturing any inputs and outputs the job uses/creates.
372 Parameters
373 ----------
374 arguments
375 The arguments the node uses which aren't I/O related.
376 inputs
377 The arguments the node takes as inputs.
378 outputs
379 The arguments the node takes as outputs.
380 variables
381 Meta parameters that can be used within the submit description.
383 """
385 arguments: Union[Argument, Option, list] = field(default_factory=list)
386 inputs: Union[Argument, Option, list] = field(default_factory=list)
387 outputs: Union[Argument, Option, list] = field(default_factory=list)
388 variables: dict = field(default_factory=dict)
390 def __post_init__(self) -> None:
391 if isinstance(self.arguments, (Argument, Option)):
392 self.arguments = [self.arguments]
393 if isinstance(self.inputs, (Argument, Option)):
394 self.inputs = [self.inputs]
395 if isinstance(self.outputs, (Argument, Option)):
396 self.outputs = [self.outputs]
398 # wrap string literals
399 self.arguments = [_wrap_string_literal(arg) for arg in self.arguments]
400 self.inputs = [_wrap_string_literal(arg) for arg in self.inputs]
401 self.outputs = [_wrap_string_literal(arg) for arg in self.outputs]
403 @property
404 def requires(self) -> List[str]:
405 """
406 Returns
407 -------
408 list
409 The inputs this node explicitly depends on to run.
411 """
412 assert isinstance(self.inputs, list)
413 return list(
414 itertools.chain(*[input_.args() for input_ in self.inputs if input_.track])
415 )
417 @property
418 def provides(self) -> List[str]:
419 """
420 Returns
421 -------
422 list
423 The outputs this node provides when it completes.
425 """
426 assert isinstance(self.outputs, list)
427 return list(
428 itertools.chain(*[output.args() for output in self.outputs if output.track])
429 )
432class HexFormatter(dags.SimpleFormatter):
433 """A hex-based node formatter that produces names like LayerName:0000C."""
435 def __init__(
436 self, separator: str = ":", index_format: str = "{:05X}", offset: int = 0
437 ) -> None:
438 self.separator = separator
439 self.index_format = index_format
440 self.offset = offset
442 def parse(self, node_name: str) -> Tuple[str, int]:
443 layer, hex_index = node_name.split(self.separator)
444 index = int(hex_index, 16)
445 return layer, index - self.offset
448def _wrap_string_literal(
449 argument: Union[str, int, float, Argument, Option],
450) -> Union[Literal, Argument, Option]:
451 """Wraps a string literal, passing other arguments unchanged."""
452 if isinstance(argument, (Argument, Option)):
453 return argument
454 return Literal(argument)