Coverage for ezdag/dags.py: 73.3%
131 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-29 15:59 -0700
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-29 15:59 -0700
1# Copyright (C) 2020 Patrick Godwin
2#
3# This Source Code Form is subject to the terms of the Mozilla Public License, v2.0.
4# If a copy of the MPL was not distributed with this file, You can obtain one at
5# <https://mozilla.org/MPL/2.0/>.
6#
7# SPDX-License-Identifier: MPL-2.0
9from __future__ import annotations
11import os
12import re
13import warnings
14from collections import Counter, defaultdict
15from pathlib import Path
17# disable warnings when condor config source is not found
18with warnings.catch_warnings():
19 warnings.simplefilter("ignore", UserWarning)
20 import htcondor
21 from htcondor import dags
23from .layers import HexFormatter, Layer
25LayerKey = tuple[str, int]
28class DAG(dags.DAG):
29 """Defines a DAGMan workflow including the execution graph and configuration.
31 Parameters
32 ----------
33 name : str
34 The name of the DAG workflow, used for files written to disk and for
35 DAG submission when calling write() and submit(). Defaults to "workflow".
36 formatter : htcondor.dags.NodeNameFormatter
37 Defines how the node names are defined and formatted. Defaults to a
38 hex-based formatter with 5 digits.
39 *args
40 Any positional arguments that htcondor.dags.DAG accepts
41 **kwargs
42 Any keyword arguments that htcondor.dags.DAG accepts
44 """
46 def __init__(
47 self,
48 name: str = "workflow",
49 formatter: dags.NodeNameFormatter | None = None,
50 *args,
51 **kwargs,
52 ) -> None:
53 super().__init__(*args, **kwargs)
54 self.name = name
55 self._node_layers: dict[LayerKey, dags.NodeLayer] = {}
56 self._ordered_layers: list[dags.NodeLayer] = []
57 self._layers: dict[LayerKey, Layer] = {}
58 self._provides: dict[LayerKey, tuple[LayerKey, int]] = {}
59 if formatter:
60 self.formatter = formatter
61 else:
62 self.formatter = HexFormatter()
63 self._dag_path: str | None = None
64 self._layer_count: Counter[str] = Counter()
66 def attach(self, layer: Layer) -> None:
67 """Attach a layer of related job nodes to this DAG.
69 Parameters
70 ----------
71 layer
72 The layer to attach.
74 """
75 key = (layer.name, self._layer_count[layer.name])
76 self._layers[key] = layer
77 self._layer_count[layer.name] += 1
79 # layer naming - append number for duplicate layers
80 layer_config = layer.config(self.formatter)
81 if self._layer_count[layer.name] > 1:
82 layer_config["name"] = f"{layer.name}{self._layer_count[layer.name]}"
84 # determine parent-child relationships and connect accordingly
85 all_edges = defaultdict(set)
86 if layer.has_dependencies:
87 # determine edges
88 for child_idx, node in enumerate(layer.nodes):
89 for input_ in node.requires:
90 if input_ in self._provides:
91 parent_name, parent_idx = self._provides[input_]
92 all_edges[parent_name].add((parent_idx, child_idx))
94 if not all_edges:
95 node_layer = self.layer(**layer_config)
96 self._node_layers[key] = node_layer
97 self._ordered_layers.append(node_layer)
99 # determine edge type and connect
100 for num, (parent, edges) in enumerate(all_edges.items()):
101 edge = self._get_edge_type(parent, key, edges)
102 if num == 0:
103 node_layer = self._node_layers[parent].child_layer(
104 **layer_config, edge=edge
105 )
106 self._node_layers[key] = node_layer
107 self._ordered_layers.append(node_layer)
108 else:
109 self._node_layers[key].add_parents(
110 self._node_layers[parent], edge=edge
111 )
113 else:
114 node_layer = self.layer(**layer_config)
115 self._node_layers[key] = node_layer
116 self._ordered_layers.append(node_layer)
118 # register any data products the layer provides
119 for idx, node in enumerate(layer.nodes):
120 for output in node.provides:
121 self._provides[output] = (key, idx)
123 def create_log_dir(self, log_dir: Path = Path("logs")) -> None:
124 """Create the log directory where job logs are stored.
126 Parameters
127 ----------
128 log_dir : Path
129 The directory to create logs in. Defaults to ./logs.
131 """
132 warnings.warn(
133 "create_log_dir has been deprecated in favor of automatically "
134 "creating log directories upon DAG generation. this method "
135 "will be removed in a future release",
136 DeprecationWarning,
137 stacklevel=2,
138 )
139 os.makedirs(log_dir, exist_ok=True)
141 def write(self, path: Path | None = None, *, write_script: bool = False) -> None:
142 """Write out the given DAG to the given directory.
144 This includes the DAG description file itself, as well as any
145 associated submit descriptions and log directories.
147 Also optionally writes out the list of commands for each node, which
148 represents commands that would be run on the execute point, after
149 taking into account file location changes where the job would be run if
150 file transfer is enabled.
152 Parameters
153 ----------
154 path : Path
155 The directory to write the DAG files to. Defaults to the current working
156 directory.
157 write_script : bool
158 Also write out the list of commands for each node to disk. Defaults
159 to false.
161 """
162 if not path:
163 path = Path.cwd()
165 dag_file = f"{self.name}.dag"
166 self._write_dag(dag_file, path=path)
167 self._dag_path = str(path / dag_file)
168 if write_script:
169 self._write_script(f"{self.name}.sh", path=path)
171 def submit(
172 self, path: Path | None = None, *, write_script: bool = False, **kwargs
173 ) -> htcondor.SubmitResult:
174 """Submit the DAG via HTCondor.
176 If the DAG has not already been written to disk, do so as well.
177 This is equivalent to calling write() prior to submission, making
178 use of the `path` and `write_script` arguments for doing so. See
179 DAG.write for more information.
181 Parameters
182 ----------
183 path : Path
184 The directory to write the DAG files to. Defaults to the current working
185 directory.
186 write_script : bool
187 Also write out the list of commands for each node to disk. Defaults
188 to false.
189 **kwargs
190 Any keyword arguments that `condor_submit_dag` accepts. See
191 [htcondor.Submit.from_dag](https://htcondor.readthedocs.io/en/latest/apis/python-bindings/api/htcondor.html#htcondor.Submit.from_dag)
192 for more information.
194 Returns
195 -------
196 htcondor.SubmitResult
197 The submit result containing the cluster ID and ClassAd of the
198 submitted DAG.
200 """
201 if not path:
202 path = Path.cwd()
204 # write DAG to disk if not already done
205 if not self._dag_path:
206 self.write(path, write_script=write_script)
207 self._dag_path = str(path / f"{self.name}.dag")
209 # submit the DAG
210 submit_kwargs = {"UseDagDir": True, **kwargs}
211 dag_submit = htcondor.Submit.from_dag(self._dag_path, submit_kwargs)
212 return htcondor.Schedd().submit(dag_submit)
214 def write_dag(self, filename: str, path: Path | None = None, **kwargs) -> None:
215 """Write out the given DAG to the given directory.
217 This includes the DAG description file itself, as well as any
218 associated submit descriptions and log directories.
220 Parameters
221 ----------
222 filename : str
223 The name of the DAG description file itself, e.g. my_dag.dag.
224 path : Path
225 The directory to write the DAG files to. Defaults to the current working
226 directory.
227 **kwargs
228 Any other keyword arguments that htcondor.dags.write_dag accepts
230 """
231 warnings.warn(
232 "write_dag has been deprecated in favor of write. "
233 "this method will be removed in a future release",
234 DeprecationWarning,
235 stacklevel=2,
236 )
237 if not path:
238 path = Path.cwd()
239 self._write_dag(filename, path, **kwargs)
241 def write_script(
242 self,
243 filename: str,
244 path: Path | None = None,
245 ) -> None:
246 """Write out the list of commands for each node to the given directory.
248 This represents commands that would be run on the execute point, after
249 taking into account file location changes where the job would be run if
250 file transfer is enabled.
252 Parameters
253 ----------
254 filename : str
255 The name of the script file itself, e.g. my_dag.sh.
256 path : Path
257 The directory to write the script file to. Defaults to the current working
258 directory.
260 """
261 warnings.warn(
262 "write_dag has been deprecated in favor of write. "
263 "this method will be removed in a future release",
264 DeprecationWarning,
265 stacklevel=2,
266 )
267 if not path:
268 path = Path.cwd()
269 self._write_script(filename, path)
271 def _write_dag(self, filename: str, path: Path | None = None, **kwargs) -> None:
272 """Write out the given DAG to the given directory.
274 This includes the DAG description file itself, as well as any
275 associated submit descriptions and log directories.
277 Parameters
278 ----------
279 filename : str
280 The name of the DAG description file itself, e.g. my_dag.dag.
281 path : Path
282 The directory to write the DAG files to. Defaults to the current working
283 directory.
284 **kwargs
285 Any other keyword arguments that htcondor.dags.write_dag accepts
287 """
288 if not path:
289 path = Path.cwd()
291 # create log directories
292 for key, layer in self._layers.items():
293 if os.path.isabs(layer.log_dir):
294 log_path = Path(layer.log_dir)
295 else:
296 submit = self._node_layers[key].submit_description
297 initialdir = Path(submit.get("initialdir", str(path)))
298 log_path = initialdir / layer.log_dir
299 os.makedirs(log_path, exist_ok=True)
301 # create DAG and submit files
302 dags.write_dag(
303 self,
304 path,
305 dag_file_name=filename,
306 node_name_formatter=self.formatter,
307 **kwargs,
308 )
310 def _write_script(
311 self,
312 filename: str,
313 path: Path | None = None,
314 ) -> None:
315 """Write out the list of commands for each node to the given directory.
317 This represents commands that would be run on the execute point, after
318 taking into account file location changes where the job would be run if
319 file transfer is enabled.
321 Parameters
322 ----------
323 filename : str
324 The name of the script file itself, e.g. my_dag.sh.
325 path : Path
326 The directory to write the script file to. Defaults to the current working
327 directory.
329 """
330 if not path:
331 path = Path.cwd()
333 with open(path / filename, "w") as f:
334 # traverse DAG in breadth-first order
335 for layer in self._ordered_layers:
336 # grab relevant submit args, format $(arg) to {arg}
337 executable = layer.submit_description["executable"]
338 args = layer.submit_description["arguments"]
339 args = re.sub(r"\$\(((\w+?))\)", r"{\1}", args)
341 # evaluate vars for each node in layer, write to disk
342 for idx, node_vars in enumerate(layer.vars):
343 node_name = self.formatter.generate(layer.name, idx)
344 print(f"# Job {node_name}", file=f)
345 print(executable + " " + args.format(**node_vars) + "\n", file=f)
347 def _get_edge_type(self, parent_name, child_name, edges) -> dags.BaseEdge:
348 parent = self._layers[parent_name]
349 child = self._layers[child_name]
350 edges = sorted(list(edges))
352 # check special cases, defaulting to explicit edge connections via indices
353 if len(edges) == (len(parent.nodes) + len(child.nodes)):
354 return dags.ManyToMany()
355 if len(parent.nodes) == len(child.nodes) and all(
356 parent_idx == child_idx for parent_idx, child_idx in edges
357 ):
358 return dags.OneToOne()
359 return EdgeConnector(edges)
362class EdgeConnector(dags.BaseEdge):
363 """This edge connects individual nodes in layers given an explicit mapping."""
365 def __init__(self, indices) -> None:
366 self.indices = indices
368 def get_edges(self, parent, child, join_factory):
369 for parent_idx, child_idx in self.indices:
370 yield (parent_idx,), (child_idx,)
373def write_dag(
374 dag: dags.DAG,
375 dag_dir: Path | None = None,
376 formatter: dags.NodeNameFormatter | None = None,
377 **kwargs,
378) -> Path:
379 """Write out the given DAG to the given directory.
381 This includes the DAG description file itself, as well as any associated
382 submit descriptions.
384 Parameters
385 ----------
386 dag : DAG
387 The DAG to write.
388 dag_dir : Path
389 The directory to write the DAG files to. Defaults to the current working
390 directory.
391 formatter : htcondor.dags.NodeNameFormatter
392 Defines how the node names are defined and formatted. Defaults to a
393 hex-based formatter with 5 digits.
394 **kwargs
395 Any other keyword arguments that htcondor.dags.write_dag accepts
397 """
398 warnings.warn(
399 "write_dag has been deprecated in favor of DAG.write. "
400 "this method will be removed in a future release",
401 DeprecationWarning,
402 stacklevel=2,
403 )
404 if not dag_dir:
405 dag_dir = Path.cwd()
406 if not formatter:
407 formatter = HexFormatter()
408 return dags.write_dag(dag, dag_dir, node_name_formatter=formatter, **kwargs)