Module bytewax.inputs
Helpers to let you quickly define epoch / batching semantics.
Use these to wrap an existing iterator which yields items.
Expand source code
"""Helpers to let you quickly define epoch / batching semantics.
Use these to wrap an existing iterator which yields items.
"""
import datetime
import heapq
from .bytewax import AdvanceTo, Emit
from dataclasses import dataclass
from typing import Any, Callable, Iterable, Tuple
def yield_epochs(fn: Callable):
"""A decorator function to unwrap an iterator of [epoch, item]
into successive `AdvanceTo` and `Emit` classes with the
contents of the iterator.
Use this when you have an input_builder function that returns
a generator of (epoch, item) to be used with `cluster_main` or
`spawn_cluster`:
>>> from bytewax import Dataflow, cluster_main
>>> from bytewax.inputs import yield_epochs, fully_ordered
>>> flow = Dataflow()
>>> flow.capture()
>>> @yield_epochs
... def input_builder(i, n):
... return fully_ordered(["a", "b", "c"])
>>> cluster_main(flow, input_builder, lambda i, n: print, [], 0, 1)
(0, 'a')
(1, 'b')
(2, 'c')
"""
def inner_fn(worker_index, worker_count):
gen = fn(worker_index, worker_count)
for (epoch, item) in gen:
yield AdvanceTo(epoch)
yield Emit(item)
return inner_fn
def single_batch(wrap_iter: Iterable) -> Iterable[Tuple[int, Any]]:
"""All input items are part of the same epoch.
Use this for non-streaming-style batch processing.
>>> from bytewax import Dataflow, run
>>> flow = Dataflow()
>>> flow.capture()
>>> out = run(flow, single_batch(["a", "b", "c"]))
>>> sorted(out)
[(0, 'a'), (0, 'b'), (0, 'c')]
Args:
wrap_iter: Existing input iterable of just items.
Yields:
Tuples of `(epoch, item)`.
"""
for item in wrap_iter:
yield (0, item)
def tumbling_epoch(
wrap_iter: Iterable,
epoch_length: Any,
time_getter: Callable[[Any], Any] = lambda _: datetime.datetime.now(),
epoch_start_time: Any = None,
epoch_start: int = 0,
) -> Iterable[Tuple[int, Any]]:
"""All inputs within a tumbling window are part of the same epoch.
The time of the first item will be used as start of the 0
epoch. Out-of-order items will cause issues as Bytewax requires
inputs to dataflows to be in epoch order. See
`bytewax.inputs.fully_ordered()`.
>>> from bytewax import Dataflow, run
>>> items = [
... {
... "timestamp": datetime.datetime(2022, 2, 22, 1, 2, 3),
... "value": "a",
... },
... {
... "timestamp": datetime.datetime(2022, 2, 22, 1, 2, 4),
... "value": "b",
... },
... {
... "timestamp": datetime.datetime(2022, 2, 22, 1, 2, 8),
... "value": "c",
... },
... ]
>>> flow = Dataflow()
>>> flow.map(lambda item: item["value"])
>>> flow.capture()
>>> out = run(flow, tumbling_epoch(
... items,
... datetime.timedelta(seconds=2),
... lambda item: item["timestamp"],
... ))
>>> sorted(out)
[(0, 'a'), (0, 'b'), (2, 'c')]
By default, uses "ingestion time" and you don't need to specify a
way to access the timestamp in each item.
>>> import pytest; pytest.skip("Figure out sleep in test.")
>>> items = [
... "a", # sleep(4)
... "b", # sleep(1)
... "c",
... ]
>>> list(tumbling_epoch(items, datetime.timedelta(seconds=2)))
[(0, 'a'), (2, 'b'), (2, 'c')]
Args:
wrap_iter: Existing input iterable of just items.
epoch_length: Length of each epoch window.
time_getter: Function that returns a timestamp given an
item. Defaults to current wall time.
epoch_start_time: The timestamp that should correspond to
the start of the 0th epoch. Otherwise defaults to the time
found on the first item.
epoch_start: The integer value to start counting epochs from.
This can be used for continuity during processing.
Yields:
Tuples of `(epoch, item)`.
"""
for item in wrap_iter:
time = time_getter(item)
if epoch_start_time is None:
epoch_start_time = time
epoch = epoch_start
else:
epoch = int((time - epoch_start_time) / epoch_length) + epoch_start
yield (epoch, item)
def fully_ordered(wrap_iter: Iterable) -> Iterable[Tuple[int, Any]]:
"""Each input item increments the epoch.
Be careful using this in high-volume streams with many workers, as
the worker overhead goes up with finely granulated epochs.
>>> from bytewax import Dataflow, run
>>> flow = Dataflow()
>>> flow.capture()
>>> out = run(flow, fully_ordered(["a", "b", "c"]))
>>> sorted(out)
[(0, 'a'), (1, 'b'), (2, 'c')]
Args:
wrap_iter: Existing input iterable of just items.
Yields:
Tuples of `(epoch, item)`.
"""
epoch = 0
for item in wrap_iter:
yield (epoch, item)
epoch += 1
@dataclass
class _HeapItem:
"""Wrapper class which holds pairs of time and item for implementing
`sorted_window()`.
We need some class that has an ordering only based on the time.
"""
time: Any
item: Any
def __lt__(self, other):
"""Compare just by timestamp. Ignore the item."""
return self.time < other.time
def sorted_window(
wrap_iter: Iterable,
window_length: Any,
time_getter: Callable[[Any], Any],
on_drop: Callable[[Any], None] = None,
) -> Iterable[Tuple[int, Any]]:
"""Sort a iterator to be increasing by some timestamp.
To support a possibly infinite iterator, store a limited sorted
buffer of items and only emit things downstream once a certain
window of time has passed, as indicated by the timestamp on new
items.
New input items which are older than those already emitted will be
dropped to maintain sorted output.
The window length needs to be tuned for how "out of order" your
input data is and how much data you're willing to drop: Already
perfectly ordered input data can have a window of "0" and nothing
will be dropped. Completely reversed input data needs a window
that is the difference between the oldest and youngest timestamp
to ensure nothing will be dropped.
>>> from bytewax import Dataflow, run
>>> items = [
... {
... "timestamp": datetime.datetime(2022, 2, 22, 1, 2, 4),
... "value": "c",
... },
... {
... "timestamp": datetime.datetime(2022, 2, 22, 1, 2, 3),
... "value": "b",
... },
... {
... "timestamp": datetime.datetime(2022, 2, 22, 1, 2, 0),
... "value": "a",
... },
... ]
>>> sorted_items = list(
... sorted_window(
... items,
... datetime.timedelta(seconds=2),
... lambda item: item["timestamp"],
... )
... )
>>> sorted_items
[{'timestamp': datetime.datetime(2022, 2, 22, 1, 2, 3), 'value': 'b'},
{'timestamp': datetime.datetime(2022, 2, 22, 1, 2, 4), 'value': 'c'}]
You could imagine using it with `tumbling_epoch()` to ensure you
get in-order, bucketed data into your dataflow.
>>> flow = Dataflow()
>>> flow.map(lambda item: item["value"])
>>> flow.capture()
>>> out = run(flow, tumbling_epoch(
... sorted_items,
... datetime.timedelta(seconds=0.5),
... lambda item: item["timestamp"],
... ))
>>> sorted(out)
[(0, 'b'), (2, 'c')]
Args:
wrap_iter: Existing input iterable.
window_length: Buffering duration. Values will be emitted once
this amount of time has passed.
time_getter: Function to call to produce a timestamp for each
value.
on_drop: Function to call with each dropped item. E.g. log or
increment metrics on drop events to refine your window
length.
Yields:
Values in increasing timestamp order.
"""
sorted_buffer = []
newest_time = None
drop_older_than = None
def is_too_late(time):
return drop_older_than is not None and time <= drop_older_than
def is_newest_item(time):
return newest_time is None or time > newest_time
def emit_all(emit_older_than):
while len(sorted_buffer) > 0 and sorted_buffer[0].time <= emit_older_than:
sort_item = heapq.heappop(sorted_buffer)
yield sort_item.item
for item in wrap_iter:
time = time_getter(item)
if is_too_late(time):
if on_drop:
on_drop(item)
else:
heapq.heappush(sorted_buffer, _HeapItem(time, item))
if is_newest_item(time):
newest_time = time
drop_older_than = time - window_length
yield from emit_all(drop_older_than)
yield from emit_all(newest_time)
Functions
def fully_ordered(wrap_iter: Iterable) ‑> Iterable[Tuple[int, Any]]
-
Each input item increments the epoch.
Be careful using this in high-volume streams with many workers, as the worker overhead goes up with finely granulated epochs.
>>> from bytewax import Dataflow, run >>> flow = Dataflow() >>> flow.capture() >>> out = run(flow, fully_ordered(["a", "b", "c"])) >>> sorted(out) [(0, 'a'), (1, 'b'), (2, 'c')]
Args
wrap_iter
- Existing input iterable of just items.
Yields
Tuples of
(epoch, item)
.Expand source code
def fully_ordered(wrap_iter: Iterable) -> Iterable[Tuple[int, Any]]: """Each input item increments the epoch. Be careful using this in high-volume streams with many workers, as the worker overhead goes up with finely granulated epochs. >>> from bytewax import Dataflow, run >>> flow = Dataflow() >>> flow.capture() >>> out = run(flow, fully_ordered(["a", "b", "c"])) >>> sorted(out) [(0, 'a'), (1, 'b'), (2, 'c')] Args: wrap_iter: Existing input iterable of just items. Yields: Tuples of `(epoch, item)`. """ epoch = 0 for item in wrap_iter: yield (epoch, item) epoch += 1
def single_batch(wrap_iter: Iterable) ‑> Iterable[Tuple[int, Any]]
-
All input items are part of the same epoch.
Use this for non-streaming-style batch processing.
>>> from bytewax import Dataflow, run >>> flow = Dataflow() >>> flow.capture() >>> out = run(flow, single_batch(["a", "b", "c"])) >>> sorted(out) [(0, 'a'), (0, 'b'), (0, 'c')]
Args
wrap_iter
- Existing input iterable of just items.
Yields
Tuples of
(epoch, item)
.Expand source code
def single_batch(wrap_iter: Iterable) -> Iterable[Tuple[int, Any]]: """All input items are part of the same epoch. Use this for non-streaming-style batch processing. >>> from bytewax import Dataflow, run >>> flow = Dataflow() >>> flow.capture() >>> out = run(flow, single_batch(["a", "b", "c"])) >>> sorted(out) [(0, 'a'), (0, 'b'), (0, 'c')] Args: wrap_iter: Existing input iterable of just items. Yields: Tuples of `(epoch, item)`. """ for item in wrap_iter: yield (0, item)
def sorted_window(wrap_iter: Iterable, window_length: Any, time_getter: Callable[[Any], Any], on_drop: Callable[[Any], None] = None) ‑> Iterable[Tuple[int, Any]]
-
Sort a iterator to be increasing by some timestamp.
To support a possibly infinite iterator, store a limited sorted buffer of items and only emit things downstream once a certain window of time has passed, as indicated by the timestamp on new items.
New input items which are older than those already emitted will be dropped to maintain sorted output.
The window length needs to be tuned for how "out of order" your input data is and how much data you're willing to drop: Already perfectly ordered input data can have a window of "0" and nothing will be dropped. Completely reversed input data needs a window that is the difference between the oldest and youngest timestamp to ensure nothing will be dropped.
>>> from bytewax import Dataflow, run >>> items = [ ... { ... "timestamp": datetime.datetime(2022, 2, 22, 1, 2, 4), ... "value": "c", ... }, ... { ... "timestamp": datetime.datetime(2022, 2, 22, 1, 2, 3), ... "value": "b", ... }, ... { ... "timestamp": datetime.datetime(2022, 2, 22, 1, 2, 0), ... "value": "a", ... }, ... ] >>> sorted_items = list( ... sorted_window( ... items, ... datetime.timedelta(seconds=2), ... lambda item: item["timestamp"], ... ) ... ) >>> sorted_items [{'timestamp': datetime.datetime(2022, 2, 22, 1, 2, 3), 'value': 'b'}, {'timestamp': datetime.datetime(2022, 2, 22, 1, 2, 4), 'value': 'c'}]
You could imagine using it with
tumbling_epoch()
to ensure you get in-order, bucketed data into your dataflow.>>> flow = Dataflow() >>> flow.map(lambda item: item["value"]) >>> flow.capture() >>> out = run(flow, tumbling_epoch( ... sorted_items, ... datetime.timedelta(seconds=0.5), ... lambda item: item["timestamp"], ... )) >>> sorted(out) [(0, 'b'), (2, 'c')]
Args
wrap_iter
- Existing input iterable.
window_length
- Buffering duration. Values will be emitted once this amount of time has passed.
time_getter
- Function to call to produce a timestamp for each value.
on_drop
- Function to call with each dropped item. E.g. log or increment metrics on drop events to refine your window length.
Yields
Values in increasing timestamp order.
Expand source code
def sorted_window( wrap_iter: Iterable, window_length: Any, time_getter: Callable[[Any], Any], on_drop: Callable[[Any], None] = None, ) -> Iterable[Tuple[int, Any]]: """Sort a iterator to be increasing by some timestamp. To support a possibly infinite iterator, store a limited sorted buffer of items and only emit things downstream once a certain window of time has passed, as indicated by the timestamp on new items. New input items which are older than those already emitted will be dropped to maintain sorted output. The window length needs to be tuned for how "out of order" your input data is and how much data you're willing to drop: Already perfectly ordered input data can have a window of "0" and nothing will be dropped. Completely reversed input data needs a window that is the difference between the oldest and youngest timestamp to ensure nothing will be dropped. >>> from bytewax import Dataflow, run >>> items = [ ... { ... "timestamp": datetime.datetime(2022, 2, 22, 1, 2, 4), ... "value": "c", ... }, ... { ... "timestamp": datetime.datetime(2022, 2, 22, 1, 2, 3), ... "value": "b", ... }, ... { ... "timestamp": datetime.datetime(2022, 2, 22, 1, 2, 0), ... "value": "a", ... }, ... ] >>> sorted_items = list( ... sorted_window( ... items, ... datetime.timedelta(seconds=2), ... lambda item: item["timestamp"], ... ) ... ) >>> sorted_items [{'timestamp': datetime.datetime(2022, 2, 22, 1, 2, 3), 'value': 'b'}, {'timestamp': datetime.datetime(2022, 2, 22, 1, 2, 4), 'value': 'c'}] You could imagine using it with `tumbling_epoch()` to ensure you get in-order, bucketed data into your dataflow. >>> flow = Dataflow() >>> flow.map(lambda item: item["value"]) >>> flow.capture() >>> out = run(flow, tumbling_epoch( ... sorted_items, ... datetime.timedelta(seconds=0.5), ... lambda item: item["timestamp"], ... )) >>> sorted(out) [(0, 'b'), (2, 'c')] Args: wrap_iter: Existing input iterable. window_length: Buffering duration. Values will be emitted once this amount of time has passed. time_getter: Function to call to produce a timestamp for each value. on_drop: Function to call with each dropped item. E.g. log or increment metrics on drop events to refine your window length. Yields: Values in increasing timestamp order. """ sorted_buffer = [] newest_time = None drop_older_than = None def is_too_late(time): return drop_older_than is not None and time <= drop_older_than def is_newest_item(time): return newest_time is None or time > newest_time def emit_all(emit_older_than): while len(sorted_buffer) > 0 and sorted_buffer[0].time <= emit_older_than: sort_item = heapq.heappop(sorted_buffer) yield sort_item.item for item in wrap_iter: time = time_getter(item) if is_too_late(time): if on_drop: on_drop(item) else: heapq.heappush(sorted_buffer, _HeapItem(time, item)) if is_newest_item(time): newest_time = time drop_older_than = time - window_length yield from emit_all(drop_older_than) yield from emit_all(newest_time)
def tumbling_epoch(wrap_iter: Iterable, epoch_length: Any, time_getter: Callable[[Any], Any] = <function <lambda>>, epoch_start_time: Any = None, epoch_start: int = 0) ‑> Iterable[Tuple[int, Any]]
-
All inputs within a tumbling window are part of the same epoch.
The time of the first item will be used as start of the 0 epoch. Out-of-order items will cause issues as Bytewax requires inputs to dataflows to be in epoch order. See
fully_ordered()
.>>> from bytewax import Dataflow, run >>> items = [ ... { ... "timestamp": datetime.datetime(2022, 2, 22, 1, 2, 3), ... "value": "a", ... }, ... { ... "timestamp": datetime.datetime(2022, 2, 22, 1, 2, 4), ... "value": "b", ... }, ... { ... "timestamp": datetime.datetime(2022, 2, 22, 1, 2, 8), ... "value": "c", ... }, ... ] >>> flow = Dataflow() >>> flow.map(lambda item: item["value"]) >>> flow.capture() >>> out = run(flow, tumbling_epoch( ... items, ... datetime.timedelta(seconds=2), ... lambda item: item["timestamp"], ... )) >>> sorted(out) [(0, 'a'), (0, 'b'), (2, 'c')]
By default, uses "ingestion time" and you don't need to specify a way to access the timestamp in each item.
>>> import pytest; pytest.skip("Figure out sleep in test.") >>> items = [ ... "a", # sleep(4) ... "b", # sleep(1) ... "c", ... ] >>> list(tumbling_epoch(items, datetime.timedelta(seconds=2))) [(0, 'a'), (2, 'b'), (2, 'c')]
Args
wrap_iter
- Existing input iterable of just items.
epoch_length
- Length of each epoch window.
time_getter
- Function that returns a timestamp given an item. Defaults to current wall time.
epoch_start_time
- The timestamp that should correspond to the start of the 0th epoch. Otherwise defaults to the time found on the first item.
epoch_start
- The integer value to start counting epochs from. This can be used for continuity during processing.
Yields
Tuples of
(epoch, item)
.Expand source code
def tumbling_epoch( wrap_iter: Iterable, epoch_length: Any, time_getter: Callable[[Any], Any] = lambda _: datetime.datetime.now(), epoch_start_time: Any = None, epoch_start: int = 0, ) -> Iterable[Tuple[int, Any]]: """All inputs within a tumbling window are part of the same epoch. The time of the first item will be used as start of the 0 epoch. Out-of-order items will cause issues as Bytewax requires inputs to dataflows to be in epoch order. See `bytewax.inputs.fully_ordered()`. >>> from bytewax import Dataflow, run >>> items = [ ... { ... "timestamp": datetime.datetime(2022, 2, 22, 1, 2, 3), ... "value": "a", ... }, ... { ... "timestamp": datetime.datetime(2022, 2, 22, 1, 2, 4), ... "value": "b", ... }, ... { ... "timestamp": datetime.datetime(2022, 2, 22, 1, 2, 8), ... "value": "c", ... }, ... ] >>> flow = Dataflow() >>> flow.map(lambda item: item["value"]) >>> flow.capture() >>> out = run(flow, tumbling_epoch( ... items, ... datetime.timedelta(seconds=2), ... lambda item: item["timestamp"], ... )) >>> sorted(out) [(0, 'a'), (0, 'b'), (2, 'c')] By default, uses "ingestion time" and you don't need to specify a way to access the timestamp in each item. >>> import pytest; pytest.skip("Figure out sleep in test.") >>> items = [ ... "a", # sleep(4) ... "b", # sleep(1) ... "c", ... ] >>> list(tumbling_epoch(items, datetime.timedelta(seconds=2))) [(0, 'a'), (2, 'b'), (2, 'c')] Args: wrap_iter: Existing input iterable of just items. epoch_length: Length of each epoch window. time_getter: Function that returns a timestamp given an item. Defaults to current wall time. epoch_start_time: The timestamp that should correspond to the start of the 0th epoch. Otherwise defaults to the time found on the first item. epoch_start: The integer value to start counting epochs from. This can be used for continuity during processing. Yields: Tuples of `(epoch, item)`. """ for item in wrap_iter: time = time_getter(item) if epoch_start_time is None: epoch_start_time = time epoch = epoch_start else: epoch = int((time - epoch_start_time) / epoch_length) + epoch_start yield (epoch, item)
def yield_epochs(fn: Callable)
-
A decorator function to unwrap an iterator of [epoch, item] into successive
AdvanceTo
andEmit
classes with the contents of the iterator.Use this when you have an input_builder function that returns a generator of (epoch, item) to be used with
cluster_main
orspawn_cluster
:>>> from bytewax import Dataflow, cluster_main >>> from bytewax.inputs import yield_epochs, fully_ordered >>> flow = Dataflow() >>> flow.capture() >>> @yield_epochs ... def input_builder(i, n): ... return fully_ordered(["a", "b", "c"]) >>> cluster_main(flow, input_builder, lambda i, n: print, [], 0, 1) (0, 'a') (1, 'b') (2, 'c')
Expand source code
def yield_epochs(fn: Callable): """A decorator function to unwrap an iterator of [epoch, item] into successive `AdvanceTo` and `Emit` classes with the contents of the iterator. Use this when you have an input_builder function that returns a generator of (epoch, item) to be used with `cluster_main` or `spawn_cluster`: >>> from bytewax import Dataflow, cluster_main >>> from bytewax.inputs import yield_epochs, fully_ordered >>> flow = Dataflow() >>> flow.capture() >>> @yield_epochs ... def input_builder(i, n): ... return fully_ordered(["a", "b", "c"]) >>> cluster_main(flow, input_builder, lambda i, n: print, [], 0, 1) (0, 'a') (1, 'b') (2, 'c') """ def inner_fn(worker_index, worker_count): gen = fn(worker_index, worker_count) for (epoch, item) in gen: yield AdvanceTo(epoch) yield Emit(item) return inner_fn