--- title: core keywords: fastai sidebar: home_sidebar summary: "Unpack and load the [AMASS][] dataset for training with a PyTorch iterator." description: "Unpack and load the [AMASS][] dataset for training with a PyTorch iterator." nb_path: "00_core.ipynb" ---
{% raw %}
{% endraw %}

Unpack Tar Files

Console script to unpack all tar files found in a specified directory and put them in another directory, then create a symlink to be able to find the unpacked data later

Checksum Directories

Checksum directories to only unpack tar files when target directory either doesn't exist or has been incorrectly unpacked.

It would probably be sufficient to check if the target directory exists, but this is more thorough.

{% raw %}

md5_update_from_file[source]

md5_update_from_file(filename:Union[str, Path], hash:HASH)

{% endraw %} {% raw %}

md5_file[source]

md5_file(filename:Union[str, Path])

{% endraw %} {% raw %}

md5_update_from_dir[source]

md5_update_from_dir(directory:Union[str, Path], hash:HASH)

{% endraw %} {% raw %}

md5_dir[source]

md5_dir(directory:Union[str, Path])

{% endraw %} {% raw %}
{% endraw %} {% raw %}
{% endraw %}

Parallel Unpacking with Joblib

Unpacks tar files in multiple jobs to speed up unpacking the dataset.

{% raw %}

lazy_unpack[source]

lazy_unpack(tarpath, outdir)

{% endraw %} {% raw %}

unpack_body_models[source]

unpack_body_models(tardir, outdir, n_jobs=1, verify=False, verbose=False)

{% endraw %} {% raw %}

fast_amass_unpack[source]

fast_amass_unpack()

{% endraw %} {% raw %}
import json
import argparse
import functools
import os
from shutil import unpack_archive
import joblib
from tqdm.auto import tqdm
from llamass.tqdm import ProgressParallel


def lazy_unpack(tarpath, outdir):
    # check if this has already been unpacked by looking for hash file
    tarpath, outdir = Path(tarpath), Path(outdir)
    unpacks_to = hashes[tarpath.name]["unpacks_to"]
    hashpath = outdir / Path(unpacks_to + ".hash")
    # if the hash exists and it's correct then assume the directory is correctly unpacked
    if hashpath.exists():
        with open(hashpath) as f:
            h = f.read()  # read hash
        if h == hashes[tarpath.name]["hash"]:
            return None
    else:
        # if there's no stored hash or it doesn't match, unpack the tar file
        unpack_archive(tarpath, outdir)
        # calculate the hash of the unpacked directory and check it's the same
        h = md5_dir(outdir / unpacks_to)
        _h = hashes[tarpath.name]["hash"]
        assert h == _h, f"Directory {outdir/unpacks_to} hash {h} != {_h}"
        # save the calculated hash
        with open(hashpath, "w") as f:
            f.write(h)


def unpack_body_models(tardir, outdir, n_jobs=1, verify=False, verbose=False):
    tar_root, _, tarfiles = [x for x in os.walk(tardir)][0]
    tarfiles = [x for x in tarfiles if "tar" in x.split(".")]
    tarpaths = [os.path.join(tar_root, tar) for tar in tarfiles]
    for tarpath in tarpaths:
        if verbose:
            print(f"{tarpath} extracting to {outdir}")
    unpack = lazy_unpack if verify else unpack_archive
    ProgressParallel(n_jobs=n_jobs)(
        (joblib.delayed(unpack)(tarpath, outdir) for tarpath in tarpaths),
        total=len(tarpaths),
    )


def fast_amass_unpack():
    parser = argparse.ArgumentParser(
        description="Unpack all the body model tar files in a directory to a target directory"
    )
    parser.add_argument(
        "tardir",
        type=str,
        help="Directory containing tar.bz2 body model files",
    )
    parser.add_argument(
        "outdir",
        type=str,
        help="Output directory",
    )
    parser.add_argument(
        "--verify",
        action="store_true",
        help="Verify the output by calculating a checksum, "
        "ensures that each tar file will only be unpacked once.",
    )
    parser.add_argument(
        "-n",
        default=1,
        type=int,
        help="Number of jobs to run the tar unpacking with",
    )
    args = parser.parse_args()
    unpack_body_models(args.tardir, args.outdir, n_jobs=args.n, verify=args.verify)
{% endraw %}

Test unpacking the sample data always yields the same result:

{% raw %}
import tempfile
import hashlib

# https://stackoverflow.com/a/3431838/6937913
def md5(fname):
    hash_md5 = hashlib.md5()
    with open(fname, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()


md5sums = {
    "amass_sample.npz": "d0b546b3619c8579ade39e3a8ccdc4e2",
    "dmpl_sample.npz": "576bb76b2a6328dc5c276c4150c466f0",
}

with tempfile.TemporaryDirectory() as tmpdirname:
    unpack_body_models("sample_data/", tmpdirname, 8)
    for r, d, f in os.walk(tmpdirname):
        npz_files = [x for x in f if "npz" in x.split(".")]
        npz_paths = [os.path.join(tmpdirname, r, x) for x in npz_files]
    _md5sums = {os.path.split(fpath)[-1]: md5(fpath) for fpath in npz_paths}

for k in md5sums:
    assert md5sums[k] == _md5sums[k]
{% endraw %}

Testing that verify=True works as expected. Can redefine hashes here for testing without breaking the exported library because this cell doesn't get exported by nbdev.

{% raw %}
import time

hashes = {
    "sample.tar.bz2": {
        "unpacks_to": "sample",
        "hash": "b5a86fe22ed2799d79101a532eb0ff27",
    }
}

with tempfile.TemporaryDirectory() as tmpdirname:
    start = time.time()
    unpack_body_models("sample_data/", tmpdirname, 8, verify=True)
    unpacking_time = time.time() - start
    start = time.time()
    unpack_body_models("sample_data/", tmpdirname, 8, verify=True)
    skip_time = time.time() - start
    assert unpacking_time > skip_time
{% endraw %}

Loading Functions

Load the pose data directly from the npz files after unpacking.

Based on the AMASS tutorial notebooks, I would like to iterate over the dataset using a PyTorch Dataloader.

Steps to load:1. Index all of the npz files in the AMASS directory2. Iterate through all of them in sequence

1. Load the `npz` file
1. Cut out acceptable motion sequence in center of each file (typically middle 80% of motion sequence)
2. _Optionally_ shuffle the dataset
2. Iterate over this sequence along the first dimension
  1. When running with num_workers > 0, give each worker a different random set of npz files to load
{% raw %}
{% endraw %}

Looking at the sample data:

{% raw %}
with tempfile.TemporaryDirectory() as tmpdirname:
    unpack_body_models("sample_data/", tmpdirname, 8)
    for r, d, f in os.walk(tmpdirname):
        npz_files = [x for x in f if "npz" in x.split(".")]
        npz_paths = [os.path.join(tmpdirname, r, x) for x in npz_files]
    for npz_path in npz_paths:
        cdata = np.load(npz_path)
        print(npz_path)
        print("  ", [k for k in cdata.keys()])
        print("  ", [(k, cdata[k].shape) for k in cdata.keys()])
/tmp/tmpos4dxm6z/sample/subdir/amass_sample.npz
   ['poses', 'gender', 'mocap_framerate', 'betas', 'marker_data', 'dmpls', 'marker_labels', 'trans']
   [('poses', (601, 156)), ('gender', ()), ('mocap_framerate', ()), ('betas', (16,)), ('marker_data', (601, 85, 3)), ('dmpls', (601, 8)), ('marker_labels', (85,)), ('trans', (601, 3))]
/tmp/tmpos4dxm6z/sample/subdir/dmpl_sample.npz
   ['poses', 'gender', 'mocap_framerate', 'betas', 'marker_data', 'dmpls', 'marker_labels', 'trans']
   [('poses', (235, 156)), ('gender', ()), ('mocap_framerate', ()), ('betas', (16,)), ('marker_data', (235, 67, 3)), ('dmpls', (235, 8)), ('marker_labels', (67,)), ('trans', (235, 3))]
{% endraw %}

The AMASS dataset is composed of 14,096 .npz archives (at time of writing). The size of archives varies over two orders of magnitude, between 0.1MB and 10MB.

A histogram of the file sizes in AMASS

Other statistics we might want to know:

  • Length of motion sequence in each of these files
  • ...

What do the fields mean?

Screenshot quote from the AMASS paper

AMASS npz files contain 5 fields ('poses', 'gender', 'betas', 'dmpls', 'trans'), what do they mean?

  • poses are SMPLH vectors, which are a representation of pose based on SMPL with additional information about the positions of the hands. What are SMPLH vectors composed of?
    • 52 joints, each represented with 3 parameters, 22 for the body and 30 for the hands
    • Encoded with 3 rotational degrees of freedom as angle-axis rotation vectors
  • gender is the reported gender of the actor (it's not clear if MPI has used their gender classifier here)
  • betas are "identity-dependent shape parameters"
  • dmpls are soft tissue deformations described in the original SMPL paper
  • trans I think this is the $\gamma$ 3D parameter representing the translation of the root coordinate system, it is required to describe the pose and should probably be concatenated to the pose vector as described in the AMASS paper.

npz File Iterator

Iterates over all the paths of all the npz files in AMASS.

{% raw %}

npz_paths[source]

npz_paths(npz_directory)

{% endraw %} {% raw %}
def npz_paths(npz_directory):
    npz_directory = Path(npz_directory).resolve()
    npz_paths = []
    for r, d, f in os.walk(npz_directory, followlinks=True):
        for fname in f:
            if "npz" == fname.split(".")[-1] and fname != "shape.npz":
                yield os.path.join(npz_directory, r, fname)
{% endraw %} {% raw %}
with tempfile.TemporaryDirectory() as tmpdirname:
    unpack_body_models("sample_data/", tmpdirname, 8)
    for npz_path in npz_paths(tmpdirname):
        assert Path(npz_path).exists()
{% endraw %} {% raw %}
from contextlib import contextmanager

@contextmanager
def symlink(target, source):
    source.symlink_to(target)
    try:
        yield source
    finally:
        source.unlink()

with tempfile.TemporaryDirectory() as tmpdirname:
    unpack_body_models("sample_data/", tmpdirname, 8)
    tmpdirname = Path(tmpdirname)
    with symlink(tmpdirname/'sample', Path('sym')) as symlink_loc:
        for npz_path in npz_paths(symlink_loc):
            assert Path(npz_path).exists(), npz_path
{% endraw %}

Inferring Dataset Size

A function to calculate dataset size, with the result stored in this package.

The result of this calculation is stored in this package, the dataset loader will try to load this file or recreate it itself, so you can skip that step by copying it into the directory where you have unpacked the data.

{% raw %}

npz_len[source]

npz_len(npz_path, strict=True)

{% endraw %} {% raw %}

npz_lens[source]

npz_lens(unpacked_directory, n_jobs, strict=True)

{% endraw %} {% raw %}

save_lens[source]

save_lens(save_path, npz_file_lens)

{% endraw %} {% raw %}
def npz_len(npz_path, strict=True):
    cdata = np.load(npz_path)
    h = md5_file(npz_path)
    dirs = [hashes[h]['unpacks_to'] for h in hashes]
    if strict:
        m = []
        for p in Path(npz_path).parents:
            m += [d for d in dirs if p.name == d]
        assert len(m) == 1, f"Subdir of {npz_path} contains {len(m)} of {dirs}"
        subdir = m[0]
    else:
        subdir = Path(npz_path).parts[-2]
    return subdir, h, cdata["poses"].shape[0]

def npz_lens(unpacked_directory, n_jobs, strict=True):
    paths = [p for p in npz_paths(unpacked_directory)]
    return ProgressParallel(n_jobs=n_jobs)(
        [joblib.delayed(npz_len)(npz_path, strict=strict) for npz_path in paths], total=len(paths)
    )

def save_lens(save_path, npz_file_lens):
    with gzip.open(save_path, "wt") as f:
        f.write(json.dumps(npz_file_lens))

#npz_file_lens = npz_lens('/nobackup/gngdb/repos/amass/data', 10)
#save_lens('npz_file_lens.json.gz', npz_file_lens)
{% endraw %} {% raw %}
!du -hs npz_file_lens.json.gz
399K	npz_file_lens.json.gz
{% endraw %}

Viable Indexes

For every npz file I need to pull out the viable indexes:

{% raw %}

keep_slice[source]

keep_slice(n, keep)

{% endraw %} {% raw %}

viable_slice[source]

viable_slice(cdata, keep)

Inspects a dictionary loaded from .npz numpy dumps and creates a slice of the viable indexes. args:

- `cdata`: dictionary containing keys:
    ['poses', 'gender', 'mocap_framerate', 'betas',
     'marker_data', 'dmpls', 'marker_labels', 'trans']
- `keep`: ratio of the file to keep, between zero and 1.,
    drops leading and trailing ends of the arrays

returns:

- viable: slice that can access frames in the arrays:
    cdata['poses'], cdata['marker_data'], cdata['dmpls'], cdata['trans']
{% endraw %} {% raw %}
def keep_slice(n, keep):
    drop = (1.0 - keep) / 2.0
    return slice(int(n * drop), int(n * keep + n * drop))


def viable_slice(cdata, keep):
    """
    Inspects a dictionary loaded from `.npz` numpy dumps
    and creates a slice of the viable indexes.
    args:

        - `cdata`: dictionary containing keys:
            ['poses', 'gender', 'mocap_framerate', 'betas',
             'marker_data', 'dmpls', 'marker_labels', 'trans']
        - `keep`: ratio of the file to keep, between zero and 1.,
            drops leading and trailing ends of the arrays

    returns:

        - viable: slice that can access frames in the arrays:
            cdata['poses'], cdata['marker_data'], cdata['dmpls'], cdata['trans']
    """
    assert (
        keep > 0.0 and keep <= 1.0
    ), "Proportion of array to keep must be between zero and one"
    n = cdata["poses"].shape[0]
    return keep_slice(n, keep)
{% endraw %} {% raw %}
with tempfile.TemporaryDirectory() as tmpdirname:
    unpack_body_models("sample_data/", tmpdirname, 8)
    for npz_path in npz_paths(tmpdirname):
        cdata = np.load(npz_path)
        print(npz_path)
        print("  ", viable_slice(cdata, 0.8))
/tmp/tmpas2sa5g4/sample/subdir/amass_sample.npz
   slice(60, 540, None)
/tmp/tmpas2sa5g4/sample/subdir/dmpl_sample.npz
   slice(23, 211, None)
{% endraw %}

npz Contents Iterator

Loads an .npz file and iterates over the examples within.

{% raw %}

npz_contents[source]

npz_contents(npz_path, clip_length, overlapping, keep=0.8, keys=('poses', 'dmpls', 'trans', 'betas', 'gender'), shuffle=False, seed=None)

{% endraw %} {% raw %}
def npz_contents(
    npz_path,
    clip_length,
    overlapping,
    keep=0.8,
    keys=("poses", "dmpls", "trans", "betas", "gender"),
    shuffle=False,
    seed=None,
):
    # cache this because we will often be accessing the same file multiple times
    cdata = np.load(npz_path)

    # slice of viable indices
    viable = viable_slice(cdata, keep)

    # slice iterator
    # every time the file is opened the non-overlapping slices will be the same
    # this may not be preferred, but loading overlapping means a lot of repetitive data
    def clip_slices(viable, clip_length, overlapping):
        i = 0
        step = 1 if overlapping else clip_length
        for i in range(viable.start, viable.stop, step):
            if i + clip_length < viable.stop:
                yield slice(i, i + clip_length)

    # buffer the iterator and shuffle here, when implementing that
    buf_clip_slices = [s for s in clip_slices(viable, clip_length, overlapping)]
    if shuffle:
        # this will be correlated over workers
        # seed should be passed drawn from torch Generator
        seed = seed if seed else random.randint(1e6)
        random.Random(seed).shuffle(buf_clip_slices)

    # iterate over slices
    for s in buf_clip_slices:
        data = {}
        # unpack and enforce data type
        to_load = [k for k in ("poses", "dmpls", "trans") if k in keys]
        for k in to_load:
            data[k] = cdata[k][s].astype(np.float32)
        if "betas" in keys:
            r = s.stop - s.start
            data["betas"] = np.repeat(
                cdata["betas"][np.newaxis].astype(np.float32), repeats=r, axis=0
            )
        if "gender" in keys:

            def gender_to_int(g):
                # casting gender to integer will raise a warning in future
                g = str(g.astype(str))
                return {"male": -1, "neutral": 0, "female": 1}[g]

            data["gender"] = np.array(
                [gender_to_int(cdata["gender"]) for _ in range(s.start, s.stop)]
            )
        yield data
{% endraw %} {% raw %}
def test_load_npz(clip_length, overlapping):
    with tempfile.TemporaryDirectory() as tmpdirname:
        unpack_body_models("sample_data/", tmpdirname, 8)
        for npz_path in npz_paths(tmpdirname):
            for data in npz_contents(npz_path, clip_length, overlapping):
                print([(k, data[k].shape) for k in data])
                for k in data:
                    assert data[k].shape[0] == clip_length
                break


test_load_npz(1, False)
test_load_npz(3, False)
test_load_npz(3, True)
[('poses', (1, 156)), ('dmpls', (1, 8)), ('trans', (1, 3)), ('betas', (1, 16)), ('gender', (1,))]
[('poses', (1, 156)), ('dmpls', (1, 8)), ('trans', (1, 3)), ('betas', (1, 16)), ('gender', (1,))]
[('poses', (3, 156)), ('dmpls', (3, 8)), ('trans', (3, 3)), ('betas', (3, 16)), ('gender', (3,))]
[('poses', (3, 156)), ('dmpls', (3, 8)), ('trans', (3, 3)), ('betas', (3, 16)), ('gender', (3,))]
[('poses', (3, 156)), ('dmpls', (3, 8)), ('trans', (3, 3)), ('betas', (3, 16)), ('gender', (3,))]
[('poses', (3, 156)), ('dmpls', (3, 8)), ('trans', (3, 3)), ('betas', (3, 16)), ('gender', (3,))]
{% endraw %}

PyTorch Dataset Class

An iterable style PyTorch Dataset class to iterate over all of the .npz files storing motion sequences in a directory.

{% raw %}
[p for p in Path('/nobackup/gngdb/repos/amass/data').glob('*') if p.is_dir()]
[Path('/nobackup/gngdb/repos/amass/data/val'),
 Path('/nobackup/gngdb/repos/amass/data/test'),
 Path('/nobackup/gngdb/repos/amass/data/train'),
 Path('/nobackup/gngdb/repos/amass/data/train_subset'),
 Path('/nobackup/gngdb/repos/amass/data/val_subset')]
{% endraw %} {% raw %}

class AMASS[source]

AMASS(*args, **kwds) :: IterableDataset

An iterable Dataset.

All datasets that represent an iterable of data samples should subclass it. Such form of datasets is particularly useful when data come from a stream.

All subclasses should overwrite :meth:__iter__, which would return an iterator of samples in this dataset.

When a subclass is used with :class:~torch.utils.data.DataLoader, each item in the dataset will be yielded from the :class:~torch.utils.data.DataLoader iterator. When :attr:num_workers > 0, each worker process will have a different copy of the dataset object, so it is often desired to configure each copy independently to avoid having duplicate data returned from the workers. :func:~torch.utils.data.get_worker_info, when called in a worker process, returns information about the worker. It can be used in either the dataset's :meth:__iter__ method or the :class:~torch.utils.data.DataLoader 's :attr:worker_init_fn option to modify each copy's behavior.

Example 1: splitting workload across all workers in :meth:__iter__::

>>> class MyIterableDataset(torch.utils.data.IterableDataset):
...     def __init__(self, start, end):
...         super(MyIterableDataset).__init__()
...         assert end > start, "this example code only works with end >= start"
...         self.start = start
...         self.end = end
...
...     def __iter__(self):
...         worker_info = torch.utils.data.get_worker_info()
...         if worker_info is None:  # single-process data loading, return the full iterator
...             iter_start = self.start
...             iter_end = self.end
...         else:  # in a worker process
...             # split workload
...             per_worker = int(math.ceil((self.end - self.start) / float(worker_info.num_workers)))
...             worker_id = worker_info.id
...             iter_start = self.start + worker_id * per_worker
...             iter_end = min(iter_start + per_worker, self.end)
...         return iter(range(iter_start, iter_end))
...
>>> # should give same set of data as range(3, 7), i.e., [3, 4, 5, 6].
>>> ds = MyIterableDataset(start=3, end=7)

>>> # Single-process loading
>>> print(list(torch.utils.data.DataLoader(ds, num_workers=0)))
[3, 4, 5, 6]

>>> # Mult-process loading with two worker processes
>>> # Worker 0 fetched [3, 4].  Worker 1 fetched [5, 6].
>>> print(list(torch.utils.data.DataLoader(ds, num_workers=2)))
[3, 5, 4, 6]

>>> # With even more workers
>>> print(list(torch.utils.data.DataLoader(ds, num_workers=20)))
[3, 4, 5, 6]

Example 2: splitting workload across all workers using :attr:worker_init_fn::

>>> class MyIterableDataset(torch.utils.data.IterableDataset):
...     def __init__(self, start, end):
...         super(MyIterableDataset).__init__()
...         assert end > start, "this example code only works with end >= start"
...         self.start = start
...         self.end = end
...
...     def __iter__(self):
...         return iter(range(self.start, self.end))
...
>>> # should give same set of data as range(3, 7), i.e., [3, 4, 5, 6].
>>> ds = MyIterableDataset(start=3, end=7)

>>> # Single-process loading
>>> print(list(torch.utils.data.DataLoader(ds, num_workers=0)))
[3, 4, 5, 6]
>>>
>>> # Directly doing multi-process loading yields duplicate data
>>> print(list(torch.utils.data.DataLoader(ds, num_workers=2)))
[3, 3, 4, 4, 5, 5, 6, 6]

>>> # Define a [`worker_init_fn`](/llamass/core.html#worker_init_fn) that configures each dataset copy differently
>>> def worker_init_fn(worker_id):
...     worker_info = torch.utils.data.get_worker_info()
...     dataset = worker_info.dataset  # the dataset copy in this worker process
...     overall_start = dataset.start
...     overall_end = dataset.end
...     # configure the dataset to only process the split workload
...     per_worker = int(math.ceil((overall_end - overall_start) / float(worker_info.num_workers)))
...     worker_id = worker_info.id
...     dataset.start = overall_start + worker_id * per_worker
...     dataset.end = min(dataset.start + per_worker, overall_end)
...

>>> # Mult-process loading with the custom [`worker_init_fn`](/llamass/core.html#worker_init_fn)
>>> # Worker 0 fetched [3, 4].  Worker 1 fetched [5, 6].
>>> print(list(torch.utils.data.DataLoader(ds, num_workers=2, worker_init_fn=worker_init_fn)))
[3, 5, 4, 6]

>>> # With even more workers
>>> print(list(torch.utils.data.DataLoader(ds, num_workers=20, worker_init_fn=worker_init_fn)))
[3, 4, 5, 6]
{% endraw %} {% raw %}
class AMASS(tudata.IterableDataset):
    def __init__(
        self,
        amass_location,
        clip_length,
        overlapping,
        keep=0.8,
        transform=None,
        data_keys=("poses", "dmpls", "trans", "betas", "gender"),
        file_list_seed=0,
        shuffle=False,
        seed=None,
        strict=True
    ):
        assert clip_length > 0 and type(clip_length) is int
        self.transform = transform
        self.data_keys = data_keys
        self.amass_location = amass_location
        # these should be shuffled but pull shuffle argument out of dataloader worker arguments
        self._npz_paths = [npz_path for npz_path in npz_paths(amass_location)]
        random.Random(file_list_seed).shuffle(self._npz_paths)
        self._npz_paths = tuple(self._npz_paths)
        self.npz_paths = self._npz_paths
        self.clip_length = clip_length
        self.overlapping = overlapping
        self.keep = keep
        self.shuffle = shuffle
        self.seed = seed if seed else random.randint(0, 1e6)
        self.strict = strict

    def infer_len(self, n_jobs=4):
        # uses known dimensions of the npz files in the AMASS dataset to infer the length
        # with clip_length and overlapping settings stored
        lenfile = Path(self.amass_location) / Path("npz_file_lens.json.gz")
        # try to load file
        if lenfile.exists():
            with gzip.open(lenfile, "rt") as f:
                self.npz_lens = json.load(f)
                def filter_lens(npz_lens):
                    # filter out file length information to only existing dirs
                    datasets = [p.name for p in Path(self.amass_location).glob('*') if p.is_dir()]
                    return [(p, h, l) for p, h, l in npz_lens
                            if p in datasets]
                self.npz_lens = filter_lens(self.npz_lens)
        else:  # if it's not there, recompute it and create the file
            print(f'Inspecting {len(self.npz_paths)} files to determine dataset length'
                  f', saving the result to {lenfile}')
            self.npz_lens = npz_lens(self.amass_location, n_jobs, strict=self.strict)
            save_lens(lenfile, self.npz_lens)

        # using stored lengths to infer the total dataset length
        def lenslice(s):
            if self.overlapping:
                return (s.stop - s.start) - (self.clip_length - 1)
            else:
                return math.floor((s.stop - s.start) / self.clip_length)

        N = 0
        for p, h, l in self.npz_lens:
            s = keep_slice(l, keep=self.keep)
            N += lenslice(s)

        return N

    def __len__(self):
        if hasattr(self, "N"):
            return self.N
        else:
            self.N = self.infer_len()
            return self.N

    def __iter__(self):
        if self.shuffle:
            self.npz_paths = list(self.npz_paths)
            random.Random(self.seed).shuffle(self.npz_paths)
        for npz_path in self.npz_paths:
            for data in npz_contents(
                npz_path,
                self.clip_length,
                self.overlapping,
                keys=self.data_keys,
                keep=self.keep,
                shuffle=self.shuffle,
                seed=self.seed,
            ):
                self.seed += 1  # increment to vary shuffle over files
                yield {k: self.transform(data[k]) for k in data}
{% endraw %}

Test I can load some data with this Dataset:

{% raw %}
with tempfile.TemporaryDirectory() as tmpdirname:
    unpack_body_models("sample_data/", tmpdirname, 8)
    amass = AMASS(tmpdirname, overlapping=False, clip_length=1, transform=torch.tensor)
    for data in amass:
        for k in data:
            print(k, data[k].shape)
            assert type(data[k]) is torch.Tensor
        break
    print(len(amass))
poses torch.Size([1, 156])
dmpls torch.Size([1, 8])
trans torch.Size([1, 3])
betas torch.Size([1, 16])
gender torch.Size([1])
Inspecting 2 files to determine dataset length, saving the result to /tmp/tmpleow1ugq/npz_file_lens.json.gz
668
{% endraw %}

Test it works in a DataLoader to make batches:

{% raw %}
with tempfile.TemporaryDirectory() as tmpdirname:
    unpack_body_models("sample_data/", tmpdirname, 8)
    amass = AMASS(tmpdirname, overlapping=False, clip_length=1, transform=torch.tensor)
    amasstrain = tudata.DataLoader(amass, batch_size=4)
    for i, data in enumerate(amasstrain):
        for k in data:
            print(k, data[k].shape)
        assert data["poses"].size(0) == 4, f'{data["poses"].size()}'
        break
poses torch.Size([4, 1, 156])
dmpls torch.Size([4, 1, 8])
trans torch.Size([4, 1, 3])
betas torch.Size([4, 1, 16])
gender torch.Size([4, 1])
{% endraw %}

Multi-process Data Loading

To work with num_workers > 0 I'm going to pass a different set of npz files to each worker using a worker_init_fn.

The following worker_init_fn must always be used when using num_workers > 0 or data will be duplicated. To simplify I am providing a DataLoader class that bakes this worker_init_fn in when num_workers > 0.

{% raw %}

worker_init_fn[source]

worker_init_fn(worker_id)

{% endraw %} {% raw %}

class IterableLoader[source]

IterableLoader(*args, **kwds) :: DataLoader

Data loader. Combines a dataset and a sampler, and provides an iterable over the given dataset.

The :class:~torch.utils.data.DataLoader supports both map-style and iterable-style datasets with single- or multi-process loading, customizing loading order and optional automatic batching (collation) and memory pinning.

See :py:mod:torch.utils.data documentation page for more details.

Args: dataset (Dataset): dataset from which to load the data. batch_size (int, optional): how many samples per batch to load (default: 1). shuffle (bool, optional): set to True to have the data reshuffled at every epoch (default: False). sampler (Sampler or Iterable, optional): defines the strategy to draw samples from the dataset. Can be any Iterable with __len__ implemented. If specified, :attr:shuffle must not be specified. batch_sampler (Sampler or Iterable, optional): like :attr:sampler, but returns a batch of indices at a time. Mutually exclusive with :attr:batch_size, :attr:shuffle, :attr:sampler, and :attr:drop_last. num_workers (int, optional): how many subprocesses to use for data loading. 0 means that the data will be loaded in the main process. (default: 0) collate_fn (callable, optional): merges a list of samples to form a mini-batch of Tensor(s). Used when using batched loading from a map-style dataset. pin_memory (bool, optional): If True, the data loader will copy Tensors into CUDA pinned memory before returning them. If your data elements are a custom type, or your :attr:collate_fn returns a batch that is a custom type, see the example below. drop_last (bool, optional): set to True to drop the last incomplete batch, if the dataset size is not divisible by the batch size. If False and the size of dataset is not divisible by the batch size, then the last batch will be smaller. (default: False) timeout (numeric, optional): if positive, the timeout value for collecting a batch from workers. Should always be non-negative. (default: 0) worker_init_fn (callable, optional): If not None, this will be called on each worker subprocess with the worker id (an int in [0, num_workers - 1]) as input, after seeding and before data loading. (default: None) generator (torch.Generator, optional): If not None, this RNG will be used by RandomSampler to generate random indexes and multiprocessing to generate base_seed for workers. (default: None) prefetch_factor (int, optional, keyword-only arg): Number of samples loaded in advance by each worker. 2 means there will be a total of 2 * num_workers samples prefetched across all workers. (default: 2) persistent_workers (bool, optional): If True, the data loader will not shutdown the worker processes after a dataset has been consumed once. This allows to maintain the workers Dataset instances alive. (default: False)

.. warning:: If the spawn start method is used, :attr:worker_init_fn cannot be an unpicklable object, e.g., a lambda function. See :ref:multiprocessing-best-practices on more details related to multiprocessing in PyTorch.

.. warning:: len(dataloader) heuristic is based on the length of the sampler used. When :attr:dataset is an :class:~torch.utils.data.IterableDataset, it instead returns an estimate based on len(dataset) / batch_size, with proper rounding depending on :attr:drop_last, regardless of multi-process loading configurations. This represents the best guess PyTorch can make because PyTorch trusts user :attr:dataset code in correctly handling multi-process loading to avoid duplicate data.

         However, if sharding results in multiple workers having incomplete last batches,
         this estimate can still be inaccurate, because (1) an otherwise complete batch can
         be broken into multiple ones and (2) more than one batch worth of samples can be
         dropped when :attr:`drop_last` is set. Unfortunately, PyTorch can not detect such
         cases in general.

         See `Dataset Types`_ for more details on these two types of datasets and how
         :class:`~torch.utils.data.IterableDataset` interacts with
         `Multi-process data loading`_.

.. warning:: See :ref:reproducibility, and :ref:dataloader-workers-random-seed, and :ref:data-loading-randomness notes for random seed related questions.

{% endraw %} {% raw %}
def worker_init_fn(worker_id):
    worker_info = torch.utils.data.get_worker_info()

    # slice up dataset among workers
    dataset = worker_info.dataset
    overall_npz_paths = dataset._npz_paths
    step = int(len(overall_npz_paths) / float(worker_info.num_workers))
    n = len(overall_npz_paths)
    assert n >= worker_info.num_workers, (
        "Every worker must get at least one file:" f" {worker_info.num_workers} > {n}"
    )
    start, stop = 0, n
    for worker_idx, i in enumerate(range(start, stop, step)):
        if worker_idx == worker_info.id:
            worker_slice = slice(i, min(i + step, n + 1))
    dataset.npz_paths = overall_npz_paths[worker_slice]

    # set each workers seed
    dataset.seed = dataset.seed + worker_info.seed

class IterableLoader(tudata.DataLoader):
    def __init__(self, *args, **kwargs):
        kwargs['worker_init_fn'] = worker_init_fn
        super().__init__(*args, **kwargs)
{% endraw %} {% raw %}
def test_dataloader():
    with tempfile.TemporaryDirectory() as tmpdirname:
        unpack_body_models("sample_data/", tmpdirname, 8)
        amass = AMASS(
            tmpdirname, overlapping=False, clip_length=1, transform=torch.tensor
        )
        amasstrain = tudata.DataLoader(
            amass, batch_size=4, worker_init_fn=worker_init_fn, num_workers=2
        )
        for i, data in enumerate(amasstrain):
            for k in data:
                print(k, data[k].shape)
            assert data["poses"].size(0) == 4, f'{data["poses"].size()}'
            break
        amasstrain = IterableLoader(
            amass, batch_size=4, num_workers=2
        )
        for i, data in enumerate(amasstrain):
            for k in data:
                print(k, data[k].shape)
            assert data["poses"].size(0) == 4, f'{data["poses"].size()}'
            break

test_dataloader()
poses torch.Size([4, 1, 156])
dmpls torch.Size([4, 1, 8])
trans torch.Size([4, 1, 3])
betas torch.Size([4, 1, 16])
gender torch.Size([4, 1])
poses torch.Size([4, 1, 156])
dmpls torch.Size([4, 1, 8])
trans torch.Size([4, 1, 3])
betas torch.Size([4, 1, 16])
gender torch.Size([4, 1])
{% endraw %} {% raw %}
def test_runtime(unpacked_dir, batch_size, num_workers):
    amass = AMASS(
        unpacked_dir, overlapping=False, clip_length=1, transform=torch.tensor, seed=0
    )
    amasstrain = DataLoader(
        amass, batch_size=batch_size, worker_init_fn=worker_init_fn, num_workers=num_workers
    )
    start = time.time()
    i = 0
    for data in tqdm(amasstrain):
        i += 1
        if i > 100:
            break
    elapsed = time.time() - start
    total_hours = ((elapsed / i) * len(amasstrain)) / (60 ** 2)
    return elapsed, elapsed / i, total_hours


# test_runtime('/nobackup/gngdb/repos/amass/data', 256, 8)
# test_runtime('/scratch/gobi1/gngdb/amass', 256, 8)
{% endraw %}

Rough results from testing runtime on full dataset on my workstation:

  • Batch size 32:
    • 0: 260ms/batch, 35 hours per epoch
    • 2: 91ms/batch, 12 hours per epoch
    • 4: 58ms/batch, 7 hours 54 minutes per epoch
    • 8: 29ms/batch, 4 hours per epoch
    • 12 (number of cores): 3 hours 45 minutes per epoch
  • Batch size 256:
    • 0: 910ms/batch, 16 hours per epoch
    • 2: 816ms/batch, 14 hours per epoch
    • 4: 457ms/batch, 8 hours per epoch
    • 8: 235ms/batch, 4 hours per epoch

Shuffling

PyTorch DataLoaders don't support shuffling IterableDataset because it's assuming the data is coming in as an IID stream. For this problem, this means the shuffling has to be implemented elsewhere.

There are two parts to shuffle:

  • The indexes accessing the arrays in each file
  • The list of files to access

The first is easy and can be an option to the iterator over each file. It doesn't affect how each worker operates because no two workers should ever touch the same file.

The second is more difficult because each worker has a different list of files. Also, it's important that the order of the global list of files be random, because some files are larger than others and the randomness is to ensure that each worker has approximately the same number of examples in the files it has received. However, every worker initialises a separate dataset, so each dataset has to have access to the same list of files. I think the best way to ensure this at this point is to use a shared random seed to shuffle the list of files at initialisation.

{% raw %}
def test_shuffling(num_workers):
    with tempfile.TemporaryDirectory() as tmpdirname:
        unpack_body_models("sample_data/", tmpdirname, 8)
        amass = AMASS(
            tmpdirname,
            overlapping=False,
            clip_length=1,
            transform=torch.tensor,
            shuffle=True,
            seed=0,
        )
        amasstrain = tudata.DataLoader(
            amass, batch_size=4, worker_init_fn=worker_init_fn, num_workers=num_workers
        )
        for i, data in enumerate(amasstrain):
            _data = data["poses"]
            break
        # second epoch shouldn't produce the same minibatch
        for i, data in enumerate(amasstrain):
            data = data["poses"]
            data, _data = data.numpy(), _data.numpy()
            assert not np.allclose(data, _data)
            break


for num_workers in range(3):
    test_shuffling(num_workers)
{% endraw %}

Train/test Splits

Console utility to split directories in a standard way

The original preprocessing script splits directories the following way:

{% raw %}
original_amass_splits = {
    'val' : ['HumanEva', 'MPI_HDM05', 'SFU', 'MPI_mosh'],
    'test': ['Transitions_mocap', 'SSM_synced'],
    'train': ['CMU', 'MPI_Limits', 'TotalCapture', 'Eyes_Japan_Dataset', 'KIT', 'BML', 'EKUT', 'TCD_handMocap', 'ACCAD']
}
{% endraw %}

This doesn't allocated all the datasets composing AMASS to a split:

Datasets not allocated to a split: {'BMLmovi', 'BioMotionLab_NTroje', 'DanceDB', 'BMLhandball', 'DFaust_67'}

Obtained by evaluating the following cell on an unpacked directory of AMASS data:

{% raw %}
def unallocated_splits(unpacked_dir):
    data_path = Path(unpacked_dir)
    all_subdirs = {f.name for f in data_path.iterdir() if f.is_dir()}
    unallocated = all_subdirs - {d for k in amass_splits for d in amass_splits[k]}
    print(f'Datasets not allocated to a split: {unallocated}')
# unallocated_splits('/nobackup/gngdb/repos/amass/data')
{% endraw %}

I'm going to include all the newer datasets in the training set.

{% raw %}
amass_splits = {
    'val' : ['HumanEva', 'MPI_HDM05', 'SFU', 'MPI_mosh'],
    'test': ['Transitions_mocap', 'SSM_synced'],
    'train': ['CMU', 'MPI_Limits', 'TotalCapture', 'Eyes_Japan_Dataset',
              'KIT', 'BML', 'EKUT', 'TCD_handMocap', 'ACCAD',
              'BMLmovi', 'BioMotionLab_NTroje', 'DanceDB', 'BMLhandball', 'DFaust_67']
}
{% endraw %}

These functions move the directories into subdirectories that define the splits and undo this change.

{% raw %}

move_dirs_into_splits[source]

move_dirs_into_splits(amass_loc, splits, undo=False)

{% endraw %} {% raw %}
def move_dirs_into_splits(amass_loc, splits, undo=False):
    amass_loc = Path(amass_loc)
    for k in amass_splits:
        split_dir = amass_loc / Path(k)
        if not split_dir.exists():
            os.mkdir(split_dir)
        for d in amass_splits[k]:
            d = Path(d)
            t = split_dir/d
            f = amass_loc/d
            try:
                if undo:
                    os.rename(t, f)
                else:
                    os.rename(f, t)
            except FileNotFoundError:
                warnings.warn(f'Could not find {d} for {k} split')
    
move_dirs_out_of_splits = functools.partial(move_dirs_into_splits, undo=True)
{% endraw %} {% raw %}

console_split_dirs[source]

console_split_dirs()

{% endraw %} {% raw %}
def console_split_dirs():
    parser = argparse.ArgumentParser(
        description="Split AMASS Dataset subdirs into train/val/test"
    )
    parser.add_argument(
        "amassloc",
        type=str,
        help="Location where AMASS has been unpacked",
    )
    parser.add_argument(
        "--undo",
        action="store_true",
        help="Undo move into subdirectories, put them all back in the root AMASS location",
    )
    args = parser.parse_args()
    move_dirs_into_splits(args.amassloc, amass_splits, undo=args.undo)
{% endraw %}

SPL Train/Validation/Test

{% raw %}
import urllib.request

train_url = "https://raw.githubusercontent.com/eth-ait/spl/master/preprocessing/training_fnames.txt"
val_url = "https://raw.githubusercontent.com/eth-ait/spl/master/preprocessing/validation_fnames.txt"
test_url = "https://raw.githubusercontent.com/eth-ait/spl/master/preprocessing/test_fnames.txt"

splits = {}
for url in [train_url, val_url, test_url]:
    split = url.split("/")[-1].split("_")[0]
    with urllib.request.urlopen(url) as f:
        file_list = f.read().decode('UTF-8').split("\n")
    splits[split] = file_list

for split, file_list in splits.items():
    print(split, " = ", set(f.split("/")[0] for f in file_list))
training  =  {'', 'CMU', 'Transition', 'ACCAD', 'SSM', 'BioMotion', 'JointLimit', 'HEva', 'MIXAMO', 'HDM05', 'CMU_Kitchen', 'Eyes'}
validation  =  {'', 'CMU', 'Transition', 'ACCAD', 'SSM', 'BioMotion', 'JointLimit', 'MIXAMO', 'HDM05', 'Eyes'}
test  =  {'', 'CMU', 'Transition', 'ACCAD', 'BioMotion', 'JointLimit', 'HEva', 'MIXAMO', 'HDM05', 'CMU_Kitchen', 'Eyes'}
{% endraw %} {% raw %}
spl_splits = dict(
    training  =  {'CMU_Kitchen', 'Eyes', 'HEva', '', 'MIXAMO', 'Transition', 'CMU', 'SSM', 'BioMotion', 'JointLimit', 'ACCAD', 'HDM05'},
    validation  =  {'Eyes', '', 'MIXAMO', 'Transition', 'CMU', 'SSM', 'BioMotion', 'JointLimit', 'ACCAD', 'HDM05'},
    test  =  {'CMU_Kitchen', 'Eyes', 'HEva', '', 'MIXAMO', 'Transition', 'CMU', 'BioMotion', 'JointLimit', 'ACCAD', 'HDM05'}
)
{% endraw %} {% raw %}
# iterate over every subdirectory in AMASS and open the json files containing information on file sizes
amass_loc = "/nobackup/gngdb/repos/amass/data/"
metadata = {}
for d in Path(amass_loc).iterdir():
    if d.is_dir():
        npz_len_file = d / "npz_file_lens.json.gz"
        with gzip.open(npz_len_file) as f:
            metadata[d.stem] = json.load(f)
{% endraw %} {% raw %}
for splits in [['train_subset', 'val_subset'], ['val', 'test', 'train']]:
    print(splits)
    total = sum(v for k in splits for n, h, v in metadata[k])
    for k in splits:
        subtotal = sum(v for n, h, v in metadata[k])
        print("  ", k, f"{subtotal}/{total} = {100.*subtotal/total:.2f}%")
['train_subset', 'val_subset']
   train_subset 41220/102240 = 40.32%
   val_subset 61020/102240 = 59.68%
['val', 'test', 'train']
   val 1320126/19775902 = 6.68%
   test 117679/19775902 = 0.60%
   train 18338097/19775902 = 92.73%
{% endraw %}