--- title: core keywords: fastai sidebar: home_sidebar summary: "Unpack and load the [AMASS][] dataset for training with a PyTorch iterator." description: "Unpack and load the [AMASS][] dataset for training with a PyTorch iterator." nb_path: "00_core.ipynb" ---
import json
import argparse
import functools
import os
from shutil import unpack_archive
import joblib
from tqdm.auto import tqdm
from llamass.tqdm import ProgressParallel
def lazy_unpack(tarpath, outdir):
# check if this has already been unpacked by looking for hash file
tarpath, outdir = Path(tarpath), Path(outdir)
unpacks_to = hashes[tarpath.name]["unpacks_to"]
hashpath = outdir / Path(unpacks_to + ".hash")
# if the hash exists and it's correct then assume the directory is correctly unpacked
if hashpath.exists():
with open(hashpath) as f:
h = f.read() # read hash
if h == hashes[tarpath.name]["hash"]:
return None
else:
# if there's no stored hash or it doesn't match, unpack the tar file
unpack_archive(tarpath, outdir)
# calculate the hash of the unpacked directory and check it's the same
h = md5_dir(outdir / unpacks_to)
_h = hashes[tarpath.name]["hash"]
assert h == _h, f"Directory {outdir/unpacks_to} hash {h} != {_h}"
# save the calculated hash
with open(hashpath, "w") as f:
f.write(h)
def unpack_body_models(tardir, outdir, n_jobs=1, verify=False, verbose=False):
tar_root, _, tarfiles = [x for x in os.walk(tardir)][0]
tarfiles = [x for x in tarfiles if "tar" in x.split(".")]
tarpaths = [os.path.join(tar_root, tar) for tar in tarfiles]
for tarpath in tarpaths:
if verbose:
print(f"{tarpath} extracting to {outdir}")
unpack = lazy_unpack if verify else unpack_archive
ProgressParallel(n_jobs=n_jobs)(
(joblib.delayed(unpack)(tarpath, outdir) for tarpath in tarpaths),
total=len(tarpaths),
)
def fast_amass_unpack():
parser = argparse.ArgumentParser(
description="Unpack all the body model tar files in a directory to a target directory"
)
parser.add_argument(
"tardir",
type=str,
help="Directory containing tar.bz2 body model files",
)
parser.add_argument(
"outdir",
type=str,
help="Output directory",
)
parser.add_argument(
"--verify",
action="store_true",
help="Verify the output by calculating a checksum, "
"ensures that each tar file will only be unpacked once.",
)
parser.add_argument(
"-n",
default=1,
type=int,
help="Number of jobs to run the tar unpacking with",
)
args = parser.parse_args()
unpack_body_models(args.tardir, args.outdir, n_jobs=args.n, verify=args.verify)
Test unpacking the sample data always yields the same result:
import tempfile
import hashlib
# https://stackoverflow.com/a/3431838/6937913
def md5(fname):
hash_md5 = hashlib.md5()
with open(fname, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
md5sums = {
"amass_sample.npz": "d0b546b3619c8579ade39e3a8ccdc4e2",
"dmpl_sample.npz": "576bb76b2a6328dc5c276c4150c466f0",
}
with tempfile.TemporaryDirectory() as tmpdirname:
unpack_body_models("sample_data/", tmpdirname, 8)
for r, d, f in os.walk(tmpdirname):
npz_files = [x for x in f if "npz" in x.split(".")]
npz_paths = [os.path.join(tmpdirname, r, x) for x in npz_files]
_md5sums = {os.path.split(fpath)[-1]: md5(fpath) for fpath in npz_paths}
for k in md5sums:
assert md5sums[k] == _md5sums[k]
Testing that verify=True
works as expected. Can redefine hashes
here for testing without breaking the exported library because this cell doesn't get exported by nbdev
.
import time
hashes = {
"sample.tar.bz2": {
"unpacks_to": "sample",
"hash": "b5a86fe22ed2799d79101a532eb0ff27",
}
}
with tempfile.TemporaryDirectory() as tmpdirname:
start = time.time()
unpack_body_models("sample_data/", tmpdirname, 8, verify=True)
unpacking_time = time.time() - start
start = time.time()
unpack_body_models("sample_data/", tmpdirname, 8, verify=True)
skip_time = time.time() - start
assert unpacking_time > skip_time
Load the pose data directly from the
npz
files after unpacking.
Based on the AMASS tutorial notebooks, I would like to iterate over the dataset using a PyTorch Dataloader.
Steps to load:1. Index all of the npz
files in the AMASS directory2. Iterate through all of them in sequence
1. Load the `npz` file
1. Cut out acceptable motion sequence in center of each file (typically middle 80% of motion sequence)
2. _Optionally_ shuffle the dataset
2. Iterate over this sequence along the first dimension
num_workers > 0
, give each worker a different random set of npz
files to loadLooking at the sample data:
with tempfile.TemporaryDirectory() as tmpdirname:
unpack_body_models("sample_data/", tmpdirname, 8)
for r, d, f in os.walk(tmpdirname):
npz_files = [x for x in f if "npz" in x.split(".")]
npz_paths = [os.path.join(tmpdirname, r, x) for x in npz_files]
for npz_path in npz_paths:
cdata = np.load(npz_path)
print(npz_path)
print(" ", [k for k in cdata.keys()])
print(" ", [(k, cdata[k].shape) for k in cdata.keys()])
The AMASS dataset is composed of 14,096 .npz
archives (at time of writing). The size of archives varies over two orders of magnitude, between 0.1MB and 10MB.
Other statistics we might want to know:
AMASS npz
files contain 5 fields ('poses', 'gender', 'betas', 'dmpls', 'trans'
), what do they mean?
poses
are SMPLH vectors, which are a representation of pose based on SMPL with additional information about the positions of the hands. What are SMPLH vectors composed of?gender
is the reported gender of the actor (it's not clear if MPI has used their gender classifier here)betas
are "identity-dependent shape parameters"dmpls
are soft tissue deformations described in the original SMPL papertrans
I think this is the $\gamma$ 3D parameter representing the translation of the root coordinate system, it is required to describe the pose and should probably be concatenated to the pose vector as described in the AMASS paper.def npz_paths(npz_directory):
npz_directory = Path(npz_directory).resolve()
npz_paths = []
for r, d, f in os.walk(npz_directory, followlinks=True):
for fname in f:
if "npz" == fname.split(".")[-1] and fname != "shape.npz":
yield os.path.join(npz_directory, r, fname)
with tempfile.TemporaryDirectory() as tmpdirname:
unpack_body_models("sample_data/", tmpdirname, 8)
for npz_path in npz_paths(tmpdirname):
assert Path(npz_path).exists()
from contextlib import contextmanager
@contextmanager
def symlink(target, source):
source.symlink_to(target)
try:
yield source
finally:
source.unlink()
with tempfile.TemporaryDirectory() as tmpdirname:
unpack_body_models("sample_data/", tmpdirname, 8)
tmpdirname = Path(tmpdirname)
with symlink(tmpdirname/'sample', Path('sym')) as symlink_loc:
for npz_path in npz_paths(symlink_loc):
assert Path(npz_path).exists(), npz_path
A function to calculate dataset size, with the result stored in this package.
The result of this calculation is stored in this package, the dataset loader will try to load this file or recreate it itself, so you can skip that step by copying it into the directory where you have unpacked the data.
def npz_len(npz_path, strict=True):
cdata = np.load(npz_path)
h = md5_file(npz_path)
dirs = [hashes[h]['unpacks_to'] for h in hashes]
if strict:
m = []
for p in Path(npz_path).parents:
m += [d for d in dirs if p.name == d]
assert len(m) == 1, f"Subdir of {npz_path} contains {len(m)} of {dirs}"
subdir = m[0]
else:
subdir = Path(npz_path).parts[-2]
return subdir, h, cdata["poses"].shape[0]
def npz_lens(unpacked_directory, n_jobs, strict=True):
paths = [p for p in npz_paths(unpacked_directory)]
return ProgressParallel(n_jobs=n_jobs)(
[joblib.delayed(npz_len)(npz_path, strict=strict) for npz_path in paths], total=len(paths)
)
def save_lens(save_path, npz_file_lens):
with gzip.open(save_path, "wt") as f:
f.write(json.dumps(npz_file_lens))
#npz_file_lens = npz_lens('/nobackup/gngdb/repos/amass/data', 10)
#save_lens('npz_file_lens.json.gz', npz_file_lens)
!du -hs npz_file_lens.json.gz
def keep_slice(n, keep):
drop = (1.0 - keep) / 2.0
return slice(int(n * drop), int(n * keep + n * drop))
def viable_slice(cdata, keep):
"""
Inspects a dictionary loaded from `.npz` numpy dumps
and creates a slice of the viable indexes.
args:
- `cdata`: dictionary containing keys:
['poses', 'gender', 'mocap_framerate', 'betas',
'marker_data', 'dmpls', 'marker_labels', 'trans']
- `keep`: ratio of the file to keep, between zero and 1.,
drops leading and trailing ends of the arrays
returns:
- viable: slice that can access frames in the arrays:
cdata['poses'], cdata['marker_data'], cdata['dmpls'], cdata['trans']
"""
assert (
keep > 0.0 and keep <= 1.0
), "Proportion of array to keep must be between zero and one"
n = cdata["poses"].shape[0]
return keep_slice(n, keep)
with tempfile.TemporaryDirectory() as tmpdirname:
unpack_body_models("sample_data/", tmpdirname, 8)
for npz_path in npz_paths(tmpdirname):
cdata = np.load(npz_path)
print(npz_path)
print(" ", viable_slice(cdata, 0.8))
def npz_contents(
npz_path,
clip_length,
overlapping,
keep=0.8,
keys=("poses", "dmpls", "trans", "betas", "gender"),
shuffle=False,
seed=None,
):
# cache this because we will often be accessing the same file multiple times
cdata = np.load(npz_path)
# slice of viable indices
viable = viable_slice(cdata, keep)
# slice iterator
# every time the file is opened the non-overlapping slices will be the same
# this may not be preferred, but loading overlapping means a lot of repetitive data
def clip_slices(viable, clip_length, overlapping):
i = 0
step = 1 if overlapping else clip_length
for i in range(viable.start, viable.stop, step):
if i + clip_length < viable.stop:
yield slice(i, i + clip_length)
# buffer the iterator and shuffle here, when implementing that
buf_clip_slices = [s for s in clip_slices(viable, clip_length, overlapping)]
if shuffle:
# this will be correlated over workers
# seed should be passed drawn from torch Generator
seed = seed if seed else random.randint(1e6)
random.Random(seed).shuffle(buf_clip_slices)
# iterate over slices
for s in buf_clip_slices:
data = {}
# unpack and enforce data type
to_load = [k for k in ("poses", "dmpls", "trans") if k in keys]
for k in to_load:
data[k] = cdata[k][s].astype(np.float32)
if "betas" in keys:
r = s.stop - s.start
data["betas"] = np.repeat(
cdata["betas"][np.newaxis].astype(np.float32), repeats=r, axis=0
)
if "gender" in keys:
def gender_to_int(g):
# casting gender to integer will raise a warning in future
g = str(g.astype(str))
return {"male": -1, "neutral": 0, "female": 1}[g]
data["gender"] = np.array(
[gender_to_int(cdata["gender"]) for _ in range(s.start, s.stop)]
)
yield data
def test_load_npz(clip_length, overlapping):
with tempfile.TemporaryDirectory() as tmpdirname:
unpack_body_models("sample_data/", tmpdirname, 8)
for npz_path in npz_paths(tmpdirname):
for data in npz_contents(npz_path, clip_length, overlapping):
print([(k, data[k].shape) for k in data])
for k in data:
assert data[k].shape[0] == clip_length
break
test_load_npz(1, False)
test_load_npz(3, False)
test_load_npz(3, True)
[p for p in Path('/nobackup/gngdb/repos/amass/data').glob('*') if p.is_dir()]
class AMASS(tudata.IterableDataset):
def __init__(
self,
amass_location,
clip_length,
overlapping,
keep=0.8,
transform=None,
data_keys=("poses", "dmpls", "trans", "betas", "gender"),
file_list_seed=0,
shuffle=False,
seed=None,
strict=True
):
assert clip_length > 0 and type(clip_length) is int
self.transform = transform
self.data_keys = data_keys
self.amass_location = amass_location
# these should be shuffled but pull shuffle argument out of dataloader worker arguments
self._npz_paths = [npz_path for npz_path in npz_paths(amass_location)]
random.Random(file_list_seed).shuffle(self._npz_paths)
self._npz_paths = tuple(self._npz_paths)
self.npz_paths = self._npz_paths
self.clip_length = clip_length
self.overlapping = overlapping
self.keep = keep
self.shuffle = shuffle
self.seed = seed if seed else random.randint(0, 1e6)
self.strict = strict
def infer_len(self, n_jobs=4):
# uses known dimensions of the npz files in the AMASS dataset to infer the length
# with clip_length and overlapping settings stored
lenfile = Path(self.amass_location) / Path("npz_file_lens.json.gz")
# try to load file
if lenfile.exists():
with gzip.open(lenfile, "rt") as f:
self.npz_lens = json.load(f)
def filter_lens(npz_lens):
# filter out file length information to only existing dirs
datasets = [p.name for p in Path(self.amass_location).glob('*') if p.is_dir()]
return [(p, h, l) for p, h, l in npz_lens
if p in datasets]
self.npz_lens = filter_lens(self.npz_lens)
else: # if it's not there, recompute it and create the file
print(f'Inspecting {len(self.npz_paths)} files to determine dataset length'
f', saving the result to {lenfile}')
self.npz_lens = npz_lens(self.amass_location, n_jobs, strict=self.strict)
save_lens(lenfile, self.npz_lens)
# using stored lengths to infer the total dataset length
def lenslice(s):
if self.overlapping:
return (s.stop - s.start) - (self.clip_length - 1)
else:
return math.floor((s.stop - s.start) / self.clip_length)
N = 0
for p, h, l in self.npz_lens:
s = keep_slice(l, keep=self.keep)
N += lenslice(s)
return N
def __len__(self):
if hasattr(self, "N"):
return self.N
else:
self.N = self.infer_len()
return self.N
def __iter__(self):
if self.shuffle:
self.npz_paths = list(self.npz_paths)
random.Random(self.seed).shuffle(self.npz_paths)
for npz_path in self.npz_paths:
for data in npz_contents(
npz_path,
self.clip_length,
self.overlapping,
keys=self.data_keys,
keep=self.keep,
shuffle=self.shuffle,
seed=self.seed,
):
self.seed += 1 # increment to vary shuffle over files
yield {k: self.transform(data[k]) for k in data}
Test I can load some data with this Dataset:
with tempfile.TemporaryDirectory() as tmpdirname:
unpack_body_models("sample_data/", tmpdirname, 8)
amass = AMASS(tmpdirname, overlapping=False, clip_length=1, transform=torch.tensor)
for data in amass:
for k in data:
print(k, data[k].shape)
assert type(data[k]) is torch.Tensor
break
print(len(amass))
Test it works in a DataLoader to make batches:
with tempfile.TemporaryDirectory() as tmpdirname:
unpack_body_models("sample_data/", tmpdirname, 8)
amass = AMASS(tmpdirname, overlapping=False, clip_length=1, transform=torch.tensor)
amasstrain = tudata.DataLoader(amass, batch_size=4)
for i, data in enumerate(amasstrain):
for k in data:
print(k, data[k].shape)
assert data["poses"].size(0) == 4, f'{data["poses"].size()}'
break
To work with
num_workers > 0
I'm going to pass a different set ofnpz
files to each worker using aworker_init_fn
.
The following worker_init_fn
must always be used when using num_workers > 0
or data will be duplicated. To simplify I am providing a DataLoader class that bakes this worker_init_fn
in when num_workers > 0
.
def worker_init_fn(worker_id):
worker_info = torch.utils.data.get_worker_info()
# slice up dataset among workers
dataset = worker_info.dataset
overall_npz_paths = dataset._npz_paths
step = int(len(overall_npz_paths) / float(worker_info.num_workers))
n = len(overall_npz_paths)
assert n >= worker_info.num_workers, (
"Every worker must get at least one file:" f" {worker_info.num_workers} > {n}"
)
start, stop = 0, n
for worker_idx, i in enumerate(range(start, stop, step)):
if worker_idx == worker_info.id:
worker_slice = slice(i, min(i + step, n + 1))
dataset.npz_paths = overall_npz_paths[worker_slice]
# set each workers seed
dataset.seed = dataset.seed + worker_info.seed
class IterableLoader(tudata.DataLoader):
def __init__(self, *args, **kwargs):
kwargs['worker_init_fn'] = worker_init_fn
super().__init__(*args, **kwargs)
def test_dataloader():
with tempfile.TemporaryDirectory() as tmpdirname:
unpack_body_models("sample_data/", tmpdirname, 8)
amass = AMASS(
tmpdirname, overlapping=False, clip_length=1, transform=torch.tensor
)
amasstrain = tudata.DataLoader(
amass, batch_size=4, worker_init_fn=worker_init_fn, num_workers=2
)
for i, data in enumerate(amasstrain):
for k in data:
print(k, data[k].shape)
assert data["poses"].size(0) == 4, f'{data["poses"].size()}'
break
amasstrain = IterableLoader(
amass, batch_size=4, num_workers=2
)
for i, data in enumerate(amasstrain):
for k in data:
print(k, data[k].shape)
assert data["poses"].size(0) == 4, f'{data["poses"].size()}'
break
test_dataloader()
def test_runtime(unpacked_dir, batch_size, num_workers):
amass = AMASS(
unpacked_dir, overlapping=False, clip_length=1, transform=torch.tensor, seed=0
)
amasstrain = DataLoader(
amass, batch_size=batch_size, worker_init_fn=worker_init_fn, num_workers=num_workers
)
start = time.time()
i = 0
for data in tqdm(amasstrain):
i += 1
if i > 100:
break
elapsed = time.time() - start
total_hours = ((elapsed / i) * len(amasstrain)) / (60 ** 2)
return elapsed, elapsed / i, total_hours
# test_runtime('/nobackup/gngdb/repos/amass/data', 256, 8)
# test_runtime('/scratch/gobi1/gngdb/amass', 256, 8)
Rough results from testing runtime on full dataset on my workstation:
PyTorch DataLoaders don't support shuffling IterableDataset because it's assuming the data is coming in as an IID stream. For this problem, this means the shuffling has to be implemented elsewhere.
There are two parts to shuffle:
The first is easy and can be an option to the iterator over each file. It doesn't affect how each worker operates because no two workers should ever touch the same file.
The second is more difficult because each worker has a different list of files. Also, it's important that the order of the global list of files be random, because some files are larger than others and the randomness is to ensure that each worker has approximately the same number of examples in the files it has received. However, every worker initialises a separate dataset, so each dataset has to have access to the same list of files. I think the best way to ensure this at this point is to use a shared random seed to shuffle the list of files at initialisation.
def test_shuffling(num_workers):
with tempfile.TemporaryDirectory() as tmpdirname:
unpack_body_models("sample_data/", tmpdirname, 8)
amass = AMASS(
tmpdirname,
overlapping=False,
clip_length=1,
transform=torch.tensor,
shuffle=True,
seed=0,
)
amasstrain = tudata.DataLoader(
amass, batch_size=4, worker_init_fn=worker_init_fn, num_workers=num_workers
)
for i, data in enumerate(amasstrain):
_data = data["poses"]
break
# second epoch shouldn't produce the same minibatch
for i, data in enumerate(amasstrain):
data = data["poses"]
data, _data = data.numpy(), _data.numpy()
assert not np.allclose(data, _data)
break
for num_workers in range(3):
test_shuffling(num_workers)
Console utility to split directories in a standard way
The original preprocessing script splits directories the following way:
original_amass_splits = {
'val' : ['HumanEva', 'MPI_HDM05', 'SFU', 'MPI_mosh'],
'test': ['Transitions_mocap', 'SSM_synced'],
'train': ['CMU', 'MPI_Limits', 'TotalCapture', 'Eyes_Japan_Dataset', 'KIT', 'BML', 'EKUT', 'TCD_handMocap', 'ACCAD']
}
This doesn't allocated all the datasets composing AMASS to a split:
Datasets not allocated to a split: {'BMLmovi', 'BioMotionLab_NTroje', 'DanceDB', 'BMLhandball', 'DFaust_67'}
Obtained by evaluating the following cell on an unpacked directory of AMASS data:
def unallocated_splits(unpacked_dir):
data_path = Path(unpacked_dir)
all_subdirs = {f.name for f in data_path.iterdir() if f.is_dir()}
unallocated = all_subdirs - {d for k in amass_splits for d in amass_splits[k]}
print(f'Datasets not allocated to a split: {unallocated}')
# unallocated_splits('/nobackup/gngdb/repos/amass/data')
I'm going to include all the newer datasets in the training set.
amass_splits = {
'val' : ['HumanEva', 'MPI_HDM05', 'SFU', 'MPI_mosh'],
'test': ['Transitions_mocap', 'SSM_synced'],
'train': ['CMU', 'MPI_Limits', 'TotalCapture', 'Eyes_Japan_Dataset',
'KIT', 'BML', 'EKUT', 'TCD_handMocap', 'ACCAD',
'BMLmovi', 'BioMotionLab_NTroje', 'DanceDB', 'BMLhandball', 'DFaust_67']
}
These functions move the directories into subdirectories that define the splits and undo this change.
def move_dirs_into_splits(amass_loc, splits, undo=False):
amass_loc = Path(amass_loc)
for k in amass_splits:
split_dir = amass_loc / Path(k)
if not split_dir.exists():
os.mkdir(split_dir)
for d in amass_splits[k]:
d = Path(d)
t = split_dir/d
f = amass_loc/d
try:
if undo:
os.rename(t, f)
else:
os.rename(f, t)
except FileNotFoundError:
warnings.warn(f'Could not find {d} for {k} split')
move_dirs_out_of_splits = functools.partial(move_dirs_into_splits, undo=True)
def console_split_dirs():
parser = argparse.ArgumentParser(
description="Split AMASS Dataset subdirs into train/val/test"
)
parser.add_argument(
"amassloc",
type=str,
help="Location where AMASS has been unpacked",
)
parser.add_argument(
"--undo",
action="store_true",
help="Undo move into subdirectories, put them all back in the root AMASS location",
)
args = parser.parse_args()
move_dirs_into_splits(args.amassloc, amass_splits, undo=args.undo)
import urllib.request
train_url = "https://raw.githubusercontent.com/eth-ait/spl/master/preprocessing/training_fnames.txt"
val_url = "https://raw.githubusercontent.com/eth-ait/spl/master/preprocessing/validation_fnames.txt"
test_url = "https://raw.githubusercontent.com/eth-ait/spl/master/preprocessing/test_fnames.txt"
splits = {}
for url in [train_url, val_url, test_url]:
split = url.split("/")[-1].split("_")[0]
with urllib.request.urlopen(url) as f:
file_list = f.read().decode('UTF-8').split("\n")
splits[split] = file_list
for split, file_list in splits.items():
print(split, " = ", set(f.split("/")[0] for f in file_list))
spl_splits = dict(
training = {'CMU_Kitchen', 'Eyes', 'HEva', '', 'MIXAMO', 'Transition', 'CMU', 'SSM', 'BioMotion', 'JointLimit', 'ACCAD', 'HDM05'},
validation = {'Eyes', '', 'MIXAMO', 'Transition', 'CMU', 'SSM', 'BioMotion', 'JointLimit', 'ACCAD', 'HDM05'},
test = {'CMU_Kitchen', 'Eyes', 'HEva', '', 'MIXAMO', 'Transition', 'CMU', 'BioMotion', 'JointLimit', 'ACCAD', 'HDM05'}
)
# iterate over every subdirectory in AMASS and open the json files containing information on file sizes
amass_loc = "/nobackup/gngdb/repos/amass/data/"
metadata = {}
for d in Path(amass_loc).iterdir():
if d.is_dir():
npz_len_file = d / "npz_file_lens.json.gz"
with gzip.open(npz_len_file) as f:
metadata[d.stem] = json.load(f)
for splits in [['train_subset', 'val_subset'], ['val', 'test', 'train']]:
print(splits)
total = sum(v for k in splits for n, h, v in metadata[k])
for k in splits:
subtotal = sum(v for n, h, v in metadata[k])
print(" ", k, f"{subtotal}/{total} = {100.*subtotal/total:.2f}%")