--- title: Feature Statistics keywords: fastai sidebar: home_sidebar summary: "Gathering information about the statistics of features in the entire dataset and each component dataset." description: "Gathering information about the statistics of features in the entire dataset and each component dataset." nb_path: "02_features.ipynb" ---
I'm going to put some figures in here about the features statistics. I want to check that features don't have zero variance.
The features are the numpy arrays named poses
in each .npz
archive. It is a sequence of 3D euler angles referring to both hand and body joints.
'root_orient': data['poses'][:, :3], # controls the global root orientation
'pose_body': data['poses'][:, 3:66], # controls the body
'pose_hand': data['poses'][:, 66:], # controls finger articulation
I do not know which joints are at what index in this pose vector.
I have concatenated the global body translation array to this because together they are sufficient to describe the pose:
'trans': data['trans'], # controls global body position
import json
import gzip
import joblib
import numpy as np
import llamass.core
from llamass.tqdm import ProgressParallel
def calculate_stats(npz_path):
data = np.load(npz_path)
pose = np.concatenate([data['poses'], data['trans']], axis=1)
return npz_path, (np.mean(pose, 0), np.std(pose, 0), pose.shape[0])
def stats_for_paths(unpacked_directory, n_jobs):
paths = [p for p in llamass.core.npz_paths(unpacked_directory)]
return ProgressParallel(n_jobs=n_jobs)(
[joblib.delayed(calculate_stats)(npz_path) for npz_path in paths], total=len(paths)
)
def jsonize(stats):
return [(p, (list(m), list(s))) for p, (m, s) in stats]
#amass_loc = '/nobackup/gngdb/repos/amass/data'
#stats = stats_for_paths(amass_loc, 8)
import os
from pathlib import Path
import matplotlib.pyplot as plt
plt.style.use('ggplot')
def plot_stats(stats, savedir='.'):
means, stds, weights = [], [], []
for p, (m, s, n) in stats:
means.append(m)
stds.append(s)
weights.append(n)
means, stds, weights = np.stack(means), np.stack(stds), np.stack(weights)
# iterate over feature indexes
for i in range(means.shape[1]):
path = Path(savedir)/Path(f'feature_{i:03d}.png')
m, s = means[:, i], stds[:, i]
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(8, 5))
axes[0].hist(m, bins=32, weights=weights)
axes[0].set_xlabel('Mean')
axes[0].set_title(f'Feature {i} Means in Files')
axes[1].hist(s, bins=32, weights=weights)
axes[1].set_xlabel('Standard Deviation')
axes[1].set_yscale('log')
axes[1].set_title(f'Feature {i} Sigmas in Files')
fig.tight_layout()
fig.savefig(path)
plt.close()
savedir = Path('images/features')
#if not savedir.exists():
# os.mkdir(savedir)
#plot_stats(stats, savedir=savedir)
from IPython.display import Image, display
Plots are not displayed here to avoid repo bloat (every time I regenerate them it'd add 3MB to the git history). They are in this gist instead. The following cell images hosted in the gist.
url = "https://gist.githubusercontent.com/gngdb/680d0a7df321c79338e568fbba62667a/raw/9da5501be42850b36c467943a3ab9c3c2a440b12/"
for i in range(159):
image_url = url + f"feature_{i:03d}.png"
display(Image(image_url))