--- title: Feature Statistics keywords: fastai sidebar: home_sidebar summary: "Gathering information about the statistics of features in the entire dataset and each component dataset." description: "Gathering information about the statistics of features in the entire dataset and each component dataset." nb_path: "02_features.ipynb" ---
{% raw %}
{% endraw %}

I'm going to put some figures in here about the features statistics. I want to check that features don't have zero variance.

The features are the numpy arrays named poses in each .npz archive. It is a sequence of 3D euler angles referring to both hand and body joints.

'root_orient': data['poses'][:, :3],   # controls the global root orientation
'pose_body':   data['poses'][:, 3:66], # controls the body
'pose_hand':   data['poses'][:, 66:],  # controls finger articulation

I do not know which joints are at what index in this pose vector.

I have concatenated the global body translation array to this because together they are sufficient to describe the pose:

'trans': data['trans'],                # controls global body position
{% raw %}

calculate_stats[source]

calculate_stats(npz_path)

{% endraw %} {% raw %}

stats_for_paths[source]

stats_for_paths(unpacked_directory, n_jobs)

{% endraw %} {% raw %}

jsonize[source]

jsonize(stats)

{% endraw %} {% raw %}
import json
import gzip
import joblib
import numpy as np
import llamass.core
from llamass.tqdm import ProgressParallel


def calculate_stats(npz_path):
    data = np.load(npz_path)
    pose = np.concatenate([data['poses'], data['trans']], axis=1)
    return npz_path, (np.mean(pose, 0), np.std(pose, 0), pose.shape[0])

def stats_for_paths(unpacked_directory, n_jobs):
    paths = [p for p in llamass.core.npz_paths(unpacked_directory)]
    return ProgressParallel(n_jobs=n_jobs)(
        [joblib.delayed(calculate_stats)(npz_path) for npz_path in paths], total=len(paths)
    )

def jsonize(stats):
    return [(p, (list(m), list(s))) for p, (m, s) in stats]

#amass_loc = '/nobackup/gngdb/repos/amass/data'
#stats = stats_for_paths(amass_loc, 8)
{% endraw %} {% raw %}
import os
from pathlib import Path
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_stats(stats, savedir='.'):
    means, stds, weights = [], [], []
    for p, (m, s, n) in stats:
        means.append(m)
        stds.append(s)
        weights.append(n)
    means, stds, weights = np.stack(means), np.stack(stds), np.stack(weights)
    # iterate over feature indexes
    for i in range(means.shape[1]):
        path = Path(savedir)/Path(f'feature_{i:03d}.png')
        m, s = means[:, i], stds[:, i]
        fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(8, 5))
        axes[0].hist(m, bins=32, weights=weights)
        axes[0].set_xlabel('Mean')
        axes[0].set_title(f'Feature {i} Means in Files')
        axes[1].hist(s, bins=32, weights=weights)
        axes[1].set_xlabel('Standard Deviation')
        axes[1].set_yscale('log')
        axes[1].set_title(f'Feature {i} Sigmas in Files')
        fig.tight_layout()
        fig.savefig(path)
        plt.close()
savedir = Path('images/features')
#if not savedir.exists():
#    os.mkdir(savedir)
#plot_stats(stats, savedir=savedir)
{% endraw %} {% raw %}
from IPython.display import Image, display
{% endraw %}

Plots are not displayed here to avoid repo bloat (every time I regenerate them it'd add 3MB to the git history). They are in this gist instead. The following cell images hosted in the gist.

{% raw %}
url = "https://gist.githubusercontent.com/gngdb/680d0a7df321c79338e568fbba62667a/raw/9da5501be42850b36c467943a3ab9c3c2a440b12/"
for i in range(159):
    image_url = url + f"feature_{i:03d}.png"
    display(Image(image_url))
{% endraw %}