Source code for synthetic.fetch_data.utk

"""Downlaod and manages train / test split for UTKFaces dataset."""

from sklearn.model_selection import StratifiedKFold
import aia_fairness.dataset_processing as dp

from . import split

[docs] def load(sensitive=[],k=0): """Load UTKFaces Dataset. Downloads if data are not available. :param k: (Optinal default=0) Corss validation step in {0,1,2,3,4}. :type k: int :param sensitive: (Optional default=[]) List of sensitive attributes to include in the features. The sensitive attribute are "sex" and "race". :type sensitive: list of str :return: Train and test split numpy.ndarray in a dictionary. :rtype: Doctionary """ def loop(sensitive): data = {} if sensitive == []: tmp = dp.load_format("UTK", "sex") data["x"] = tmp["x"] data["y"] = tmp["y"] else: for s in sensitive: tmp = dp.load_format("UTK", s) data["x"] = tmp["x"] data["y"] = tmp["y"] data[s] = tmp["z"] return data try: data = loop(sensitive) except: print("Downloading UTk") dp.load_utk() data = loop(sensitive) skf = StratifiedKFold(random_state=1234,shuffle=True) for i,(tmp_train,tmp_test) in enumerate(skf.split(data["x"],data["y"])): if i==k: train = tmp_train test = tmp_test data_split = {"train":{},"test":{}} if sensitive==[]: data_split["train"] = {"x":data["x"][train], "y":data["y"][train]} data_split["test"] = {"x":data["x"][test], "y":data["y"][test]} else: for s in sensitive: data_split["train"][s] = data[s][train] data_split["test"][s] = data[s][test] return data_split