Source code for synthetic_aia_mia.fetch_data.utk
"""Downlaod and manages train / test split for UTKFaces dataset."""
from cv2 import pyrDown, pyrUp
from sklearn.model_selection import StratifiedKFold
import aia_fairness.dataset_processing as dp
import numpy as np
from PIL import Image
from pathlib import Path
import os
import kaggle
from zipfile import ZipFile
import copy
import matplotlib.pyplot as plt
from . import split
[docs]
class StorageData:
"""A dataset structure that loads images from storage.
On initialisation """
def __init__(self):
path = Path(os.getcwd(),"data_raw", "UTK")
os.makedirs(path, exist_ok=True)
#Toggle this switch to download or use predownload utk
imgpath = Path(path,"utkface_aligned_cropped","crop_part1")
if not(os.path.exists(imgpath)):
kaggle.api.authenticate()
kaggle.api.dataset_download_files("jangedoo/utkface-new", path)
with ZipFile(Path(path, "utkface-new.zip")) as z:
z.extractall(path=path)
files = os.listdir(imgpath)
#Parse file name to obtain labels and attributes
self.x = []
self.race = []
self.sex = []
self.y = []
for file in files:
a = file.find("_")
age = int(int(file[:a])>50)
b = 1+a+file[a+1:].find("_")
sex = int(file[a+1:b])
c = 1+b+file[b+1:].find("_")
try:
race = int(file[b+1:c])
except:
race = 3
if race==0 or race==1:
self.x += [Path(imgpath,file)]
self.y += [age]
self.race += [race]
self.sex += [sex]
self.x =np.array(self.x)
self.y =np.array(self.y)
self.race =np.array(self.race)
self.sex =np.array(self.sex)
def __getitem__(self,i):
"""return ith element in the following order : image, y, sex, race."""
x = np.asarray(Image.open(self.x[i]))
x = pyrDown(pyrDown(x))
x = np.moveaxis(x,2,0)
x = x.astype(float)/255.0
return x,self.y[i],self.sex[i],self.race[i]
def __len__(self):
return len(self.y)
def __str__(self):
pass
[docs]
def load(k=0,p=1):
"""Load UTK in a dictionary with train and test.
:param k: Validation step in {0,1,2,3,4}.
:type k: int
:param p: Proportion of data used in [0,1].
:type p: float
:return: Dictionary containing train and test.
:rtype: Dictionary of StorageDataset
"""
data = StorageData()
skf = StratifiedKFold(random_state=123, shuffle=True)
for i,(tmp_train,tmp_test) in enumerate(skf.split(data,data.y)):
if i==k:
train = tmp_train[:int(p*len(tmp_train))]
test = tmp_test[:int(p*len(tmp_test))]
data_split = {}
data_split["train"] = data.extraction(train)
data_split["test"] = data.extraction(test)
return data_split