Source code for datasetsdefer.synthetic_data

import torch
import numpy as np
import sys
import torch
import torch.distributions as D
import logging
import sys
sys.path.append('../')
from .basedataset import BaseDataset
from networks.linear_net import Linear_net_sig

[docs]class SyntheticData(BaseDataset): """ Synthetic dataset introduced in our work """ def __init__(self, train_samples = 1000, test_samples = 1000, data_distribution = "mix_of_guassians", d = 10, mean_scale = 1, expert_deferred_error = 0, expert_nondeferred_error = 0.5, machine_nondeferred_error = 0, num_of_guassians = 10, val_split = 0.1, batch_size = 1000, transforms = None): ''' total_samples: total number of samples in the dataset data_distribution: the distribution of the data. mix_of_guassians, or uniform d: dimension of the data mean_scale: the scale of the means of the guassians, or uniform expert_deferred_error: the error of the expert when the data is deferred expert_nondeferred_error: the error of the expert when the data is nondeferred machine_nondeferred_error: the error of the machine when the data is nondeferred num_of_guassians: the number of guassians in the mix of guassians ''' self.val_split = val_split self.batch_size = batch_size self.train_split = 1 - val_split self.transforms = transforms self.train_samples = train_samples self.test_samples = test_samples self.total_samples = train_samples + test_samples self.data_distribution = data_distribution self.d = d self.n_dataset = 2 self.mean_scale = mean_scale self.expert_deferred_error = expert_deferred_error self.expert_nondeferred_error = expert_nondeferred_error self.machine_nondeferred_error = machine_nondeferred_error self.num_of_guassians = num_of_guassians self.generate_data()
[docs] def generate_data(self): if self.data_distribution == "uniform": data_x = torch.rand((self.total_samples,self.d))*self.mean_scale else: mix = D.Categorical(torch.ones(self.num_of_guassians,)) comp = D.Independent(D.Normal( torch.randn(self.num_of_guassians,self.d), torch.rand(self.num_of_guassians,self.d)), 1) gmm = torch.distributions.mixture_same_family.MixtureSameFamily(mix, comp) data_x = gmm.sample((self.total_samples,))*self.mean_scale # get random labels mean_rej_prop = 0 # make sure ramdom rejector rejects between 20 and 80% of the time (variable) while not (mean_rej_prop>=0.2 and mean_rej_prop<=0.8): net_rej_opt = Linear_net_sig(self.d) with torch.no_grad(): outputs = net_rej_opt(data_x) predicted = torch.round(outputs.data) mean_rej_prop = np.mean([predicted[i][0] for i in range(len(predicted))]) # get rejector preds on x opt_rej_preds = [] with torch.no_grad(): outputs = net_rej_opt(data_x) predicted = torch.round(outputs.data) opt_rej_preds = [predicted[i][0] for i in range(len(predicted))] # get classifier that is 1 at least 20% and at most 80% on non-deferred side mean_class_prop = 0 net_mach_opt = Linear_net_sig(self.d) while not (mean_class_prop>=0.2 and mean_class_prop<=0.8): net_mach_opt = Linear_net_sig(self.d) with torch.no_grad(): outputs = net_mach_opt(data_x) predicted = torch.round(outputs.data) predicted_class = [predicted[i][0]*(1-opt_rej_preds[i]) for i in range(len(predicted))] mean_class_prop = np.sum(predicted_class)/(len(opt_rej_preds)-np.sum(opt_rej_preds)) # get classifier preds on x opt_mach_preds = [] with torch.no_grad(): outputs = net_mach_opt(data_x) predicted = torch.round(outputs.data) opt_mach_preds = [predicted[i][0] for i in range(len(predicted))] # get random labels data_y = torch.randint(low=0,high=2,size=(self.total_samples,)) # make labels consistent with net_mach_opt on non-deferred side with error specified for i in range(len(data_y)): if opt_rej_preds[i] == 0: coin = np.random.binomial(1,1-self.machine_nondeferred_error,1)[0] if coin == 1: data_y[i] = opt_mach_preds[i] # make expert 1-expert_deferred_error accurate on deferred side and 1-expert_nondeferred_error accurate otherwise human_predictions = [0]*len(data_y) for i in range(len(data_y)): if opt_rej_preds[i] == 1: coin = np.random.binomial(1,1-self.expert_deferred_error,1)[0] if coin == 1: human_predictions[i] = data_y[i] else: human_predictions[i] = 1 - data_y[i] else: coin = np.random.binomial(1,1-self.expert_nondeferred_error,1)[0] if coin == 1: human_predictions[i] = data_y[i] else: human_predictions[i] = 1 - data_y[i] human_predictions = torch.tensor(human_predictions) # split into train, val, test train_size = int(self.train_samples * self.train_split) val_size = int(self.train_samples * self.val_split) test_size = len(data_x) - train_size - val_size # = self.test_samples self.train_x, self.val_x, self.test_x = torch.utils.data.random_split(data_x, [train_size, val_size, test_size], generator=torch.Generator().manual_seed(42)) self.train_y, self.val_y, self.test_y = torch.utils.data.random_split(data_y, [train_size, val_size, test_size], generator=torch.Generator().manual_seed(42)) self.train_h, self.val_h, self.test_h = torch.utils.data.random_split(human_predictions, [train_size, val_size, test_size], generator=torch.Generator().manual_seed(42)) logging.info("train size: ", len(self.train_x)) logging.info("val size: ", len(self.val_x)) logging.info("test size: ", len(self.test_x)) self.data_train = torch.utils.data.TensorDataset(self.train_x.dataset.data[self.train_x.indices], self.train_y.dataset.data[self.train_y.indices], self.train_h.dataset.data[self.train_h.indices]) self.data_val = torch.utils.data.TensorDataset(self.val_x.dataset.data[self.val_x.indices], self.val_y.dataset.data[self.val_y.indices], self.val_h.dataset.data[self.val_h.indices]) self.data_test = torch.utils.data.TensorDataset(self.test_x.dataset.data[self.test_x.indices], self.test_y.dataset.data[self.test_y.indices], self.test_h.dataset.data[self.test_h.indices]) self.data_train_loader = torch.utils.data.DataLoader(self.data_train, batch_size=self.batch_size, shuffle=True) self.data_val_loader = torch.utils.data.DataLoader(self.data_val, batch_size=self.batch_size, shuffle=True) self.data_test_loader = torch.utils.data.DataLoader(self.data_test, batch_size=self.batch_size, shuffle=True) # double check if the solution we got is actually correct error_optimal_= 0 for i in range(len(data_y)): if opt_rej_preds[i] == 1: error_optimal_ += human_predictions[i] != data_y[i] else: error_optimal_ += opt_mach_preds[i] != data_y[i] error_optimal_ = error_optimal_/len(data_y) self.error_optimal = error_optimal_ logging.info(f'Data optimal: Accuracy Train {100-100*error_optimal_:.3f} with rej {mean_rej_prop*100} \n \n')