src.slune.savers.csv
1from typing import List, Optional 2import os 3import pandas as pd 4from slune.utils import find_directory_path, get_all_paths, get_numeric_equiv 5from slune.base import BaseSaver, BaseLogger 6import random 7import time 8 9class SaverCsv(BaseSaver): 10 """ Saves the results of each run in a CSV file in hierarchy of directories. 11 12 Each directory is named after a parameter - value pair in the form "--parameter_name=value". 13 The paths to csv files then define the configuration under which the results were obtained, 14 for example if we only have one parameter "learning_rate" with value 0.01 used to obtain the results, 15 to save those results we would create a directory named "--learning_rate=0.01" and save the results in a csv file in that directory. 16 17 If we have multiple parameters, for example "learning_rate" with value 0.01 and "batch_size" with value 32, 18 we would create a directory named "--learning_rate=0.01" with a subdirectory named "--batch_size=32", 19 and save the results in a csv file in that subdirectory. 20 21 We use this structure to then read the results from the csv files by searching for the directory that matches the parameters we want, 22 and then reading the csv file in that directory. 23 24 The order in which we create the directories is determined by the order in which the parameters are given, 25 so if we are given ["--learning_rate=0.01", "--batch_size=32"] we would create the directories in the following order: 26 "--learning_rate=0.01/--batch_size=32". 27 28 The directory structure generated will also depend on existing directories in the root directory, 29 if there are existing directories in the root directory that match some subset of the parameters given, 30 we will create the directory tree from the deepest matching directory. 31 32 For example if we only have the following path in the root directory: 33 "--learning_rate=0.01/--batch_size=32" 34 and we are given the parameters ["--learning_rate=0.01", "--batch_size=32", "--num_epochs=10"], 35 we will create the path: 36 "--learning_rate=0.01/--batch_size=32/--num_epochs=10". 37 on the other hand if we are given the parameters ["--learning_rate=0.02", "--num_epochs=10", "--batch_size=32"], 38 we will create the path: 39 "--learning_rate=0.02/--batch_size=32/--num_epochs=10". 40 41 Handles parallel runs trying to create the same directories by waiting a random time (under 1 second) before creating the directory. 42 Should work pretty well in practice, however, may occasionally fail depending on the number of jobs launched at the same time. 43 44 Attributes: 45 - root_dir (str): Path to the root directory where we will store the csv files. 46 - current_path (str): Path to the csv file where we will store the results for the current run. 47 48 """ 49 50 def __init__(self, logger_instance: BaseLogger, params: List[str] = None, root_dir: Optional[str] = os.path.join('.', 'tuning_results')): 51 """ Initialises the csv saver. 52 53 Args: 54 - logger_instance (BaseLogger): Instance of a logger class that inherits from BaseLogger. 55 - params (list, optional): List of strings containing the parameters used, in form ["--parameter_name=parameter_value", ...], default is None. 56 If None, we will create a path using the parameters given in the log. 57 - root_dir (str, optional): Path to the root directory where we will store the csv files, default is './tuning_results'. 58 59 """ 60 61 super(SaverCsv, self).__init__(logger_instance) 62 self.root_dir = root_dir 63 if params != None: 64 self.current_path = self.get_path(params) 65 66 def strip_params(self, params: List[str]) -> List[str]: 67 """ Strips the parameter values. 68 69 Strips the parameter values from the list of parameters given, 70 ie. ["--parameter_name=parameter_value", ...] -> ["--parameter_name=", ...] 71 72 Also gets rid of blank spaces. 73 74 Args: 75 - params (list of str): List of strings containing the parameters used, in form ["--parameter_name=parameter_value", ...]. 76 77 Returns: 78 - stripped_params (list of str): List of strings containing the parameters used, in form ["--parameter_name=", ...]. 79 80 """ 81 82 stripped_params = [p.split('=')[0].strip() for p in params] 83 return stripped_params 84 85 def get_match(self, params: List[str]) -> str: 86 """ Searches the root directory for a path that matches the parameters given. 87 88 If only partial matches are found, returns the deepest matching directory with the missing parameters appended. 89 By deepest we mean the directory with the most parameters matching. 90 If no matches are found creates a path using the parameters. 91 Creates path using parameters in the order they are given, 92 ie. ["--learning_rate=0.01", "--batch_size=32"] -> "--learning_rate=0.01/--batch_size=32". 93 94 If we find a partial match, we add the missing parameters to the end of the path, 95 ie. if we have the path "--learning_rate=0.01" in the root 96 and are given the parameters ["--learning_rate=0.01", "--batch_size=32"], 97 we will create the path "--learning_rate=0.01/--batch_size=32". 98 99 Args: 100 - params (list of str): List of strings containing the arguments used, in form ["--argument_name=argument_value", ...]. 101 102 Returns: 103 - match (str): Path to the directory that matches the parameters given. 104 105 """ 106 107 # First check if there is a directory with path matching some subset of the arguments 108 stripped_params = [p.split('=')[0].strip() +'=' for p in params] # Strip the params of whitespace and everything after the '=' 109 if len(set(stripped_params)) != len(stripped_params): 110 raise ValueError(f"Duplicate parameters found in {stripped_params}") 111 match = find_directory_path(stripped_params, root_directory=self.root_dir) 112 # Add on missing parameters 113 if match == self.root_dir: 114 match = os.path.join(*stripped_params) 115 else: 116 missing_params = [p for p in stripped_params if p not in match] 117 if missing_params != []: 118 match = [match] + missing_params 119 match = os.path.join(*match) 120 # Take the root directory out of the match 121 match = match.replace(self.root_dir, '') 122 if match.startswith(os.path.sep): 123 match = match[1:] 124 # Now we add back in the values we stripped out 125 match = match.split(os.path.sep) 126 match = [[p for p in params if m in p][0] for m in match] 127 # Check if there is an existing path with the same numerical values, if so use that instead 128 match = get_numeric_equiv(os.path.join(*match), root_directory=self.root_dir) 129 return match 130 131 def get_path(self, params: List[str]) -> str: 132 """ Creates a path using the parameters. 133 134 Does this by first checking for existing paths in the root directory that match the parameters given. 135 136 Check get_match for how we create the path, 137 once we have the path we check if there is already a csv file with results in that path, 138 if there is we increment the number of the results file name that we will use. 139 140 For example if we get back the path "--learning_rate=0.01/--batch_size=32", 141 and there exists a csv file named "results_0.csv" in the final directory, 142 we will name our csv file "results_1.csv". 143 144 Args: 145 - params (list of str): List of strings containing the arguments used, in form ["--argument_name=argument_value", ...]. 146 147 Returns: 148 - csv_file_path (str): Path to the csv file where we will store the results for the current run. 149 150 """ 151 152 # Check if root directory exists, if not create it 153 if not os.path.exists(self.root_dir): 154 time.sleep(random.random()) # Wait a random amount of time under 1 second to avoid multiple processes creating the same directory 155 os.makedirs(self.root_dir) 156 # Get path of directory where we should store our csv of results 157 dir_path = self.get_match(params) 158 # Check if directory exists, if not create it 159 if not os.path.exists(dir_path): 160 csv_file_number = 0 161 # If it does exist, check if there is already a csv file with results, 162 # if there is find the name of the last csv file and increment the number 163 else: 164 csv_files = [f for f in os.listdir(dir_path) if f.endswith('.csv')] 165 if len(csv_files) > 0: 166 last_csv_file = max(csv_files) 167 # Check that the last csv file starts with "results_" 168 if not last_csv_file.startswith('results_'): 169 raise ValueError('Found csv file in directory that doesn\'t start with "results_"') 170 csv_file_number = int(last_csv_file.split('_')[1][:-4]) + 1 171 else: 172 csv_file_number = 0 173 # Create path name for a new csv file where we can later store results 174 csv_file_path = os.path.join(dir_path, f'results_{csv_file_number}.csv') 175 return csv_file_path 176 177 def save_collated_from_results(self, results: pd.DataFrame): 178 """ Saves results to csv file. 179 180 If the csv file already exists, 181 we append the collated results from the logger to the end of the csv file. 182 If the csv file does not exist, 183 we create it and save the results to it. 184 185 Args: 186 - results (pd.DataFrame): Data frame containing the results to be saved. 187 188 TODO: 189 - Could be making to many assumptions about the format in which we get the results from the logger, 190 should be able to work with any logger. 191 We should only be assuming that we are saving results to a csv file. 192 193 """ 194 195 # If path does not exist, create it 196 # Remove the csv file name from the path 197 dir_path = self.current_path.split(os.path.sep)[:-1] 198 dir_path = os.path.join(*dir_path) 199 if not os.path.exists(dir_path): 200 time.sleep(random.random()) # Wait a random amount of time under 1 second to avoid multiple processes creating the same directory 201 os.makedirs(dir_path) 202 # If csv file already exists, append results to the end 203 if os.path.exists(self.current_path): 204 results = pd.concat([pd.read_csv(self.current_path), results]) 205 results.to_csv(self.current_path, mode='w', index=False) 206 # If csv file does not exist, create it 207 else: 208 results.to_csv(self.current_path, index=False) 209 210 def save_collated(self): 211 """ Saves results to csv file. """ 212 213 self.save_collated_from_results(self.logger.results) 214 215 def read(self, params: List[str], metric_name: str, select_by: str ='max', avg: bool =True) -> (List[str], float): 216 """ Finds the min/max value of a metric from all csv files in the root directory that match the parameters given. 217 218 Args: 219 - params (list of str): Contains the parameters used, in form ["--parameter_name=parameter_value", ...]. 220 - metric_name (string): Name of the metric to be read. 221 - select_by (string, optional): How to select the 'best' value for the metric from a log file, currently can select by 'min' or 'max'. 222 - avg (bool, optional): Whether to average the metric over all runs, default is True. 223 224 Returns: 225 - best_params (list of str): Contains the arguments used to get the 'best' value of the metric (determined by select_by). 226 - best_value (float): Best value of the metric (determined by select_by). 227 228 """ 229 230 # Get all paths that match the parameters given 231 paths = get_all_paths(params, root_directory=self.root_dir) 232 if paths == []: 233 raise ValueError(f"No paths found matching {params}") 234 # Read the metric from each path 235 values = {} 236 # Do averaging for different runs of same params if avg is True, otherwise just read the metric from each path 237 if avg: 238 paths_same_params = set([os.path.join(*p.split(os.path.sep)[:-1]) for p in paths]) 239 for path in paths_same_params: 240 runs = get_all_paths(path.split(os.path.sep), root_directory=self.root_dir) 241 cumsum = 0 242 for r in runs: 243 df = pd.read_csv(r) 244 cumsum += self.read_log(df, metric_name, select_by) 245 avg_of_runs = cumsum / len(runs) 246 values[path] = avg_of_runs 247 else: 248 for path in paths: 249 df = pd.read_csv(path) 250 values[os.path.join(*path.split(os.path.sep)[:-1])] = self.read_log(df, metric_name, select_by) 251 # Get the key of the min/max value 252 if select_by == 'min': 253 best_params = min(values, key=values.get) 254 elif select_by == 'max': 255 best_params = max(values, key=values.get) 256 else: 257 raise ValueError(f"select_by must be 'min' or 'max', got {select_by}") 258 # Find the best value of the metric from the key 259 best_value = values[best_params] 260 # Format the path into a list of arguments 261 best_params = best_params.replace(self.root_dir, '') 262 if best_params.startswith(os.path.sep): 263 best_params = best_params[1:] 264 best_params = best_params.split(os.path.sep) 265 return best_params, best_value 266 267 def exists(self, params: List[str]) -> int: 268 """ Checks if results already exist in storage. 269 270 Args: 271 - params (list of str): Contains the parameters used, in form ["--parameter_name=parameter_value", ...]. 272 273 Returns: 274 - num_runs (int): Number of runs that exist in storage for the given parameters. 275 276 """ 277 278 # Get all paths that match the parameters given 279 paths = get_all_paths(params, root_directory=self.root_dir) 280 return len(paths) 281 282 def get_current_path(self) -> str: 283 """ Getter function for the current_path attribute. 284 285 Returns: 286 - current_path (str): Path to the csv file where we will store the results for the current run. 287 288 """ 289 290 return self.current_path
10class SaverCsv(BaseSaver): 11 """ Saves the results of each run in a CSV file in hierarchy of directories. 12 13 Each directory is named after a parameter - value pair in the form "--parameter_name=value". 14 The paths to csv files then define the configuration under which the results were obtained, 15 for example if we only have one parameter "learning_rate" with value 0.01 used to obtain the results, 16 to save those results we would create a directory named "--learning_rate=0.01" and save the results in a csv file in that directory. 17 18 If we have multiple parameters, for example "learning_rate" with value 0.01 and "batch_size" with value 32, 19 we would create a directory named "--learning_rate=0.01" with a subdirectory named "--batch_size=32", 20 and save the results in a csv file in that subdirectory. 21 22 We use this structure to then read the results from the csv files by searching for the directory that matches the parameters we want, 23 and then reading the csv file in that directory. 24 25 The order in which we create the directories is determined by the order in which the parameters are given, 26 so if we are given ["--learning_rate=0.01", "--batch_size=32"] we would create the directories in the following order: 27 "--learning_rate=0.01/--batch_size=32". 28 29 The directory structure generated will also depend on existing directories in the root directory, 30 if there are existing directories in the root directory that match some subset of the parameters given, 31 we will create the directory tree from the deepest matching directory. 32 33 For example if we only have the following path in the root directory: 34 "--learning_rate=0.01/--batch_size=32" 35 and we are given the parameters ["--learning_rate=0.01", "--batch_size=32", "--num_epochs=10"], 36 we will create the path: 37 "--learning_rate=0.01/--batch_size=32/--num_epochs=10". 38 on the other hand if we are given the parameters ["--learning_rate=0.02", "--num_epochs=10", "--batch_size=32"], 39 we will create the path: 40 "--learning_rate=0.02/--batch_size=32/--num_epochs=10". 41 42 Handles parallel runs trying to create the same directories by waiting a random time (under 1 second) before creating the directory. 43 Should work pretty well in practice, however, may occasionally fail depending on the number of jobs launched at the same time. 44 45 Attributes: 46 - root_dir (str): Path to the root directory where we will store the csv files. 47 - current_path (str): Path to the csv file where we will store the results for the current run. 48 49 """ 50 51 def __init__(self, logger_instance: BaseLogger, params: List[str] = None, root_dir: Optional[str] = os.path.join('.', 'tuning_results')): 52 """ Initialises the csv saver. 53 54 Args: 55 - logger_instance (BaseLogger): Instance of a logger class that inherits from BaseLogger. 56 - params (list, optional): List of strings containing the parameters used, in form ["--parameter_name=parameter_value", ...], default is None. 57 If None, we will create a path using the parameters given in the log. 58 - root_dir (str, optional): Path to the root directory where we will store the csv files, default is './tuning_results'. 59 60 """ 61 62 super(SaverCsv, self).__init__(logger_instance) 63 self.root_dir = root_dir 64 if params != None: 65 self.current_path = self.get_path(params) 66 67 def strip_params(self, params: List[str]) -> List[str]: 68 """ Strips the parameter values. 69 70 Strips the parameter values from the list of parameters given, 71 ie. ["--parameter_name=parameter_value", ...] -> ["--parameter_name=", ...] 72 73 Also gets rid of blank spaces. 74 75 Args: 76 - params (list of str): List of strings containing the parameters used, in form ["--parameter_name=parameter_value", ...]. 77 78 Returns: 79 - stripped_params (list of str): List of strings containing the parameters used, in form ["--parameter_name=", ...]. 80 81 """ 82 83 stripped_params = [p.split('=')[0].strip() for p in params] 84 return stripped_params 85 86 def get_match(self, params: List[str]) -> str: 87 """ Searches the root directory for a path that matches the parameters given. 88 89 If only partial matches are found, returns the deepest matching directory with the missing parameters appended. 90 By deepest we mean the directory with the most parameters matching. 91 If no matches are found creates a path using the parameters. 92 Creates path using parameters in the order they are given, 93 ie. ["--learning_rate=0.01", "--batch_size=32"] -> "--learning_rate=0.01/--batch_size=32". 94 95 If we find a partial match, we add the missing parameters to the end of the path, 96 ie. if we have the path "--learning_rate=0.01" in the root 97 and are given the parameters ["--learning_rate=0.01", "--batch_size=32"], 98 we will create the path "--learning_rate=0.01/--batch_size=32". 99 100 Args: 101 - params (list of str): List of strings containing the arguments used, in form ["--argument_name=argument_value", ...]. 102 103 Returns: 104 - match (str): Path to the directory that matches the parameters given. 105 106 """ 107 108 # First check if there is a directory with path matching some subset of the arguments 109 stripped_params = [p.split('=')[0].strip() +'=' for p in params] # Strip the params of whitespace and everything after the '=' 110 if len(set(stripped_params)) != len(stripped_params): 111 raise ValueError(f"Duplicate parameters found in {stripped_params}") 112 match = find_directory_path(stripped_params, root_directory=self.root_dir) 113 # Add on missing parameters 114 if match == self.root_dir: 115 match = os.path.join(*stripped_params) 116 else: 117 missing_params = [p for p in stripped_params if p not in match] 118 if missing_params != []: 119 match = [match] + missing_params 120 match = os.path.join(*match) 121 # Take the root directory out of the match 122 match = match.replace(self.root_dir, '') 123 if match.startswith(os.path.sep): 124 match = match[1:] 125 # Now we add back in the values we stripped out 126 match = match.split(os.path.sep) 127 match = [[p for p in params if m in p][0] for m in match] 128 # Check if there is an existing path with the same numerical values, if so use that instead 129 match = get_numeric_equiv(os.path.join(*match), root_directory=self.root_dir) 130 return match 131 132 def get_path(self, params: List[str]) -> str: 133 """ Creates a path using the parameters. 134 135 Does this by first checking for existing paths in the root directory that match the parameters given. 136 137 Check get_match for how we create the path, 138 once we have the path we check if there is already a csv file with results in that path, 139 if there is we increment the number of the results file name that we will use. 140 141 For example if we get back the path "--learning_rate=0.01/--batch_size=32", 142 and there exists a csv file named "results_0.csv" in the final directory, 143 we will name our csv file "results_1.csv". 144 145 Args: 146 - params (list of str): List of strings containing the arguments used, in form ["--argument_name=argument_value", ...]. 147 148 Returns: 149 - csv_file_path (str): Path to the csv file where we will store the results for the current run. 150 151 """ 152 153 # Check if root directory exists, if not create it 154 if not os.path.exists(self.root_dir): 155 time.sleep(random.random()) # Wait a random amount of time under 1 second to avoid multiple processes creating the same directory 156 os.makedirs(self.root_dir) 157 # Get path of directory where we should store our csv of results 158 dir_path = self.get_match(params) 159 # Check if directory exists, if not create it 160 if not os.path.exists(dir_path): 161 csv_file_number = 0 162 # If it does exist, check if there is already a csv file with results, 163 # if there is find the name of the last csv file and increment the number 164 else: 165 csv_files = [f for f in os.listdir(dir_path) if f.endswith('.csv')] 166 if len(csv_files) > 0: 167 last_csv_file = max(csv_files) 168 # Check that the last csv file starts with "results_" 169 if not last_csv_file.startswith('results_'): 170 raise ValueError('Found csv file in directory that doesn\'t start with "results_"') 171 csv_file_number = int(last_csv_file.split('_')[1][:-4]) + 1 172 else: 173 csv_file_number = 0 174 # Create path name for a new csv file where we can later store results 175 csv_file_path = os.path.join(dir_path, f'results_{csv_file_number}.csv') 176 return csv_file_path 177 178 def save_collated_from_results(self, results: pd.DataFrame): 179 """ Saves results to csv file. 180 181 If the csv file already exists, 182 we append the collated results from the logger to the end of the csv file. 183 If the csv file does not exist, 184 we create it and save the results to it. 185 186 Args: 187 - results (pd.DataFrame): Data frame containing the results to be saved. 188 189 TODO: 190 - Could be making to many assumptions about the format in which we get the results from the logger, 191 should be able to work with any logger. 192 We should only be assuming that we are saving results to a csv file. 193 194 """ 195 196 # If path does not exist, create it 197 # Remove the csv file name from the path 198 dir_path = self.current_path.split(os.path.sep)[:-1] 199 dir_path = os.path.join(*dir_path) 200 if not os.path.exists(dir_path): 201 time.sleep(random.random()) # Wait a random amount of time under 1 second to avoid multiple processes creating the same directory 202 os.makedirs(dir_path) 203 # If csv file already exists, append results to the end 204 if os.path.exists(self.current_path): 205 results = pd.concat([pd.read_csv(self.current_path), results]) 206 results.to_csv(self.current_path, mode='w', index=False) 207 # If csv file does not exist, create it 208 else: 209 results.to_csv(self.current_path, index=False) 210 211 def save_collated(self): 212 """ Saves results to csv file. """ 213 214 self.save_collated_from_results(self.logger.results) 215 216 def read(self, params: List[str], metric_name: str, select_by: str ='max', avg: bool =True) -> (List[str], float): 217 """ Finds the min/max value of a metric from all csv files in the root directory that match the parameters given. 218 219 Args: 220 - params (list of str): Contains the parameters used, in form ["--parameter_name=parameter_value", ...]. 221 - metric_name (string): Name of the metric to be read. 222 - select_by (string, optional): How to select the 'best' value for the metric from a log file, currently can select by 'min' or 'max'. 223 - avg (bool, optional): Whether to average the metric over all runs, default is True. 224 225 Returns: 226 - best_params (list of str): Contains the arguments used to get the 'best' value of the metric (determined by select_by). 227 - best_value (float): Best value of the metric (determined by select_by). 228 229 """ 230 231 # Get all paths that match the parameters given 232 paths = get_all_paths(params, root_directory=self.root_dir) 233 if paths == []: 234 raise ValueError(f"No paths found matching {params}") 235 # Read the metric from each path 236 values = {} 237 # Do averaging for different runs of same params if avg is True, otherwise just read the metric from each path 238 if avg: 239 paths_same_params = set([os.path.join(*p.split(os.path.sep)[:-1]) for p in paths]) 240 for path in paths_same_params: 241 runs = get_all_paths(path.split(os.path.sep), root_directory=self.root_dir) 242 cumsum = 0 243 for r in runs: 244 df = pd.read_csv(r) 245 cumsum += self.read_log(df, metric_name, select_by) 246 avg_of_runs = cumsum / len(runs) 247 values[path] = avg_of_runs 248 else: 249 for path in paths: 250 df = pd.read_csv(path) 251 values[os.path.join(*path.split(os.path.sep)[:-1])] = self.read_log(df, metric_name, select_by) 252 # Get the key of the min/max value 253 if select_by == 'min': 254 best_params = min(values, key=values.get) 255 elif select_by == 'max': 256 best_params = max(values, key=values.get) 257 else: 258 raise ValueError(f"select_by must be 'min' or 'max', got {select_by}") 259 # Find the best value of the metric from the key 260 best_value = values[best_params] 261 # Format the path into a list of arguments 262 best_params = best_params.replace(self.root_dir, '') 263 if best_params.startswith(os.path.sep): 264 best_params = best_params[1:] 265 best_params = best_params.split(os.path.sep) 266 return best_params, best_value 267 268 def exists(self, params: List[str]) -> int: 269 """ Checks if results already exist in storage. 270 271 Args: 272 - params (list of str): Contains the parameters used, in form ["--parameter_name=parameter_value", ...]. 273 274 Returns: 275 - num_runs (int): Number of runs that exist in storage for the given parameters. 276 277 """ 278 279 # Get all paths that match the parameters given 280 paths = get_all_paths(params, root_directory=self.root_dir) 281 return len(paths) 282 283 def get_current_path(self) -> str: 284 """ Getter function for the current_path attribute. 285 286 Returns: 287 - current_path (str): Path to the csv file where we will store the results for the current run. 288 289 """ 290 291 return self.current_path
Saves the results of each run in a CSV file in hierarchy of directories.
Each directory is named after a parameter - value pair in the form "--parameter_name=value". The paths to csv files then define the configuration under which the results were obtained, for example if we only have one parameter "learning_rate" with value 0.01 used to obtain the results, to save those results we would create a directory named "--learning_rate=0.01" and save the results in a csv file in that directory.
If we have multiple parameters, for example "learning_rate" with value 0.01 and "batch_size" with value 32, we would create a directory named "--learning_rate=0.01" with a subdirectory named "--batch_size=32", and save the results in a csv file in that subdirectory.
We use this structure to then read the results from the csv files by searching for the directory that matches the parameters we want, and then reading the csv file in that directory.
The order in which we create the directories is determined by the order in which the parameters are given, so if we are given ["--learning_rate=0.01", "--batch_size=32"] we would create the directories in the following order: "--learning_rate=0.01/--batch_size=32".
The directory structure generated will also depend on existing directories in the root directory, if there are existing directories in the root directory that match some subset of the parameters given, we will create the directory tree from the deepest matching directory.
For example if we only have the following path in the root directory: "--learning_rate=0.01/--batch_size=32" and we are given the parameters ["--learning_rate=0.01", "--batch_size=32", "--num_epochs=10"], we will create the path: "--learning_rate=0.01/--batch_size=32/--num_epochs=10". on the other hand if we are given the parameters ["--learning_rate=0.02", "--num_epochs=10", "--batch_size=32"], we will create the path: "--learning_rate=0.02/--batch_size=32/--num_epochs=10".
Handles parallel runs trying to create the same directories by waiting a random time (under 1 second) before creating the directory. Should work pretty well in practice, however, may occasionally fail depending on the number of jobs launched at the same time.
Attributes:
- - root_dir (str): Path to the root directory where we will store the csv files.
- - current_path (str): Path to the csv file where we will store the results for the current run.
51 def __init__(self, logger_instance: BaseLogger, params: List[str] = None, root_dir: Optional[str] = os.path.join('.', 'tuning_results')): 52 """ Initialises the csv saver. 53 54 Args: 55 - logger_instance (BaseLogger): Instance of a logger class that inherits from BaseLogger. 56 - params (list, optional): List of strings containing the parameters used, in form ["--parameter_name=parameter_value", ...], default is None. 57 If None, we will create a path using the parameters given in the log. 58 - root_dir (str, optional): Path to the root directory where we will store the csv files, default is './tuning_results'. 59 60 """ 61 62 super(SaverCsv, self).__init__(logger_instance) 63 self.root_dir = root_dir 64 if params != None: 65 self.current_path = self.get_path(params)
Initialises the csv saver.
Arguments:
- - logger_instance (BaseLogger): Instance of a logger class that inherits from BaseLogger.
- - params (list, optional): List of strings containing the parameters used, in form ["--parameter_name=parameter_value", ...], default is None. If None, we will create a path using the parameters given in the log.
- - root_dir (str, optional): Path to the root directory where we will store the csv files, default is './tuning_results'.
67 def strip_params(self, params: List[str]) -> List[str]: 68 """ Strips the parameter values. 69 70 Strips the parameter values from the list of parameters given, 71 ie. ["--parameter_name=parameter_value", ...] -> ["--parameter_name=", ...] 72 73 Also gets rid of blank spaces. 74 75 Args: 76 - params (list of str): List of strings containing the parameters used, in form ["--parameter_name=parameter_value", ...]. 77 78 Returns: 79 - stripped_params (list of str): List of strings containing the parameters used, in form ["--parameter_name=", ...]. 80 81 """ 82 83 stripped_params = [p.split('=')[0].strip() for p in params] 84 return stripped_params
Strips the parameter values.
Strips the parameter values from the list of parameters given, ie. ["--parameter_name=parameter_value", ...] -> ["--parameter_name=", ...]
Also gets rid of blank spaces.
Arguments:
- - params (list of str): List of strings containing the parameters used, in form ["--parameter_name=parameter_value", ...].
Returns:
- stripped_params (list of str): List of strings containing the parameters used, in form ["--parameter_name=", ...].
86 def get_match(self, params: List[str]) -> str: 87 """ Searches the root directory for a path that matches the parameters given. 88 89 If only partial matches are found, returns the deepest matching directory with the missing parameters appended. 90 By deepest we mean the directory with the most parameters matching. 91 If no matches are found creates a path using the parameters. 92 Creates path using parameters in the order they are given, 93 ie. ["--learning_rate=0.01", "--batch_size=32"] -> "--learning_rate=0.01/--batch_size=32". 94 95 If we find a partial match, we add the missing parameters to the end of the path, 96 ie. if we have the path "--learning_rate=0.01" in the root 97 and are given the parameters ["--learning_rate=0.01", "--batch_size=32"], 98 we will create the path "--learning_rate=0.01/--batch_size=32". 99 100 Args: 101 - params (list of str): List of strings containing the arguments used, in form ["--argument_name=argument_value", ...]. 102 103 Returns: 104 - match (str): Path to the directory that matches the parameters given. 105 106 """ 107 108 # First check if there is a directory with path matching some subset of the arguments 109 stripped_params = [p.split('=')[0].strip() +'=' for p in params] # Strip the params of whitespace and everything after the '=' 110 if len(set(stripped_params)) != len(stripped_params): 111 raise ValueError(f"Duplicate parameters found in {stripped_params}") 112 match = find_directory_path(stripped_params, root_directory=self.root_dir) 113 # Add on missing parameters 114 if match == self.root_dir: 115 match = os.path.join(*stripped_params) 116 else: 117 missing_params = [p for p in stripped_params if p not in match] 118 if missing_params != []: 119 match = [match] + missing_params 120 match = os.path.join(*match) 121 # Take the root directory out of the match 122 match = match.replace(self.root_dir, '') 123 if match.startswith(os.path.sep): 124 match = match[1:] 125 # Now we add back in the values we stripped out 126 match = match.split(os.path.sep) 127 match = [[p for p in params if m in p][0] for m in match] 128 # Check if there is an existing path with the same numerical values, if so use that instead 129 match = get_numeric_equiv(os.path.join(*match), root_directory=self.root_dir) 130 return match
Searches the root directory for a path that matches the parameters given.
If only partial matches are found, returns the deepest matching directory with the missing parameters appended. By deepest we mean the directory with the most parameters matching. If no matches are found creates a path using the parameters. Creates path using parameters in the order they are given, ie. ["--learning_rate=0.01", "--batch_size=32"] -> "--learning_rate=0.01/--batch_size=32".
If we find a partial match, we add the missing parameters to the end of the path, ie. if we have the path "--learning_rate=0.01" in the root and are given the parameters ["--learning_rate=0.01", "--batch_size=32"], we will create the path "--learning_rate=0.01/--batch_size=32".
Arguments:
- - params (list of str): List of strings containing the arguments used, in form ["--argument_name=argument_value", ...].
Returns:
- match (str): Path to the directory that matches the parameters given.
132 def get_path(self, params: List[str]) -> str: 133 """ Creates a path using the parameters. 134 135 Does this by first checking for existing paths in the root directory that match the parameters given. 136 137 Check get_match for how we create the path, 138 once we have the path we check if there is already a csv file with results in that path, 139 if there is we increment the number of the results file name that we will use. 140 141 For example if we get back the path "--learning_rate=0.01/--batch_size=32", 142 and there exists a csv file named "results_0.csv" in the final directory, 143 we will name our csv file "results_1.csv". 144 145 Args: 146 - params (list of str): List of strings containing the arguments used, in form ["--argument_name=argument_value", ...]. 147 148 Returns: 149 - csv_file_path (str): Path to the csv file where we will store the results for the current run. 150 151 """ 152 153 # Check if root directory exists, if not create it 154 if not os.path.exists(self.root_dir): 155 time.sleep(random.random()) # Wait a random amount of time under 1 second to avoid multiple processes creating the same directory 156 os.makedirs(self.root_dir) 157 # Get path of directory where we should store our csv of results 158 dir_path = self.get_match(params) 159 # Check if directory exists, if not create it 160 if not os.path.exists(dir_path): 161 csv_file_number = 0 162 # If it does exist, check if there is already a csv file with results, 163 # if there is find the name of the last csv file and increment the number 164 else: 165 csv_files = [f for f in os.listdir(dir_path) if f.endswith('.csv')] 166 if len(csv_files) > 0: 167 last_csv_file = max(csv_files) 168 # Check that the last csv file starts with "results_" 169 if not last_csv_file.startswith('results_'): 170 raise ValueError('Found csv file in directory that doesn\'t start with "results_"') 171 csv_file_number = int(last_csv_file.split('_')[1][:-4]) + 1 172 else: 173 csv_file_number = 0 174 # Create path name for a new csv file where we can later store results 175 csv_file_path = os.path.join(dir_path, f'results_{csv_file_number}.csv') 176 return csv_file_path
Creates a path using the parameters.
Does this by first checking for existing paths in the root directory that match the parameters given.
Check get_match for how we create the path, once we have the path we check if there is already a csv file with results in that path, if there is we increment the number of the results file name that we will use.
For example if we get back the path "--learning_rate=0.01/--batch_size=32", and there exists a csv file named "results_0.csv" in the final directory, we will name our csv file "results_1.csv".
Arguments:
- - params (list of str): List of strings containing the arguments used, in form ["--argument_name=argument_value", ...].
Returns:
- csv_file_path (str): Path to the csv file where we will store the results for the current run.
178 def save_collated_from_results(self, results: pd.DataFrame): 179 """ Saves results to csv file. 180 181 If the csv file already exists, 182 we append the collated results from the logger to the end of the csv file. 183 If the csv file does not exist, 184 we create it and save the results to it. 185 186 Args: 187 - results (pd.DataFrame): Data frame containing the results to be saved. 188 189 TODO: 190 - Could be making to many assumptions about the format in which we get the results from the logger, 191 should be able to work with any logger. 192 We should only be assuming that we are saving results to a csv file. 193 194 """ 195 196 # If path does not exist, create it 197 # Remove the csv file name from the path 198 dir_path = self.current_path.split(os.path.sep)[:-1] 199 dir_path = os.path.join(*dir_path) 200 if not os.path.exists(dir_path): 201 time.sleep(random.random()) # Wait a random amount of time under 1 second to avoid multiple processes creating the same directory 202 os.makedirs(dir_path) 203 # If csv file already exists, append results to the end 204 if os.path.exists(self.current_path): 205 results = pd.concat([pd.read_csv(self.current_path), results]) 206 results.to_csv(self.current_path, mode='w', index=False) 207 # If csv file does not exist, create it 208 else: 209 results.to_csv(self.current_path, index=False)
Saves results to csv file.
If the csv file already exists, we append the collated results from the logger to the end of the csv file. If the csv file does not exist, we create it and save the results to it.
Arguments:
- - results (pd.DataFrame): Data frame containing the results to be saved.
TODO: - Could be making to many assumptions about the format in which we get the results from the logger, should be able to work with any logger. We should only be assuming that we are saving results to a csv file.
211 def save_collated(self): 212 """ Saves results to csv file. """ 213 214 self.save_collated_from_results(self.logger.results)
Saves results to csv file.
216 def read(self, params: List[str], metric_name: str, select_by: str ='max', avg: bool =True) -> (List[str], float): 217 """ Finds the min/max value of a metric from all csv files in the root directory that match the parameters given. 218 219 Args: 220 - params (list of str): Contains the parameters used, in form ["--parameter_name=parameter_value", ...]. 221 - metric_name (string): Name of the metric to be read. 222 - select_by (string, optional): How to select the 'best' value for the metric from a log file, currently can select by 'min' or 'max'. 223 - avg (bool, optional): Whether to average the metric over all runs, default is True. 224 225 Returns: 226 - best_params (list of str): Contains the arguments used to get the 'best' value of the metric (determined by select_by). 227 - best_value (float): Best value of the metric (determined by select_by). 228 229 """ 230 231 # Get all paths that match the parameters given 232 paths = get_all_paths(params, root_directory=self.root_dir) 233 if paths == []: 234 raise ValueError(f"No paths found matching {params}") 235 # Read the metric from each path 236 values = {} 237 # Do averaging for different runs of same params if avg is True, otherwise just read the metric from each path 238 if avg: 239 paths_same_params = set([os.path.join(*p.split(os.path.sep)[:-1]) for p in paths]) 240 for path in paths_same_params: 241 runs = get_all_paths(path.split(os.path.sep), root_directory=self.root_dir) 242 cumsum = 0 243 for r in runs: 244 df = pd.read_csv(r) 245 cumsum += self.read_log(df, metric_name, select_by) 246 avg_of_runs = cumsum / len(runs) 247 values[path] = avg_of_runs 248 else: 249 for path in paths: 250 df = pd.read_csv(path) 251 values[os.path.join(*path.split(os.path.sep)[:-1])] = self.read_log(df, metric_name, select_by) 252 # Get the key of the min/max value 253 if select_by == 'min': 254 best_params = min(values, key=values.get) 255 elif select_by == 'max': 256 best_params = max(values, key=values.get) 257 else: 258 raise ValueError(f"select_by must be 'min' or 'max', got {select_by}") 259 # Find the best value of the metric from the key 260 best_value = values[best_params] 261 # Format the path into a list of arguments 262 best_params = best_params.replace(self.root_dir, '') 263 if best_params.startswith(os.path.sep): 264 best_params = best_params[1:] 265 best_params = best_params.split(os.path.sep) 266 return best_params, best_value
Finds the min/max value of a metric from all csv files in the root directory that match the parameters given.
Arguments:
- - params (list of str): Contains the parameters used, in form ["--parameter_name=parameter_value", ...].
- - metric_name (string): Name of the metric to be read.
- - select_by (string, optional): How to select the 'best' value for the metric from a log file, currently can select by 'min' or 'max'.
- - avg (bool, optional): Whether to average the metric over all runs, default is True.
Returns:
- best_params (list of str): Contains the arguments used to get the 'best' value of the metric (determined by select_by).
- best_value (float): Best value of the metric (determined by select_by).
268 def exists(self, params: List[str]) -> int: 269 """ Checks if results already exist in storage. 270 271 Args: 272 - params (list of str): Contains the parameters used, in form ["--parameter_name=parameter_value", ...]. 273 274 Returns: 275 - num_runs (int): Number of runs that exist in storage for the given parameters. 276 277 """ 278 279 # Get all paths that match the parameters given 280 paths = get_all_paths(params, root_directory=self.root_dir) 281 return len(paths)
Checks if results already exist in storage.
Arguments:
- - params (list of str): Contains the parameters used, in form ["--parameter_name=parameter_value", ...].
Returns:
- num_runs (int): Number of runs that exist in storage for the given parameters.
283 def get_current_path(self) -> str: 284 """ Getter function for the current_path attribute. 285 286 Returns: 287 - current_path (str): Path to the csv file where we will store the results for the current run. 288 289 """ 290 291 return self.current_path
Getter function for the current_path attribute.
Returns:
- current_path (str): Path to the csv file where we will store the results for the current run.
Inherited Members
- slune.base.BaseSaver
- logger
- log
- read_log