src.slune.savers.csv

  1from typing import List,  Optional
  2import os 
  3import pandas as pd
  4from slune.utils import find_directory_path, get_all_paths, get_numeric_equiv
  5from slune.base import BaseSaver, BaseLogger
  6import random
  7import time
  8
  9class SaverCsv(BaseSaver):
 10    """ Saves the results of each run in a CSV file in hierarchy of directories.
 11     
 12    Each directory is named after a parameter - value pair in the form "--parameter_name=value".
 13    The paths to csv files then define the configuration under which the results were obtained,
 14    for example if we only have one parameter "learning_rate" with value 0.01 used to obtain the results,
 15    to save those results we would create a directory named "--learning_rate=0.01" and save the results in a csv file in that directory.
 16
 17    If we have multiple parameters, for example "learning_rate" with value 0.01 and "batch_size" with value 32,
 18    we would create a directory named "--learning_rate=0.01" with a subdirectory named "--batch_size=32",
 19    and save the results in a csv file in that subdirectory.
 20
 21    We use this structure to then read the results from the csv files by searching for the directory that matches the parameters we want,
 22    and then reading the csv file in that directory.
 23
 24    The order in which we create the directories is determined by the order in which the parameters are given,
 25    so if we are given ["--learning_rate=0.01", "--batch_size=32"] we would create the directories in the following order:
 26    "--learning_rate=0.01/--batch_size=32".
 27
 28    The directory structure generated will also depend on existing directories in the root directory,
 29    if there are existing directories in the root directory that match some subset of the parameters given,
 30    we will create the directory tree from the deepest matching directory.
 31
 32    For example if we only have the following path in the root directory:
 33    "--learning_rate=0.01/--batch_size=32"
 34    and we are given the parameters ["--learning_rate=0.01", "--batch_size=32", "--num_epochs=10"],
 35    we will create the path:
 36    "--learning_rate=0.01/--batch_size=32/--num_epochs=10".
 37    on the other hand if we are given the parameters ["--learning_rate=0.02", "--num_epochs=10", "--batch_size=32"],
 38    we will create the path:
 39    "--learning_rate=0.02/--batch_size=32/--num_epochs=10".
 40
 41    Handles parallel runs trying to create the same directories by waiting a random time (under 1 second) before creating the directory.
 42    Should work pretty well in practice, however, may occasionally fail depending on the number of jobs launched at the same time. 
 43
 44    Attributes:
 45        - root_dir (str): Path to the root directory where we will store the csv files.
 46        - current_path (str): Path to the csv file where we will store the results for the current run.
 47
 48    """
 49
 50    def __init__(self, logger_instance: BaseLogger, params: List[str] = None, root_dir: Optional[str] = os.path.join('.', 'tuning_results')):
 51        """ Initialises the csv saver. 
 52
 53        Args:
 54            - logger_instance (BaseLogger): Instance of a logger class that inherits from BaseLogger.
 55            - params (list, optional): List of strings containing the parameters used, in form ["--parameter_name=parameter_value", ...], default is None.
 56                If None, we will create a path using the parameters given in the log.
 57            - root_dir (str, optional): Path to the root directory where we will store the csv files, default is './tuning_results'.
 58        
 59        """
 60
 61        super(SaverCsv, self).__init__(logger_instance)
 62        self.root_dir = root_dir
 63        if params != None:
 64            self.current_path = self.get_path(params)
 65    
 66    def strip_params(self, params: List[str]) -> List[str]:
 67        """ Strips the parameter values.
 68
 69        Strips the parameter values from the list of parameters given,
 70        ie. ["--parameter_name=parameter_value", ...] -> ["--parameter_name=", ...]
 71
 72        Also gets rid of blank spaces.
 73
 74        Args:
 75            - params (list of str): List of strings containing the parameters used, in form ["--parameter_name=parameter_value", ...].
 76
 77        Returns:
 78            - stripped_params (list of str): List of strings containing the parameters used, in form ["--parameter_name=", ...].
 79
 80        """
 81
 82        stripped_params = [p.split('=')[0].strip() for p in params]
 83        return stripped_params
 84
 85    def get_match(self, params: List[str]) -> str:
 86        """ Searches the root directory for a path that matches the parameters given.
 87
 88        If only partial matches are found, returns the deepest matching directory with the missing parameters appended.
 89        By deepest we mean the directory with the most parameters matching.
 90        If no matches are found creates a path using the parameters.
 91        Creates path using parameters in the order they are given, 
 92        ie. ["--learning_rate=0.01", "--batch_size=32"] -> "--learning_rate=0.01/--batch_size=32".
 93
 94        If we find a partial match, we add the missing parameters to the end of the path,
 95        ie. if we have the path "--learning_rate=0.01" in the root 
 96        and are given the parameters ["--learning_rate=0.01", "--batch_size=32"],
 97        we will create the path "--learning_rate=0.01/--batch_size=32".
 98
 99        Args:
100            - params (list of str): List of strings containing the arguments used, in form ["--argument_name=argument_value", ...].
101
102        Returns:
103            - match (str): Path to the directory that matches the parameters given.
104
105        """
106
107        # First check if there is a directory with path matching some subset of the arguments
108        stripped_params = [p.split('=')[0].strip() +'=' for p in params] # Strip the params of whitespace and everything after the '='
109        if len(set(stripped_params)) != len(stripped_params):
110            raise ValueError(f"Duplicate parameters found in {stripped_params}")
111        match = find_directory_path(stripped_params, root_directory=self.root_dir)
112        # Add on missing parameters
113        if match == self.root_dir:
114            match = os.path.join(*stripped_params)
115        else:
116            missing_params = [p for p in stripped_params if p not in match]
117            if missing_params != []:
118                match = [match] + missing_params
119                match = os.path.join(*match)
120        # Take the root directory out of the match
121        match = match.replace(self.root_dir, '')
122        if match.startswith(os.path.sep):
123            match = match[1:]
124        # Now we add back in the values we stripped out
125        match = match.split(os.path.sep)
126        match = [[p for p in params if m in p][0] for m in match]
127        # Check if there is an existing path with the same numerical values, if so use that instead
128        match = get_numeric_equiv(os.path.join(*match), root_directory=self.root_dir)
129        return match
130
131    def get_path(self, params: List[str]) -> str:
132        """ Creates a path using the parameters.
133        
134        Does this by first checking for existing paths in the root directory that match the parameters given.
135
136        Check get_match for how we create the path, 
137        once we have the path we check if there is already a csv file with results in that path,
138        if there is we increment the number of the results file name that we will use.
139
140        For example if we get back the path "--learning_rate=0.01/--batch_size=32",
141        and there exists a csv file named "results_0.csv" in the final directory,
142        we will name our csv file "results_1.csv".
143
144        Args:
145            - params (list of str): List of strings containing the arguments used, in form ["--argument_name=argument_value", ...].
146
147        Returns:
148            - csv_file_path (str): Path to the csv file where we will store the results for the current run.
149
150        """
151
152        # Check if root directory exists, if not create it
153        if not os.path.exists(self.root_dir):
154            time.sleep(random.random()) # Wait a random amount of time under 1 second to avoid multiple processes creating the same directory
155            os.makedirs(self.root_dir)
156        # Get path of directory where we should store our csv of results
157        dir_path = self.get_match(params)
158        # Check if directory exists, if not create it
159        if not os.path.exists(dir_path):
160            csv_file_number = 0
161        # If it does exist, check if there is already a csv file with results,
162        # if there is find the name of the last csv file and increment the number
163        else:
164            csv_files = [f for f in os.listdir(dir_path) if f.endswith('.csv')]
165            if len(csv_files) > 0:
166                last_csv_file = max(csv_files)
167                # Check that the last csv file starts with "results_"
168                if not last_csv_file.startswith('results_'):
169                    raise ValueError('Found csv file in directory that doesn\'t start with "results_"')
170                csv_file_number = int(last_csv_file.split('_')[1][:-4]) + 1
171            else:
172                csv_file_number = 0
173        # Create path name for a new csv file where we can later store results
174        csv_file_path = os.path.join(dir_path, f'results_{csv_file_number}.csv')
175        return csv_file_path
176
177    def save_collated_from_results(self, results: pd.DataFrame):
178        """ Saves results to csv file.
179        
180        If the csv file already exists, 
181        we append the collated results from the logger to the end of the csv file.
182        If the csv file does not exist,
183        we create it and save the results to it.
184
185        Args:
186            - results (pd.DataFrame): Data frame containing the results to be saved.
187
188        TODO: 
189            - Could be making to many assumptions about the format in which we get the results from the logger,
190            should be able to work with any logger.
191            We should only be assuming that we are saving results to a csv file. 
192
193        """
194
195        # If path does not exist, create it
196        # Remove the csv file name from the path
197        dir_path = self.current_path.split(os.path.sep)[:-1]
198        dir_path = os.path.join(*dir_path)
199        if not os.path.exists(dir_path):
200            time.sleep(random.random()) # Wait a random amount of time under 1 second to avoid multiple processes creating the same directory
201            os.makedirs(dir_path)
202        # If csv file already exists, append results to the end
203        if os.path.exists(self.current_path):
204            results = pd.concat([pd.read_csv(self.current_path), results])
205            results.to_csv(self.current_path, mode='w', index=False)
206        # If csv file does not exist, create it
207        else:
208            results.to_csv(self.current_path, index=False)
209
210    def save_collated(self):
211        """ Saves results to csv file. """
212
213        self.save_collated_from_results(self.logger.results)
214        
215    def read(self, params: List[str], metric_name: str, select_by: str ='max', avg: bool =True) -> (List[str], float):
216        """ Finds the min/max value of a metric from all csv files in the root directory that match the parameters given.
217
218        Args:
219            - params (list of str): Contains the parameters used, in form ["--parameter_name=parameter_value", ...].
220            - metric_name (string): Name of the metric to be read.
221            - select_by (string, optional): How to select the 'best' value for the metric from a log file, currently can select by 'min' or 'max'.
222            - avg (bool, optional): Whether to average the metric over all runs, default is True.
223
224        Returns:
225            - best_params (list of str): Contains the arguments used to get the 'best' value of the metric (determined by select_by).
226            - best_value (float): Best value of the metric (determined by select_by).
227
228        """
229
230        #  Get all paths that match the parameters given
231        paths = get_all_paths(params, root_directory=self.root_dir)
232        if paths == []:
233            raise ValueError(f"No paths found matching {params}")
234        # Read the metric from each path
235        values = {}
236        # Do averaging for different runs of same params if avg is True, otherwise just read the metric from each path
237        if avg:
238            paths_same_params = set([os.path.join(*p.split(os.path.sep)[:-1]) for p in paths])
239            for path in paths_same_params:
240                runs = get_all_paths(path.split(os.path.sep), root_directory=self.root_dir)
241                cumsum = 0
242                for r in runs:
243                    df = pd.read_csv(r)
244                    cumsum += self.read_log(df, metric_name, select_by)
245                avg_of_runs = cumsum / len(runs)
246                values[path] = avg_of_runs
247        else:
248            for path in paths:
249                df = pd.read_csv(path)
250                values[os.path.join(*path.split(os.path.sep)[:-1])] = self.read_log(df, metric_name, select_by)
251        # Get the key of the min/max value
252        if select_by == 'min':
253            best_params = min(values, key=values.get)
254        elif select_by == 'max':
255            best_params = max(values, key=values.get)
256        else:
257            raise ValueError(f"select_by must be 'min' or 'max', got {select_by}")
258        # Find the best value of the metric from the key
259        best_value = values[best_params]
260        # Format the path into a list of arguments
261        best_params = best_params.replace(self.root_dir, '')
262        if best_params.startswith(os.path.sep):
263            best_params = best_params[1:]
264        best_params = best_params.split(os.path.sep)
265        return best_params, best_value       
266
267    def exists(self, params: List[str]) -> int:
268        """ Checks if results already exist in storage.
269
270        Args:
271            - params (list of str): Contains the parameters used, in form ["--parameter_name=parameter_value", ...].
272
273        Returns:
274            - num_runs (int): Number of runs that exist in storage for the given parameters.
275
276        """
277
278        #  Get all paths that match the parameters given
279        paths = get_all_paths(params, root_directory=self.root_dir)
280        return len(paths)
281
282    def get_current_path(self) -> str:
283        """ Getter function for the current_path attribute. 
284        
285        Returns:
286            - current_path (str): Path to the csv file where we will store the results for the current run.
287        
288        """
289
290        return self.current_path
class SaverCsv(slune.base.BaseSaver):
 10class SaverCsv(BaseSaver):
 11    """ Saves the results of each run in a CSV file in hierarchy of directories.
 12     
 13    Each directory is named after a parameter - value pair in the form "--parameter_name=value".
 14    The paths to csv files then define the configuration under which the results were obtained,
 15    for example if we only have one parameter "learning_rate" with value 0.01 used to obtain the results,
 16    to save those results we would create a directory named "--learning_rate=0.01" and save the results in a csv file in that directory.
 17
 18    If we have multiple parameters, for example "learning_rate" with value 0.01 and "batch_size" with value 32,
 19    we would create a directory named "--learning_rate=0.01" with a subdirectory named "--batch_size=32",
 20    and save the results in a csv file in that subdirectory.
 21
 22    We use this structure to then read the results from the csv files by searching for the directory that matches the parameters we want,
 23    and then reading the csv file in that directory.
 24
 25    The order in which we create the directories is determined by the order in which the parameters are given,
 26    so if we are given ["--learning_rate=0.01", "--batch_size=32"] we would create the directories in the following order:
 27    "--learning_rate=0.01/--batch_size=32".
 28
 29    The directory structure generated will also depend on existing directories in the root directory,
 30    if there are existing directories in the root directory that match some subset of the parameters given,
 31    we will create the directory tree from the deepest matching directory.
 32
 33    For example if we only have the following path in the root directory:
 34    "--learning_rate=0.01/--batch_size=32"
 35    and we are given the parameters ["--learning_rate=0.01", "--batch_size=32", "--num_epochs=10"],
 36    we will create the path:
 37    "--learning_rate=0.01/--batch_size=32/--num_epochs=10".
 38    on the other hand if we are given the parameters ["--learning_rate=0.02", "--num_epochs=10", "--batch_size=32"],
 39    we will create the path:
 40    "--learning_rate=0.02/--batch_size=32/--num_epochs=10".
 41
 42    Handles parallel runs trying to create the same directories by waiting a random time (under 1 second) before creating the directory.
 43    Should work pretty well in practice, however, may occasionally fail depending on the number of jobs launched at the same time. 
 44
 45    Attributes:
 46        - root_dir (str): Path to the root directory where we will store the csv files.
 47        - current_path (str): Path to the csv file where we will store the results for the current run.
 48
 49    """
 50
 51    def __init__(self, logger_instance: BaseLogger, params: List[str] = None, root_dir: Optional[str] = os.path.join('.', 'tuning_results')):
 52        """ Initialises the csv saver. 
 53
 54        Args:
 55            - logger_instance (BaseLogger): Instance of a logger class that inherits from BaseLogger.
 56            - params (list, optional): List of strings containing the parameters used, in form ["--parameter_name=parameter_value", ...], default is None.
 57                If None, we will create a path using the parameters given in the log.
 58            - root_dir (str, optional): Path to the root directory where we will store the csv files, default is './tuning_results'.
 59        
 60        """
 61
 62        super(SaverCsv, self).__init__(logger_instance)
 63        self.root_dir = root_dir
 64        if params != None:
 65            self.current_path = self.get_path(params)
 66    
 67    def strip_params(self, params: List[str]) -> List[str]:
 68        """ Strips the parameter values.
 69
 70        Strips the parameter values from the list of parameters given,
 71        ie. ["--parameter_name=parameter_value", ...] -> ["--parameter_name=", ...]
 72
 73        Also gets rid of blank spaces.
 74
 75        Args:
 76            - params (list of str): List of strings containing the parameters used, in form ["--parameter_name=parameter_value", ...].
 77
 78        Returns:
 79            - stripped_params (list of str): List of strings containing the parameters used, in form ["--parameter_name=", ...].
 80
 81        """
 82
 83        stripped_params = [p.split('=')[0].strip() for p in params]
 84        return stripped_params
 85
 86    def get_match(self, params: List[str]) -> str:
 87        """ Searches the root directory for a path that matches the parameters given.
 88
 89        If only partial matches are found, returns the deepest matching directory with the missing parameters appended.
 90        By deepest we mean the directory with the most parameters matching.
 91        If no matches are found creates a path using the parameters.
 92        Creates path using parameters in the order they are given, 
 93        ie. ["--learning_rate=0.01", "--batch_size=32"] -> "--learning_rate=0.01/--batch_size=32".
 94
 95        If we find a partial match, we add the missing parameters to the end of the path,
 96        ie. if we have the path "--learning_rate=0.01" in the root 
 97        and are given the parameters ["--learning_rate=0.01", "--batch_size=32"],
 98        we will create the path "--learning_rate=0.01/--batch_size=32".
 99
100        Args:
101            - params (list of str): List of strings containing the arguments used, in form ["--argument_name=argument_value", ...].
102
103        Returns:
104            - match (str): Path to the directory that matches the parameters given.
105
106        """
107
108        # First check if there is a directory with path matching some subset of the arguments
109        stripped_params = [p.split('=')[0].strip() +'=' for p in params] # Strip the params of whitespace and everything after the '='
110        if len(set(stripped_params)) != len(stripped_params):
111            raise ValueError(f"Duplicate parameters found in {stripped_params}")
112        match = find_directory_path(stripped_params, root_directory=self.root_dir)
113        # Add on missing parameters
114        if match == self.root_dir:
115            match = os.path.join(*stripped_params)
116        else:
117            missing_params = [p for p in stripped_params if p not in match]
118            if missing_params != []:
119                match = [match] + missing_params
120                match = os.path.join(*match)
121        # Take the root directory out of the match
122        match = match.replace(self.root_dir, '')
123        if match.startswith(os.path.sep):
124            match = match[1:]
125        # Now we add back in the values we stripped out
126        match = match.split(os.path.sep)
127        match = [[p for p in params if m in p][0] for m in match]
128        # Check if there is an existing path with the same numerical values, if so use that instead
129        match = get_numeric_equiv(os.path.join(*match), root_directory=self.root_dir)
130        return match
131
132    def get_path(self, params: List[str]) -> str:
133        """ Creates a path using the parameters.
134        
135        Does this by first checking for existing paths in the root directory that match the parameters given.
136
137        Check get_match for how we create the path, 
138        once we have the path we check if there is already a csv file with results in that path,
139        if there is we increment the number of the results file name that we will use.
140
141        For example if we get back the path "--learning_rate=0.01/--batch_size=32",
142        and there exists a csv file named "results_0.csv" in the final directory,
143        we will name our csv file "results_1.csv".
144
145        Args:
146            - params (list of str): List of strings containing the arguments used, in form ["--argument_name=argument_value", ...].
147
148        Returns:
149            - csv_file_path (str): Path to the csv file where we will store the results for the current run.
150
151        """
152
153        # Check if root directory exists, if not create it
154        if not os.path.exists(self.root_dir):
155            time.sleep(random.random()) # Wait a random amount of time under 1 second to avoid multiple processes creating the same directory
156            os.makedirs(self.root_dir)
157        # Get path of directory where we should store our csv of results
158        dir_path = self.get_match(params)
159        # Check if directory exists, if not create it
160        if not os.path.exists(dir_path):
161            csv_file_number = 0
162        # If it does exist, check if there is already a csv file with results,
163        # if there is find the name of the last csv file and increment the number
164        else:
165            csv_files = [f for f in os.listdir(dir_path) if f.endswith('.csv')]
166            if len(csv_files) > 0:
167                last_csv_file = max(csv_files)
168                # Check that the last csv file starts with "results_"
169                if not last_csv_file.startswith('results_'):
170                    raise ValueError('Found csv file in directory that doesn\'t start with "results_"')
171                csv_file_number = int(last_csv_file.split('_')[1][:-4]) + 1
172            else:
173                csv_file_number = 0
174        # Create path name for a new csv file where we can later store results
175        csv_file_path = os.path.join(dir_path, f'results_{csv_file_number}.csv')
176        return csv_file_path
177
178    def save_collated_from_results(self, results: pd.DataFrame):
179        """ Saves results to csv file.
180        
181        If the csv file already exists, 
182        we append the collated results from the logger to the end of the csv file.
183        If the csv file does not exist,
184        we create it and save the results to it.
185
186        Args:
187            - results (pd.DataFrame): Data frame containing the results to be saved.
188
189        TODO: 
190            - Could be making to many assumptions about the format in which we get the results from the logger,
191            should be able to work with any logger.
192            We should only be assuming that we are saving results to a csv file. 
193
194        """
195
196        # If path does not exist, create it
197        # Remove the csv file name from the path
198        dir_path = self.current_path.split(os.path.sep)[:-1]
199        dir_path = os.path.join(*dir_path)
200        if not os.path.exists(dir_path):
201            time.sleep(random.random()) # Wait a random amount of time under 1 second to avoid multiple processes creating the same directory
202            os.makedirs(dir_path)
203        # If csv file already exists, append results to the end
204        if os.path.exists(self.current_path):
205            results = pd.concat([pd.read_csv(self.current_path), results])
206            results.to_csv(self.current_path, mode='w', index=False)
207        # If csv file does not exist, create it
208        else:
209            results.to_csv(self.current_path, index=False)
210
211    def save_collated(self):
212        """ Saves results to csv file. """
213
214        self.save_collated_from_results(self.logger.results)
215        
216    def read(self, params: List[str], metric_name: str, select_by: str ='max', avg: bool =True) -> (List[str], float):
217        """ Finds the min/max value of a metric from all csv files in the root directory that match the parameters given.
218
219        Args:
220            - params (list of str): Contains the parameters used, in form ["--parameter_name=parameter_value", ...].
221            - metric_name (string): Name of the metric to be read.
222            - select_by (string, optional): How to select the 'best' value for the metric from a log file, currently can select by 'min' or 'max'.
223            - avg (bool, optional): Whether to average the metric over all runs, default is True.
224
225        Returns:
226            - best_params (list of str): Contains the arguments used to get the 'best' value of the metric (determined by select_by).
227            - best_value (float): Best value of the metric (determined by select_by).
228
229        """
230
231        #  Get all paths that match the parameters given
232        paths = get_all_paths(params, root_directory=self.root_dir)
233        if paths == []:
234            raise ValueError(f"No paths found matching {params}")
235        # Read the metric from each path
236        values = {}
237        # Do averaging for different runs of same params if avg is True, otherwise just read the metric from each path
238        if avg:
239            paths_same_params = set([os.path.join(*p.split(os.path.sep)[:-1]) for p in paths])
240            for path in paths_same_params:
241                runs = get_all_paths(path.split(os.path.sep), root_directory=self.root_dir)
242                cumsum = 0
243                for r in runs:
244                    df = pd.read_csv(r)
245                    cumsum += self.read_log(df, metric_name, select_by)
246                avg_of_runs = cumsum / len(runs)
247                values[path] = avg_of_runs
248        else:
249            for path in paths:
250                df = pd.read_csv(path)
251                values[os.path.join(*path.split(os.path.sep)[:-1])] = self.read_log(df, metric_name, select_by)
252        # Get the key of the min/max value
253        if select_by == 'min':
254            best_params = min(values, key=values.get)
255        elif select_by == 'max':
256            best_params = max(values, key=values.get)
257        else:
258            raise ValueError(f"select_by must be 'min' or 'max', got {select_by}")
259        # Find the best value of the metric from the key
260        best_value = values[best_params]
261        # Format the path into a list of arguments
262        best_params = best_params.replace(self.root_dir, '')
263        if best_params.startswith(os.path.sep):
264            best_params = best_params[1:]
265        best_params = best_params.split(os.path.sep)
266        return best_params, best_value       
267
268    def exists(self, params: List[str]) -> int:
269        """ Checks if results already exist in storage.
270
271        Args:
272            - params (list of str): Contains the parameters used, in form ["--parameter_name=parameter_value", ...].
273
274        Returns:
275            - num_runs (int): Number of runs that exist in storage for the given parameters.
276
277        """
278
279        #  Get all paths that match the parameters given
280        paths = get_all_paths(params, root_directory=self.root_dir)
281        return len(paths)
282
283    def get_current_path(self) -> str:
284        """ Getter function for the current_path attribute. 
285        
286        Returns:
287            - current_path (str): Path to the csv file where we will store the results for the current run.
288        
289        """
290
291        return self.current_path

Saves the results of each run in a CSV file in hierarchy of directories.

Each directory is named after a parameter - value pair in the form "--parameter_name=value". The paths to csv files then define the configuration under which the results were obtained, for example if we only have one parameter "learning_rate" with value 0.01 used to obtain the results, to save those results we would create a directory named "--learning_rate=0.01" and save the results in a csv file in that directory.

If we have multiple parameters, for example "learning_rate" with value 0.01 and "batch_size" with value 32, we would create a directory named "--learning_rate=0.01" with a subdirectory named "--batch_size=32", and save the results in a csv file in that subdirectory.

We use this structure to then read the results from the csv files by searching for the directory that matches the parameters we want, and then reading the csv file in that directory.

The order in which we create the directories is determined by the order in which the parameters are given, so if we are given ["--learning_rate=0.01", "--batch_size=32"] we would create the directories in the following order: "--learning_rate=0.01/--batch_size=32".

The directory structure generated will also depend on existing directories in the root directory, if there are existing directories in the root directory that match some subset of the parameters given, we will create the directory tree from the deepest matching directory.

For example if we only have the following path in the root directory: "--learning_rate=0.01/--batch_size=32" and we are given the parameters ["--learning_rate=0.01", "--batch_size=32", "--num_epochs=10"], we will create the path: "--learning_rate=0.01/--batch_size=32/--num_epochs=10". on the other hand if we are given the parameters ["--learning_rate=0.02", "--num_epochs=10", "--batch_size=32"], we will create the path: "--learning_rate=0.02/--batch_size=32/--num_epochs=10".

Handles parallel runs trying to create the same directories by waiting a random time (under 1 second) before creating the directory. Should work pretty well in practice, however, may occasionally fail depending on the number of jobs launched at the same time.

Attributes:
  • - root_dir (str): Path to the root directory where we will store the csv files.
  • - current_path (str): Path to the csv file where we will store the results for the current run.
SaverCsv( logger_instance: slune.base.BaseLogger, params: List[str] = None, root_dir: Optional[str] = './tuning_results')
51    def __init__(self, logger_instance: BaseLogger, params: List[str] = None, root_dir: Optional[str] = os.path.join('.', 'tuning_results')):
52        """ Initialises the csv saver. 
53
54        Args:
55            - logger_instance (BaseLogger): Instance of a logger class that inherits from BaseLogger.
56            - params (list, optional): List of strings containing the parameters used, in form ["--parameter_name=parameter_value", ...], default is None.
57                If None, we will create a path using the parameters given in the log.
58            - root_dir (str, optional): Path to the root directory where we will store the csv files, default is './tuning_results'.
59        
60        """
61
62        super(SaverCsv, self).__init__(logger_instance)
63        self.root_dir = root_dir
64        if params != None:
65            self.current_path = self.get_path(params)

Initialises the csv saver.

Arguments:
  • - logger_instance (BaseLogger): Instance of a logger class that inherits from BaseLogger.
  • - params (list, optional): List of strings containing the parameters used, in form ["--parameter_name=parameter_value", ...], default is None. If None, we will create a path using the parameters given in the log.
  • - root_dir (str, optional): Path to the root directory where we will store the csv files, default is './tuning_results'.
root_dir
def strip_params(self, params: List[str]) -> List[str]:
67    def strip_params(self, params: List[str]) -> List[str]:
68        """ Strips the parameter values.
69
70        Strips the parameter values from the list of parameters given,
71        ie. ["--parameter_name=parameter_value", ...] -> ["--parameter_name=", ...]
72
73        Also gets rid of blank spaces.
74
75        Args:
76            - params (list of str): List of strings containing the parameters used, in form ["--parameter_name=parameter_value", ...].
77
78        Returns:
79            - stripped_params (list of str): List of strings containing the parameters used, in form ["--parameter_name=", ...].
80
81        """
82
83        stripped_params = [p.split('=')[0].strip() for p in params]
84        return stripped_params

Strips the parameter values.

Strips the parameter values from the list of parameters given, ie. ["--parameter_name=parameter_value", ...] -> ["--parameter_name=", ...]

Also gets rid of blank spaces.

Arguments:
  • - params (list of str): List of strings containing the parameters used, in form ["--parameter_name=parameter_value", ...].
Returns:
  • stripped_params (list of str): List of strings containing the parameters used, in form ["--parameter_name=", ...].
def get_match(self, params: List[str]) -> str:
 86    def get_match(self, params: List[str]) -> str:
 87        """ Searches the root directory for a path that matches the parameters given.
 88
 89        If only partial matches are found, returns the deepest matching directory with the missing parameters appended.
 90        By deepest we mean the directory with the most parameters matching.
 91        If no matches are found creates a path using the parameters.
 92        Creates path using parameters in the order they are given, 
 93        ie. ["--learning_rate=0.01", "--batch_size=32"] -> "--learning_rate=0.01/--batch_size=32".
 94
 95        If we find a partial match, we add the missing parameters to the end of the path,
 96        ie. if we have the path "--learning_rate=0.01" in the root 
 97        and are given the parameters ["--learning_rate=0.01", "--batch_size=32"],
 98        we will create the path "--learning_rate=0.01/--batch_size=32".
 99
100        Args:
101            - params (list of str): List of strings containing the arguments used, in form ["--argument_name=argument_value", ...].
102
103        Returns:
104            - match (str): Path to the directory that matches the parameters given.
105
106        """
107
108        # First check if there is a directory with path matching some subset of the arguments
109        stripped_params = [p.split('=')[0].strip() +'=' for p in params] # Strip the params of whitespace and everything after the '='
110        if len(set(stripped_params)) != len(stripped_params):
111            raise ValueError(f"Duplicate parameters found in {stripped_params}")
112        match = find_directory_path(stripped_params, root_directory=self.root_dir)
113        # Add on missing parameters
114        if match == self.root_dir:
115            match = os.path.join(*stripped_params)
116        else:
117            missing_params = [p for p in stripped_params if p not in match]
118            if missing_params != []:
119                match = [match] + missing_params
120                match = os.path.join(*match)
121        # Take the root directory out of the match
122        match = match.replace(self.root_dir, '')
123        if match.startswith(os.path.sep):
124            match = match[1:]
125        # Now we add back in the values we stripped out
126        match = match.split(os.path.sep)
127        match = [[p for p in params if m in p][0] for m in match]
128        # Check if there is an existing path with the same numerical values, if so use that instead
129        match = get_numeric_equiv(os.path.join(*match), root_directory=self.root_dir)
130        return match

Searches the root directory for a path that matches the parameters given.

If only partial matches are found, returns the deepest matching directory with the missing parameters appended. By deepest we mean the directory with the most parameters matching. If no matches are found creates a path using the parameters. Creates path using parameters in the order they are given, ie. ["--learning_rate=0.01", "--batch_size=32"] -> "--learning_rate=0.01/--batch_size=32".

If we find a partial match, we add the missing parameters to the end of the path, ie. if we have the path "--learning_rate=0.01" in the root and are given the parameters ["--learning_rate=0.01", "--batch_size=32"], we will create the path "--learning_rate=0.01/--batch_size=32".

Arguments:
  • - params (list of str): List of strings containing the arguments used, in form ["--argument_name=argument_value", ...].
Returns:
  • match (str): Path to the directory that matches the parameters given.
def get_path(self, params: List[str]) -> str:
132    def get_path(self, params: List[str]) -> str:
133        """ Creates a path using the parameters.
134        
135        Does this by first checking for existing paths in the root directory that match the parameters given.
136
137        Check get_match for how we create the path, 
138        once we have the path we check if there is already a csv file with results in that path,
139        if there is we increment the number of the results file name that we will use.
140
141        For example if we get back the path "--learning_rate=0.01/--batch_size=32",
142        and there exists a csv file named "results_0.csv" in the final directory,
143        we will name our csv file "results_1.csv".
144
145        Args:
146            - params (list of str): List of strings containing the arguments used, in form ["--argument_name=argument_value", ...].
147
148        Returns:
149            - csv_file_path (str): Path to the csv file where we will store the results for the current run.
150
151        """
152
153        # Check if root directory exists, if not create it
154        if not os.path.exists(self.root_dir):
155            time.sleep(random.random()) # Wait a random amount of time under 1 second to avoid multiple processes creating the same directory
156            os.makedirs(self.root_dir)
157        # Get path of directory where we should store our csv of results
158        dir_path = self.get_match(params)
159        # Check if directory exists, if not create it
160        if not os.path.exists(dir_path):
161            csv_file_number = 0
162        # If it does exist, check if there is already a csv file with results,
163        # if there is find the name of the last csv file and increment the number
164        else:
165            csv_files = [f for f in os.listdir(dir_path) if f.endswith('.csv')]
166            if len(csv_files) > 0:
167                last_csv_file = max(csv_files)
168                # Check that the last csv file starts with "results_"
169                if not last_csv_file.startswith('results_'):
170                    raise ValueError('Found csv file in directory that doesn\'t start with "results_"')
171                csv_file_number = int(last_csv_file.split('_')[1][:-4]) + 1
172            else:
173                csv_file_number = 0
174        # Create path name for a new csv file where we can later store results
175        csv_file_path = os.path.join(dir_path, f'results_{csv_file_number}.csv')
176        return csv_file_path

Creates a path using the parameters.

Does this by first checking for existing paths in the root directory that match the parameters given.

Check get_match for how we create the path, once we have the path we check if there is already a csv file with results in that path, if there is we increment the number of the results file name that we will use.

For example if we get back the path "--learning_rate=0.01/--batch_size=32", and there exists a csv file named "results_0.csv" in the final directory, we will name our csv file "results_1.csv".

Arguments:
  • - params (list of str): List of strings containing the arguments used, in form ["--argument_name=argument_value", ...].
Returns:
  • csv_file_path (str): Path to the csv file where we will store the results for the current run.
def save_collated_from_results(self, results: pandas.core.frame.DataFrame):
178    def save_collated_from_results(self, results: pd.DataFrame):
179        """ Saves results to csv file.
180        
181        If the csv file already exists, 
182        we append the collated results from the logger to the end of the csv file.
183        If the csv file does not exist,
184        we create it and save the results to it.
185
186        Args:
187            - results (pd.DataFrame): Data frame containing the results to be saved.
188
189        TODO: 
190            - Could be making to many assumptions about the format in which we get the results from the logger,
191            should be able to work with any logger.
192            We should only be assuming that we are saving results to a csv file. 
193
194        """
195
196        # If path does not exist, create it
197        # Remove the csv file name from the path
198        dir_path = self.current_path.split(os.path.sep)[:-1]
199        dir_path = os.path.join(*dir_path)
200        if not os.path.exists(dir_path):
201            time.sleep(random.random()) # Wait a random amount of time under 1 second to avoid multiple processes creating the same directory
202            os.makedirs(dir_path)
203        # If csv file already exists, append results to the end
204        if os.path.exists(self.current_path):
205            results = pd.concat([pd.read_csv(self.current_path), results])
206            results.to_csv(self.current_path, mode='w', index=False)
207        # If csv file does not exist, create it
208        else:
209            results.to_csv(self.current_path, index=False)

Saves results to csv file.

If the csv file already exists, we append the collated results from the logger to the end of the csv file. If the csv file does not exist, we create it and save the results to it.

Arguments:
  • - results (pd.DataFrame): Data frame containing the results to be saved.

TODO: - Could be making to many assumptions about the format in which we get the results from the logger, should be able to work with any logger. We should only be assuming that we are saving results to a csv file.

def save_collated(self):
211    def save_collated(self):
212        """ Saves results to csv file. """
213
214        self.save_collated_from_results(self.logger.results)

Saves results to csv file.

def read( self, params: List[str], metric_name: str, select_by: str = 'max', avg: bool = True) -> (typing.List[str], <class 'float'>):
216    def read(self, params: List[str], metric_name: str, select_by: str ='max', avg: bool =True) -> (List[str], float):
217        """ Finds the min/max value of a metric from all csv files in the root directory that match the parameters given.
218
219        Args:
220            - params (list of str): Contains the parameters used, in form ["--parameter_name=parameter_value", ...].
221            - metric_name (string): Name of the metric to be read.
222            - select_by (string, optional): How to select the 'best' value for the metric from a log file, currently can select by 'min' or 'max'.
223            - avg (bool, optional): Whether to average the metric over all runs, default is True.
224
225        Returns:
226            - best_params (list of str): Contains the arguments used to get the 'best' value of the metric (determined by select_by).
227            - best_value (float): Best value of the metric (determined by select_by).
228
229        """
230
231        #  Get all paths that match the parameters given
232        paths = get_all_paths(params, root_directory=self.root_dir)
233        if paths == []:
234            raise ValueError(f"No paths found matching {params}")
235        # Read the metric from each path
236        values = {}
237        # Do averaging for different runs of same params if avg is True, otherwise just read the metric from each path
238        if avg:
239            paths_same_params = set([os.path.join(*p.split(os.path.sep)[:-1]) for p in paths])
240            for path in paths_same_params:
241                runs = get_all_paths(path.split(os.path.sep), root_directory=self.root_dir)
242                cumsum = 0
243                for r in runs:
244                    df = pd.read_csv(r)
245                    cumsum += self.read_log(df, metric_name, select_by)
246                avg_of_runs = cumsum / len(runs)
247                values[path] = avg_of_runs
248        else:
249            for path in paths:
250                df = pd.read_csv(path)
251                values[os.path.join(*path.split(os.path.sep)[:-1])] = self.read_log(df, metric_name, select_by)
252        # Get the key of the min/max value
253        if select_by == 'min':
254            best_params = min(values, key=values.get)
255        elif select_by == 'max':
256            best_params = max(values, key=values.get)
257        else:
258            raise ValueError(f"select_by must be 'min' or 'max', got {select_by}")
259        # Find the best value of the metric from the key
260        best_value = values[best_params]
261        # Format the path into a list of arguments
262        best_params = best_params.replace(self.root_dir, '')
263        if best_params.startswith(os.path.sep):
264            best_params = best_params[1:]
265        best_params = best_params.split(os.path.sep)
266        return best_params, best_value       

Finds the min/max value of a metric from all csv files in the root directory that match the parameters given.

Arguments:
  • - params (list of str): Contains the parameters used, in form ["--parameter_name=parameter_value", ...].
  • - metric_name (string): Name of the metric to be read.
  • - select_by (string, optional): How to select the 'best' value for the metric from a log file, currently can select by 'min' or 'max'.
  • - avg (bool, optional): Whether to average the metric over all runs, default is True.
Returns:
  • best_params (list of str): Contains the arguments used to get the 'best' value of the metric (determined by select_by).
  • best_value (float): Best value of the metric (determined by select_by).
def exists(self, params: List[str]) -> int:
268    def exists(self, params: List[str]) -> int:
269        """ Checks if results already exist in storage.
270
271        Args:
272            - params (list of str): Contains the parameters used, in form ["--parameter_name=parameter_value", ...].
273
274        Returns:
275            - num_runs (int): Number of runs that exist in storage for the given parameters.
276
277        """
278
279        #  Get all paths that match the parameters given
280        paths = get_all_paths(params, root_directory=self.root_dir)
281        return len(paths)

Checks if results already exist in storage.

Arguments:
  • - params (list of str): Contains the parameters used, in form ["--parameter_name=parameter_value", ...].
Returns:
  • num_runs (int): Number of runs that exist in storage for the given parameters.
def get_current_path(self) -> str:
283    def get_current_path(self) -> str:
284        """ Getter function for the current_path attribute. 
285        
286        Returns:
287            - current_path (str): Path to the csv file where we will store the results for the current run.
288        
289        """
290
291        return self.current_path

Getter function for the current_path attribute.

Returns:
  • current_path (str): Path to the csv file where we will store the results for the current run.
Inherited Members
slune.base.BaseSaver
logger
log
read_log