gaitsetpy.dataset.utils

This file contains the utility functions to download and extract the datasets. Supported datasets: - Daphnet

Maintainer: @aharshit123456

  1'''
  2    This file contains the utility functions to download and extract the datasets.
  3    Supported datasets:
  4    - Daphnet
  5    
  6Maintainer: @aharshit123456
  7'''
  8
  9## imports
 10import os
 11import requests
 12import zipfile
 13import tarfile
 14import json
 15import pandas as pd
 16import numpy as np
 17from glob import glob
 18
 19#################################################################################
 20############################## DATASET DOWNLOAD #################################
 21#################################################################################
 22
 23def download_dataset(dataset_name, data_dir):
 24    """Download the dataset."""
 25    if dataset_name == "daphnet":
 26        download_daphnet_data(data_dir)
 27    elif dataset_name == "mobifall":
 28        download_mobifall_data(data_dir)
 29    elif dataset_name == "arduous":
 30        download_arduous_data(data_dir)
 31    elif dataset_name == "harup":
 32        download_harup_data(data_dir)
 33    elif dataset_name == "physionet":
 34        # PhysioNet dataset is handled by the PhysioNetLoader itself
 35        pass
 36    else:
 37        raise ValueError(f"Dataset {dataset_name} not supported.")
 38    
 39
 40def download_daphnet_data(data_dir):
 41    """Download the Daphnet dataset.
 42    
 43    This function downloads the Daphnet Freezing of Gait dataset from the UCI Machine Learning Repository.
 44    It shows a progress bar during download and handles various potential errors.
 45    If the file already exists in the specified directory, it skips the download.
 46    
 47    Args:
 48        data_dir (str): Directory where the dataset will be downloaded
 49        
 50    Returns:
 51        str: Path to the downloaded file
 52        
 53    Raises:
 54        ConnectionError: If unable to connect to the download URL
 55        IOError: If unable to create or write to the download directory/file
 56        Exception: For other unexpected errors during download
 57    """
 58    import os
 59    import requests
 60    from tqdm import tqdm
 61    
 62    url = "https://archive.ics.uci.edu/static/public/245/daphnet+freezing+of+gait.zip"
 63    file_path = os.path.join(data_dir, "daphnet.zip")
 64    
 65    # Check if file already exists
 66    if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
 67        print(f"Dataset already exists at: {file_path}")
 68        return file_path
 69    
 70    try:
 71        # Create directory if it doesn't exist
 72        os.makedirs(data_dir, exist_ok=True)
 73        print(f"Downloading Daphnet dataset to: {file_path}")
 74        
 75        # Send a HEAD request first to get the file size
 76        response = requests.head(url)
 77        total_size = int(response.headers.get('content-length', 0))
 78        
 79        # Start the download with progress bar
 80        response = requests.get(url, stream=True)
 81        response.raise_for_status()  # Raise an exception for bad status codes
 82        
 83        # Initialize progress bar
 84        progress_bar = tqdm(
 85            total=total_size,
 86            unit='iB',
 87            unit_scale=True,
 88            desc='Download Progress'
 89        )
 90        
 91        # Write the file with progress updates
 92        with open(file_path, "wb") as file:
 93            for chunk in response.iter_content(chunk_size=8192):
 94                if chunk:
 95                    size = file.write(chunk)
 96                    progress_bar.update(size)
 97        
 98        progress_bar.close()
 99        
100        # Verify download completed successfully
101        if os.path.getsize(file_path) > 0:
102            print(f"Download completed successfully! File saved to: {file_path}")
103            return file_path
104        else:
105            raise IOError("Downloaded file is empty")
106            
107    except requests.exceptions.RequestException as e:
108        print(f"Error connecting to download URL: {e}")
109        if os.path.exists(file_path):
110            os.remove(file_path)  # Clean up partial download
111        raise ConnectionError(f"Failed to download dataset: {e}")
112        
113    except IOError as e:
114        print(f"Error writing download file: {e}")
115        if os.path.exists(file_path):
116            os.remove(file_path)  # Clean up partial download
117        raise IOError(f"Failed to save dataset: {e}")
118        
119    except Exception as e:
120        print(f"Unexpected error during download: {e}")
121        if os.path.exists(file_path):
122            os.remove(file_path)  # Clean up partial download
123        raise Exception(f"Download failed: {e}")
124
125def download_mobifall_data(data_dir):
126    """Download the MobiFall dataset."""
127    pass
128
129def download_arduous_data(data_dir):
130    """Download the Arduous dataset."""
131    pass
132
133
134#################################################################################
135############################## EXTRACT DOWNLOAD #################################
136#################################################################################
137
138def extract_dataset(dataset_name, data_dir):
139    """Extract the dataset."""
140    if dataset_name == "daphnet":
141        extract_daphnet_data(data_dir)
142    elif dataset_name == "mobifall":
143        extract_mobifall_data(data_dir)
144    elif dataset_name == "arduous":
145        extract_arduous_data(data_dir)
146    elif dataset_name == "harup":
147        extract_harup_data(data_dir)
148    elif dataset_name == "physionet":
149        # PhysioNet dataset is handled by the PhysioNetLoader itself
150        pass
151    else:
152        raise ValueError(f"Dataset {dataset_name} not supported.")
153    
154
155def extract_daphnet_data(data_dir):
156    """Extract the Daphnet dataset."""
157    file_path = os.path.join(data_dir, "daphnet.zip")
158    with zipfile.ZipFile(file_path, "r") as zip_ref:
159        zip_ref.extractall(data_dir)
160
161def extract_mobifall_data(data_dir):
162    """Extract the MobiFall dataset."""
163    pass
164
165def extract_arduous_data(data_dir):
166    """Extract the Arduous dataset."""
167    pass
168
169
170#################################################################################
171############################ OTHER UTILS DOWNLOAD ###############################
172#################################################################################
173
174
175def sliding_window(data, window_size, step_size):
176    num_windows = (len(data) - window_size) // step_size + 1
177    windows = []
178    for i in range(num_windows):
179        start = i * step_size
180        end = start + window_size
181        windows.append(data[start:end])
182    return windows
183
184def download_harup_data(data_dir):
185    """
186    Download the HAR-UP dataset.
187    
188    This function provides instructions for downloading the HAR-UP dataset and offers
189    an option to download it directly from Google Drive as a ZIP file.
190    
191    Args:
192        data_dir (str): Directory where the dataset will be downloaded
193        
194    Returns:
195        str: Path to the extracted dataset directory or None if not performed
196    """
197    import os
198    import requests
199    from tqdm import tqdm
200    import webbrowser
201    import zipfile
202
203    # Create directory if it doesn't exist
204    os.makedirs(data_dir, exist_ok=True)
205
206    # Define file paths
207    zip_filename = "HAR-UP_Dataset.zip"
208    zip_path = os.path.join(data_dir, zip_filename)
209    dataset_dir = os.path.join(data_dir, "DataSet")
210
211    # Check if dataset directory already exists
212    if os.path.exists(dataset_dir):
213        print(f"HAR-UP dataset directory already exists at: {dataset_dir}")
214        return dataset_dir
215
216    # Direct download URL from Google Drive (update if needed)
217    url = "https://drive.usercontent.google.com/download?id=1Y2MSUijPcB7--PcGoAKhGeqI8GxKK0Pm&export=download&authuser=0"
218    print("\n" + "="*80)
219    print("HAR-UP DATASET DOWNLOAD")
220    print("="*80)
221    print("The HAR-UP dataset can be downloaded automatically or manually.")
222    print("\nOptions:")
223    print("1. Automatic download (recommended)")
224    print("2. Manual download")
225    print("3. Skip download (if you already have the dataset elsewhere)")
226
227    choice = input("\nEnter your choice (1-3): ")
228
229    if choice == "1":
230        try:
231            print(f"\nDownloading HAR-UP dataset ZIP to: {zip_path}")
232            print("This may take some time depending on your internet connection...")
233            response = requests.get(url, stream=True)
234            response.raise_for_status()  # Raise an exception for bad status codes
235            total_size = int(response.headers.get('content-length', 0))
236            progress_bar = tqdm(total=total_size, unit='iB', unit_scale=True, desc='Download Progress')
237            with open(zip_path, "wb") as file:
238                for chunk in response.iter_content(chunk_size=8192):
239                    if chunk:
240                        size = file.write(chunk)
241                        progress_bar.update(size)
242            progress_bar.close()
243            if os.path.getsize(zip_path) > 0:
244                print(f"Download completed successfully! File saved to: {zip_path}")
245                print("\nExtracting the downloaded ZIP file...")
246                with zipfile.ZipFile(zip_path, "r") as zip_ref:
247                    zip_ref.extractall(data_dir)
248                # Check for DataSet folder
249                if not os.path.exists(dataset_dir):
250                    # Sometimes the zip may contain a top-level folder, e.g., HAR-UP_Dataset/DataSet/...
251                    for entry in os.listdir(data_dir):
252                        entry_path = os.path.join(data_dir, entry)
253                        if os.path.isdir(entry_path) and os.path.exists(os.path.join(entry_path, "DataSet")):
254                            import shutil
255                            shutil.move(os.path.join(entry_path, "DataSet"), dataset_dir)
256                            break
257                if os.path.exists(dataset_dir):
258                    print(f"Extraction complete. DataSet directory at: {dataset_dir}")
259                    return dataset_dir
260                else:
261                    print("Extraction failed: DataSet directory not found after extraction.")
262                    return None
263            else:
264                raise IOError("Downloaded file is empty")
265        except Exception as e:
266            print(f"\nError during download: {e}")
267            print("\nPlease try the manual download option instead.")
268            if os.path.exists(zip_path):
269                os.remove(zip_path)  # Clean up partial download
270            return None
271
272    elif choice == "2":
273        print("\nOpening the HAR-UP dataset download page in your browser...")
274        print("Please download the ZIP file and save it to the following location:")
275        print(f"  {zip_path}")
276        webbrowser.open("https://sites.google.com/up.edu.mx/har-up/download")
277        print("\nAfter downloading, please ensure the ZIP file is named 'HAR-UP_Dataset.zip' and placed in your data directory.")
278        print("Then, rerun this function or choose option 1 to extract.")
279        return None
280
281    elif choice == "3":
282        print("\nSkipping download. Please ensure the dataset is available at:")
283        print(f"  {os.path.join(data_dir, 'DataSet')}")
284        return None
285
286    else:
287        print("\nInvalid choice. Please run again and select a valid option.")
288        return None
289
290
291def extract_harup_data(data_dir):
292    """
293    Extract the HAR-UP dataset zip file if not already extracted.
294    """
295    dataset_dir = os.path.join(data_dir, "DataSet")
296    if os.path.exists(dataset_dir):
297        print(f"HAR-UP dataset already extracted at: {dataset_dir}")
298        return
299    zip_path = os.path.join(data_dir, "HAR-UP_Dataset.zip")
300    if not os.path.exists(zip_path):
301        print(f"HAR-UP zip file not found at: {zip_path}")
302        print("Please run download_harup_data first.")
303        return
304    import zipfile
305    print(f"Extracting HAR-UP dataset zip to: {data_dir}")
306    with zipfile.ZipFile(zip_path, "r") as zip_ref:
307        zip_ref.extractall(data_dir)
308    print(f"Extraction complete.")
def download_dataset(dataset_name, data_dir):
24def download_dataset(dataset_name, data_dir):
25    """Download the dataset."""
26    if dataset_name == "daphnet":
27        download_daphnet_data(data_dir)
28    elif dataset_name == "mobifall":
29        download_mobifall_data(data_dir)
30    elif dataset_name == "arduous":
31        download_arduous_data(data_dir)
32    elif dataset_name == "harup":
33        download_harup_data(data_dir)
34    elif dataset_name == "physionet":
35        # PhysioNet dataset is handled by the PhysioNetLoader itself
36        pass
37    else:
38        raise ValueError(f"Dataset {dataset_name} not supported.")

Download the dataset.

def download_daphnet_data(data_dir):
 41def download_daphnet_data(data_dir):
 42    """Download the Daphnet dataset.
 43    
 44    This function downloads the Daphnet Freezing of Gait dataset from the UCI Machine Learning Repository.
 45    It shows a progress bar during download and handles various potential errors.
 46    If the file already exists in the specified directory, it skips the download.
 47    
 48    Args:
 49        data_dir (str): Directory where the dataset will be downloaded
 50        
 51    Returns:
 52        str: Path to the downloaded file
 53        
 54    Raises:
 55        ConnectionError: If unable to connect to the download URL
 56        IOError: If unable to create or write to the download directory/file
 57        Exception: For other unexpected errors during download
 58    """
 59    import os
 60    import requests
 61    from tqdm import tqdm
 62    
 63    url = "https://archive.ics.uci.edu/static/public/245/daphnet+freezing+of+gait.zip"
 64    file_path = os.path.join(data_dir, "daphnet.zip")
 65    
 66    # Check if file already exists
 67    if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
 68        print(f"Dataset already exists at: {file_path}")
 69        return file_path
 70    
 71    try:
 72        # Create directory if it doesn't exist
 73        os.makedirs(data_dir, exist_ok=True)
 74        print(f"Downloading Daphnet dataset to: {file_path}")
 75        
 76        # Send a HEAD request first to get the file size
 77        response = requests.head(url)
 78        total_size = int(response.headers.get('content-length', 0))
 79        
 80        # Start the download with progress bar
 81        response = requests.get(url, stream=True)
 82        response.raise_for_status()  # Raise an exception for bad status codes
 83        
 84        # Initialize progress bar
 85        progress_bar = tqdm(
 86            total=total_size,
 87            unit='iB',
 88            unit_scale=True,
 89            desc='Download Progress'
 90        )
 91        
 92        # Write the file with progress updates
 93        with open(file_path, "wb") as file:
 94            for chunk in response.iter_content(chunk_size=8192):
 95                if chunk:
 96                    size = file.write(chunk)
 97                    progress_bar.update(size)
 98        
 99        progress_bar.close()
100        
101        # Verify download completed successfully
102        if os.path.getsize(file_path) > 0:
103            print(f"Download completed successfully! File saved to: {file_path}")
104            return file_path
105        else:
106            raise IOError("Downloaded file is empty")
107            
108    except requests.exceptions.RequestException as e:
109        print(f"Error connecting to download URL: {e}")
110        if os.path.exists(file_path):
111            os.remove(file_path)  # Clean up partial download
112        raise ConnectionError(f"Failed to download dataset: {e}")
113        
114    except IOError as e:
115        print(f"Error writing download file: {e}")
116        if os.path.exists(file_path):
117            os.remove(file_path)  # Clean up partial download
118        raise IOError(f"Failed to save dataset: {e}")
119        
120    except Exception as e:
121        print(f"Unexpected error during download: {e}")
122        if os.path.exists(file_path):
123            os.remove(file_path)  # Clean up partial download
124        raise Exception(f"Download failed: {e}")

Download the Daphnet dataset.

This function downloads the Daphnet Freezing of Gait dataset from the UCI Machine Learning Repository. It shows a progress bar during download and handles various potential errors. If the file already exists in the specified directory, it skips the download.

Args: data_dir (str): Directory where the dataset will be downloaded

Returns: str: Path to the downloaded file

Raises: ConnectionError: If unable to connect to the download URL IOError: If unable to create or write to the download directory/file Exception: For other unexpected errors during download

def download_mobifall_data(data_dir):
126def download_mobifall_data(data_dir):
127    """Download the MobiFall dataset."""
128    pass

Download the MobiFall dataset.

def download_arduous_data(data_dir):
130def download_arduous_data(data_dir):
131    """Download the Arduous dataset."""
132    pass

Download the Arduous dataset.

def extract_dataset(dataset_name, data_dir):
139def extract_dataset(dataset_name, data_dir):
140    """Extract the dataset."""
141    if dataset_name == "daphnet":
142        extract_daphnet_data(data_dir)
143    elif dataset_name == "mobifall":
144        extract_mobifall_data(data_dir)
145    elif dataset_name == "arduous":
146        extract_arduous_data(data_dir)
147    elif dataset_name == "harup":
148        extract_harup_data(data_dir)
149    elif dataset_name == "physionet":
150        # PhysioNet dataset is handled by the PhysioNetLoader itself
151        pass
152    else:
153        raise ValueError(f"Dataset {dataset_name} not supported.")

Extract the dataset.

def extract_daphnet_data(data_dir):
156def extract_daphnet_data(data_dir):
157    """Extract the Daphnet dataset."""
158    file_path = os.path.join(data_dir, "daphnet.zip")
159    with zipfile.ZipFile(file_path, "r") as zip_ref:
160        zip_ref.extractall(data_dir)

Extract the Daphnet dataset.

def extract_mobifall_data(data_dir):
162def extract_mobifall_data(data_dir):
163    """Extract the MobiFall dataset."""
164    pass

Extract the MobiFall dataset.

def extract_arduous_data(data_dir):
166def extract_arduous_data(data_dir):
167    """Extract the Arduous dataset."""
168    pass

Extract the Arduous dataset.

def sliding_window(data, window_size, step_size):
176def sliding_window(data, window_size, step_size):
177    num_windows = (len(data) - window_size) // step_size + 1
178    windows = []
179    for i in range(num_windows):
180        start = i * step_size
181        end = start + window_size
182        windows.append(data[start:end])
183    return windows
def download_harup_data(data_dir):
185def download_harup_data(data_dir):
186    """
187    Download the HAR-UP dataset.
188    
189    This function provides instructions for downloading the HAR-UP dataset and offers
190    an option to download it directly from Google Drive as a ZIP file.
191    
192    Args:
193        data_dir (str): Directory where the dataset will be downloaded
194        
195    Returns:
196        str: Path to the extracted dataset directory or None if not performed
197    """
198    import os
199    import requests
200    from tqdm import tqdm
201    import webbrowser
202    import zipfile
203
204    # Create directory if it doesn't exist
205    os.makedirs(data_dir, exist_ok=True)
206
207    # Define file paths
208    zip_filename = "HAR-UP_Dataset.zip"
209    zip_path = os.path.join(data_dir, zip_filename)
210    dataset_dir = os.path.join(data_dir, "DataSet")
211
212    # Check if dataset directory already exists
213    if os.path.exists(dataset_dir):
214        print(f"HAR-UP dataset directory already exists at: {dataset_dir}")
215        return dataset_dir
216
217    # Direct download URL from Google Drive (update if needed)
218    url = "https://drive.usercontent.google.com/download?id=1Y2MSUijPcB7--PcGoAKhGeqI8GxKK0Pm&export=download&authuser=0"
219    print("\n" + "="*80)
220    print("HAR-UP DATASET DOWNLOAD")
221    print("="*80)
222    print("The HAR-UP dataset can be downloaded automatically or manually.")
223    print("\nOptions:")
224    print("1. Automatic download (recommended)")
225    print("2. Manual download")
226    print("3. Skip download (if you already have the dataset elsewhere)")
227
228    choice = input("\nEnter your choice (1-3): ")
229
230    if choice == "1":
231        try:
232            print(f"\nDownloading HAR-UP dataset ZIP to: {zip_path}")
233            print("This may take some time depending on your internet connection...")
234            response = requests.get(url, stream=True)
235            response.raise_for_status()  # Raise an exception for bad status codes
236            total_size = int(response.headers.get('content-length', 0))
237            progress_bar = tqdm(total=total_size, unit='iB', unit_scale=True, desc='Download Progress')
238            with open(zip_path, "wb") as file:
239                for chunk in response.iter_content(chunk_size=8192):
240                    if chunk:
241                        size = file.write(chunk)
242                        progress_bar.update(size)
243            progress_bar.close()
244            if os.path.getsize(zip_path) > 0:
245                print(f"Download completed successfully! File saved to: {zip_path}")
246                print("\nExtracting the downloaded ZIP file...")
247                with zipfile.ZipFile(zip_path, "r") as zip_ref:
248                    zip_ref.extractall(data_dir)
249                # Check for DataSet folder
250                if not os.path.exists(dataset_dir):
251                    # Sometimes the zip may contain a top-level folder, e.g., HAR-UP_Dataset/DataSet/...
252                    for entry in os.listdir(data_dir):
253                        entry_path = os.path.join(data_dir, entry)
254                        if os.path.isdir(entry_path) and os.path.exists(os.path.join(entry_path, "DataSet")):
255                            import shutil
256                            shutil.move(os.path.join(entry_path, "DataSet"), dataset_dir)
257                            break
258                if os.path.exists(dataset_dir):
259                    print(f"Extraction complete. DataSet directory at: {dataset_dir}")
260                    return dataset_dir
261                else:
262                    print("Extraction failed: DataSet directory not found after extraction.")
263                    return None
264            else:
265                raise IOError("Downloaded file is empty")
266        except Exception as e:
267            print(f"\nError during download: {e}")
268            print("\nPlease try the manual download option instead.")
269            if os.path.exists(zip_path):
270                os.remove(zip_path)  # Clean up partial download
271            return None
272
273    elif choice == "2":
274        print("\nOpening the HAR-UP dataset download page in your browser...")
275        print("Please download the ZIP file and save it to the following location:")
276        print(f"  {zip_path}")
277        webbrowser.open("https://sites.google.com/up.edu.mx/har-up/download")
278        print("\nAfter downloading, please ensure the ZIP file is named 'HAR-UP_Dataset.zip' and placed in your data directory.")
279        print("Then, rerun this function or choose option 1 to extract.")
280        return None
281
282    elif choice == "3":
283        print("\nSkipping download. Please ensure the dataset is available at:")
284        print(f"  {os.path.join(data_dir, 'DataSet')}")
285        return None
286
287    else:
288        print("\nInvalid choice. Please run again and select a valid option.")
289        return None

Download the HAR-UP dataset.

This function provides instructions for downloading the HAR-UP dataset and offers an option to download it directly from Google Drive as a ZIP file.

Args: data_dir (str): Directory where the dataset will be downloaded

Returns: str: Path to the extracted dataset directory or None if not performed

def extract_harup_data(data_dir):
292def extract_harup_data(data_dir):
293    """
294    Extract the HAR-UP dataset zip file if not already extracted.
295    """
296    dataset_dir = os.path.join(data_dir, "DataSet")
297    if os.path.exists(dataset_dir):
298        print(f"HAR-UP dataset already extracted at: {dataset_dir}")
299        return
300    zip_path = os.path.join(data_dir, "HAR-UP_Dataset.zip")
301    if not os.path.exists(zip_path):
302        print(f"HAR-UP zip file not found at: {zip_path}")
303        print("Please run download_harup_data first.")
304        return
305    import zipfile
306    print(f"Extracting HAR-UP dataset zip to: {data_dir}")
307    with zipfile.ZipFile(zip_path, "r") as zip_ref:
308        zip_ref.extractall(data_dir)
309    print(f"Extraction complete.")

Extract the HAR-UP dataset zip file if not already extracted.