import pandas as pd
from sqlalchemy import text
from scripts.base import my_eng
from .net_helper import *
from .net_manager_queries import (DELETE_DATASET, INSERT_DATASET,
SELECT_ALL_DATASET_NAMES)
[docs]class DataSet:
[docs] def __init__(self, name: str, path: str, engine=None):
"""
Initialize a DataSet object.
:param name: The name of the dataset.
:type name: str
:param path: The file path of the dataset CSV file.
:type path: str
"""
self.name = name
self.path = path
self.engine = engine if engine is not None else my_eng
[docs] def validate(self):
"""
Validate name and path attributes.
:raises TypeError: If name or path is not a string.
"""
if not isinstance(self.name, str):
raise TypeError("name argument must be a string")
if not isinstance(self.path, str):
raise TypeError("path argument must be a string")
[docs] def upload_dataset(self, NodeId=-1):
"""
Upload the dataset to the database.
:param NodeId: The NodeId associated with the dataset.
:type NodeId: int
Notes:
- Assumes the file at self.path is a valid CSV file.
- The dataset is uploaded to the 'DataSets' table in the database.
"""
data_df = pd.read_csv(self.path)
nodeId = NodeId
columns = data_df.columns.tolist()
data_df = process_eicu(data_df)
for index, row in data_df.iterrows():
query_1 = "INSERT INTO DataSets(DataSetName,nodeId," + "".join(
f"{x}," for x in columns
)
query_2 = f" VALUES ('{self.name}',{nodeId}, " + "".join(
f"{is_str(data_df, row, x)}," for x in columns
)
query = query_1[:-1] + ")" + query_2[:-1] + ")"
self.engine.execute(text(query))
[docs] def delete_dataset(self):
"""
Delete the dataset from the database.
Notes:
- Assumes the dataset name is unique in the 'DataSets' table.
"""
self.engine.execute(text(DELETE_DATASET), {"name": self.name})
[docs] def update_data(self):
"""
Update the data in the dataset.
Not implemented yet.
"""
pass
[docs] @staticmethod
def list_alldatasets(engine):
"""
List all dataset names from the 'DataSets' table.
:returns: A DataFrame containing the names of all datasets in the 'DataSets' table.
:rtype: pd.DataFrame
"""
res = pd.read_sql(text(SELECT_ALL_DATASET_NAMES), engine)
return res