!pip install flowrunner
In this flow we define the following steps
note: This one has a return at the end, this is important later
# -*- coding: utf-8 -*-
import pandas as pd
from flowrunner import BaseFlow, end, start, step
class ExamplePandas(BaseFlow):
@start
@step(next=["transformation_function_1", "transformation_function_2"])
def create_data(self):
"""
This method we create the dataset we are going use. In real use cases,
you'll have to read from a source (csv, parquet, etc)
For this example we create two dataframes for students ranked by marked scored
for when they attempted the example on 1st January 2023 and 12th March 2023
After creating the dataset we pass it to the next methods
- transformation_function_1
- transformation_function_2
"""
data1 = {"Name": ["Hermione", "Harry", "Ron"], "marks": [100, 85, 75]}
data2 = {"Name": ["Hermione", "Ron", "Harry"], "marks": [100, 90, 80]}
df1 = pd.DataFrame(data1, index=["rank1", "rank2", "rank3"])
df2 = pd.DataFrame(data2, index=["rank1", "rank2", "rank3"])
self.input_data_1 = df1
self.input_data_2 = df2
@step(next=["append_data"])
def transformation_function_1(self):
"""
Here we add a snapshot_date to the input dataframe of 2023-03-12
"""
transformed_df = self.input_data_1
transformed_df.insert(1, "snapshot_date", "2023-03-12")
self.transformed_df_1 = transformed_df
@step(next=["append_data"])
def transformation_function_2(self):
"""
Here we add a snapshot_date to the input dataframe of 2023-01-01
"""
transformed_df = self.input_data_2
transformed_df.insert(1, "snapshot_date", "2023-01-01")
self.transformed_df_2 = transformed_df
@step(next=["show_data"])
def append_data(self):
"""
Here we append the two dataframe together
"""
self.final_df = pd.concat([self.transformed_df_1, self.transformed_df_2])
@end
@step
def show_data(self):
"""
Here we show the new final dataframe of aggregated data. However in real use cases. It would
be more likely to write the data to some final layer/format
"""
print(self.final_df)
return self.final_df
example = ExamplePandas() # create an instance of your flow
show
to show what our flow will do¶example.show()
DEBUG | Validating flow for ExamplePandas(data_store={}, param_store={}) | 2023-03-25 23:24:59 | flowrunner.system.logger | 4884 DEBUG | Show flow for ExamplePandas(data_store={}, param_store={}) | 2023-03-25 23:24:59 | flowrunner.system.logger | 4884
create_data This method we create the dataset we are going use. In real use cases, you'll have to read from a source (csv, parquet, etc) For this example we create two dataframes for students ranked by marked scored for when they attempted the example on 1st January 2023 and 12th March 2023 After creating the dataset we pass it to the next methods - transformation_function_1 - transformation_function_2 Next=transformation_function_1, transformation_function_2 transformation_function_1 Here we add a snapshot_date to the input dataframe of 2023-03-12 Next=append_data transformation_function_2 Here we add a snapshot_date to the input dataframe of 2023-01-01 Next=append_data append_data Here we append the two dataframe together Next=show_data show_data Here we show the new final dataframe of aggregated data. However in real use cases. It would be more likely to write the data to some final layer/format
example.display()
flow
¶example.run()
DEBUG | Validating flow for ExamplePandas(data_store={}, param_store={}) | 2023-03-25 23:25:53 | flowrunner.system.logger | 4884 WARNING | Validation will raise InvalidFlowException if invalid Flow found | 2023-03-25 23:25:53 | flowrunner.system.logger | 4884 DEBUG | Running flow for ExamplePandas(data_store={}, param_store={}) | 2023-03-25 23:25:53 | flowrunner.system.logger | 4884
Name snapshot_date marks rank1 Hermione 2023-03-12 100 rank2 Harry 2023-03-12 85 rank3 Ron 2023-03-12 75 rank1 Hermione 2023-01-01 100 rank2 Ron 2023-01-01 90 rank3 Harry 2023-01-01 80
print()
the object to see the output of each function¶data_store
: Which contains output of each function in case there was a returnparam_store
. Param store is used to pass reusable parameters to each flowprint(example)
ExamplePandas(data_store={'create_data': None, 'transformation_function_1': None, 'transformation_function_2': None, 'append_data': None, 'show_data': Name snapshot_date marks rank1 Hermione 2023-03-12 100 rank2 Harry 2023-03-12 85 rank3 Ron 2023-03-12 75 rank1 Hermione 2023-01-01 100 rank2 Ron 2023-01-01 90 rank3 Harry 2023-01-01 80}, param_store={})