3D Alexandria Database¶
[41]:
import json
import os
import logging
from glob import glob
import shutil
import time
import numpy as np
from pyarrow import compute as pc
from parquetdb.core.parquetdb import LoadConfig, NormalizeConfig
from parquetdb.utils.general_utils import timeit
from parquetdb import ParquetDB, config
from parquetdb.utils.external_utils import download_alexandria_3d_database
import matplotlib.pyplot as plt
Setup¶
Setup data directories¶
[42]:
base_dir = os.path.join(config.data_dir, "external", "alexandria", "AlexandriaDB")
benchmark_dir = os.path.join(config.data_dir, "benchmarks", "alexandria")
Download the database¶
Lets download the database
[43]:
def download_alexandria_database(base_dir, from_scratch=False):
print("Starting task: download_alexandria_database")
if from_scratch and os.path.exists(base_dir):
print(f"Removing existing directory: {base_dir}")
shutil.rmtree(base_dir, ignore_errors=True)
# Here we download the database and save it to the data directory
output_dir = os.path.join(config.data_dir, "external", "alexandria")
alexandria_dir = download_alexandria_3d_database(output_dir, n_cores=8)
print("Done with task: download_alexandria_database")
print("-" * 200)
return alexandria_dir
alexandria_dir = download_alexandria_database(base_dir, from_scratch=False)
print(alexandria_dir)
Starting task: download_alexandria_database
Database downloaded already. Skipping download.
Done with task: download_alexandria_database
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Z:\data\parquetdb\data\external\alexandria\uncompressed
Creating the database¶
[46]:
db = ParquetDB(db_path=os.path.join(base_dir, "alexandria_3D"))
print(db)
============================================================
PARQUETDB SUMMARY
============================================================
Database path: Z:\data\parquetdb\data\external\alexandria\AlexandriaDB\alexandria_3D
• Number of columns: 1
• Number of rows: 0
• Number of files: 1
• Number of rows per file: [0]
• Number of row groups per file: [1]
• Serialized metadata size per file: [312] Bytes
############################################################
METADATA
############################################################
############################################################
COLUMN DETAILS
############################################################
• Columns:
- id
Lets Define some dictionaries to store some benchmark results
[47]:
benchmark_dict = {
"create_times": [],
"json_load_times": [],
"n_rows_per_file": [],
}
task_benchmark_dict = {"task_names": [], "task_times": []}
Inputing the data¶
Here I am just iterating over the json files and creating the database. I am also storing how long it takes to load each json file, how many materials, and how long it takes to input the data into the database.
This dataset is rather large, so you may have to choose the normalization parameters depending on how much RAM you have. If you do not have enough tweak the batch_size
, batch_readahead
, fragment_readahead
parameters.
[48]:
def create_database_if_empty(db, alexandria_dir, normalize_config=NormalizeConfig()):
"""_summary_
Parameters
----------
db : ParquetDB Instance
The database instance to create the dataset on.
alexandria_dir : str
The directory containing the json files to input into the database.
Returns
-------
_type_
_description_
"""
print("Starting task: create_database_if_empty")
start_time = time.time()
json_load_times = []
create_times = []
n_materials_per_file = []
if db.is_empty():
print("The dataset does not exist. Creating it.")
json_files = glob(os.path.join(alexandria_dir, "*.json"))
for json_file in json_files[:]:
start_time = time.time()
with open(json_file, "r") as f:
data = json.load(f)
json_load_time = time.time() - start_time
base_name = os.path.basename(json_file)
n_materials = len(data["entries"])
print(f"Processing file: {base_name}")
print(f"Number of materials: {n_materials}")
try:
# Since we are importing alot of data it is best
# to normalize the database afterwards
start_time = time.time()
db.create(
data["entries"],
normalize_dataset=False,
normalize_config=normalize_config,
)
create_time = time.time() - start_time
create_times.append(create_time)
n_materials_per_file.append(n_materials)
json_load_times.append(json_load_time)
except Exception as e:
print(e)
data = None
print(f"Time taken to create dataset: {time.time() - start_time}")
print("-" * 100)
print("Done with task: create_database_if_empty")
print("-" * 200)
return json_load_times, create_times, n_materials_per_file
normalize_config = NormalizeConfig(
load_format="batches", # Uses the batch generator to normalize
batch_readahead=4, # Controls the number of batches to load in memory a head of time. This will have impacts on amount of RAM consumed
fragment_readahead=2, # Controls the number of files to load in memory ahead of time. This will have impacts on amount of RAM consumed
batch_size=100000, # Controls the batchsize when to use when normalizing. This will have impacts on amount of RAM consumed
max_rows_per_file=100000, # Controls the max number of rows per parquet file
max_rows_per_group=100000,
) # Controls the max number of rows per group parquet
json_load_times, create_times, n_materials_per_file = create_database_if_empty(
db, alexandria_dir, normalize_config=normalize_config
)
benchmark_dict["create_times"] = create_times
benchmark_dict["json_load_times"] = json_load_times
benchmark_dict["n_rows_per_file"] = n_materials_per_file
Starting task: create_database_if_empty
The dataset does not exist. Creating it.
Processing file: alexandria_001.json
Number of materials: 100000
Time taken to create dataset: 9.207536935806274
----------------------------------------------------------------------------------------------------
Processing file: alexandria_002.json
Number of materials: 100000
Time taken to create dataset: 9.03726840019226
----------------------------------------------------------------------------------------------------
Processing file: alexandria_003.json
Number of materials: 100000
Time taken to create dataset: 7.272280693054199
----------------------------------------------------------------------------------------------------
Processing file: alexandria_004.json
Number of materials: 100000
Time taken to create dataset: 6.3968799114227295
----------------------------------------------------------------------------------------------------
Processing file: alexandria_005.json
Number of materials: 100000
Time taken to create dataset: 9.731169700622559
----------------------------------------------------------------------------------------------------
Processing file: alexandria_006.json
Number of materials: 100000
Time taken to create dataset: 7.279998540878296
----------------------------------------------------------------------------------------------------
Processing file: alexandria_007.json
Number of materials: 100000
Time taken to create dataset: 7.452792167663574
----------------------------------------------------------------------------------------------------
Processing file: alexandria_008.json
Number of materials: 100000
Time taken to create dataset: 6.5389344692230225
----------------------------------------------------------------------------------------------------
Processing file: alexandria_009.json
Number of materials: 100000
Time taken to create dataset: 12.014537334442139
----------------------------------------------------------------------------------------------------
Processing file: alexandria_010.json
Number of materials: 100000
Time taken to create dataset: 7.618973016738892
----------------------------------------------------------------------------------------------------
Processing file: alexandria_011.json
Number of materials: 100000
Time taken to create dataset: 8.03697943687439
----------------------------------------------------------------------------------------------------
Processing file: alexandria_012.json
Number of materials: 100000
Time taken to create dataset: 6.7988600730896
----------------------------------------------------------------------------------------------------
Processing file: alexandria_013.json
Number of materials: 100000
Time taken to create dataset: 6.983482122421265
----------------------------------------------------------------------------------------------------
Processing file: alexandria_014.json
Number of materials: 100000
Time taken to create dataset: 7.620621919631958
----------------------------------------------------------------------------------------------------
Processing file: alexandria_015.json
Number of materials: 100000
Time taken to create dataset: 7.09317684173584
----------------------------------------------------------------------------------------------------
Processing file: alexandria_016.json
Number of materials: 100000
Time taken to create dataset: 7.633606433868408
----------------------------------------------------------------------------------------------------
Processing file: alexandria_017.json
Number of materials: 100000
Time taken to create dataset: 7.403124570846558
----------------------------------------------------------------------------------------------------
Processing file: alexandria_018.json
Number of materials: 100000
Time taken to create dataset: 7.627993822097778
----------------------------------------------------------------------------------------------------
Processing file: alexandria_019.json
Number of materials: 100000
Time taken to create dataset: 7.117793560028076
----------------------------------------------------------------------------------------------------
Processing file: alexandria_020.json
Number of materials: 100000
Time taken to create dataset: 7.894531011581421
----------------------------------------------------------------------------------------------------
Processing file: alexandria_021.json
Number of materials: 100000
Time taken to create dataset: 6.83626651763916
----------------------------------------------------------------------------------------------------
Processing file: alexandria_022.json
Number of materials: 100000
Time taken to create dataset: 6.791656732559204
----------------------------------------------------------------------------------------------------
Processing file: alexandria_023.json
Number of materials: 100000
Time taken to create dataset: 6.708625555038452
----------------------------------------------------------------------------------------------------
Processing file: alexandria_024.json
Number of materials: 100000
Time taken to create dataset: 8.351858377456665
----------------------------------------------------------------------------------------------------
Processing file: alexandria_025.json
Number of materials: 100000
Time taken to create dataset: 7.344949007034302
----------------------------------------------------------------------------------------------------
Processing file: alexandria_026.json
Number of materials: 100000
Time taken to create dataset: 27.566998958587646
----------------------------------------------------------------------------------------------------
Processing file: alexandria_027.json
Number of materials: 100000
Time taken to create dataset: 7.999422073364258
----------------------------------------------------------------------------------------------------
Processing file: alexandria_028.json
Number of materials: 100000
Time taken to create dataset: 8.236054182052612
----------------------------------------------------------------------------------------------------
Processing file: alexandria_029.json
Number of materials: 100000
Time taken to create dataset: 8.136978387832642
----------------------------------------------------------------------------------------------------
Processing file: alexandria_030.json
Number of materials: 100000
Time taken to create dataset: 7.855477571487427
----------------------------------------------------------------------------------------------------
Processing file: alexandria_031.json
Number of materials: 100000
Time taken to create dataset: 7.7916247844696045
----------------------------------------------------------------------------------------------------
Processing file: alexandria_032.json
Number of materials: 100000
Time taken to create dataset: 6.601974010467529
----------------------------------------------------------------------------------------------------
Processing file: alexandria_033.json
Number of materials: 100000
Time taken to create dataset: 6.663681983947754
----------------------------------------------------------------------------------------------------
Processing file: alexandria_034.json
Number of materials: 100000
Time taken to create dataset: 7.94143009185791
----------------------------------------------------------------------------------------------------
Processing file: alexandria_035.json
Number of materials: 100000
Time taken to create dataset: 7.299615859985352
----------------------------------------------------------------------------------------------------
Processing file: alexandria_036.json
Number of materials: 100000
Time taken to create dataset: 6.486094951629639
----------------------------------------------------------------------------------------------------
Processing file: alexandria_037.json
Number of materials: 100000
Time taken to create dataset: 8.335729360580444
----------------------------------------------------------------------------------------------------
Processing file: alexandria_038.json
Number of materials: 100000
Time taken to create dataset: 6.945127248764038
----------------------------------------------------------------------------------------------------
Processing file: alexandria_039.json
Number of materials: 100000
Time taken to create dataset: 7.50490403175354
----------------------------------------------------------------------------------------------------
Processing file: alexandria_040.json
Number of materials: 100000
Time taken to create dataset: 7.409191131591797
----------------------------------------------------------------------------------------------------
Processing file: alexandria_041.json
Number of materials: 100000
Time taken to create dataset: 7.5053534507751465
----------------------------------------------------------------------------------------------------
Processing file: alexandria_042.json
Number of materials: 100000
Time taken to create dataset: 7.061246871948242
----------------------------------------------------------------------------------------------------
Processing file: alexandria_043.json
Number of materials: 100000
Time taken to create dataset: 6.405843496322632
----------------------------------------------------------------------------------------------------
Processing file: alexandria_044.json
Number of materials: 89295
Time taken to create dataset: 5.768043756484985
----------------------------------------------------------------------------------------------------
Done with task: create_database_if_empty
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Normalizing the dataset¶
It is best practive to normalize the dataset after all the data has been inputed. This will optimize the performance of the database.
First let’s see how the data is distributed in the row groups.
[49]:
summary = db.summary(show_row_group_metadata=True)
print(summary)
============================================================
PARQUETDB SUMMARY
============================================================
Database path: Z:\data\parquetdb\data\external\alexandria\AlexandriaDB\alexandria_3D
• Number of columns: 128
• Number of rows: 4389295
• Number of files: 44
• Number of rows per file: [100000, 100000, 100000, 100000, 100000, 100000, 100000, 100000, 100000, 100000, 100000, 100000, 100000, 100000, 100000, 100000, 100000, 100000, 100000, 100000, 100000, 100000, 100000, 100000, 100000, 100000, 100000, 100000, 100000, 100000, 100000, 100000, 100000, 100000, 100000, 100000, 100000, 100000, 89295, 100000, 100000, 100000, 100000, 100000]
• Number of row groups per file: [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 4, 4, 4, 4, 4]
• Number of rows per row group per file:
- alexandria_3D_0.parquet:
- Row group 0: 32768 rows
- Row group 1: 32768 rows
- Row group 2: 32768 rows
- Row group 3: 1696 rows
- alexandria_3D_1.parquet:
- Row group 0: 32768 rows
- Row group 1: 32768 rows
- Row group 2: 32768 rows
- Row group 3: 1696 rows
- alexandria_3D_10.parquet:
- Row group 0: 32768 rows
- Row group 1: 32768 rows
- Row group 2: 32768 rows
- Row group 3: 1696 rows
- alexandria_3D_11.parquet:
- Row group 0: 32768 rows
- Row group 1: 32768 rows
- Row group 2: 32768 rows
- Row group 3: 1696 rows
- alexandria_3D_12.parquet:
- Row group 0: 32768 rows
- Row group 1: 32768 rows
- Row group 2: 32768 rows
- Row group 3: 1696 rows
- alexandria_3D_13.parquet:
- Row group 0: 32768 rows
- Row group 1: 32768 rows
- Row group 2: 32768 rows
- Row group 3: 1696 rows
- alexandria_3D_14.parquet:
- Row group 0: 32768 rows
- Row group 1: 32768 rows
- Row group 2: 32768 rows
- Row group 3: 1696 rows
- alexandria_3D_15.parquet:
- Row group 0: 32768 rows
- Row group 1: 32768 rows
- Row group 2: 32768 rows
- Row group 3: 1696 rows
- alexandria_3D_16.parquet:
- Row group 0: 32768 rows
- Row group 1: 32768 rows
- Row group 2: 32768 rows
- Row group 3: 1696 rows
- alexandria_3D_17.parquet:
- Row group 0: 32768 rows
- Row group 1: 32768 rows
- Row group 2: 32768 rows
- Row group 3: 1696 rows
- alexandria_3D_18.parquet:
- Row group 0: 32768 rows
- Row group 1: 32768 rows
- Row group 2: 32768 rows
- Row group 3: 1696 rows
- alexandria_3D_19.parquet:
- Row group 0: 32768 rows
- Row group 1: 32768 rows
- Row group 2: 32768 rows
- Row group 3: 1696 rows
- alexandria_3D_2.parquet:
- Row group 0: 32768 rows
- Row group 1: 32768 rows
- Row group 2: 32768 rows
- Row group 3: 1696 rows
- alexandria_3D_20.parquet:
- Row group 0: 32768 rows
- Row group 1: 32768 rows
- Row group 2: 32768 rows
- Row group 3: 1696 rows
- alexandria_3D_21.parquet:
- Row group 0: 32768 rows
- Row group 1: 32768 rows
- Row group 2: 32768 rows
- Row group 3: 1696 rows
- alexandria_3D_22.parquet:
- Row group 0: 32768 rows
- Row group 1: 32768 rows
- Row group 2: 32768 rows
- Row group 3: 1696 rows
- alexandria_3D_23.parquet:
- Row group 0: 32768 rows
- Row group 1: 32768 rows
- Row group 2: 32768 rows
- Row group 3: 1696 rows
- alexandria_3D_24.parquet:
- Row group 0: 32768 rows
- Row group 1: 32768 rows
- Row group 2: 32768 rows
- Row group 3: 1696 rows
- alexandria_3D_25.parquet:
- Row group 0: 100000 rows
- alexandria_3D_26.parquet:
- Row group 0: 100000 rows
- alexandria_3D_27.parquet:
- Row group 0: 100000 rows
- alexandria_3D_28.parquet:
- Row group 0: 100000 rows
- alexandria_3D_29.parquet:
- Row group 0: 100000 rows
- alexandria_3D_3.parquet:
- Row group 0: 32768 rows
- Row group 1: 32768 rows
- Row group 2: 32768 rows
- Row group 3: 1696 rows
- alexandria_3D_30.parquet:
- Row group 0: 100000 rows
- alexandria_3D_31.parquet:
- Row group 0: 100000 rows
- alexandria_3D_32.parquet:
- Row group 0: 100000 rows
- alexandria_3D_33.parquet:
- Row group 0: 100000 rows
- alexandria_3D_34.parquet:
- Row group 0: 100000 rows
- alexandria_3D_35.parquet:
- Row group 0: 100000 rows
- alexandria_3D_36.parquet:
- Row group 0: 100000 rows
- alexandria_3D_37.parquet:
- Row group 0: 100000 rows
- alexandria_3D_38.parquet:
- Row group 0: 100000 rows
- alexandria_3D_39.parquet:
- Row group 0: 100000 rows
- alexandria_3D_4.parquet:
- Row group 0: 32768 rows
- Row group 1: 32768 rows
- Row group 2: 32768 rows
- Row group 3: 1696 rows
- alexandria_3D_40.parquet:
- Row group 0: 100000 rows
- alexandria_3D_41.parquet:
- Row group 0: 100000 rows
- alexandria_3D_42.parquet:
- Row group 0: 100000 rows
- alexandria_3D_43.parquet:
- Row group 0: 89295 rows
- alexandria_3D_5.parquet:
- Row group 0: 32768 rows
- Row group 1: 32768 rows
- Row group 2: 32768 rows
- Row group 3: 1696 rows
- alexandria_3D_6.parquet:
- Row group 0: 32768 rows
- Row group 1: 32768 rows
- Row group 2: 32768 rows
- Row group 3: 1696 rows
- alexandria_3D_7.parquet:
- Row group 0: 32768 rows
- Row group 1: 32768 rows
- Row group 2: 32768 rows
- Row group 3: 1696 rows
- alexandria_3D_8.parquet:
- Row group 0: 32768 rows
- Row group 1: 32768 rows
- Row group 2: 32768 rows
- Row group 3: 1696 rows
- alexandria_3D_9.parquet:
- Row group 0: 32768 rows
- Row group 1: 32768 rows
- Row group 2: 32768 rows
- Row group 3: 1696 rows
• Serialized metadata size per file: [74101, 77444, 77856, 77696, 78981, 76117, 74681, 75387, 77761, 77151, 73789, 78065, 76137, 75769, 77500, 72899, 79079, 77137, 30983, 31079, 31034, 31038, 31521, 75599, 31495, 30344, 31060, 31337, 31354, 31177, 31163, 31191, 30452, 30605, 75367, 30766, 31097, 30715, 30329, 74876, 76781, 78952, 78372, 77480] Bytes
############################################################
METADATA
############################################################
############################################################
COLUMN DETAILS
############################################################
Let’s check the file size of the parquet files.
[50]:
def list_file_sizes(directory, in_MB=True):
"""Lists the size of files in a directory.
Args:
directory: The path to the directory.
"""
file_sizes = {}
for filename in os.listdir(directory):
file_path = os.path.join(directory, filename)
if os.path.isfile(file_path):
file_size = os.path.getsize(file_path)
if in_MB:
file_size = file_size / (1024 * 1024)
file_sizes[filename] = file_size
return file_sizes
file_sizes = list_file_sizes(db.db_path)
for file, size in file_sizes.items():
print(f"{file}: {size} MB")
alexandria_3D_0.parquet: 63.217278480529785 MB
alexandria_3D_1.parquet: 66.39281272888184 MB
alexandria_3D_10.parquet: 67.79165077209473 MB
alexandria_3D_11.parquet: 76.40940952301025 MB
alexandria_3D_12.parquet: 64.72634792327881 MB
alexandria_3D_13.parquet: 61.073683738708496 MB
alexandria_3D_14.parquet: 59.82237529754639 MB
alexandria_3D_15.parquet: 59.66804218292236 MB
alexandria_3D_16.parquet: 79.35534191131592 MB
alexandria_3D_17.parquet: 65.30797958374023 MB
alexandria_3D_18.parquet: 53.305752754211426 MB
alexandria_3D_19.parquet: 67.49996185302734 MB
alexandria_3D_2.parquet: 80.63626098632812 MB
alexandria_3D_20.parquet: 60.81020164489746 MB
alexandria_3D_21.parquet: 71.9999008178711 MB
alexandria_3D_22.parquet: 57.005184173583984 MB
alexandria_3D_23.parquet: 76.82518005371094 MB
alexandria_3D_24.parquet: 69.84022235870361 MB
alexandria_3D_25.parquet: 73.2688102722168 MB
alexandria_3D_26.parquet: 70.60673236846924 MB
alexandria_3D_27.parquet: 79.58608531951904 MB
alexandria_3D_28.parquet: 79.32598686218262 MB
alexandria_3D_29.parquet: 72.01288318634033 MB
alexandria_3D_3.parquet: 58.68510627746582 MB
alexandria_3D_30.parquet: 68.60778427124023 MB
alexandria_3D_31.parquet: 53.06931400299072 MB
alexandria_3D_32.parquet: 53.88803291320801 MB
alexandria_3D_33.parquet: 73.62857913970947 MB
alexandria_3D_34.parquet: 63.31537055969238 MB
alexandria_3D_35.parquet: 51.56621837615967 MB
alexandria_3D_36.parquet: 80.69254112243652 MB
alexandria_3D_37.parquet: 59.53824043273926 MB
alexandria_3D_38.parquet: 64.71589279174805 MB
alexandria_3D_39.parquet: 66.9590711593628 MB
alexandria_3D_4.parquet: 58.02934169769287 MB
alexandria_3D_40.parquet: 66.78332233428955 MB
alexandria_3D_41.parquet: 61.57388782501221 MB
alexandria_3D_42.parquet: 52.7365026473999 MB
alexandria_3D_43.parquet: 46.13230228424072 MB
alexandria_3D_5.parquet: 72.92802429199219 MB
alexandria_3D_6.parquet: 66.0786542892456 MB
alexandria_3D_7.parquet: 71.52147197723389 MB
alexandria_3D_8.parquet: 68.42291927337646 MB
alexandria_3D_9.parquet: 73.67225074768066 MB
Let’s also check the size of the row groups.
[51]:
row_group_metadata_per_file = db.get_parquet_file_row_group_metadata_per_file(
as_dict=True
)
row_group_size_per_file = {}
sum_row_group_size = 0
num_row_groups = 0
for file, row_group_metadata in row_group_metadata_per_file.items():
print(f"{file}")
row_group_size_per_file[file] = {}
for row_group, metadata in row_group_metadata.items():
row_group_size_per_file[file][row_group] = metadata["total_byte_size"] / (
1024 * 1024
)
sum_row_group_size += row_group_size_per_file[file][row_group]
num_row_groups += 1
print(f" {row_group}: {row_group_size_per_file[file][row_group]} MB")
print(f"Average row group size: {sum_row_group_size/num_row_groups} MB")
alexandria_3D_0.parquet
0: 33.88526916503906 MB
1: 28.67838764190674 MB
2: 26.826565742492676 MB
3: 1.0584754943847656 MB
alexandria_3D_1.parquet
0: 23.262088775634766 MB
1: 34.650197982788086 MB
2: 36.09531307220459 MB
3: 1.8985786437988281 MB
alexandria_3D_10.parquet
0: 28.755844116210938 MB
1: 31.349617958068848 MB
2: 33.02553176879883 MB
3: 1.9675817489624023 MB
alexandria_3D_11.parquet
0: 48.49870491027832 MB
1: 29.866143226623535 MB
2: 28.96322250366211 MB
3: 1.456308364868164 MB
alexandria_3D_12.parquet
0: 35.41706848144531 MB
1: 27.881027221679688 MB
2: 27.899497985839844 MB
3: 1.2672271728515625 MB
alexandria_3D_13.parquet
0: 29.536629676818848 MB
1: 28.44379234313965 MB
2: 27.21775722503662 MB
3: 1.3715496063232422 MB
alexandria_3D_14.parquet
0: 32.75925064086914 MB
1: 25.701278686523438 MB
2: 24.80800437927246 MB
3: 1.0945501327514648 MB
alexandria_3D_15.parquet
0: 28.813363075256348 MB
1: 24.955053329467773 MB
2: 27.343029022216797 MB
3: 2.002194404602051 MB
alexandria_3D_16.parquet
0: 38.87716007232666 MB
1: 44.16834259033203 MB
2: 30.809864044189453 MB
3: 1.4979143142700195 MB
alexandria_3D_17.parquet
0: 31.438282012939453 MB
1: 29.75895881652832 MB
2: 31.682196617126465 MB
3: 1.9960041046142578 MB
alexandria_3D_18.parquet
0: 26.820265769958496 MB
1: 21.677895545959473 MB
2: 24.17652416229248 MB
3: 1.1478967666625977 MB
alexandria_3D_19.parquet
0: 30.522936820983887 MB
1: 37.90152645111084 MB
2: 27.28691577911377 MB
3: 1.362447738647461 MB
alexandria_3D_2.parquet
0: 39.840396881103516 MB
1: 35.17914295196533 MB
2: 37.78933525085449 MB
3: 1.4440546035766602 MB
alexandria_3D_20.parquet
0: 28.389055252075195 MB
1: 26.526031494140625 MB
2: 30.0849027633667 MB
3: 1.1245336532592773 MB
alexandria_3D_21.parquet
0: 26.3947114944458 MB
1: 39.86369228363037 MB
2: 34.58688449859619 MB
3: 1.5055503845214844 MB
alexandria_3D_22.parquet
0: 30.08759593963623 MB
1: 24.752853393554688 MB
2: 23.644336700439453 MB
3: 1.2023286819458008 MB
alexandria_3D_23.parquet
0: 33.71604251861572 MB
1: 43.66508960723877 MB
2: 33.232436180114746 MB
3: 1.5115013122558594 MB
alexandria_3D_24.parquet
0: 33.69458198547363 MB
1: 29.915438652038574 MB
2: 33.283854484558105 MB
3: 2.087203025817871 MB
alexandria_3D_25.parquet
0: 113.63787937164307 MB
alexandria_3D_26.parquet
0: 110.95186138153076 MB
alexandria_3D_27.parquet
0: 121.27318477630615 MB
alexandria_3D_28.parquet
0: 120.9677734375 MB
alexandria_3D_29.parquet
0: 114.41764545440674 MB
alexandria_3D_3.parquet
0: 27.135764122009277 MB
1: 26.59963893890381 MB
2: 28.526336669921875 MB
3: 1.7803220748901367 MB
alexandria_3D_30.parquet
0: 111.70868110656738 MB
alexandria_3D_31.parquet
0: 87.3005952835083 MB
alexandria_3D_32.parquet
0: 88.21440029144287 MB
alexandria_3D_33.parquet
0: 115.94622802734375 MB
alexandria_3D_34.parquet
0: 102.83384799957275 MB
alexandria_3D_35.parquet
0: 86.06189823150635 MB
alexandria_3D_36.parquet
0: 120.09446239471436 MB
alexandria_3D_37.parquet
0: 95.56644344329834 MB
alexandria_3D_38.parquet
0: 106.56142330169678 MB
alexandria_3D_39.parquet
0: 106.53310775756836 MB
alexandria_3D_4.parquet
0: 26.864155769348145 MB
1: 30.033101081848145 MB
2: 24.01892852783203 MB
3: 1.1299266815185547 MB
alexandria_3D_40.parquet
0: 107.71124458312988 MB
alexandria_3D_41.parquet
0: 98.69949913024902 MB
alexandria_3D_42.parquet
0: 86.55539226531982 MB
alexandria_3D_43.parquet
0: 76.38001537322998 MB
alexandria_3D_5.parquet
0: 30.619487762451172 MB
1: 40.44719314575195 MB
2: 31.88987159729004 MB
3: 1.137807846069336 MB
alexandria_3D_6.parquet
0: 28.624879837036133 MB
1: 27.89090061187744 MB
2: 35.02172374725342 MB
3: 1.4424219131469727 MB
alexandria_3D_7.parquet
0: 43.56060314178467 MB
1: 27.994236946105957 MB
2: 29.434123039245605 MB
3: 1.6086492538452148 MB
alexandria_3D_8.parquet
0: 28.378860473632812 MB
1: 33.01034450531006 MB
2: 35.88528060913086 MB
3: 1.7302274703979492 MB
alexandria_3D_9.parquet
0: 42.97386932373047 MB
1: 33.387916564941406 MB
2: 26.984159469604492 MB
3: 1.7877273559570312 MB
Average row group size: 36.577393547827455 MB
For the most optimal performance we should aim for 2GB per file and about 200-500MB per row group.
Currently for a row group with 32,768 rows the size of the row group is ~30MB. To get to 200MB we should put 200MB/30MB more rows in the row group or ~200,000 rows.
If each rowgroup is about 200MB at 200,000 rows then we should have 2GB/200MB = 10 row groups per file. or 2,000,000 rows per file.
[52]:
def normalize_dataset(db, normalize_config=NormalizeConfig()):
task_name = "normalize_dataset"
print("Starting task: normalize_dataset")
db.normalize(normalize_config=normalize_config)
print("Done with task: normalize_dataset")
print("-" * 200)
return task_name
normalize_config = NormalizeConfig(
load_format="batches", # Uses the batch generator to normalize
batch_readahead=4, # Controls the number of batches to load in memory a head of time. This will have impacts on amount of RAM consumed
fragment_readahead=1, # Controls the number of files to load in memory ahead of time. This will have impacts on amount of RAM consumed
batch_size=10000, # Controls the batchsize when to use when normalizing. This will have impacts on amount of RAM consumed
max_rows_per_file=2000000, # Controls the max number of rows per parquet file
max_rows_per_group=200000,
min_rows_per_group=200000,
) # Controls the max number of rows per group parquet file
normalize_dataset(db, normalize_config=normalize_config)
print(db.summary(show_row_group_metadata=True))
Starting task: normalize_dataset
Done with task: normalize_dataset
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
============================================================
PARQUETDB SUMMARY
============================================================
Database path: Z:\data\parquetdb\data\external\alexandria\AlexandriaDB\alexandria_3D
• Number of columns: 128
• Number of rows: 4389295
• Number of files: 3
• Number of rows per file: [2000000, 2000000, 389295]
• Number of row groups per file: [10, 10, 2]
• Number of rows per row group per file:
- alexandria_3D_0.parquet:
- Row group 0: 200000 rows
- Row group 1: 200000 rows
- Row group 2: 200000 rows
- Row group 3: 200000 rows
- Row group 4: 200000 rows
- Row group 5: 200000 rows
- Row group 6: 200000 rows
- Row group 7: 200000 rows
- Row group 8: 200000 rows
- Row group 9: 200000 rows
- alexandria_3D_1.parquet:
- Row group 0: 200000 rows
- Row group 1: 200000 rows
- Row group 2: 200000 rows
- Row group 3: 200000 rows
- Row group 4: 200000 rows
- Row group 5: 200000 rows
- Row group 6: 200000 rows
- Row group 7: 200000 rows
- Row group 8: 200000 rows
- Row group 9: 200000 rows
- alexandria_3D_2.parquet:
- Row group 0: 200000 rows
- Row group 1: 189295 rows
• Serialized metadata size per file: [180510, 178802, 48142] Bytes
############################################################
METADATA
############################################################
############################################################
COLUMN DETAILS
############################################################
Let’s check the file sizes and row group sizes again.
[53]:
file_sizes = list_file_sizes(db.db_path)
for file, size in file_sizes.items():
print(f"{file}: {size} MB")
row_group_metadata_per_file = db.get_parquet_file_row_group_metadata_per_file(
as_dict=True
)
row_group_size_per_file = {}
sum_row_group_size = 0
num_row_groups = 0
for file, row_group_metadata in row_group_metadata_per_file.items():
print(f"{file}")
row_group_size_per_file[file] = {}
for row_group, metadata in row_group_metadata.items():
row_group_size_per_file[file][row_group] = metadata["total_byte_size"] / (
1024 * 1024
)
sum_row_group_size += row_group_size_per_file[file][row_group]
num_row_groups += 1
print(f" {row_group}: {row_group_size_per_file[file][row_group]} MB")
print(f"Average row group size: {sum_row_group_size/num_row_groups} MB")
alexandria_3D_0.parquet: 1261.1049909591675 MB
alexandria_3D_1.parquet: 1251.4470014572144 MB
alexandria_3D_2.parquet: 254.18889236450195 MB
alexandria_3D_0.parquet
0: 203.45607089996338 MB
1: 217.69878768920898 MB
2: 197.38053512573242 MB
3: 187.29719257354736 MB
4: 227.97186374664307 MB
5: 192.6089096069336 MB
6: 216.6963243484497 MB
7: 198.1317253112793 MB
8: 226.02880668640137 MB
9: 226.7027463912964 MB
alexandria_3D_1.parquet
0: 244.55265712738037 MB
1: 210.91666316986084 MB
2: 204.90382862091064 MB
3: 204.14434432983398 MB
4: 192.36827659606934 MB
5: 217.43347454071045 MB
6: 215.3526430130005 MB
7: 201.5632438659668 MB
8: 189.40411949157715 MB
9: 200.09713077545166 MB
alexandria_3D_2.parquet
0: 208.74722480773926 MB
1: 206.02736282348633 MB
Average row group size: 208.6129059791565 MB
Here we see that the row groups are about 200MB and the file sizes are a 1GB. This is a little smaller because the less files the more common metadata is stored together. I would try to store all the data in the same file in this case
[54]:
normalize_config = NormalizeConfig(
load_format="batches", # Uses the batch generator to normalize
batch_readahead=4, # Controls the number of batches to load in memory a head of time. This will have impacts on amount of RAM consumed
fragment_readahead=1, # Controls the number of files to load in memory ahead of time. This will have impacts on amount of RAM consumed
batch_size=10000, # Controls the batchsize when to use when normalizing. This will have impacts on amount of RAM consumed
max_rows_per_file=5000000, # Controls the max number of rows per parquet file
max_rows_per_group=200000,
min_rows_per_group=200000,
) # Controls the max number of rows per group parquet file
normalize_dataset(db, normalize_config=normalize_config)
print(db.summary(show_row_group_metadata=True))
file_sizes = list_file_sizes(db.db_path)
for file, size in file_sizes.items():
print(f"{file}: {size} MB")
Starting task: normalize_dataset
Done with task: normalize_dataset
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
============================================================
PARQUETDB SUMMARY
============================================================
Database path: Z:\data\parquetdb\data\external\alexandria\AlexandriaDB\alexandria_3D
• Number of columns: 128
• Number of rows: 4389295
• Number of files: 1
• Number of rows per file: [4389295]
• Number of row groups per file: [22]
• Number of rows per row group per file:
- alexandria_3D_0.parquet:
- Row group 0: 200000 rows
- Row group 1: 200000 rows
- Row group 2: 200000 rows
- Row group 3: 200000 rows
- Row group 4: 200000 rows
- Row group 5: 200000 rows
- Row group 6: 200000 rows
- Row group 7: 200000 rows
- Row group 8: 200000 rows
- Row group 9: 200000 rows
- Row group 10: 200000 rows
- Row group 11: 200000 rows
- Row group 12: 200000 rows
- Row group 13: 200000 rows
- Row group 14: 200000 rows
- Row group 15: 200000 rows
- Row group 16: 200000 rows
- Row group 17: 200000 rows
- Row group 18: 200000 rows
- Row group 19: 200000 rows
- Row group 20: 200000 rows
- Row group 21: 189295 rows
• Serialized metadata size per file: [377883] Bytes
############################################################
METADATA
############################################################
############################################################
COLUMN DETAILS
############################################################
alexandria_3D_0.parquet: 2767.1490955352783 MB
Basic Operations¶
In this section we are going to test the performance of ParquetDB for basic operations.
Reading a single column¶
If one were to want a single property for all materials in alexandria, if it were in json format, we would have to iterate over and read all the json files to collect the property.
[55]:
total_time_to_read_from_json = sum(benchmark_dict["json_load_times"])
def read_single_column(db):
task_name = "read_single_column"
print("Starting task: read_single_column")
table = db.read(columns=["id"], load_format="table")
print(table.shape)
print("-" * 200)
return task_name
start_time = time.time()
task_name = read_single_column(db)
read_single_column_time = time.time() - start_time
task_benchmark_dict["task_names"].append(task_name)
task_benchmark_dict["task_times"].append(read_single_column_time)
print(f"Time to read from parquetdb: {read_single_column_time:.2f} seconds")
print(f"Time to read from json: {total_time_to_read_from_json:.2f} seconds")
Starting task: read_single_column
(4389295, 1)
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Time to read from parquetdb: 0.06 seconds
Time to read from json: 589.19 seconds
Reading specific ids¶
Now, if we were to want speific columns, at minimum time to do this in json format would be the same time as iterating over the json files.
In ParquetDB, it is much simpler and less memory intensive to read specific ids.
[ ]:
def read_specific_ids(db):
task_name = "read_specific_ids"
print("Starting task: read_specific_ids")
table = db.read(
ids=[
0,
10,
100,
1000,
10000,
100000,
1000000,
], # Controls which rows we want to read
load_format="table", # Controls the output format. The options are 'table', 'batches', `dataset`.
)
df = table.to_pandas() # Converts the table to a pandas dataframe
print(df["id"])
print(df.head())
print(df.shape)
print(f"Data : {df.iloc[0]['data.spg']}")
print(list(df.columns))
print("Done with task: read_specific_ids")
print("-" * 200)
return task_name
start_time = time.time()
task_name = read_specific_ids(db)
read_specific_ids_time = time.time() - start_time
task_benchmark_dict["task_names"].append(task_name)
task_benchmark_dict["task_times"].append(read_specific_ids_time)
print(f"Time to read from parquetdb: {read_specific_ids_time:.2f} seconds")
print(f"Time to read from json: {total_time_to_read_from_json:.2f} seconds")
Starting task: read_specific_ids
0 10000
1 0
2 10
3 100
4 1000
5 100000
6 1000000
Name: id, dtype: int64
@class @module composition.Ac \
0 ComputedStructureEntry pymatgen.entries.computed_entries 2.0
1 ComputedStructureEntry pymatgen.entries.computed_entries 1.0
2 ComputedStructureEntry pymatgen.entries.computed_entries 1.0
3 ComputedStructureEntry pymatgen.entries.computed_entries 1.0
4 ComputedStructureEntry pymatgen.entries.computed_entries 2.0
composition.Ag composition.Al composition.Ar composition.As \
0 NaN NaN NaN NaN
1 NaN NaN NaN NaN
2 NaN NaN NaN NaN
3 NaN NaN NaN NaN
4 NaN NaN NaN NaN
composition.Au composition.B composition.Ba ... structure.lattice.a \
0 NaN NaN NaN ... 4.829086
1 NaN NaN NaN ... 10.091510
2 NaN NaN NaN ... 15.519920
3 NaN NaN NaN ... 5.789011
4 NaN NaN NaN ... 8.695520
structure.lattice.alpha structure.lattice.b structure.lattice.beta \
0 90.000000 4.829086 90.000000
1 109.471217 10.091510 109.471222
2 89.997963 11.000739 89.996986
3 111.411199 8.109282 99.469696
4 69.601810 7.709933 55.506047
structure.lattice.c structure.lattice.gamma \
0 9.745281 90.000000
1 10.091511 109.471219
2 5.192681 45.191179
3 8.109282 99.469696
4 7.709933 55.506037
structure.lattice.matrix structure.lattice.pbc \
0 [4.82908586, 0.0, 0.0, 0.0, 4.82908586, 0.0, 0... [True, True, True]
1 [9.51436671, 2e-08, -3.36383678, -4.75718337, ... [True, True, True]
2 [15.51989271, -0.00721914, 0.02799947, 7.75631... [True, True, True]
3 [5.78373142, -0.13705652, -0.20571227, -1.1463... [True, True, True]
4 [7.92465233, 0.98010515, 3.44257929, 3.2101699... [True, True, True]
structure.lattice.volume structure.sites
0 227.260645 [{'abc': [0.0, 0.5, 0.2399718], 'label': 'Ac',...
1 791.127799 [{'abc': [0.0, 0.0, 0.0], 'label': 'Ac', 'prop...
2 628.973408 [{'abc': [3e-06, 3e-06, 0.0], 'label': 'Ac', '...
3 338.968680 [{'abc': [0.5, 0.0, 0.0], 'label': 'Ac', 'prop...
4 350.818535 [{'abc': [0.83221258, 0.94982224, 0.94982224],...
[5 rows x 128 columns]
(7, 128)
[dtype('O'), dtype('int64'), dtype('float64')]
Data : 129
['@class', '@module', 'composition.Ac', 'composition.Ag', 'composition.Al', 'composition.Ar', 'composition.As', 'composition.Au', 'composition.B', 'composition.Ba', 'composition.Be', 'composition.Bi', 'composition.Br', 'composition.C', 'composition.Ca', 'composition.Cd', 'composition.Ce', 'composition.Cl', 'composition.Co', 'composition.Cr', 'composition.Cs', 'composition.Cu', 'composition.Dy', 'composition.Er', 'composition.Eu', 'composition.F', 'composition.Fe', 'composition.Ga', 'composition.Gd', 'composition.Ge', 'composition.H', 'composition.He', 'composition.Hf', 'composition.Hg', 'composition.Ho', 'composition.I', 'composition.In', 'composition.Ir', 'composition.K', 'composition.Kr', 'composition.La', 'composition.Li', 'composition.Lu', 'composition.Mg', 'composition.Mn', 'composition.Mo', 'composition.N', 'composition.Na', 'composition.Nb', 'composition.Nd', 'composition.Ne', 'composition.Ni', 'composition.Np', 'composition.O', 'composition.Os', 'composition.P', 'composition.Pa', 'composition.Pb', 'composition.Pd', 'composition.Pm', 'composition.Pr', 'composition.Pt', 'composition.Pu', 'composition.Rb', 'composition.Re', 'composition.Rh', 'composition.Ru', 'composition.S', 'composition.Sb', 'composition.Sc', 'composition.Se', 'composition.Si', 'composition.Sm', 'composition.Sn', 'composition.Sr', 'composition.Ta', 'composition.Tb', 'composition.Tc', 'composition.Te', 'composition.Th', 'composition.Ti', 'composition.Tl', 'composition.Tm', 'composition.U', 'composition.V', 'composition.W', 'composition.Xe', 'composition.Y', 'composition.Yb', 'composition.Zn', 'composition.Zr', 'correction', 'data.band_gap_dir', 'data.band_gap_ind', 'data.decomposition', 'data.dos_ef', 'data.e_above_hull', 'data.e_form', 'data.e_phase_separation', 'data.elements', 'data.energy_corrected', 'data.energy_total', 'data.formula', 'data.location', 'data.mat_id', 'data.nsites', 'data.prototype_id', 'data.spg', 'data.stress', 'data.total_mag', 'energy', 'energy_adjustments', 'entry_id', 'id', 'parameters.dummy_field', 'structure.@class', 'structure.@module', 'structure.charge', 'structure.lattice.a', 'structure.lattice.alpha', 'structure.lattice.b', 'structure.lattice.beta', 'structure.lattice.c', 'structure.lattice.gamma', 'structure.lattice.matrix', 'structure.lattice.pbc', 'structure.lattice.volume', 'structure.sites']
Done with task: read_specific_ids
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Time to read from parquetdb: 4.36 seconds
Time to read from json: 589.19 seconds
Finding the minimum and maximum of a energy¶
[57]:
def read_energy_min_max(db):
task_name = "read_energy_min_max"
print("Starting task: read_energy_min_max")
table = db.read(columns=["energy"], load_format="table")
print(table.shape)
result = pc.min_max(table["energy"])
# The result will be a struct with 'min' and 'max' fields
min_value = result["min"].as_py()
max_value = result["max"].as_py()
print(f"Min: {min_value}, Max: {max_value}")
print("Done with task: read_energy_min_max")
print("-" * 200)
return task_name
start_time = time.time()
task_name = read_energy_min_max(db)
read_energy_min_max_time = time.time() - start_time
task_benchmark_dict["task_names"].append(task_name)
task_benchmark_dict["task_times"].append(read_energy_min_max_time)
print(f"Time to read from parquetdb: {read_energy_min_max_time:.2f} seconds")
print(f"Time to read from json: {total_time_to_read_from_json:.2f} seconds")
Starting task: read_energy_min_max
(4389295, 1)
Min: -1496.5922219, Max: -0.003981
Done with task: read_energy_min_max
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Time to read from parquetdb: 0.11 seconds
Time to read from json: 589.19 seconds
Reading records filtered by energy above -1.0¶
[58]:
def read_filtered_energy_above_minus_one(db):
task_name = "read_filtered_energy_above_-1"
"""Read records filtered by energy above -1.0 and track timing."""
print("Starting task: read_filtered_energy_above_minus_one")
table = db.read(
columns=["id", "energy"],
filters=[pc.field("energy") > -1.0],
load_format="table",
)
df = table.to_pandas() # Converts the table to a pandas dataframe
print(df.head())
print(df.shape)
print("Done with task: read_filtered_energy_above_minus_one")
print("-" * 200)
return task_name
start_time = time.time()
task_name = read_filtered_energy_above_minus_one(db)
read_filtered_energy_above_minus_one_time = time.time() - start_time
task_benchmark_dict["task_names"].append(task_name)
task_benchmark_dict["task_times"].append(read_filtered_energy_above_minus_one_time)
print(
f"Time to read from parquetdb: {read_filtered_energy_above_minus_one_time:.2f} seconds"
)
print(f"Time to read from json: {total_time_to_read_from_json:.2f} seconds")
Starting task: read_filtered_energy_above_minus_one
id energy
0 123136 -0.063105
1 123137 -0.125970
2 403318 -0.972671
3 570682 -0.907343
4 570683 -0.901483
(46, 2)
Done with task: read_filtered_energy_above_minus_one
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Time to read from parquetdb: 0.04 seconds
Time to read from json: 589.19 seconds
Reading records filtered by spg 204¶
[59]:
def read_filtered_spg_204(db):
task_name = "read_filtered_spg_204_table"
print("Starting task: read_filtered_spg_204")
table = db.read(
columns=["id", "data.spg"],
filters=[pc.field("data.spg") == 204],
load_format="table",
)
df = table.to_pandas() # Converts the table to a pandas dataframe
print(df.head())
print(df.shape)
print("Done with task: read_filtered_spg_204")
print("-" * 200)
return task_name
start_time = time.time()
task_name = read_filtered_spg_204(db)
read_filtered_spg_204_time = time.time() - start_time
task_benchmark_dict["task_names"].append(task_name)
task_benchmark_dict["task_times"].append(read_filtered_spg_204_time)
print(f"Time to read from parquetdb: {read_filtered_spg_204_time:.2f} seconds")
print(f"Time to read from json: {total_time_to_read_from_json:.2f} seconds")
Starting task: read_filtered_spg_204
id data.spg
0 10113 204
1 10125 204
2 10126 204
3 10133 204
4 10140 204
(7240, 2)
Done with task: read_filtered_spg_204
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Time to read from parquetdb: 0.05 seconds
Time to read from json: 589.19 seconds
Reading records filtered by spg batches¶
[60]:
def read_filtered_spg_batches(db):
task_name = "read_filtered_spg_batches"
print("Starting task: read_filtered_spg_batches")
generator = db.read(
load_format="batches",
batch_size=1000,
load_config=LoadConfig(
batch_readahead=10,
fragment_readahead=2,
fragment_scan_options=None,
use_threads=True,
memory_pool=None,
),
columns=["id", "data.spg"],
filters=[pc.field("data.spg") == 204],
)
batch_count = 0
num_rows = 0
for table in generator:
df = table.to_pandas()
num_rows += table.num_rows
batch_count += 1
print(f"Total number of rows: {num_rows}, Batches: {batch_count}")
print("Done with task: read_filtered_spg_batches")
print("-" * 200)
return task_name
start_time = time.time()
task_name = read_filtered_spg_batches(db)
read_filtered_spg_batches_time = time.time() - start_time
task_benchmark_dict["task_names"].append(task_name)
task_benchmark_dict["task_times"].append(read_filtered_spg_batches_time)
print(f"Time to read from parquetdb: {read_filtered_spg_batches_time:.2f} seconds")
print(f"Time to read from json: {total_time_to_read_from_json:.2f} seconds")
Starting task: read_filtered_spg_batches
Total number of rows: 7240, Batches: 4390
Done with task: read_filtered_spg_batches
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Time to read from parquetdb: 0.87 seconds
Time to read from json: 589.19 seconds
Reading lattice matrix for space group 204¶
[61]:
def read_lattice_matrix_spg_204(db):
task_name = "read_lattice_matrix_spg_204"
print("Starting task: read_lattice_matrix_spg_204")
table = db.read(
columns=["structure.lattice.matrix"], filters=[pc.field("data.spg") == 204]
)
lattice = table["structure.lattice.matrix"].combine_chunks().to_numpy_ndarray()
print(lattice.shape)
print("Done with task: read_lattice_matrix_spg_204")
print("-" * 200)
return task_name
start_time = time.time()
task_name = read_lattice_matrix_spg_204(db)
read_lattice_matrix_spg_204_time = time.time() - start_time
task_benchmark_dict["task_names"].append(task_name)
task_benchmark_dict["task_times"].append(read_lattice_matrix_spg_204_time)
print(f"Time to read from parquetdb: {read_lattice_matrix_spg_204_time:.2f} seconds")
print(f"Time to read from json: {total_time_to_read_from_json:.2f} seconds")
Starting task: read_lattice_matrix_spg_204
(7240, 3, 3)
Done with task: read_lattice_matrix_spg_204
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Time to read from parquetdb: 0.22 seconds
Time to read from json: 589.19 seconds
Reading nested column selection¶
[62]:
def read_nested_column_selection(db):
task_name = "read_nested_column_selection"
print("Starting task: read_nested_column_selection")
table = db.read(columns=["id", "structure.sites"], load_format="table")
print(table.shape)
print(table["structure.sites"].type)
print(table["structure.sites"].combine_chunks().type)
print("Done with task: read_nested_column_selection")
print("-" * 200)
return task_name
start_time = time.time()
task_name = read_nested_column_selection(db)
read_nested_column_selection_time = time.time() - start_time
task_benchmark_dict["task_names"].append(task_name)
task_benchmark_dict["task_times"].append(read_nested_column_selection_time)
print(f"Time to read from parquetdb: {read_nested_column_selection_time:.2f} seconds")
print(f"Time to read from json: {total_time_to_read_from_json:.2f} seconds")
Starting task: read_nested_column_selection
(4389295, 2)
list<element: struct<abc: list<element: double>, label: string, properties: struct<charge: double, forces: list<element: double>, magmom: double>, species: list<element: struct<element: string, occu: int64>>, xyz: list<element: double>>>
list<element: struct<abc: list<element: double>, label: string, properties: struct<charge: double, forces: list<element: double>, magmom: double>, species: list<element: struct<element: string, occu: int64>>, xyz: list<element: double>>>
Done with task: read_nested_column_selection
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Time to read from parquetdb: 4.58 seconds
Time to read from json: 589.19 seconds
Nested structure into class¶
[63]:
def read_nested_structure_into_class(db):
# By default the database flattens nested structure for storage.
# However, we provide an option to rebuild the nested structure. This will create a new dataset in {dataset_name}_nested.
# After the creation of the new dataset, the query parameters are applied to the new dataset.
task_name = "read_nested_structure_into_class"
print("Starting task: read_nested_structure_into_class")
table = db.read(
columns=[
"id",
"structure",
"data",
], # Instead of using the flatten syntax, we can use the nested syntax
ids=[0, 1000000],
load_format="table",
rebuild_nested_struct=True, # When set to True to rebuild the nested structure
rebuild_nested_from_scratch=False, # When set to True, the nested structure will be rebuilt from scratch
normalize_config=NormalizeConfig(
load_format="batches",
batch_readahead=2,
fragment_readahead=1,
batch_size=10000,
max_rows_per_file=5000000,
min_rows_per_group=200000,
max_rows_per_group=200000,
),
)
print(table.shape)
print(table["data"].type)
print("structure type")
print(table["structure"].type)
try:
from pymatgen.core.structure import Structure
structure = Structure.from_dict(
table["structure"].combine_chunks().to_pylist()[0]
)
print(structure)
except Exception as e:
print(e)
print("Done with task: read_nested_structure_into_class")
print("-" * 200)
return task_name
start_time = time.time()
read_nested_structure_into_class(db)
nested_structure_time = time.time() - start_time
print(f"Time to read from parquetdb: {nested_structure_time:.2f} seconds")
print(f"Time to read from json: {total_time_to_read_from_json:.2f} seconds")
Starting task: read_nested_structure_into_class
(2, 3)
struct<band_gap_dir: double, band_gap_ind: double, decomposition: string, dos_ef: double, e_above_hull: double, e_form: double, e_phase_separation: double, elements: list<element: string>, energy_corrected: double, energy_total: double, formula: string, location: string, mat_id: string, nsites: int64, prototype_id: string, spg: int64, stress: extension<arrow.fixed_shape_tensor[value_type=double, shape=[3,3]]>, total_mag: double>
structure type
struct<@class: string, @module: string, charge: int64, lattice: struct<a: double, alpha: double, b: double, beta: double, c: double, gamma: double, matrix: extension<arrow.fixed_shape_tensor[value_type=double, shape=[3,3]]>, pbc: extension<arrow.fixed_shape_tensor[value_type=bool, shape=[3]]>, volume: double>, sites: list<element: struct<abc: list<element: double>, label: string, properties: struct<charge: double, forces: list<element: double>, magmom: double>, species: list<element: struct<element: string, occu: int64>>, xyz: list<element: double>>>>
Full Formula (Ac1 Pr12 Ho7)
Reduced Formula: AcPr12Ho7
abc : 10.091510 10.091510 10.091511
angles: 109.471217 109.471222 109.471219
pbc : True True True
Sites (20)
# SP a b c charge forces magmom
--- ---- -------- -------- -------- -------- --------------------------------------- --------
0 Ac 0 0 0 8.076 [0.0, -0.0, -0.0] -0
1 Pr 0.476206 0.707375 0.768831 8.873 [0.0022184, 0.00231648, -0.00126846] -0
2 Pr 0.938545 0.707375 0.231169 8.873 [0.00045645, 0.00231648, -0.00251434] -0
3 Pr 0.523794 0.292625 0.231169 8.873 [-0.0022184, -0.00231648, 0.00126846] -0
4 Pr 0.061455 0.292625 0.768831 8.873 [-0.00045645, -0.00231648, 0.00251434] -0
5 Pr 0.768831 0.061455 0.292625 8.873 [0.00177791, -0.00155354, -0.00251434] -0
6 Pr 0.768831 0.476206 0.707375 8.873 [-0.00089694, 0.00307943, 0.00126846] -0
7 Pr 0.231169 0.938545 0.707375 8.873 [-0.00177791, 0.00155354, 0.00251434] -0
8 Pr 0.231169 0.523794 0.292625 8.873 [0.00089694, -0.00307943, -0.00126846] -0
9 Pr 0.707375 0.768831 0.476206 8.873 [0.00223436, 0.00076295, 0.00251434] -0
10 Pr 0.292625 0.231169 0.523794 8.873 [-0.00223436, -0.00076295, -0.00251434] -0
11 Pr 0.707375 0.231169 0.938545 8.873 [0.00311533, -0.00076295, 0.00126846] -0
12 Pr 0.292625 0.768831 0.061455 8.873 [-0.00311533, 0.00076295, -0.00126846] -0
13 Ho 0.5 0.5 0 7.546 [-0.0, -0.0, -0.0] 0
14 Ho 0 0.5 0.5 7.546 [0.0, -0.0, -0.0] 0
15 Ho 0.5 0 0.5 7.546 [-0.0, -0.0, -0.0] 0
16 Ho 0.5 0.5 0.5 8.703 [-0.0, -0.0, -0.0] 0
17 Ho 0 0 0.5 8.703 [-0.0, -0.0, -0.0] 0
18 Ho 0.5 0 0 8.703 [-0.0, -0.0, -0.0] 0
19 Ho 0 0.5 0 8.703 [-0.0, -0.0, -0.0] 0
Done with task: read_nested_structure_into_class
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Time to read from parquetdb: 52.64 seconds
Time to read from json: 589.19 seconds
In the previous cell it takes ~ 60 seconds to perform the operation, this is due to reconstructing the nested structure. Further queries will be faster as the nested structure is already built.
[64]:
start_time = time.time()
task_name = read_nested_structure_into_class(db)
nested_structure_time = time.time() - start_time
task_benchmark_dict["task_names"].append(task_name)
task_benchmark_dict["task_times"].append(nested_structure_time)
print(f"Time to read from parquetdb: {nested_structure_time:.2f} seconds")
print(f"Time to read from json: {total_time_to_read_from_json:.2f} seconds")
Starting task: read_nested_structure_into_class
(2, 3)
struct<band_gap_dir: double, band_gap_ind: double, decomposition: string, dos_ef: double, e_above_hull: double, e_form: double, e_phase_separation: double, elements: list<element: string>, energy_corrected: double, energy_total: double, formula: string, location: string, mat_id: string, nsites: int64, prototype_id: string, spg: int64, stress: extension<arrow.fixed_shape_tensor[value_type=double, shape=[3,3]]>, total_mag: double>
structure type
struct<@class: string, @module: string, charge: int64, lattice: struct<a: double, alpha: double, b: double, beta: double, c: double, gamma: double, matrix: extension<arrow.fixed_shape_tensor[value_type=double, shape=[3,3]]>, pbc: extension<arrow.fixed_shape_tensor[value_type=bool, shape=[3]]>, volume: double>, sites: list<element: struct<abc: list<element: double>, label: string, properties: struct<charge: double, forces: list<element: double>, magmom: double>, species: list<element: struct<element: string, occu: int64>>, xyz: list<element: double>>>>
Full Formula (Ac1 Pr12 Ho7)
Reduced Formula: AcPr12Ho7
abc : 10.091510 10.091510 10.091511
angles: 109.471217 109.471222 109.471219
pbc : True True True
Sites (20)
# SP a b c charge forces magmom
--- ---- -------- -------- -------- -------- --------------------------------------- --------
0 Ac 0 0 0 8.076 [0.0, -0.0, -0.0] -0
1 Pr 0.476206 0.707375 0.768831 8.873 [0.0022184, 0.00231648, -0.00126846] -0
2 Pr 0.938545 0.707375 0.231169 8.873 [0.00045645, 0.00231648, -0.00251434] -0
3 Pr 0.523794 0.292625 0.231169 8.873 [-0.0022184, -0.00231648, 0.00126846] -0
4 Pr 0.061455 0.292625 0.768831 8.873 [-0.00045645, -0.00231648, 0.00251434] -0
5 Pr 0.768831 0.061455 0.292625 8.873 [0.00177791, -0.00155354, -0.00251434] -0
6 Pr 0.768831 0.476206 0.707375 8.873 [-0.00089694, 0.00307943, 0.00126846] -0
7 Pr 0.231169 0.938545 0.707375 8.873 [-0.00177791, 0.00155354, 0.00251434] -0
8 Pr 0.231169 0.523794 0.292625 8.873 [0.00089694, -0.00307943, -0.00126846] -0
9 Pr 0.707375 0.768831 0.476206 8.873 [0.00223436, 0.00076295, 0.00251434] -0
10 Pr 0.292625 0.231169 0.523794 8.873 [-0.00223436, -0.00076295, -0.00251434] -0
11 Pr 0.707375 0.231169 0.938545 8.873 [0.00311533, -0.00076295, 0.00126846] -0
12 Pr 0.292625 0.768831 0.061455 8.873 [-0.00311533, 0.00076295, -0.00126846] -0
13 Ho 0.5 0.5 0 7.546 [-0.0, -0.0, -0.0] 0
14 Ho 0 0.5 0.5 7.546 [0.0, -0.0, -0.0] 0
15 Ho 0.5 0 0.5 7.546 [-0.0, -0.0, -0.0] 0
16 Ho 0.5 0.5 0.5 8.703 [-0.0, -0.0, -0.0] 0
17 Ho 0 0 0.5 8.703 [-0.0, -0.0, -0.0] 0
18 Ho 0.5 0 0 8.703 [-0.0, -0.0, -0.0] 0
19 Ho 0 0.5 0 8.703 [-0.0, -0.0, -0.0] 0
Done with task: read_nested_structure_into_class
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Time to read from parquetdb: 3.75 seconds
Time to read from json: 589.19 seconds
Plotting times¶
[65]:
import matplotlib.pyplot as plt
from parquetdb.utils import matplotlib_utils
from matplotlib import rcParams
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
%matplotlib inline
[ ]:
xlabel_size = 16
ylabel_size = 16
title_size = 16
xtick_size = 14
ytick_size = 12
inset_xtick_size = 10
inset_ytick_size = 10
inset_title_size = 12
labels = task_benchmark_dict["task_names"]
times = task_benchmark_dict["task_times"]
# Create the main plot with numbered x labels and an inset showing the same data on a log scale
fig, ax = plt.subplots(figsize=(10, 6))
# Number the labels
numbered_labels = [f"{i+1}. {label}" for i, label in enumerate(labels)]
# matplotlib_utils.set_palette('Cavalcanti1')
matplotlib_utils.set_palette("Darjeeling1_alt")
# matplotlib_utils.set_palette('Zissou1')
# matplotlib_utils.set_palette('AsteroidCity1')
# matplotlib_utils.set_palette('BottleRocket2')
colors = rcParams["axes.prop_cycle"].by_key()["color"]
# Main horizontal bar plotcolors[:len(times)]
# ax.barh(numbered_labels, times, color="#59b9de")
ax.barh(numbered_labels, times, color=colors[: len(times)])
ax.set_xlabel("Total Time (seconds)", fontsize=xlabel_size)
ax.set_ylabel("Operations", fontsize=ylabel_size)
ax.tick_params(axis="x", labelsize=xtick_size)
ax.tick_params(axis="y", labelsize=ytick_size)
ax.set_title(
"Total Time for Various Operations on dataset with 4.8 million rows",
fontsize=title_size,
)
# Inset plot with log scale and just the numbers
# ax_inset = inset_axes(ax, width="40%", height="30%", loc="center right")
ax_inset = inset_axes(
ax,
width="30%",
height="30%",
loc="center right",
bbox_to_anchor=(-0.05, -0.1, 1, 1),
bbox_transform=ax.transAxes,
)
ax_inset.barh(range(1, len(labels) + 1), times, color="#e52207")
ax_inset.barh(range(1, len(labels) + 1), times, color=colors[: len(times)])
ax_inset.set_xscale("log")
ax_inset.set_yticks(range(1, len(labels) + 1)) # Show just the numbers
ax_inset.set_yticklabels(range(1, len(labels) + 1), fontsize=inset_ytick_size)
ax_inset.set_title("Log Scale", fontsize=inset_title_size)
# Adjust layout and show the plot
plt.tight_layout()
plt.show()
C:\Users\lllang\AppData\Local\Temp\ipykernel_35236\2220293953.py:54: UserWarning: This figure includes Axes that are not compatible with tight_layout, so results might be incorrect.
plt.tight_layout()
