"""Convert database to folder tree."""
from asr.core import command, argument, option, ASRResult
from asr.utils import timed_print
from pathlib import Path
from datetime import datetime
[docs]
def make_folder_tree(*, folders, chunks,
copy,
patterns,
atomsfile,
update_tree):
"""Write folder tree to disk."""
from os import makedirs, link
from ase.io import write
from asr.core import write_json
import importlib
from fnmatch import fnmatch
nfolders = len(folders)
for i, (rowid, (folder, row)) in enumerate(folders.items()):
now = datetime.now()
percentage_completed = (i + 1) / nfolders * 100
timed_print(f'{now:%H:%M:%S} {i + 1}/{nfolders} '
f'{percentage_completed:.1f}%', wait=30)
if chunks > 1:
chunkno = i % chunks
parts = list(Path(folder).parts)
parts[0] += str(chunkno)
folder = str(Path().joinpath(*parts))
folder = Path(folder)
if not update_tree and atomsfile:
if not folder.is_dir():
makedirs(folder)
write(folder / atomsfile, row.toatoms())
for filename, results in row.data.items():
for pattern in patterns:
if fnmatch(filename, pattern):
break
else:
continue
if not folder.is_dir():
if not update_tree:
makedirs(folder)
else:
continue
if (folder / filename).is_file() and not update_tree:
continue
# We treat json differently
if filename.endswith('.json'):
write_json(folder / filename, results)
# Unpack any extra files
files = results.get('__files__', {})
for extrafile, content in files.items():
if '__tofile__' in content:
# TODO: This should _really_ be handled differently.
tofile = content.pop('__tofile__')
mod, func = tofile.split('@')
write_func = getattr(importlib.import_module(mod),
func)
write_func(folder / extrafile, content)
elif filename in {'__links__', '__children__'}:
pass
else:
path = results.get('pointer')
srcfile = Path(path).resolve()
if not srcfile.is_file():
print(f'Cannot locate source file: {path}')
continue
destfile = folder / Path(filename)
if destfile.is_file():
continue
if copy:
try:
link(str(srcfile), str(destfile))
except OSError:
destfile.write_bytes(srcfile.read_bytes())
else:
destfile.symlink_to(srcfile)
[docs]
def make_folder_dict(rows, tree_structure):
"""Return a dictionary where key=uid and value=(folder, row)."""
import spglib
folders = {}
folderlist = []
err = []
nc = 0
child_uids = {}
for row in rows:
identifier = row.get('uid', row.id)
children = row.data.get('__children__')
if children:
for path, child_uid in children.items():
if child_uid in child_uids:
existing_path = child_uids[child_uid]['path']
assert (existing_path.startswith(path)
or path.startswith(existing_path))
if path.startswith(existing_path):
continue
child_uids[child_uid] = {'path': path, 'parentuid': identifier}
for row in rows:
identifier = row.get('uid', row.id)
if identifier in child_uids:
folders[identifier] = (None, row)
continue
atoms = row.toatoms()
formula = atoms.symbols.formula
st = atoms.symbols.formula.stoichiometry()[0]
cell = (atoms.cell.array,
atoms.get_scaled_positions(),
atoms.numbers)
stoi = atoms.symbols.formula.stoichiometry()
st = stoi[0]
reduced_formula = stoi[1]
dataset = spglib.get_symmetry_dataset(cell, symprec=1e-3,
angle_tolerance=0.1)
sg = dataset['number']
w = '-'.join(sorted(set(dataset['wyckoffs'])))
if 'magstate' in row:
magstate = row.magstate.lower()
else:
magstate = None
# Add a unique identifier
folder = tree_structure.format(stoi=st, spg=sg, wyck=w,
reduced_formula=reduced_formula,
formula=formula,
mag=magstate,
row=row)
assert folder not in folderlist, f'Collision in folder: {folder}!'
folderlist.append(folder)
folders[identifier] = (folder, row)
for child_uid, links in child_uids.items():
parent_uid = links['parentuid']
if child_uid not in folders:
print(f'Parent (uid={parent_uid}) has unknown child '
f'(uid={child_uid}).')
continue
parentfolder = folders[parent_uid][0]
childfolder = str(Path().joinpath(parentfolder, links['path']))
folders[child_uid] = (childfolder, folders[child_uid][1])
print(f'Number of collisions: {nc}')
for er in err:
print(er)
return folders
@command('asr.database.totree',
save_results_file=False)
@argument('database', nargs=1, type=str)
@option('--run/--dry-run', is_flag=True)
@option('-s', '--selection', help='ASE-DB selection', type=str)
@option('-t', '--tree-structure', type=str)
@option('--sort', help='Sort the generated materials '
'(only useful when dividing chunking tree)', type=str)
@option('--copy/--no-copy', is_flag=True, help='Copy pointer tagged files')
@option('--atomsfile',
help="Filename to unpack atomic structure to. "
"By default, don't write atoms file.",
type=str)
@option('-c', '--chunks', metavar='N', help='Divide the tree into N chunks',
type=int)
@option('--patterns',
help="Comma separated patterns. Only unpack files matching patterns",
type=str)
@option('--update-tree', is_flag=True,
help='Update results files in existing folder tree.')
def main(database: str, run: bool = False, selection: str = '',
tree_structure: str = (
'tree/{stoi}/{reduced_formula:abc}/{row.uid}'
),
sort: str = None, atomsfile: str = None,
chunks: int = 1, copy: bool = False,
patterns: str = '*', update_tree: bool = False) -> ASRResult:
"""Unpack an ASE database to a tree of folders.
This setup recipe can unpack an ASE database to into folders
that have a tree like structure where directory names can be
given by the material parameters such stoichiometry, spacegroup
number for example: stoichiometry/spacegroup/formula.
The specific tree structure is given by the --tree-structure
option which can be customized according to the following table
* {stoi}: Material stoichiometry
* {spg}: Material spacegroup number
* {formula}: Chemical formula. A possible variant is {formula:metal}
in which case the formula will be sorted by metal atoms
* {reduced_formula}: Reduced chemical formula. Like {formula}
except the formula has been reduced, i.e., Mo2S4 -> MoS2.
* {wyck}: Unique wyckoff positions. The unique alphabetically
sorted Wyckoff positions.
Examples
--------
For all these examples, suppose you have a database named "database.db".
Unpack database using default parameters:
$ asr run "database.totree database.db --run"
Don't actually unpack the database but do a dry-run:
$ asr run "database.totree database.db"
Only select a part of the database to unpack:
$ asr run "database.totree database.db --selection natoms<3 --run"
Set custom folder tree-structure:
$ asr run "database.totree database.db
... --tree-structure tree/{stoi}/{spg}/{formula:metal} --run"
Divide the tree into 2 chunks (in case the study of the materials)
is divided between 2 people). Also sort after number of atoms,
so computationally expensive materials are divided evenly:
$ asr run "database.totree database.db --sort natoms --chunks 2 --run"
"""
from pathlib import Path
from ase.db import connect
if selection:
print(f'Selecting {selection}')
if sort:
print(f'Sorting after {sort}')
assert Path(database).exists(), f'file: {database} doesn\'t exist'
db = connect(database)
rows = list(db.select(selection, sort=sort))
patterns = patterns.split(',')
folders = make_folder_dict(rows, tree_structure)
if not run:
print(f'Would (at most) make {len(folders)} folders')
if chunks > 1:
print(f'Would divide these folders into {chunks} chunks')
print('The first 10 folders would be')
for rowid, folder in list(folders.items())[:10]:
print(f' {folder[0]}')
print(' ...')
print('To run the command use the --run option')
return
make_folder_tree(folders=folders, chunks=chunks,
atomsfile=atomsfile, copy=copy,
patterns=patterns,
update_tree=update_tree)
if __name__ == '__main__':
main.cli()