Source code for lodstorage.sync

"""
Created on 2023-12-27

@author: wf
"""

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional

from tabulate import tabulate


[docs] @dataclass class SyncPair: """ A class to represent a pair of data sources for synchronization. Attributes: title (str): The title of the synchronization pair. l_name (str): Name of the left data source (e.g., 'local'). r_name (str): Name of the right data source (e.g., 'wikidata'). l_data (List[Dict[str, Any]]): A list of dictionaries from the left data source. r_data (List[Dict[str, Any]]): A list of dictionaries from the right data source. l_key (str): The field name in the left data source dictionaries used as a unique identifier for synchronization. r_key (str): The field name in the right data source dictionaries used as a unique identifier for synchronization. l_pkey(str): the primary key field of the left data source r_pkey(str): the primary key field of the right data source Example usage: l_data = [{'id_l': '1', 'value': 'a'}, {'id_l': '2', 'value': 'b'}] r_data = [{'id_r': '2', 'value': 'b'}, {'id_r': '3', 'value': 'c'}] pair = SyncPair("Title", "local", "wikidata", l_data, r_data, 'id_l', 'id_r') sync = Sync(pair) print(sync.status_table()) """ title: str l_name: str r_name: str l_data: List[Dict[str, Any]] r_data: List[Dict[str, Any]] l_key: str r_key: str l_pkey: Optional[str] = None r_pkey: Optional[str] = None # Add dictionaries for quick primary key access l_by_pkey: Dict[str, Dict[str, Any]] = field(init=False) r_by_pkey: Dict[str, Dict[str, Any]] = field(init=False) def __post_init__(self): # Set the l_pkey to l_key if not provided if self.l_pkey is None: self.l_pkey = self.l_key # Set the r_pkey to r_key if not provided if self.r_pkey is None: self.r_pkey = self.r_key self.l_by_pkey = {d[self.l_pkey]: d for d in self.l_data if self.l_pkey in d} self.r_by_pkey = {d[self.r_pkey]: d for d in self.r_data if self.r_pkey in d}
[docs] class Sync: """ A class to help with synchronization between two sets of data, each represented as a list of dictionaries. """ def __init__(self, pair: SyncPair): """ Initialize the Sync class with the given Synchronization Pair. """ self.pair = pair self.sync_dict = self._create_sync_dict() self.directions = ["←", "↔", "→"] self.sides = {"left": ["←", "l", "left"], "right": ["→", "r", "right"]}
[docs] def handle_direction_error(self, direction: str): invalid_direction_msg = ( f"Invalid direction '{direction}'. Use {', '.join(self.directions)}." ) raise ValueError(invalid_direction_msg)
[docs] def handle_side_error(self, side: str): invalid_side_msg = f"Invalid side '{side}'. Use {', '.join(self.sides['left'])} for left or {', '.join(self.sides['right'])} for right." raise ValueError(invalid_side_msg)
def _create_sync_dict(self) -> dict: """ Create a dictionary representing the synchronization state between left and right data sources. """ l_keys = {d[self.pair.l_key] for d in self.pair.l_data if self.pair.l_key in d} r_keys = {d[self.pair.r_key] for d in self.pair.r_data if self.pair.r_key in d} sync_dict = { "←": r_keys - l_keys, # Present in right but not in left "↔": l_keys.intersection(r_keys), # Present in both "→": l_keys - r_keys, # Present in left but not in right } return sync_dict
[docs] def get_record_by_pkey(self, side: str, pkey: str) -> Optional[Dict[str, Any]]: """ Retrieves a record by primary key from the appropriate data source as specified by direction. Args: side (str): The side of data source, "←","l" or "left" for left and "→","r" or "right" for right. pkey (str): The primary key of the record to retrieve. Returns: Optional[Dict[str, Any]]: The record if found, otherwise None. """ record = None if side in self.sides["left"]: # retrieve from left record = self.pair.l_by_pkey.get(pkey) elif side in self.sides["right"]: # retrieve from right record = self.pair.r_by_pkey.get(pkey) else: self.handle_side_error(side) return record
[docs] def get_record_by_key(self, side: str, key: str) -> dict: """ Retrieves a record by the given unique key from the appropriate data source as specified by direction. Args: side (str): The side of data source, "←","l" or "left" for left and "→","r" or "right" for right. key (str): The unique key of the record to retrieve. Returns: Optional[Dict[str, Any]]: The record if found, otherwise None. Raises: ValueError: If the provided direction is invalid. """ record = None if side in ["←", "l", "left"]: record = next( (item for item in self.pair.l_data if item[self.pair.l_key] == key), None, ) elif side in ["→", "r", "right"]: record = next( (item for item in self.pair.r_data if item[self.pair.r_key] == key), None, ) else: self.handle_side_error(side) return record
[docs] def get_keys(self, direction: str) -> set: """ Get the keys for a given direction of synchronization. """ if direction in self.sync_dict: return self.sync_dict[direction] else: self.handle_direction_error(direction)
[docs] def status_table(self, tablefmt: str = "grid") -> str: """ Create a table representing the synchronization status. """ total_records = sum(len(keys) for keys in self.sync_dict.values()) if total_records == 0: # Avoid division by zero total_records = 1 table_data = [] for direction, keys in self.sync_dict.items(): num_records = len(keys) percentage = (num_records / total_records) * 100 table_data.append( { "left": self.pair.l_name, "↔": direction, "right": self.pair.r_name, "#": num_records, "%": f"{percentage:7.2f}%", } ) markup = tabulate( table_data, headers="keys", tablefmt=tablefmt, colalign=("right", "center", "left", "right", "right"), ) return markup