Source code for zink.zink

# zink.py
from .pipeline import Pseudonymizer
import functools
import os
from zink.utils.paths import get_default_mapping_path

# Create a global instance to preserve cache across calls.
_global_instance = Pseudonymizer()

[docs] def redact( text, categories=None, placeholder=None, use_cache=True, use_json_mapping=True, extractor=None, merger=None, replacer=None, # Below are concurrency-related or advanced parameters: auto_parallel=False, chunk_size=1000, max_workers=4, numbered_entities=False # Default to False for compatibility ): """ Module-level convenience function that uses a global instance for caching. If 'auto_parallel' is True and len(text) > chunk_size, concurrency-based pipeline is used. Otherwise single-pass logic is used. """ if extractor is None and merger is None and replacer is None and use_json_mapping: # Use global instance + built-in concurrency if desired return _global_instance.redact( text=text, categories=categories, placeholder=placeholder, use_cache=use_cache, auto_parallel=auto_parallel, chunk_size=chunk_size, max_workers=max_workers, numbered_entities=numbered_entities ) else: # Create a fresh instance pseudonymizer = Pseudonymizer( use_json_mapping=use_json_mapping, extractor=extractor, merger=merger, replacer=replacer ) return pseudonymizer.redact( text=text, categories=categories, placeholder=placeholder, use_cache=use_cache, auto_parallel=auto_parallel, chunk_size=chunk_size, max_workers=max_workers, numbered_entities= numbered_entities )
[docs] def replace( text, categories=None, user_replacements=None, ensure_consistency=True, use_cache=True, use_json_mapping=True, extractor=None, merger=None, replacer=None, auto_parallel=False, chunk_size=1000, max_workers=4 ): """ Module-level convenience function that uses a global instance for caching. """ if extractor is None and merger is None and replacer is None and use_json_mapping: return _global_instance.replace( text=text, categories=categories, user_replacements=user_replacements, ensure_consistency=ensure_consistency, use_cache=use_cache, auto_parallel=auto_parallel, chunk_size=chunk_size, max_workers=max_workers ) else: pseudonymizer = Pseudonymizer( use_json_mapping=use_json_mapping, extractor=extractor, merger=merger, replacer=replacer ) return pseudonymizer.replace( text=text, categories=categories, user_replacements=user_replacements, ensure_consistency=ensure_consistency, use_cache=use_cache, auto_parallel=auto_parallel, chunk_size=chunk_size, max_workers=max_workers )
[docs] def replace_with_my_data( text, categories=None, user_replacements=None, ensure_consistency=True, use_json_mapping=True, extractor=None, merger=None, replacer=None, # Usually we don't cache user-defined replacements, but if you want concurrency, add it: auto_parallel=False, chunk_size=1000, max_workers=4 ): """ Module-level convenience function. Typically 'replace_with_my_data' does NOT rely on caching, but we might still want concurrency for large texts if 'auto_parallel' is True. """ if extractor is None and merger is None and replacer is None and use_json_mapping: return _global_instance.replace_with_my_data( text=text, categories=categories, user_replacements=user_replacements, ensure_consistency=ensure_consistency, auto_parallel=auto_parallel, chunk_size=chunk_size, max_workers=max_workers ) else: pseudonymizer = Pseudonymizer( use_json_mapping=use_json_mapping, extractor=extractor, merger=merger, replacer=replacer ) return pseudonymizer.replace_with_my_data( text=text, categories=categories, user_replacements=user_replacements, ensure_consistency=ensure_consistency, auto_parallel=auto_parallel, chunk_size=chunk_size, max_workers=max_workers )
[docs] def shield(target_arg, labels=None, **zink_kwargs): """ A decorator that provides a full anonymization/re-identification "shield" for a function call. It anonymizes a specific input argument, calls the decorated function, and then automatically re-identifies the function's string output. Args: target_arg (str or int): The name (str) or position (int) of the input argument to anonymize. labels (tuple or list): The entity labels to anonymize. Required. **zink_kwargs: Additional keyword arguments for the underlying zn.redact function. """ if labels is None: raise ValueError("The 'labels' argument is required for the shield decorator.") def decorator(func): @functools.wraps(func) def wrapper(*args, **kwargs): # 1. Find and extract the original text from the function's arguments original_text = None is_kwarg = False if isinstance(target_arg, str) and target_arg in kwargs: original_text = kwargs[target_arg] is_kwarg = True elif isinstance(target_arg, int) and target_arg < len(args): original_text = args[target_arg] else: raise ValueError(f"Argument '{target_arg}' not found in function call.") if not isinstance(original_text, str): raise TypeError(f"Target argument '{target_arg}' must be a string.") # 2. Anonymize the input and build the re-identification map. # `numbered_entities` must be True for re-identification to work. result_obj = redact( original_text, categories=labels, numbered_entities=True, **zink_kwargs ) anonymized_text = result_obj.anonymized_text reid_map = {item.pseudonym: item.original for item in result_obj.replacements} # 3. Create new arguments for the wrapped function, with the text anonymized if is_kwarg: kwargs[target_arg] = anonymized_text else: args = list(args) args[target_arg] = anonymized_text args = tuple(args) # 4. Call the wrapped function (e.g., the LLM) with the safe, anonymized data anonymized_response = func(*args, **kwargs) # 5. Re-identify the placeholders in the function's output string if not isinstance(anonymized_response, str): return anonymized_response # Return non-strings as-is reidentified_response = anonymized_response for pseudonym, original in reid_map.items(): reidentified_response = reidentified_response.replace(pseudonym, original) # 6. Return the final, re-identified result return reidentified_response return wrapper return decorator
[docs] def where_mapping_file(): """Returns the path to the persistent mapping file.""" return get_default_mapping_path()
[docs] def refresh_mapping_file(): """Deletes the persistent mapping file if it exists.""" path = get_default_mapping_path() if os.path.exists(path): os.remove(path)
[docs] def prep(text, words): """ Prepares text for redaction by wrapping specified words in asterisks. These words will be excluded from redaction. """ if not words: return text # Simple replacement for now. # Note: This might replace substrings (e.g. "car" in "racecar"). # If exact word matching is needed, we'd need regex with \b. # The user example implies simple replacement or word-based. # Given "toyota" -> "*toyota*", simple replace is likely what's expected for a basic version. # But let's try to be slightly smarter and use regex word boundaries if possible? # The user prompt said: "the prep function will insert *s to whatever words are provided in the argument list." # Let's stick to simple replace for now as it's most predictable, or maybe regex. # Let's use regex to avoid partial matches if possible, but user might want partials? # "rav4" -> "*rav4*" # Let's use simple replace as a start, it's safer than regex escaping issues unless we are careful. # Actually, simple replace is risky for "cat" in "category". # Let's use regex with word boundaries for safety. import re # Sort words by length descending to handle overlapping terms (longest match first) # though with word boundaries it matters less, but still good practice. sorted_words = sorted(words, key=len, reverse=True) for word in sorted_words: # Escape the word to handle special regex chars escaped_word = re.escape(word) # Use word boundaries. # Note: \b might not work well if 'word' starts/ends with non-word chars. # But for typical entities (names, things), it's fine. # If the word itself contains punctuation, \b might be tricky. # Let's assume standard entities. pattern = r'\b' + escaped_word + r'\b' text = re.sub(pattern, f"*{word}*", text, flags=re.IGNORECASE) return text