cmoncrawl.processor.pipeline.streamer.BaseStreamerFile#

class cmoncrawl.processor.pipeline.streamer.BaseStreamerFile(root: Path, max_directory_size: int, max_file_size: int, extension: str, directory_prefix: str = 'directory_', max_retries: int = 3)#

Abstract Class which defines the basic functionality of a file streamer

__init__(root: Path, max_directory_size: int, max_file_size: int, extension: str, directory_prefix: str = 'directory_', max_retries: int = 3)#

Methods

__init__(root, max_directory_size, ...[, ...])

clean_up()

get_file_name(metadata)

metadata_to_string(extracted_data)

stream(extracted_data, metadata)