cmoncrawl.processor.pipeline.streamer.StreamerFileHTML#

class cmoncrawl.processor.pipeline.streamer.StreamerFileHTML(root: Path, max_directory_size: int)#
__init__(root: Path, max_directory_size: int)#

Methods

__init__(root, max_directory_size)

clean_up()

get_file_name(metadata)

metadata_to_string(extracted_data)

stream(extracted_data, metadata)