cmoncrawl.processor.pipeline.downloader.DownloaderDummy#

class cmoncrawl.processor.pipeline.downloader.DownloaderDummy(files: List[Path], url: Optional[str] = None, date: Optional[datetime] = None)#

Dummy downloader for testing It doesn’t download anything but return files passed in the constructor and extracts metadata from the file

__init__(files: List[Path], url: Optional[str] = None, date: Optional[datetime] = None)#

Methods

__init__(files[, url, date])

download(domain_record)

extract_url(content)

extract_year(file_path)

mine_metadata(content, file_path)