cmoncrawl.processor.pipeline.extractor.BaseExtractor#

class cmoncrawl.processor.pipeline.extractor.BaseExtractor(encoding: Optional[str] = None)#
__init__(encoding: Optional[str] = None)#

Methods

__init__([encoding])

extract(response, metadata)

extract_soup(soup, metadata)

filter_raw(response, metadata)

filter_soup(soup, metadata)

preprocess(response, metadata)