cmoncrawl.processor.pipeline.extractor.HTMLExtractor#

class cmoncrawl.processor.pipeline.extractor.HTMLExtractor(filter_non_ok: bool = True)#

Dummy Extractor which simply extracts the html

__init__(filter_non_ok: bool = True)#

Methods

__init__([filter_non_ok])

extract(response, metadata)

extract_soup(soup, metadata)

filter_raw(response, metadata)

filter_soup(soup, metadata)

preprocess(response, metadata)