Processor.App.ArticleUtils.article_extractor.ArticleExtractor#

class Processor.App.ArticleUtils.article_extractor.ArticleExtractor(header_css_dict: Dict[str, str], header_extract_dict: Dict[str, Union[Callable[[Any], Any], List[Callable[[Any], Any]]]], article_css_dict: Dict[str, str], article_extract_dict: Dict[str, Union[Callable[[Any], Any], List[Callable[[Any], Any]]]], article_css_selector: str, filter_must_exist: List[str] = [], filter_must_not_exist: List[str] = [], filter_allowed_domain_prefixes: Optional[List[str]] = None)#
__init__(header_css_dict: Dict[str, str], header_extract_dict: Dict[str, Union[Callable[[Any], Any], List[Callable[[Any], Any]]]], article_css_dict: Dict[str, str], article_extract_dict: Dict[str, Union[Callable[[Any], Any], List[Callable[[Any], Any]]]], article_css_selector: str, filter_must_exist: List[str] = [], filter_must_not_exist: List[str] = [], filter_allowed_domain_prefixes: Optional[List[str]] = None)#

Methods

__init__(header_css_dict, ...[, ...])

article_extract(soup, metadata)

check_required(extracted_dict, metadata)

custom_extract(soup, metadata)

custom_filter_raw(response, metadata)

custom_filter_soup(soup, metadata)

extract(response, metadata)

extract_soup(soup, metadata)

filter_raw(response, metadata)

filter_soup(soup, metadata)

preprocess(response, metadata)

Attributes

ENCODING

SINCE

TO