Source code for scrapple.utils.config

"""
scrapple.utils.config
~~~~~~~~~~~~~~~~~~~~~

Functions related to traversing the configuration file
"""

from __future__ import print_function
from colorama import init, Fore, Back

init()


[docs]def traverse_next(page, next, results): """ Recursive generator to traverse through the next attribute and \ crawl through the links to be followed. :param page: The current page being parsed :param next: The next attribute of the current scraping dict :param results: The current extracted content, stored in a dict :return: The extracted content, through a generator """ for link in page.extract_links(next['follow_link']): print(Back.YELLOW + Fore.BLUE + "Loading page ", link.url + Back.RESET + Fore.RESET) r = results.copy() for attribute in next['scraping'].get('data'): if attribute['field'] != "": print("\nExtracting", attribute['field'], "attribute", sep=' ') r[attribute['field']] = link.extract_content(attribute['selector'], attribute['attr'], attribute['default']) if not next['scraping'].get('next'): yield r else: for next2 in next['scraping'].get('next'): for result in traverse_next(link, next2, r): yield result
[docs]def get_fields(config): """ Recursive generator that yields the field names in the config file :param config: The configuration file that contains the specification of the extractor :return: The field names in the config file, through a generator """ for data in config['scraping']['data']: if data['field'] != '': yield data['field'] if 'next' in config['scraping']: for n in config['scraping']['next']: for f in get_fields(n): yield f
[docs]def extract_fieldnames(config): """ Function to return a list of unique field names from the config file :param config: The configuration file that contains the specification of the extractor :return: A list of field names from the config file """ fields = [] for x in get_fields(config): if x in fields: fields.append(x + '_' + str(fields.count(x) + 1)) else: fields.append(x) return fields