Source code for scrapy_redis.dupefilter

import hashlib
import json
import logging
import time

from scrapy.dupefilters import BaseDupeFilter
from scrapy.utils.python import to_unicode
from w3lib.url import canonicalize_url

from . import defaults
from .connection import get_redis_from_settings

logger = logging.getLogger(__name__)


# TODO: Rename class to RedisDupeFilter.
[docs] class RFPDupeFilter(BaseDupeFilter): """Redis-based request duplicates filter. This class can also be used with default Scrapy's scheduler. """ logger = logger def __init__(self, server, key, debug=False): """Initialize the duplicates filter. Parameters ---------- server : redis.StrictRedis The redis server instance. key : str Redis key Where to store fingerprints. debug : bool, optional Whether to log filtered requests. """ self.server = server self.key = key self.debug = debug self.logdupes = True
[docs] @classmethod def from_settings(cls, settings): """Returns an instance from given settings. This uses by default the key ``dupefilter:<timestamp>``. When using the ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as it needs to pass the spider name in the key. Parameters ---------- settings : scrapy.settings.Settings Returns ------- RFPDupeFilter A RFPDupeFilter instance. """ server = get_redis_from_settings(settings) # XXX: This creates one-time key. needed to support to use this # class as standalone dupefilter with scrapy's default scheduler # if scrapy passes spider on open() method this wouldn't be needed # TODO: Use SCRAPY_JOB env as default and fallback to timestamp. key = defaults.DUPEFILTER_KEY % {"timestamp": int(time.time())} debug = settings.getbool("DUPEFILTER_DEBUG") return cls(server, key=key, debug=debug)
[docs] @classmethod def from_crawler(cls, crawler): """Returns instance from crawler. Parameters ---------- crawler : scrapy.crawler.Crawler Returns ------- RFPDupeFilter Instance of RFPDupeFilter. """ return cls.from_settings(crawler.settings)
[docs] def request_seen(self, request): """Returns True if request was already seen. Parameters ---------- request : scrapy.http.Request Returns ------- bool """ fp = self.request_fingerprint(request) # This returns the number of values added, zero if already exists. added = self.server.sadd(self.key, fp) return added == 0
[docs] def request_fingerprint(self, request): """Returns a fingerprint for a given request. Parameters ---------- request : scrapy.http.Request Returns ------- str """ fingerprint_data = { "method": to_unicode(request.method), "url": canonicalize_url(request.url), "body": (request.body or b"").hex(), } fingerprint_json = json.dumps(fingerprint_data, sort_keys=True) return hashlib.sha1(fingerprint_json.encode()).hexdigest()
[docs] @classmethod def from_spider(cls, spider): settings = spider.settings server = get_redis_from_settings(settings) dupefilter_key = settings.get( "SCHEDULER_DUPEFILTER_KEY", defaults.SCHEDULER_DUPEFILTER_KEY ) key = dupefilter_key % {"spider": spider.name} debug = settings.getbool("DUPEFILTER_DEBUG") return cls(server, key=key, debug=debug)
[docs] def close(self, reason=""): """Delete data on close. Called by Scrapy's scheduler. Parameters ---------- reason : str, optional """ self.clear()
[docs] def clear(self): """Clears fingerprints data.""" self.server.delete(self.key)
[docs] def log(self, request, spider): """Logs given request. Parameters ---------- request : scrapy.http.Request spider : scrapy.spiders.Spider """ if self.debug: msg = "Filtered duplicate request: %(request)s" self.logger.debug(msg, {"request": request}, extra={"spider": spider}) elif self.logdupes: msg = ( "Filtered duplicate request %(request)s" " - no more duplicates will be shown" " (see DUPEFILTER_DEBUG to show all duplicates)" ) self.logger.debug(msg, {"request": request}, extra={"spider": spider}) self.logdupes = False