Source code for scrapy_redis.dupefilter
import hashlib
import json
import logging
import time
from scrapy.dupefilters import BaseDupeFilter
from scrapy.utils.python import to_unicode
from w3lib.url import canonicalize_url
from . import defaults
from .connection import get_redis_from_settings
logger = logging.getLogger(__name__)
# TODO: Rename class to RedisDupeFilter.
[docs]
class RFPDupeFilter(BaseDupeFilter):
"""Redis-based request duplicates filter.
This class can also be used with default Scrapy's scheduler.
"""
logger = logger
def __init__(self, server, key, debug=False):
"""Initialize the duplicates filter.
Parameters
----------
server : redis.StrictRedis
The redis server instance.
key : str
Redis key Where to store fingerprints.
debug : bool, optional
Whether to log filtered requests.
"""
self.server = server
self.key = key
self.debug = debug
self.logdupes = True
[docs]
@classmethod
def from_settings(cls, settings):
"""Returns an instance from given settings.
This uses by default the key ``dupefilter:<timestamp>``. When using the
``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
it needs to pass the spider name in the key.
Parameters
----------
settings : scrapy.settings.Settings
Returns
-------
RFPDupeFilter
A RFPDupeFilter instance.
"""
server = get_redis_from_settings(settings)
# XXX: This creates one-time key. needed to support to use this
# class as standalone dupefilter with scrapy's default scheduler
# if scrapy passes spider on open() method this wouldn't be needed
# TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
key = defaults.DUPEFILTER_KEY % {"timestamp": int(time.time())}
debug = settings.getbool("DUPEFILTER_DEBUG")
return cls(server, key=key, debug=debug)
[docs]
@classmethod
def from_crawler(cls, crawler):
"""Returns instance from crawler.
Parameters
----------
crawler : scrapy.crawler.Crawler
Returns
-------
RFPDupeFilter
Instance of RFPDupeFilter.
"""
return cls.from_settings(crawler.settings)
[docs]
def request_seen(self, request):
"""Returns True if request was already seen.
Parameters
----------
request : scrapy.http.Request
Returns
-------
bool
"""
fp = self.request_fingerprint(request)
# This returns the number of values added, zero if already exists.
added = self.server.sadd(self.key, fp)
return added == 0
[docs]
def request_fingerprint(self, request):
"""Returns a fingerprint for a given request.
Parameters
----------
request : scrapy.http.Request
Returns
-------
str
"""
fingerprint_data = {
"method": to_unicode(request.method),
"url": canonicalize_url(request.url),
"body": (request.body or b"").hex(),
}
fingerprint_json = json.dumps(fingerprint_data, sort_keys=True)
return hashlib.sha1(fingerprint_json.encode()).hexdigest()
[docs]
@classmethod
def from_spider(cls, spider):
settings = spider.settings
server = get_redis_from_settings(settings)
dupefilter_key = settings.get(
"SCHEDULER_DUPEFILTER_KEY", defaults.SCHEDULER_DUPEFILTER_KEY
)
key = dupefilter_key % {"spider": spider.name}
debug = settings.getbool("DUPEFILTER_DEBUG")
return cls(server, key=key, debug=debug)
[docs]
def close(self, reason=""):
"""Delete data on close. Called by Scrapy's scheduler.
Parameters
----------
reason : str, optional
"""
self.clear()
[docs]
def clear(self):
"""Clears fingerprints data."""
self.server.delete(self.key)
[docs]
def log(self, request, spider):
"""Logs given request.
Parameters
----------
request : scrapy.http.Request
spider : scrapy.spiders.Spider
"""
if self.debug:
msg = "Filtered duplicate request: %(request)s"
self.logger.debug(msg, {"request": request}, extra={"spider": spider})
elif self.logdupes:
msg = (
"Filtered duplicate request %(request)s"
" - no more duplicates will be shown"
" (see DUPEFILTER_DEBUG to show all duplicates)"
)
self.logger.debug(msg, {"request": request}, extra={"spider": spider})
self.logdupes = False