scrapy支持通过RFPDupeFilter来完成页面的去重(防止重复抓取)。
RFPDupeFilter 源码

class RFPDupeFilter(BaseDupeFilter):
    """Request Fingerprint duplicates filter"""

    def __init__(self, path=None, debug=False):
        self.file = None
        self.fingerprints = set()
        self.logdupes = True
        self.debug = debug
        self.logger = logging.getLogger(__name__)
        if path:
            self.file = open(os.path.join(path, 'requests.seen'), 'a+')
            self.file.seek(0)
            self.fingerprints.update(x.rstrip() for x in self.file)

    @classmethod
    def from_settings(cls, settings):
        debug = settings.getbool('DUPEFILTER_DEBUG')
        return cls(job_dir(settings), debug)

    def request_seen(self, request):
        fp = self.request_fingerprint(request)
        if fp in self.fingerprints:
            return True
        self.fingerprints.add(fp)
        if self.file:
            self.file.write(fp + os.linesep)

    def request_fingerprint(self, request):
        return request_fingerprint(request)

    def close(self, reason):
        if self.file:
            self.file.close()

    def log(self, request, spider):
        if self.debug:
            msg = "Filtered duplicate request: %(request)s (referer: %(referer)s)"
            args = {'request': request, 'referer': referer_str(request) }
            self.logger.debug(msg, args, extra={'spider': spider})
        elif self.logdupes:
            msg = ("Filtered duplicate request: %(request)s"
                   " - no more duplicates will be shown"
                   " (see DUPEFILTER_DEBUG to show all duplicates)")
            self.logger.debug(msg, {'request': request}, extra={'spider': spider})
            self.logdupes = False

        spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)

RFPDupeFilter实际是根据request_fingerprint实现过滤的,实现如下:

def request_fingerprint(request, include_headers=None):
    if include_headers:
        include_headers = tuple([h.lower() for h in sorted(include_headers)])
    cache = _fingerprint_cache.setdefault(request, {}) 
    if include_headers not in cache:
          fp = hashlib.sha1()
          fp.update(request.method)
          fp.update(canonicalize_url(request.url))
          fp.update(request.body or '') 
          if include_headers:
            for hdr in include_headers:
                  if hdr in request.headers:
                    fp.update(hdr)
                    for v in request.headers.getlist(hdr):
                          fp.update(v)
          cache[include_headers] = fp.hexdigest()
    return cache[include_headers]

我们可以看到,去重指纹是sha1(method + url + body + header)
所以,实际能够去掉重复的比例并不大。
如果我们需要自己提取去重的finger,需要自己实现Filter,并配置上它。
下面这个Filter只根据url去重:

from scrapy.dupefilter import RFPDupeFilter
class SeenURLFilter(RFPDupeFilter):
      """A dupe filter that considers the URL"""
      def __init__(self, path=None):
        self.urls_seen = set()
        RFPDupeFilter.__init__(self, path)
      def request_seen(self, request):
        if request.url in self.urls_seen:
              return True
        else:
              self.urls_seen.add(request.url)

不要忘记配置上:

DUPEFILTER_CLASS ='scraper.custom_filters.SeenURLFilter'

https://piaosanlang.gitbooks.io/spiders/09day/section9.4.html

HTTPX 基础教程-新乡seo|网站优化,网站建设_微信公众号:zeropython—昊天博客