How to rotate proxies in scrapy spiders?

To rotate proxies in scrapy a request middleware can be used.

Request middlewares are scrapy extensions that intercept outgoing requests and incoming responses.
So, we can create a middleware that intercepts outgoing requests and sets a random proxy:

# middlewares.py
import random
from scrapy import signals


class ProxyRotationMiddleware:
    def __init__(self, proxies):
        self.proxies = proxies

    @classmethod
    def from_crawler(cls, crawler):
        """retrieve proxy list from the settings.py PROXIES variable"""
        proxies = crawler.settings.get('PROXIES', [])
        if not proxies:
            raise ValueError('No proxies found in settings. Please provide a list of proxies in the PROXIES setting.')
        return cls(proxies)

    def process_request(self, request, spider):
        """pick random proxy for every request"""
        proxy = random.choice(self.proxies)
        request.meta['proxy'] = proxy
        spider.logger.debug(f'Using proxy: {proxy}')

# settings.py
MIDDLEWARES = {
    # ...
    'myproject.middlewares.ProxyRotationMiddleware': 750,
    # ...
}
PROXIES = [
    "http://111.22.22.33:8000",
    "http://user:password@111.22.22.33:8000",
]

This basic proxy rotation middleware will automatically attach a random proxy from proxy pool to each outgoing request.

However, random proxies is not always the most efficient way to rotate proxies when proxy pool consists of varying quality proxies. Some proxies can perform better than others and visa versa, so another approach is to consider proxy peformance using weighted randomization:

# middlewares.py
import random
from scrapy import signals


class ProxyRotationMiddleware:
    def __init__(self, proxies):
        self.proxies = proxies
        self.proxy_stats = {proxy: {"used": 0, "banned": False} for proxy in proxies}

    @classmethod
    def from_crawler(cls, crawler):
        proxies = crawler.settings.get('PROXIES', [])
        if not proxies:
            raise ValueError('No proxies found in settings. Please provide a list of proxies in the PROXIES setting.')
        return cls(proxies)

    def process_request(self, request, spider):
        """attach weighted random proxy to each request"""
        proxy = self._select_proxy()
        request.meta['proxy'] = proxy
        self.proxy_stats[proxy]["used"] += 1
        spider.logger.debug(f'Using proxy: {proxy}')

    def process_response(self, request, response, spider):
        """inspect every response and record proxy performance"""
        proxy = request.meta['proxy']
        # e.g. if response is 403 or 429 we can mark the proxy as banned
        if response.status in (403, 429):  # Add any other status codes that indicate a ban
            self.proxy_stats[proxy]["banned"] = True
            spider.logger.warning(f'Proxy {proxy} is banned, status code: {response.status}')
        else:
            self.proxy_stats[proxy]["banned"] = True
            spider.logger.info(f'Proxy {proxy} has recovered, status code: {response.status}')
        return response

    def _select_proxy(self):
        """select weighted random proxy based on proxy stats"""
        total_used = sum(stats["used"] for stats in self.proxy_stats.values())
        weights = [
            # example: calculate weight by use count and whether proxy is considered to be banned:
            ((total_used + 1) / (stats["used"] + 1)) * (0.1 if stats["banned"] else 1)
            for stats in self.proxy_stats.values()
        ]
        return random.choices(list(self.proxy_stats.keys()), weights=weights, k=1)[0]

Proxy rotation can get very complicated when scraping websites that use anti-scraping protection so for scaling up your scrapy scrapers check out Scrapfly API which integrates with scrapy directly through scrapy SDK and is capable of bypassing anti scraping protections!

Provided by Scrapfly

This knowledgebase is provided by Scrapfly data APIs, check us out! 👇