To rotate proxies in scrapy a request middleware can be used.
Request middlewares are scrapy extensions that intercept outgoing requests and incoming responses.
So, we can create a middleware that intercepts outgoing requests and sets a random proxy:
# middlewares.py
import random
from scrapy import signals
class ProxyRotationMiddleware:
def __init__(self, proxies):
self.proxies = proxies
@classmethod
def from_crawler(cls, crawler):
"""retrieve proxy list from the settings.py PROXIES variable"""
proxies = crawler.settings.get('PROXIES', [])
if not proxies:
raise ValueError('No proxies found in settings. Please provide a list of proxies in the PROXIES setting.')
return cls(proxies)
def process_request(self, request, spider):
"""pick random proxy for every request"""
proxy = random.choice(self.proxies)
request.meta['proxy'] = proxy
spider.logger.debug(f'Using proxy: {proxy}')
# settings.py
MIDDLEWARES = {
# ...
'myproject.middlewares.ProxyRotationMiddleware': 750,
# ...
}
PROXIES = [
"http://111.22.22.33:8000",
"http://user:password@111.22.22.33:8000",
]
This basic proxy rotation middleware will automatically attach a random proxy from proxy pool to each outgoing request.
However, random proxies is not always the most efficient way to rotate proxies when proxy pool consists of varying quality proxies. Some proxies can perform better than others and visa versa, so another approach is to consider proxy peformance using weighted randomization:
# middlewares.py
import random
from scrapy import signals
class ProxyRotationMiddleware:
def __init__(self, proxies):
self.proxies = proxies
self.proxy_stats = {proxy: {"used": 0, "banned": False} for proxy in proxies}
@classmethod
def from_crawler(cls, crawler):
proxies = crawler.settings.get('PROXIES', [])
if not proxies:
raise ValueError('No proxies found in settings. Please provide a list of proxies in the PROXIES setting.')
return cls(proxies)
def process_request(self, request, spider):
"""attach weighted random proxy to each request"""
proxy = self._select_proxy()
request.meta['proxy'] = proxy
self.proxy_stats[proxy]["used"] += 1
spider.logger.debug(f'Using proxy: {proxy}')
def process_response(self, request, response, spider):
"""inspect every response and record proxy performance"""
proxy = request.meta['proxy']
# e.g. if response is 403 or 429 we can mark the proxy as banned
if response.status in (403, 429): # Add any other status codes that indicate a ban
self.proxy_stats[proxy]["banned"] = True
spider.logger.warning(f'Proxy {proxy} is banned, status code: {response.status}')
else:
self.proxy_stats[proxy]["banned"] = True
spider.logger.info(f'Proxy {proxy} has recovered, status code: {response.status}')
return response
def _select_proxy(self):
"""select weighted random proxy based on proxy stats"""
total_used = sum(stats["used"] for stats in self.proxy_stats.values())
weights = [
# example: calculate weight by use count and whether proxy is considered to be banned:
((total_used + 1) / (stats["used"] + 1)) * (0.1 if stats["banned"] else 1)
for stats in self.proxy_stats.values()
]
return random.choices(list(self.proxy_stats.keys()), weights=weights, k=1)[0]
Proxy rotation can get very complicated when scraping websites that use anti-scraping protection so for scaling up your scrapy scrapers check out Scrapfly API which integrates with scrapy directly through scrapy SDK and is capable of bypassing anti scraping protections!