There are several ways to add headers in scrapy spiders. This can be done for each request manually:
class MySpider(scrapy.Spider):
def parse(self, response):
yield scrapy.Request(..., headers={"x-token": "123"})
However to automatically add headers to every or specific outgoing scrapy requests the DEAFAULT_REQUEST_HEADERS
setting can be used:
# settings.py
DEFAULT_REQUEST_HEADERS = {
"User-Agent": "my awesome scrapy robot",
}
In case more complex logic is needed like adding headers only to some requests or random User-Agent header a request middleware is the best option:
# middlewares.py
import random
class RandomUserAgentMiddleware:
def __init__(self, user_agents):
self.user_agents = user_agents
@classmethod
def from_crawler(cls, crawler):
"""retrieve user agent list from settings.USER_AGENTS"""
user_agents = crawler.settings.get('USER_AGENTS', [])
if not user_agents:
raise ValueError('No user agents found in settings. Please provide a list of user agents in the USER_AGENTS setting.')
return cls(user_agents)
def process_request(self, request, spider):
"""attach random user agent to every outgoing request"""
user_agent = random.choice(self.user_agents)
request.headers.setdefault('User-Agent', user_agent)
spider.logger.debug(f'Using User-Agent: {user_agent}')
# settings.py
MIDDLEWARES = {
# ...
'myproject.middlewares.RandomUserAgentMiddleware': 760,
# ...
}
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
# ...
]
Note that if you're using Scrapfly's scrapy SDK some headers like User-Agent string are automatically by the smart anti-blocking API.