How to Scrape StockX e-commerce Data with Python
In this first entry in our fashion data web scraping series we'll be taking a look at StockX.com - a marketplace that treats apparel as stocks and how to scrape it all.
With the recent news of Twitter closing its API to new developers, we decided to write a tutorial on how to scrape Twitter.
In this tutorial, we'll be using nothing but Python to retrieve Twitter data such as:
We'll cover Twitter web scraping using two popular methods: Playwright browser automation library and Twitter's secret graphQL API. Let's dive in!
Twitter is a major announcement hub where people and companies publish their announcements. This is a great opportunity to use Twitter to follow industry trends. For example, stock market or crypto market targets could be scraped to predict the future price of a stock or crypto.
Twitter is also a great source of data for sentiment analysis. You can use Twitter to find out what people think about a certain topic or brand. This is useful for market research, product development, and brand awareness.
So, if we can scrape Twitter data with Python we can have access to this valuable public information for free!
In this tutorial we'll approach Twitter scraping in three ways:
We'll be working with both JSON and HTML response data. So, we'll be using parsel to parse HTML and jamespath for JSON.
All of these libraries are available for free and can be installed via pip install
terminal command:
$ pip install httpx playwright parsel jmespath scrapfly-sdk
First, let's start with the easy method of using Playwright and Python to retrieve tweet data.
Twitter is a complicated javascript web application that requires javascript to work. So, for tweet scraping we'll be using Playwright browser automation library.
If you're unfamiliar with web scraping using Playwright see our introduction tutorial and example project
Our Playwright-based Twitter scraper in Python should look something like this:
parsel.Selector
Following these steps our scraper would look like this:
from parsel import Selector
from playwright.sync_api import sync_playwright
from playwright.sync_api._generated import Page
def parse_tweets(selector: Selector):
"""
parse tweets from pages containing tweets like:
- tweet page
- search page
- reply page
- homepage
returns list of tweets on the page where 1st tweet is the
main tweet and the rest are replies
"""
results = []
# select all tweets on the page as individual boxes
# each tweet is stored under <article data-testid="tweet"> box:
tweets = selector.xpath("//article[@data-testid='tweet']")
for i, tweet in enumerate(tweets):
# using data-testid attribute we can get tweet details:
found = {
"text": "".join(tweet.xpath(".//*[@data-testid='tweetText']//text()").getall()),
"username": tweet.xpath(".//*[@data-testid='User-Names']/div[1]//text()").get(),
"handle": tweet.xpath(".//*[@data-testid='User-Names']/div[2]//text()").get(),
"datetime": tweet.xpath(".//time/@datetime").get(),
"verified": bool(tweet.xpath(".//svg[@data-testid='icon-verified']")),
"url": tweet.xpath(".//time/../@href").get(),
"image": tweet.xpath(".//*[@data-testid='tweetPhoto']/img/@src").get(),
"video": tweet.xpath(".//video/@src").get(),
"video_thumb": tweet.xpath(".//video/@poster").get(),
"likes": tweet.xpath(".//*[@data-testid='like']//text()").get(),
"retweets": tweet.xpath(".//*[@data-testid='retweet']//text()").get(),
"replies": tweet.xpath(".//*[@data-testid='reply']//text()").get(),
"views": (tweet.xpath(".//*[contains(@aria-label,'Views')]").re("(\d+) Views") or [None])[0],
}
# main tweet (not a reply):
if i == 0:
found["views"] = tweet.xpath('.//span[contains(text(),"Views")]/../preceding-sibling::div//text()').get()
found["retweets"] = tweet.xpath('.//a[contains(@href,"retweets")]//text()').get()
found["quote_tweets"] = tweet.xpath('.//a[contains(@href,"retweets/with_comments")]//text()').get()
found["likes"] = tweet.xpath('.//a[contains(@href,"likes")]//text()').get()
results.append({k: v for k, v in found.items() if v is not None})
return results
def scrape_tweet(url: str, page: Page):
"""
Scrape tweet and replies from tweet page like:
https://twitter.com/Scrapfly_dev/status/1587431468141318146
"""
# go to url
page.goto(url)
# wait for content to load
page.wait_for_selector("//article[@data-testid='tweet']")
# retrieve final page HTML:
html = page.content()
# parse it for data:
selector = Selector(html)
tweets = parse_tweets(selector)
return tweets
# example run:
with sync_playwright() as pw:
# start browser and open a new tab:
browser = pw.chromium.launch(headless=False)
page = browser.new_page(viewport={"width": 1920, "height": 1080})
# scrape tweet and replies:
tweet_and_replies = scrape_tweet("httpTrutwitter.com/Scrapfly_dev/status/1587431468141318146", page)
print(tweet_and_replies)
from parsel import Selector
from scrapfly import ScrapflyClient, ScrapeConfig
scrapfly = ScrapflyClient(key="YOUR SCRAPFLY KEY")
def parse_tweets(selector: Selector):
"""
parse tweets from pages containing tweets like:
- tweet page
- search page
- reply page
- homepage
returns list of tweets on the page where 1st tweet is the
main tweet and the rest are replies
"""
results = []
# select all tweets on the page as individual boxes
# each tweet is stored under <article data-testid="tweet"> box:
tweets = selector.xpath("//article[@data-testid='tweet']")
for i, tweet in enumerate(tweets):
# using data-testid attribute we can get tweet details:
found = {
"text": "".join(tweet.xpath(".//*[@data-testid='tweetText']//text()").getall()),
"username": tweet.xpath(".//*[@data-testid='User-Names']/div[1]//text()").get(),
"handle": tweet.xpath(".//*[@data-testid='User-Names']/div[2]//text()").get(),
"datetime": tweet.xpath(".//time/@datetime").get(),
"verified": bool(tweet.xpath(".//svg[@data-testid='icon-verified']")),
"url": tweet.xpath(".//time/../@href").get(),
"image": tweet.xpath(".//*[@data-testid='tweetPhoto']/img/@src").get(),
"video": tweet.xpath(".//video/@src").get(),
"video_thumb": tweet.xpath(".//video/@poster").get(),
"likes": tweet.xpath(".//*[@data-testid='like']//text()").get(),
"retweets": tweet.xpath(".//*[@data-testid='retweet']//text()").get(),
"replies": tweet.xpath(".//*[@data-testid='reply']//text()").get(),
"views": (tweet.xpath(".//*[contains(@aria-label,'Views')]").re("(\d+) Views") or [None])[0],
}
# main tweet (not a reply):
if i == 0:
found["views"] = tweet.xpath('.//span[contains(text(),"Views")]/../preceding-sibling::div//text()').get()
found["retweets"] = tweet.xpath('.//a[contains(@href,"retweets")]//text()').get()
found["quote_tweets"] = tweet.xpath('.//a[contains(@href,"retweets/with_comments")]//text()').get()
found["likes"] = tweet.xpath('.//a[contains(@href,"likes")]//text()').get()
results.append({k: v for k, v in found.items() if v is not None})
return results
def scrape_tweet(url: str):
"""
Scrape tweet and replies from tweet page like:
https://twitter.com/Scrapfly_dev/status/1587431468141318146
"""
result = scrapfly.scrape(ScrapeConfig(
url=url,
country="US",
render_js=True,
))
return parse_tweets(result.selector)
tweet_and_replies = scrape_tweet("https://twitter.com/Google/status/1622686179077357573")
print(tweet_and_replies)
[
{
"text": "AI can help people, businesses and communities unlock their potential. Here\u2019s the latest on how we\u2019re building on our advancements in large language models, including an experimental conversational AI service and new AI-powered features in Search \u2193",
"username": "Google",
"handle": "@Google",
"datetime": "2023-02-06T19:58:27.000Z",
"verified": true,
"url": "/Google/status/1622686179077357573",
"likes": "837",
"retweets": "229",
"views": "207.5K",
"quote_tweets": "58"
},
{
"text": "And what about site owners?\n\nAre you gonna deprecate the websites presence from the search since google is owning this content?",
"username": "Divyanshu Bajpai ",
"handle": "@DivySEO",
"datetime": "2023-02-06T20:01:22.000Z",
"verified": true,
"url": "/DivySEO/status/1622686912187142146",
"likes": "7",
"replies": "1",
"views": "775"
},
...
]
Above, we're using sync_playwright
to start a headless Chrome browser and page.goto
to navigate to the tweet page. Then we're waiting for the page to load using page.wait_for_selector()
and retrieve the final HTML source using page.content()
method.
To parse the final HTML we're using parsel.Selector
and CSS selectors and XPath to extract tweet details and replies. For this, we relied on Twitters testing markup data-testid
attribute which can be used to easily find data elements of the tweets.
🧙♂️ Twitter is a bandwidth-intensive web application so when web scraping it using Playwright we should block unneeded resources like images, videos and analytic components. For that see how to block resources in playwright
Next, let's take a look how can we use this approach to scrape Twitter search for popular tweets and Twitter users.
Twitter is known for its powerful search engine and it's a great place to find popular tweets and users.
To scrape Twitter search we'll use Playwright as well. Our process looks very similar to our previous scraper:
parsel.Selector
For this example, we'll cover two search endpoints:
For tweet parsing we'll be re-using parse_tweets()
function from our previous example. For people parsing we'll be write a new but similar parse_profiles()
function:
from parsel import Selector
from playwright.sync_api import sync_playwright
from playwright.sync_api._generated import Page
def parse_profiles(sel: Selector):
"""parse profile preview data from Twitter profile search"""
profiles = []
for profile in sel.xpath("//div[@data-testid='UserCell']"):
profiles.append(
{
"name": profile.xpath(".//a[not(@tabindex=-1)]//text()").get().strip(),
"handle": profile.xpath(".//a[@tabindex=-1]//text()").get().strip(),
"bio": ''.join(profile.xpath("(.//div[@dir='auto'])[last()]//text()").getall()),
"url": profile.xpath(".//a/@href").get(),
"image": profile.xpath(".//img/@src").get(),
}
)
return profiles
def scrape_top_search(query: str, page: Page):
"""scrape top Twitter page for featured tweets"""
page.goto(f"https://twitter.com/search?q={query}&src=typed_query")
page.wait_for_selector("//article[@data-testid='tweet']") # wait for content to load
tweets = parse_tweets(Selector(page.content()))
return tweets
def scrape_people_search(query: str, page: Page):
"""scrape people search Twitter page for related users"""
page.goto(f"https://twitter.com/search?q={query}&src=typed_query&f=user")
page.wait_for_selector("//div[@data-testid='UserCell']") # wait for content to load
profiles = parse_profiles(Selector(page.content()))
return profiles
with sync_playwright() as pw:
browser = pw.chromium.launch(headless=False)
page = browser.new_page(viewport={"width": 1920, "height": 1080})
top_tweet_search = scrape_top_search("google", page)
people_tweet_search = scrape_people_search("google", page)
import json
from parsel import Selector
from playwright.sync_api import sync_playwright
from playwright.sync_api._generated import Page
from snippet1 import parse_tweets # we covered tweet parsing in previous code snippet!
from scrapfly import ScrapflyClient, ScrapeConfig
scrapfly = ScrapflyClient(key="YOUR SCRAPFLY KEY")
def parse_profiles(sel: Selector):
"""parse profile preview data from Twitter profile search"""
profiles = []
for profile in sel.xpath("//div[@data-testid='UserCell']"):
profiles.append(
{
"name": profile.xpath(".//a[not(@tabindex=-1)]//text()").get().strip(),
"handle": profile.xpath(".//a[@tabindex=-1]//text()").get().strip(),
"bio": "".join(profile.xpath("(.//div[@dir='auto'])[last()]//text()").getall()),
"url": profile.xpath(".//a/@href").get(),
"image": profile.xpath(".//img/@src").get(),
}
)
return profiles
def scrape_top_search(query: str):
"""scrape top Twitter page for featured tweets"""
result = scrapfly.scrape(
ScrapeConfig(
url=f"https://twitter.com/search?q={query}&src=typed_query",
country="US",
render_js=True,
)
)
return parse_tweets(result.selector)
def scrape_people_search(query: str):
"""scrape people search Twitter page for related users"""
result = scrapfly.scrape(
ScrapeConfig(
url=f"https://twitter.com/search?q={query}&src=typed_query&f=user",
country="US",
render_js=True,
)
)
return parse_profiles(result.selector)
if __name__ == "__main__":
top_tweet_search = scrape_top_search("google")
print(json.dumps(top_tweet_search, indent=2))
people_tweet_search = scrape_people_search("google")
print(json.dumps(people_tweet_search, indent=2))
Top tweet search:
[
{
"text": "The best AI tools you need to know about!\n\n#ai #ChatGPT #Google",
"username": "Ishan Sharma",
"handle": "@Ishansharma7390",
"datetime": "2023-02-07T11:32:28.000Z",
"verified": false,
"url": "/Ishansharma7390/status/1622921232646635520",
"video": "https://video.twimg.com/ext_tw_video/1622921135833890817/pu/vid/720x1280/L5GkN9mSB8WPdxDz.mp4?tag=12",
"video_thumb": "https://pbs.twimg.com/ext_tw_video_thumb/1622921135833890817/pu/img/c2XpGEntF_wfHDXA.jpg",
"likes": "667",
"retweets": "171",
"replies": "26",
"views": "21982"
},
...
]
Related people/user search:
[
{
"name": "Google",
"handle": "@Google",
"bio": "#HeyGoogle",
"url": "/Google",
"image": "https://pbs.twimg.com/profile_images/1605297940242669568/q8-vPggS_normal.jpg"
},
{
"name": "Google AI",
"handle": "@GoogleAI",
"bio": "Google AI is focused on bringing the benefits of AI to everyone. In conducting and applying our research, we advance the state-of-the-art in many domains.",
"url": "/GoogleAI",
"image": "https://pbs.twimg.com/profile_images/993649592422907904/yD7LkqU2_normal.jpg"
},
...
]
Above, we use the same scraping technique we used before just that this time we're extracting we're also parsing user profiles. Again, we are relying on data-testid
attribute to find the data elements we need. It's a reliable way to parse Twitter's HTML as these attributes are used for testing purposes by Twitter developers.
Using Playwright to scrape is an accessible and easy way to scrape Tweet and profile data though browsers can be inefficient and slow. So, to scrape Twitter profile data next, we'll take a look at alternative approach and scrape Twitter's internal graphql API.
To scrape profiles we'll be using Twitter's backend API which is powered by graphQL.
This approach is a bit more complicated than our Playwright approach but it's much faster. This will help us to retrieve thousands of Twitter profiles without wasting time on waiting for pages to load.
To start, let's take a look at how Twitter profile page works by using the browser's developer tools which can be accessed in most modern browsers using the F12
button. For our examples, we'll be using Chrome.
First thing to note is that when we're connecting to the profile page Twitter is making hundreds of backend API requests to generate the page. One of these requests is the GraphQL request that contains profile data:
One of these requests is being made to /graphql/lhB3zXD3M7e-VfBkR-5A8g/UserByScreenName
endpoint which returns a JSON response with profile data. However, this endpoint requires authentication headers that we need to generate first:
Authentication
requires Bearer
token which is hardcoded by Twitter so we can just copy it into our scraper.
x-guest-token
is an anonymous login identifier so we first much generate it by doing soft registration request. We'll do this by sending a request to Twitters API endpoint /1.1/guest/activate.json
which will return a JSON response with guest_token
:
import httpx
def get_guest_token(auth: str):
"""register guest token for auth key"""
headers_pre = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"accept-encoding": "gzip",
"accept-language": "en-US,en;q=0.5",
"connection": "keep-alive",
"authorization": f"Bearer {auth}",
}
result = httpx.post("https://api.twitter.com/1.1/guest/activate.json")
guest_token = result.json()["guest_token"] # '1622833653452804096'
return guest_token
authorization = "Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw"
print(get_guest_token(authorization))
import json
from scrapfly import ScrapflyClient, ScrapeConfig
scrapfly = ScrapflyClient(key="YOUR SCRAPFLY KEY")
AUTH_TOKEN = "Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw"
def get_guest_token():
"""register guest token for auth key"""
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"accept-encoding": "gzip",
"accept-language": "en-US,en;q=0.5",
"connection": "keep-alive",
"authorization": f"Bearer {AUTH_TOKEN}",
}
result = scrapfly.scrape(
ScrapeConfig(
url="https://api.twitter.com/1.1/guest/activate.json",
headers=headers,
method="POST",
)
)
guest_token = json.loads(result.content)["guest_token"] # e.g. '1622833653452804096'
return guest_token
print(get_guest_token())
Now that we have our guest_token
we can use it to make a request to /graphql/lhB3zXD3M7e-VfBkR-5A8g/UserByScreenName
endpoint to get user profile data. Let's see how our full scraper would look now:
import httpx
AUTH_TOKEN = "AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw"
# create HTTP client with browser-like user-agent:
client = httpx.Client(headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
})
def get_guest_token():
"""register guest token for auth key"""
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"accept-encoding": "gzip",
"accept-language": "en-US,en;q=0.5",
"connection": "keep-alive",
"authorization": f"Bearer {AUTH_TOKEN}",
}
result = httpx.post("https://api.twitter.com/1.1/guest/activate.json", headers=headers)
guest_token = result.json()["guest_token"] # e.g. '1622833653452804096'
return guest_token
GUEST_TOKEN = get_guest_token(AUTH_TOKEN)
def scrape_user(handle: str):
headers = {
"authority": "api.twitter.com",
"authorization": f"Bearer {AUTH_TOKEN}",
"content-type": "application/json",
"origin": "https://twitter.com",
"referer": "https://twitter.com/",
"x-guest-token": GUEST_TOKEN,
"x-twitter-active-user": "yes",
"x-twitter-client-language": "en",
}
url = f"https://api.twitter.com/graphql/lhB3zXD3M7e-VfBkR-5A8g/UserByScreenName?variables=%7B%22screen_name%22%3A%22{handle}%22%2C%22withSafetyModeUserFields%22%3Atrue%2C%22withSuperFollowsUserFields%22%3Atrue%7D&features=%7B%22responsive_web_twitter_blue_verified_badge_is_enabled%22%3Atrue%2C%22responsive_web_graphql_exclude_directive_enabled%22%3Afalse%2C%22verified_phone_label_enabled%22%3Afalse%2C%22responsive_web_graphql_skip_user_profile_image_extensions_enabled%22%3Afalse%2C%22responsive_web_graphql_timeline_navigation_enabled%22%3Atrue%7D"
response = client.get(url, headers=headers)
return response.json()['data']['user']['result']
# example use:
profile = scrape_user("Scrapfly_dev")
print(profile)
import json
from scrapfly import ScrapflyClient, ScrapeConfig
AUTH_TOKEN = "AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw"
scrapfly = ScrapflyClient(key="YOUR SCRAPFLY KEY")
def get_guest_token():
"""register guest token for auth key"""
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"accept-encoding": "gzip",
"accept-language": "en-US,en;q=0.5",
"connection": "keep-alive",
"authorization": f"Bearer {AUTH_TOKEN}",
}
result = scrapfly.scrape(ScrapeConfig(
url="https://api.twitter.com/1.1/guest/activate.json",
headers=headers,
method="POST",
))
guest_token = json.loads(result.content)["guest_token"] # e.g. '1622833653452804096'
return guest_token
GUEST_TOKEN = get_guest_token()
def scrape_user(handle: str):
headers = {
"authority": "api.twitter.com",
"authorization": f"Bearer {AUTH_TOKEN}",
"content-type": "application/json",
"origin": "https://twitter.com",
"referer": "https://twitter.com/",
"x-guest-token": GUEST_TOKEN,
"x-twitter-active-user": "yes",
"x-twitter-client-language": "en",
}
url = f"https://api.twitter.com/graphql/lhB3zXD3M7e-VfBkR-5A8g/UserByScreenName?variables=%7B%22screen_name%22%3A%22{handle}%22%2C%22withSafetyModeUserFields%22%3Atrue%2C%22withSuperFollowsUserFields%22%3Atrue%7D&features=%7B%22responsive_web_twitter_blue_verified_badge_is_enabled%22%3Atrue%2C%22responsive_web_graphql_exclude_directive_enabled%22%3Afalse%2C%22verified_phone_label_enabled%22%3Afalse%2C%22responsive_web_graphql_skip_user_profile_image_extensions_enabled%22%3Afalse%2C%22responsive_web_graphql_timeline_navigation_enabled%22%3Atrue%7D"
result = scrapfly.scrape(ScrapeConfig(
url=url,
country="US",
headers=headers,
))
data = json.loads(result.content)
return data['data']['user']['result']
# example use:
profile = scrape_user("Scrapfly_dev")
print(json.dumps(profile, indent=2))
{
"__typename": "User",
"id": "VXNlcjoxMzEwNjIzMDgxMzAwNDAyMTc4",
"rest_id": "1310623081300402178",
"affiliates_highlighted_label": {},
"is_blue_verified": false,
"legacy": {
"protected": false,
"created_at": "Mon Sep 28 16:51:22 +0000 2020",
"default_profile": true,
"default_profile_image": false,
"description": "Web Scraping API - turn any website into a database!\n\nScrapFly allows you to quickly achieve your data goals without web scraping challenges and errors.",
"entities": {
"description": {
"urls": []
},
"url": {
"urls": [
{
"display_url": "scrapfly.io",
"expanded_url": "https://scrapfly.io",
"url": "https://t.co/1Is3k6KzyM",
"indices": [
0,
23
]
}
]
}
},
"fast_followers_count": 0,
"favourites_count": 12,
"followers_count": 149,
"friends_count": 1000,
"has_custom_timelines": true,
"is_translator": false,
"listed_count": 2,
"location": "Paris",
"media_count": 1,
"name": "Scrapfly",
"normal_followers_count": 149,
"pinned_tweet_ids_str": [],
"possibly_sensitive": false,
"profile_banner_extensions": {
"mediaColor": {
"r": {
"ok": {
"palette": [
{
"percentage": 86.68,
"rgb": {
"blue": 255,
"green": 255,
"red": 255
}
},
{
"percentage": 13.19,
"rgb": {
"blue": 200,
"green": 147,
"red": 64
}
},
{
"percentage": 0.13,
"rgb": {
"blue": 163,
"green": 145,
"red": 135
}
}
]
}
}
}
},
"profile_banner_url": "https://pbs.twimg.com/profile_banners/1310623081300402178/1601320645",
"profile_image_extensions": {
"mediaColor": {
"r": {
"ok": {
"palette": [
{
"percentage": 86.04,
"rgb": {
"blue": 255,
"green": 255,
"red": 255
}
},
{
"percentage": 12.65,
"rgb": {
"blue": 242,
"green": 182,
"red": 104
}
},
{
"percentage": 1.32,
"rgb": {
"blue": 250,
"green": 182,
"red": 96
}
}
]
}
}
}
},
"profile_image_url_https": "https://pbs.twimg.com/profile_images/1310658795715076098/XedZDwC7_normal.jpg",
"profile_interstitial_type": "",
"screen_name": "Scrapfly_dev",
"statuses_count": 23,
"translator_type": "none",
"url": "https://t.co/1Is3k6KzyM",
"verified": false,
"withheld_in_countries": []
},
"has_nft_avatar": false,
"super_follow_eligible": false,
"super_followed_by": false,
"super_following": false,
"legacy_extended_profile": {},
"is_profile_translatable": false,
"verification_info": {},
"business_account": {}
}
Above, we used httpx
to establish an HTTP client and collect authentication and guest tokens which we later can use to scrape Twitter Graphql endpoints for user profile data. Note that data format, while extensive, is not very clean and requires some parsing to get the proper dataset. For more on JSON data cleanup see:
Introduction to using JMESPath to parse and restructure JSON datasets in web scraping and Python
If we start scraping Twitter at scale we start to quickly run into blocking as Twitter doesn't allow automated requests and will block scrapers IP address after extended use.
To get around this we can use proxies and web scraping APIs such as ScrapFly.
ScrapFly service does the heavy lifting for you!
ScrapFly is essentially a HTTP proxy on steroids which provides web scraping abstraction features like:
And much more. To use ScrapFly with Python we can take advantage of Python SDK:
from scrapfly import ScrapflyClient, ScrapeConfig
scrapfly = ScrapflyClient(key="YOUR SCRAPFLY KEY")
result = scrapfly.scrape(ScrapeConfig(
"https://twitter.com/Scrapfly_dev",
# we can enable features like:
# cloud headless browser use
render_js=True,
# anti scraping protection bypass
asp=True,
# screenshot taking
screenshots={"all": "fullpage"},
# proxy country selection
country="US",
))
For more on using ScrapFly to scrape Twitter see the Full Scraper Code section.
To wrap up this Python Twitter scraper let's take a look at some frequently asked questions regarding web scraping Twitter:
Yes, all of the data on Twitter is available publically so it's perfectly legal to scrape. However, note that some tweets can contain copyrighted material like images or videos and scraping them can be illegal (though scraping the URLs are perfectly fine).
Twitter is a complex javascript-heavy website and is hostile to web scraping so it's easy to get blocked. To avoid this you can use ScrapFly which provides anti scraping technology bypass and proxy rotation. Alternatively, see our article on how to avoid web scraper blocking.
Twitter uses a Graphql API for some of its endpoints which can be discovered using browser's devtools. To scrape these endpoints we need to generate guest tokens which then can be used to request graphql endpoints. For more see the profile scraping section
If you're using browser automation tools like Playwright (used in this article) then you can block images and unnecessary resources to save bandwidth and speed up scraping.
In this tutorial, we've taken a look at how to scrape data from Twitter using Python in a few different ways.
We've started with an easy approach of scraping tweets using Playwright - a browser automation library for Python. We used it to start a browser, navigate to the tweet page, wait for all of the complex javascript to load and then retrieve the fully rendered HTML. To parse the retrieved data we used CSS Selectors and XPath query languages.
Then, we've taken a look at alternative approach of scraping Twitter secret API endpoints using Python and httpx
. This required us to collect authentication tokens which we then could use for exclusive graphql endpoints to scrape Twitter user profiles. To parse scraped JSON data we used JMESPath to cleanup the overall user dataset.
Finally, to avoid blocking we've taken a look at ScrapFly web scraping API which provides a simple way to scrape Twitter at scale using proxies and anti scraping technology bypassing. Try out ScrapFly for free!
Here's the full Twitter web scraper code with ScrapFly integration we covered in this tutorial:
💙 This code should only be used as a reference. To scrape data from Twitter at scale you'll need some error handling, logging and retrying logic etc.
import json
import os
from parsel import Selector
from scrapfly import ScrapflyClient, ScrapeConfig
scrapfly = ScrapflyClient(key="YOUR SCRAPFLY KEY")
def parse_tweets(selector: Selector):
"""
parse tweets from pages containing tweets like:
- tweet page
- search page
- reply page
- homepage
returns list of tweets on the page where 1st tweet is the
main tweet and the rest are replies
"""
results = []
# select all tweets on the page as individual boxes
# each tweet is stored under <article data-testid="tweet"> box:
tweets = selector.xpath("//article[@data-testid='tweet']")
for i, tweet in enumerate(tweets):
# using data-testid attribute we can get tweet details:
found = {
"text": "".join(tweet.xpath(".//*[@data-testid='tweetText']//text()").getall()),
"username": tweet.xpath(".//*[@data-testid='User-Names']/div[1]//text()").get(),
"handle": tweet.xpath(".//*[@data-testid='User-Names']/div[2]//text()").get(),
"datetime": tweet.xpath(".//time/@datetime").get(),
"verified": bool(tweet.xpath(".//svg[@data-testid='icon-verified']")),
"url": tweet.xpath(".//time/../@href").get(),
"image": tweet.xpath(".//*[@data-testid='tweetPhoto']/img/@src").get(),
"video": tweet.xpath(".//video/@src").get(),
"video_thumb": tweet.xpath(".//video/@poster").get(),
"likes": tweet.xpath(".//*[@data-testid='like']//text()").get(),
"retweets": tweet.xpath(".//*[@data-testid='retweet']//text()").get(),
"replies": tweet.xpath(".//*[@data-testid='reply']//text()").get(),
"views": (tweet.xpath(".//*[contains(@aria-label,'Views')]").re("(\d+) Views") or [None])[0],
}
# main tweet (not a reply):
if i == 0:
found["views"] = tweet.xpath('.//span[contains(text(),"Views")]/../preceding-sibling::div//text()').get()
found["retweets"] = tweet.xpath('.//a[contains(@href,"retweets")]//text()').get()
found["quote_tweets"] = tweet.xpath('.//a[contains(@href,"retweets/with_comments")]//text()').get()
found["likes"] = tweet.xpath('.//a[contains(@href,"likes")]//text()').get()
results.append({k: v for k, v in found.items() if v is not None})
return results
def scrape_tweet(url: str):
"""
Scrape tweet and replies from tweet page like:
https://twitter.com/Scrapfly_dev/status/1587431468141318146
"""
result = scrapfly.scrape(
ScrapeConfig(
url=url,
country="US",
render_js=True,
)
)
return parse_tweets(result.selector)
def parse_profiles(sel: Selector):
"""parse profile preview data from Twitter profile search"""
profiles = []
for profile in sel.xpath("//div[@data-testid='UserCell']"):
profiles.append(
{
"name": profile.xpath(".//a[not(@tabindex=-1)]//text()").get().strip(),
"handle": profile.xpath(".//a[@tabindex=-1]//text()").get().strip(),
"bio": "".join(profile.xpath("(.//div[@dir='auto'])[last()]//text()").getall()),
"url": profile.xpath(".//a/@href").get(),
"image": profile.xpath(".//img/@src").get(),
}
)
return profiles
# -------------------------------------
# SEARCH scraping
# -------------------------------------
def scrape_top_search(query: str):
"""scrape top Twitter page for featured tweets"""
result = scrapfly.scrape(
ScrapeConfig(
url=f"https://twitter.com/search?q={query}&src=typed_query",
country="US",
render_js=True,
)
)
return parse_tweets(result.selector)
def scrape_people_search(query: str):
"""scrape people search Twitter page for related users"""
result = scrapfly.scrape(
ScrapeConfig(
url=f"https://twitter.com/search?q={query}&src=typed_query&f=user",
country="US",
render_js=True,
)
)
return parse_profiles(result.selector)
# -------------------------------------
# TWITTER graphql API scraping example:
# -------------------------------------
AUTH_TOKEN = (
"AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw"
)
def get_guest_token():
"""register guest token for auth key"""
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"accept-encoding": "gzip",
"accept-language": "en-US,en;q=0.5",
"connection": "keep-alive",
"authorization": f"Bearer {AUTH_TOKEN}",
}
result = scrapfly.scrape(
ScrapeConfig(
url="https://api.twitter.com/1.1/guest/activate.json",
headers=headers,
method="POST",
)
)
guest_token = json.loads(result.content)["guest_token"] # e.g. '1622833653452804096'
return guest_token
GUEST_TOKEN = get_guest_token()
def scrape_user(handle: str):
headers = {
"authority": "api.twitter.com",
"authorization": f"Bearer {AUTH_TOKEN}",
"content-type": "application/json",
"origin": "https://twitter.com",
"referer": "https://twitter.com/",
"x-guest-token": GUEST_TOKEN,
"x-twitter-active-user": "yes",
"x-twitter-client-language": "en",
}
url = f"https://api.twitter.com/graphql/lhB3zXD3M7e-VfBkR-5A8g/UserByScreenName?variables=%7B%22screen_name%22%3A%22{handle}%22%2C%22withSafetyModeUserFields%22%3Atrue%2C%22withSuperFollowsUserFields%22%3Atrue%7D&features=%7B%22responsive_web_twitter_blue_verified_badge_is_enabled%22%3Atrue%2C%22responsive_web_graphql_exclude_directive_enabled%22%3Afalse%2C%22verified_phone_label_enabled%22%3Afalse%2C%22responsive_web_graphql_skip_user_profile_image_extensions_enabled%22%3Afalse%2C%22responsive_web_graphql_timeline_navigation_enabled%22%3Atrue%7D"
result = scrapfly.scrape(
ScrapeConfig(
url=url,
country="US",
headers=headers,
)
)
data = json.loads(result.content)
return data["data"]["user"]["result"]
if __name__ == "__main__":
print(scrape_user("Scrapfly_dev"))
print(scrape_top_search("python"))
print(scrape_people_search("python"))
print(scrape_tweet("https://twitter.com/Scrapfly_dev/status/1587431468141318146"))