How to scrape Threads by Meta using Python (2023-08 Update)
Guide how to scrape Threads - new social media network by Meta and Instagram - using Python and popular libraries like Playwright and background request capture techniques.
In this tutorial, we'll take a look at how to scrape ZoomInfo for public company data.
We'll start with an overview on how Zoominfo.com works so we can find all public company pages. Then we'll scrape company data using Python with a few community packages.
If you're new to web scraping with Python we recommend checking out our full introduction tutorial to web scraping with Python and common best practices.
Zoominfo.com hosts millions of public company profiles that contain company credentials, financial data and contacts. Company overview data can be used in business intelligence and market analysis. Company contact and employee details can be used in lead generation and the employment market.
For more on scraping use cases see our extensive web scraping use case article
In this tutorial we'll be using Python and a couple of popular community packages:
These packages can be easily installed via pip
command:
$ pip install httpx parsel
Alternatively, feel free to swap httpx
out with any other HTTP client package such as requests as we'll only need basic HTTP functions which are almost interchangeable in every library. As for, parsel
, another great alternative is beautifulsoup package.
To scrape a company profile listed on Zoominfo first let's take a look at the company page itself. For example, let's see this page for Tesla Inc. zoominfo.com/c/tesla-inc/104333869
The visible HTML is packed with data, however, instead of parsing it directly we can take a look at the page source of the web page and we can see that the data is embedded as a quoted or raw JSON file:
So, instead of parsing the HTML let's pick up this JSON file directly:
import asyncio
import json
from pathlib import Path
import httpx
from parsel import Selector
def _unescape_angular(text):
"""Helper function to unescape Angular quoted text"""
ANGULAR_ESCAPE = {
"&a;": "&",
"&q;": '"',
"&s;": "'",
"&l;": "<",
"&g;": ">",
}
for from_, to in ANGULAR_ESCAPE.items():
text = text.replace(from_, to)
return text
def parse_company(selector: Selector):
"""parse Zoominfo company page for company data"""
data = selector.css("script#app-root-state::text").get()
data = _unescape_angular(data)
data = json.loads(data)["cd-pageData"]
return data
async def scrape_company(url:str, session: httpx.AsyncClient) -> dict:
"""scrape zoominfo company page"""
response = await session.get(url)
assert response.status_code == 200, "request was blocked, see the avoid blocking section for more info"
return parse_company(Selector(text=response.text, base_url=response.url))
async def run():
BASE_HEADERS = {
"accept-language": "en-US,en;q=0.9",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"accept-language": "en-US;en;q=0.9",
"accept-encoding": "gzip, deflate, br",
}
async with httpx.AsyncClient(
limits=httpx.Limits(max_connections=5), timeout=httpx.Timeout(15.0), headers=BASE_HEADERS, http2=True
) as session:
data = await scrape_company("https://www.zoominfo.com/c/tesla-inc/104333869", session=session)
print(json.dumps(data, indent=2, ensure_ascii=False))
if __name__ == "__main__":
asyncio.run(run())
{
"companyId": "104333869",
"url": "www.tesla.com",
"foundingYear": "2003",
"totalFundingAmount": "13763790",
"isPublic": "Public",
"name": "Tesla",
"names": [
"Tesla Inc",
"..."
],
"logo": "https://res.cloudinary.com/zoominfo-com/image/upload/w_70,h_70,c_fit/tesla.com",
"ticker": "NASDAQ: TSLA",
"website": "//www.tesla.com",
"displayLink": "www.tesla.com",
"revenue": "53823001",
"numberOfEmployees": "99290",
"fullName": "Tesla, Inc.",
"companyIds": [
"104333869",
"..."
],
"industries": [
{
"name": "Manufacturing",
"link": "/companies-search/industry-manufacturing",
"primary": true
},
"..."
],
"socialNetworkUrls": [
{
"socialNetworkType": "LINKED_IN",
"socialNetworkUrl": "https://www.linkedin.com/company/tesla-motors/"
},
"..."
],
"address": {
"street": "1 Tesla Road",
"city": "Austin",
"state": "Texas",
"country": "United States",
"zip": "78725"
},
"phone": "(512) 516-8177",
"techUsed": [
{
"id": 92112,
"name": "Microsoft SQL Server Reporting",
"logo": "https://storage.googleapis.com/datanyze-data//technologies/17480e9fd49bbff12f7c482210d0060cf8f97713.png",
"vendorFullName": "Microsoft Corporation",
"vendorDisplayName": "Microsoft",
"vendorId": 24904409
},
"..."
],
"techOwned": [],
"description": "Founded in 2003, Tesla is an electric vehicle and clean energy company that offers products including electric cars, battery energy storage from home to grid-scale, solar panels, solar roof tiles, and other related products and services.",
"competitors": [
{
"id": "407578600",
"name": "NIO",
"employees": 9834,
"revenue": "720117",
"logo": "https://res.cloudinary.com/zoominfo-com/image/upload/w_70,h_70,c_fit/nio.com",
"index": 0
},
"..."
],
"fundings": [
{
"amount": "2000000",
"date": "Feb 13, 2020",
"type": "Stock Issuance/Offering",
"investors": [
"Elon Musk",
"Larry Ellison"
]
},
"..."
],
"acquisitions": [],
"claimed": false,
"sic": [
"37",
"..."
],
"naics": [
"44",
"..."
],
"success": true,
"chartData": {
"chartEmployeeData": [
{
"date": "'21 - Q1",
"value": 45000000
},
"..."
],
"chartRevenueData": [
{
"date": "'21 - Q1",
"value": 24578000000
},
"..."
],
"twitter": [],
"facebook": []
},
"executives": {
"CEO": {
"personId": "3201848920",
"fullName": "Elon Musk",
"title": "Co-Founder & Chief Executive Officer",
"picture": "https://n.com.do/wp-content/uploads/2019/08/elon-musk-neuralink-portrait.jpg",
"personUrl": "/p/Elon-Musk/3201848920",
"orgChartTier": 1
},
"CFO": {
"personId": "3744260195",
"fullName": "Zachary Kirkhorn",
"title": "Master of Coin & Chief Financial Officer",
"picture": "https://img.etimg.com/thumb/msid-80476199,width-1200,height-900,imgsize-287780,overlay-economictimes/photo.jpg",
"personUrl": "/p/Zachary-Kirkhorn/3744260195",
"orgChartTier": 2
}
},
"orgChart": {
"title": "Tesla's Org Chart",
"btnContent": "See Full Org Chart",
"personCardActions": {
"nameAction": "OrgChartContact",
"imageAction": "OrgChartContact",
"emailAction": "OrgChartContactInfo",
"phoneAction": "OrgChartContactInfo"
},
"firstTier": {
"personId": "3201848920",
"fullName": "Elon Musk",
"title": "Co-Founder & Chief Executive Officer",
"picture": "https://n.com.do/wp-content/uploads/2019/08/elon-musk-neuralink-portrait.jpg",
"personUrl": "/p/Elon-Musk/3201848920",
"orgChartTier": 1
},
"secondTier": [
{
"personId": "3744260195",
"fullName": "Zachary Kirkhorn",
"title": "Master of Coin & Chief Financial Officer",
"picture": "https://img.etimg.com/thumb/msid-80476199,width-1200,height-900,imgsize-287780,overlay-economictimes/photo.jpg",
"personUrl": "/p/Zachary-Kirkhorn/3744260195",
"orgChartTier": 2
},
"..."
]
},
"pic": [
{
"personId": "-2033294111",
"fullName": "Emmanuelle Stewart",
"title": "Deputy General Counsel",
"picture": "",
"personUrl": "/p/Emmanuelle-Stewart/-2033294111",
"orgChartTier": 3
},
"..."
],
"ceo": {
"personId": "3201848920",
"fullName": "Elon Musk",
"title": "Co-Founder & Chief Executive Officer",
"picture": "https://n.com.do/wp-content/uploads/2019/08/elon-musk-neuralink-portrait.jpg",
"personUrl": "/p/Elon-Musk/3201848920",
"orgChartTier": 1,
"rating": {
"great": 18,
"good": 1,
"ok": 1,
"bad": 2
},
"company": {
"name": "Tesla",
"id": "104333869",
"country": "US",
"logo": "https://res.cloudinary.com/zoominfo-com/image/upload/w_70,h_70,c_fit/tesla.com",
"fullName": "Tesla, Inc.",
"claimed": false,
"domain": "www.tesla.com",
"numberOfEmployees": "99290",
"industries": [
{
"name": "Manufacturing",
"link": "/companies-search/industry-manufacturing",
"primary": true
},
"..."
],
"address": {
"street": "1 Tesla Road",
"city": "Austin",
"state": "Texas",
"country": "United States",
"zip": "78725"
}
}
},
"newsFeed": [
{
"url": "https://www.inferse.com/152631/tesla-issues-another-over-the-air-recall-on-a-small-number-of-cars-in-the-us-electrek/",
"title": "Tesla issues another over-the-air recall on a small number of cars in the US - Inferse.com",
"content": "March 25 Fred Lambert - Mar. 25th 2022 11:23 am PT",
"date": "2022-07-18T02:09:11Z",
"domain": "www.inferse.com",
"isComparablyNews": false
},
"..."
],
"user": {
"country": "US"
},
"emailPatterns": [
{
"value": "tesla.com",
"rank": 0,
"rawpatternstring": "0:tesla.com:0.61:0.61:0.98:25574",
"sampleemail": "JSmith@tesla.com",
"usagePercentage": 59.8,
"format": "first initials + last"
},
"..."
]
}
We can see, how incredibly short, efficient and simple our Zoominfo scraper is using this approach!
Now that we know how to scrape a single company's page let's take a look at how to find company page URLs so we can collect all of the public company data from Zoominfo.
Unfortunately, Zoominfo doesn't provide a publicly accessible sitemap directory as many other websites do. So, we either need to explore directories by location/industry or search companies by name. Let's take a look two of these discovery techniques.
Zoominfo.com has public company directory pages for many locations or industry types. However, these directories are limited to 100 results (5 pages) per query. For example, to find "software companies in Los Angeles" we could use this directory page:
zoominfo.com/companies-search/location-usa--california--los-angeles-industry-software
Picking up first 100 results from each directory page can give us a good amount of results and it's an easy scrape:
import httpx
from parsel import Selector
from urllib.parse import urljoin
from typing import List
def scrape_directory(url: str, scrape_pagination=True) -> List[str]:
"""Scrape Zoominfo directory page"""
response = httpx.get(url)
assert response.status_code == 200 # check whether we're blocked
# parse first page of the results
selector = Selector(text=response.text, base_url=url)
companies = selector.css("div.tableRow_companyName_nameAndLink>a::attr(href)").getall()
# parse other pages of the results
if scrape_pagination:
other_pages = selector.css('div.pagination>a::attr(href)').getall()
for page_url in other_pages:
companies.extend(scrape_directory(page_url, scrape_pagination=False))
return companies
print(scrape_directory("https://www.zoominfo.com/companies-search/location-usa--california--los-angeles-industry-software"))
In our short scraper above, we pick up all 5 pages of our directory page. To extend this, we can employ a crawling technique by exploring related companies in each company we scrape. If we take a look at the dataset we scraped before we can see that each company page contains a list of up to six competing companies:
"competitors": [
{
"id": "407578600",
"name": "NIO",
"employees": 9834,
"revenue": "720117",
"logo": "https://res.cloudinary.com/zoominfo-com/image/upload/w_70,h_70,c_fit/nio.com",
"index": 0
},
"..."
],
So, by all companies available in the directories and their competitors we can reach pretty high coverage rates. This approach is generally referred to as crawling. We have a starting point of a single or few URLs by scraping those we acquire more URLs to follow.
Our Zoominfo crawler has a decent discovery coverage by combining these two techniques, even with paging restrictions of 5 pages per directory.
For more on crawling see our full introduction article that covers crawling in great detail and provides real code examples
To complement this, let's take a look at how can we use the search system to find more company pages next.
Zoominfo also offers a quick search system that offers up to 3 results for a given search query:
So, if we have a collection of company names we can find their Zoominfo pages using this search:
import asyncio
import json
import httpx
async def search(query, session: httpx.AsyncClient):
url = "https://directory-api.zoominfo.com/api/zoominfo/dual-quick-search"
resp = await session.post(url, json={"searchString": query})
data = resp.json()
return data
async def run():
BASE_HEADERS = {
"accept-language": "en-US,en;q=0.9",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"accept-language": "en-US;en;q=0.9",
"accept-encoding": "gzip, deflate, br",
}
async with httpx.AsyncClient(
limits=httpx.Limits(max_connections=5), timeout=httpx.Timeout(15.0), headers=BASE_HEADERS, http2=True
) as session:
return await search("tesla", session=session)
if __name__ == "__main__":
print(json.dumps(asyncio.run(run()), indent=2, ensure_ascii=False))
{
"people": [
{
"id": "3201848920",
"name": {
"first": "Elon",
"last": "Musk"
},
"picture": "https://n.com.do/wp-content/uploads/2019/08/elon-musk-neuralink-portrait.jpg",
"jobTitle": "Co-Founder & Chief Executive Officer",
"companyName": "Tesla"
},
{
"id": "3744260195",
"name": {
"first": "Zachary",
"last": "Kirkhorn"
},
"picture": "https://img.etimg.com/thumb/msid-80476199,width-1200,height-900,imgsize-287780,overlay-economictimes/photo.jpg",
"jobTitle": "Master of Coin & Chief Financial Officer",
"companyName": "Tesla"
},
{
"id": "7719115377",
"name": {
"first": "Alisher",
"last": "Valikhanov"
},
"picture": "",
"jobTitle": "Chief Executive Officer",
"companyName": "Tesla-TAN"
}
],
"companies": [
{
"id": "104333869",
"name": "Tesla",
"url": "www.tesla.com",
"headquarters": {
"city": "Austin",
"state": "Texas",
"country": "United States"
}
},
{
"id": "430439652",
"name": "Tesla-TAN",
"url": "www.teslatan.kz",
"headquarters": {
"city": "Atyrau",
"state": "Atyrau",
"country": "Kazakhstan"
}
},
{
"id": "112033901",
"name": "TESLA ENGINEERING",
"url": "www.tesla.co.uk",
"headquarters": {
"city": "Storrington",
"state": "West Sussex",
"country": "United Kingdom"
}
}
],
"success": true
}
Using Zoominfo's search endpoint we can find company pages as long as we know their names and there are plenty of public company databases out there to help us out.
One source of company names could be Zoominfo's competitor - Crunchbase.com which we've covered in a recent tutorial. Crunchbase provides a company sitemap which we can use as a company name dataset.
We looked at how to Scrape Zoominfo.com though this target, in particular, is known for using multiple anti web scraping technologies which can get in the way of collecting the public data available on Zoominfo.
To get around this, let's take advantage of ScrapFly API which can avoid all of these blocks for us just with a few extra lines of Python code!
ScrapFly offers several powerful features that'll help us to get around web scraper blocking:
For this, we'll be using scrapfly-sdk python package. To start, let's install scrapfly-sdk
using pip:
$ pip install scrapfly-sdk
To take advantage of ScrapFly's API in our Zoominfo web scraper all we need to do is change our httpx
session code with scrapfly-sdk
client requests.
For scraping Zoominfo we'll be using Anti Scraping Protection Bypass feature which can be enabled via asp=True
argument.
For example, let's take a look how can we use ScrapFly to scrape a single company page:
from scrapfly import ScrapflyClient, ScrapeConfig
client = ScrapflyClient(key='YOUR_SCRAPFLY_KEY')
result = client.scrape(ScrapeConfig(
url="https://www.zoominfo.com/c/tesla-inc/104333869",
# we need to enable Anti Scraping Protection bypass with a keyword argument:
asp=True,
))
For more, see the Full Scraper Code section.
To wrap this guide up let's take a look at some frequently asked questions about web scraping Zoominfo.com:
Yes. Data displayed on Zoominfo is publicly available, and we're not extracting anything private. Scraping Zoominfo.com at slow, respectful rates would fall under the ethical scraping definition.
That being said, attention should be paid to GDRP compliance in the EU when scraping personal data such as people's data. For more, see our Is Web Scraping Legal? article.
In this tutorial, we built an Zoominfo.com company data scraper. We've taken a look at how to scrape company pages by extracting embedded state data rather than parsing HTML files. We also took a look at how to find company pages using either Zoominfo directory pages or its search system.
For this, we used Python with a few community packages like httpx and to prevent being blocked we used ScrapFly's API which smartly configures every web scraper connection to avoid being blocked. For more on ScrapFly see our documentation and try it out for free!
Let's put everything together: finding companies using search and scraping their info with ScrapFly integration:
import asyncio
import json
from pathlib import Path
from typing import Dict, List
from urllib.parse import urljoin
from scrapfly import ScrapeApiResponse, ScrapeConfig, ScrapflyClient
scrapfly = ScrapflyClient(key="YOUR SCRAPFLY KEY")
async def scrape_search(query: str) -> Dict:
url = "https://directory-api.zoominfo.com/api/zoominfo/dual-quick-search"
result = await scrapfly.async_scrape(
ScrapeConfig(
url=url,
method="POST",
data={"searchString": query},
asp=True,
)
)
data = json.loads(result.content)
return data
async def scrape_directory(url: str) -> List[str]:
"""Scrape Zoominfo directory page"""
def parse_directory(result):
companies = result.selector.css("div.tableRow_companyName_nameAndLink>a::attr(href)").getall()
return companies
# parse first page of the results:
first_page = await scrapfly.async_scrape(
ScrapeConfig(url, asp=True, country="US", proxy_pool="public_residential_pool")
)
companies = parse_directory(first_page)
# then parse remaining 4 pages concurrently:
other_pages = first_page.selector.css("div.pagination>a::attr(href)").getall()
other_pages = [ScrapeConfig(urljoin(url, page), asp=True, country="US") for page in other_pages]
async for result in scrapfly.concurrent_scrape(other_pages):
companies.extend(parse_directory(result))
return companies
def _unescape_angular(text: str) -> str:
"""Helper function to unescape Angular quoted text"""
ANGULAR_ESCAPE = {
"&a;": "&",
"&q;": '"',
"&s;": "'",
"&l;": "<",
"&g;": ">",
}
for from_, to in ANGULAR_ESCAPE.items():
text = text.replace(from_, to)
return text
def parse_company(result: ScrapeApiResponse) -> Dict:
"""parse Zoominfo company page for company data"""
data = result.selector.css("script#app-root-state::text").get()
data = _unescape_angular(data)
data = json.loads(data)["cd-pageData"]
return data
async def scrape_companies(urls: List[str]) -> List[Dict]:
to_scrape = [ScrapeConfig(url, country="US", asp=True) for url in urls]
companies = []
async for result in scrapfly.concurrent_scrape(to_scrape):
companies.append(parse_company(result))
return companies
async def example_run():
out = Path(__file__).parent / "results"
out.mkdir(exist_ok=True)
# scrape search:
result_search = await scrape_search("tesla")
out.joinpath("search-tesla.json").write_text(json.dumps(result_search, indent=2))
# scrape company directory:
result_search_directory = await scrape_directory(
"https://www.zoominfo.com/companies-search/location-usa--california--los-angeles-industry-software"
)
out.joinpath("search-directory.json").write_text(json.dumps(result_search_directory, indent=2))
# scrape company data:
result_company = await scrape_companies(["https://www.zoominfo.com/c/tesla-inc/104333869"])
out.joinpath("company-tesla.json").write_text(json.dumps(result_company, indent=2))
# # we can also scrape competitors:
urlfy_name = lambda name: name.lower().replace(" ", "-")
competitor_urls = [
f'https://www.zoominfo.com/c/{urlfy_name(competitor["name"])}/{competitor["id"]}'
for competitor in result_company[0]["competitors"]
]
result_competitors = await scrape_companies(competitor_urls)
out.joinpath("company-tesla-competitors.json").write_text(json.dumps(result_competitors, indent=2))
if __name__ == "__main__":
asyncio.run(example_run())