When web scraping, sometimes the values we want are located between two known HTML elements. To extract them we can use find_all()
and find_next_siblings()
methods:
import bs4
soup = bs4.BeautifulSoup("""
<h2>heading 1</h2>
<p>paragraph 1</p>
<p>paragraph 2</p>
<h2>heading 2</h2>
<p>paragraph 3</p>
<p>paragraph 4</p>
""")
blocks = {}
for heading in soup.find_all("h2"): # find separators, in this case h2 nodes
values = []
for sibling in heading.find_next_siblings():
if sibling.name == "h2": # iterate through siblings until separator is encoutnered
break
values.append(sibling.text)
blocks[heading.text] = values
print(blocks)
{
'heading 1': ['paragraph 1', 'paragraph 2'],
'heading 2': ['paragraph 3', 'paragraph 4']
}