例子1:循环遍历所有的<a>
标签
from lxml import html
# 假设html_content是一个包含HTML内容的字符串
html_content = """
<html>
<body>
<a href="link1.html">Link 1</a>
<a href="link2.html">Link 2</a>
<a href="link3.html">Link 3</a>
</body>
</html>
"""
# 解析HTML内容
tree = html.fromstring(html_content)
# 使用XPath表达式找到所有的<a>标签
a_elements = tree.xpath('//a')
# 循环遍历<a>标签并打印href和文本内容
for a in a_elements:
href = a.get('href') # 获取href属性
text = a.text.strip() # 获取文本内容并去除前后空白
print(f"Href: {href}, Text: {text}")
例子2:循环遍历具有特定类的<div>
标签
from lxml import html
# 假设html_content是一个包含HTML内容的字符串
html_content = """
<html>
<body>
<div class="my-class">Div 1</div>
<div class="other-class">Div 2</div>
<div class="my-class">Div 3</div>
</body>
</html>
"""
# 解析HTML内容
tree = html.fromstring(html_content)
# 使用XPath表达式找到所有class为"my-class"的<div>标签
div_elements = tree.xpath('//div[@class="my-class"]')
# 循环遍历这些<div>标签并打印文本内容
for div in div_elements:
text = div.text.strip()
print(f"Text: {text}")
例子3:循环遍历嵌套在特定<div>
中的<a>
标签
from lxml import html
# 假设html_content是一个包含HTML内容的字符串
html_content = """
<html>
<body>
<div id="content">
<a href="link1.html">Link 1</a>
<p>Some text</p>
<a href="link2.html">Link 2</a>
</div>
</body>
</html>
"""
# 解析HTML内容
tree = html.fromstring(html_content)
# 首先定位到id为"content"的<div>标签
content_div = tree.xpath('//div[@id="content"]')[0]
# 然后在该<div>内部找到所有的<a>标签
a_elements = content_div.xpath('.//a')
# 循环遍历这些<a>标签并打印href和文本内容
for a in a_elements:
href = a.get('href')
text = a.text.strip()
print(f"Href: {href}, Text: {text}")
在上面的例子中,.//a
是一个相对XPath表达式,它表示在当前元素(即content_div
)的后代中查找所有的<a>
标签。