Python爬虫Beautiful Soup详细教程
easy_install beautifulsoup4
或者
pip install beautifulsoup4
sudo python setup.py install
pip install lxml
pip install html5lib
解析器 | 使用方法 | 优势 | 劣势 |
Python标准库 | BeautifulSoup(markup, “html.parser”) | Python的内置标准库 执行速度适中 文档容错能力强 |
Python低版本中文档容错能力差 |
lxml HTML解析器 | BeautifulSoup(markup, “lxml”) | 速度快 文档容错能力强 |
需要安装C语言库 |
lxml XML解析器 | BeautifulSoup(markup, [“lxml”, “xml”])BeautifulSoup(markup, “xml”) | 速度快 唯一支持XML的解析器 |
需要安装C语言库 |
html5lib | BeautifulSoup(markup, “html5lib”) | 最好的容错性 浏览器方式解析文档 生成HTML5格式文档 |
速度慢 不依赖外部扩展 |
# 导入 bs4 库
from bs4 import BeautifulSoup
# 创建 beautifulsoup 对象
soup = BeautifulSoup(html)
# 格式化输出soup对象的内容
print soup.prettify()
Tag
NavigableString
BeautifulSoup
Comment
# Filename : example.py
# Copyright : 2020 By Lidihuo
# Author by : www.lidihuo.com
# Date : 2020-08-21
# 比如:Tag是<h1>Python爬虫Beautiful Soup</h1>
# 导入 bs4 库
from bs4 import BeautifulSoup
# 创建 beautifulsoup 对象
soup = BeautifulSoup(html)
# 输出h1对象的内容
print(soup.h1)
print soup.name
print soup.head.name
#[document]
#head
print(soup.p.attrs)
#{'class': ['title'], 'name': 'dromouse'}
print soup.p['class']
# ['title']
print soup.p.get('class')
# ['title']
soup.p['class']="newClass"
print soup.p
# <p class="newClass" name="dromouse"><b>The Dormouse's story</b></p>
del soup.p['class']
print soup.p
# <p name="dromouse"><b>The Dormouse's story</b></p>
print soup.p.string
# The Dormouse's story
# Filename : example.py
# Copyright : 2020 By Lidihuo
# Author by : www.lidihuo.com
# Date : 2020-08-21
print type(soup.name)
# <type 'unicode'>
print soup.name
# [document]
print soup.attrs
# {} 空字典
# Filename : example.py
# Copyright : 2020 By Lidihuo
# Author by : www.lidihuo.com
# Date : 2020-08-21
print soup.a
print soup.a.string
print type(soup.a.string)
print soup.head.contents
# [<title>Hello lidihuo</title>]
print soup.head.contents[0]
# <title>Hello lidihuo</title>
print soup.head.children
# <listiterator object at 0x7f71457f5710>
for child in soup.body.children:
print child
for child
in soup.body.children:
print child
print soup.head.string
# The Dormouse's story
print soup.title.string
# The Dormouse's story
print soup.html.string
# None
for string in soup.strings:
print(repr(string))
for string in soup.stripped_strings:
print(repr(string))
content = soup.head.title.string
for parent in content.parents:
print parent.name
print soup.p.next_sibling
# 实际该处为空白
print soup.p.prev_sibling
# None没有前一个兄弟节点,返回 None
print soup.p.next_sibling.next_sibling
# Filename : example.py
# Copyright : 2020 By Lidihuo
# Author by : www.lidihuo.com
# Date : 2020-08-21
for sibling in soup.a.next_siblings:
print(repr(sibling))
<head><title>Hello Lidihuo</title></head>
print soup.head.next_element
# <title>The Dormouse's story</title>
# Filename : example.py
# Copyright : 2020 By Lidihuo
# Author by : www.lidihuo.com
# Date : 2020-08-21
for element in last_a_tag.next_elements:
print(repr(element))
soup.find_all('b')
soup.find_all(["a", "b"])
# Filename : example.py
# Copyright : 2020 By Lidihuo
# Author by : www.lidihuo.com
# Date : 2020-08-21
for tag in soup.find_all(True):
print(tag.name)
# Filename : example.py
# Copyright : 2020 By Lidihuo
# Author by : www.lidihuo.com
# Date : 2020-08-21
def has_class_but_no_id(tag):
return tag.has_attr('class') and not tag.has_attr('id')
soup.find_all(has_class_but_no_id)
soup.find_all(id='link2')
soup.find_all(href=re.compile("elsie"))
soup.find_all("a", class_="sister")
data_soup = BeautifulSoup('
foo!
') data_soup.find_all(data-foo="value")
data_soup.find_all(attrs={"data-foo": "value"})
soup.find_all(text="Elsie")
soup.find_all(text=["Tillie", "Elsie", "Lacie"])
soup.find_all(text=re.compile("Dormouse"))
soup.find_all("a", limit=2)
soup.html.find_all("title")
soup.html.find_all("title", recursive=False)
# Filename : example.py
# Copyright : 2020 By Lidihuo
# Author by : www.lidihuo.com
# Date : 2020-08-21
# 通过标签名查找
print soup.select('title')
print soup.select('a')
print soup.select('b')
# 通过类名查找
print soup.select('.sister')
#通过 id 名查找
print soup.select('#link1')
# 组合查找,组合查找即和写 class 文件时,标签名与类名、id名进行的组合原理是一样的,例如查找 p 标签中,id 等于 link1的内容,二者需要用空格分开
print soup.select('p #link1')
# 直接子标签查找
print soup.select("head > title")
# 属性查找,查找时还可以加入属性元素,属性需要用中括号括起来,注意属性和标签属于同一节点,所以中间不能加空格,否则会无法匹配到。
print soup.select('a[class="sister"]')
print soup.select('a[href="http://xxx.com/elsie"]')
# 同样,属性仍然可以与上述查找方式组合,不在同一节点的空格隔开,同一节点的不加空格
print soup.select('p a[href="http://xxx.com/elsie"]')
# Filename : example.py
# Copyright : 2020 By Lidihuo
# Author by : www.lidihuo.com
# Date : 2020-08-21
soup = BeautifulSoup(html, 'lxml')
print type(soup.select('title'))
print soup.select('title')[0].get_text()
for title in soup.select('title'):
print title.get_text()