一、bs4的安装与使用
'''安装解析器pip3 install lxml安装解析库pip3 install bs4'''html_doc = """The Dormouse's story $37
Once upon a time there were three little sisters; and their names wereElsie,Lacie andTillie;and they lived at the bottom of a well.
...
"""from bs4 import BeautifulSoup# python自带的解析库# soup = BeautifulSoup(html_doc,'html.parser')# 调用bs4得到一个soup对象soup = BeautifulSoup(html_doc,'lxml')# bs4对象print(soup)# bs4类型print(type(soup))# 美化功能html = soup.prettify()print(html)
二、bs4解析库之遍历文档树
html_doc = """The Dormouse's story $37
Once upon a time there were three little sisters; and their names wereElsie,Lacie andTillie;and they lived at the bottom of a well.
...
"""from bs4 import BeautifulSoupsoup = BeautifulSoup(html_doc,'lxml')# 遍历文档树# 1、直接使用print(soup.html)print(type(soup.html))print(soup.a)print(soup.p)# 2、获取标签的名称print(soup.a.name)# 3、获取标签的属性print(soup.a.attrs) # 获取a标签中所有的属性print(soup.a.attrs['href'])# 4、获取标签的文本内容print(soup.p.text)# 5、嵌套选择print(soup.html.body.p)# 6、子节点、子孙节点print(soup.p.children)print(list(soup.p.children))# 7、父节点、祖先节点print(soup.b.parent)print(soup.b.parents)print(list(soup.b.parents))# 8、兄弟节点print(soup.a)# # 获取下一个兄弟节点print(soup.a.next_sibling)# # 获取下一个的所有兄弟节点,返回的是一个生成器print(soup.a.next_siblings)print(list(soup.a.next_siblings))# # 获取上一个兄弟节点print(soup.a.previous_sibling)# # 获取上一个的所有兄弟节点,返回的是一个生成器print(list(soup.a.previous_siblings))
三、bs4解析库之搜索文档树
'''find:找第一个find_all:找所有标签查找与属性查找: 标签: - 字符串过滤器 字符串全局匹配 name 属性匹配 attrs 属性查找匹配 text 文本匹配 - 正则过滤器 re模块匹配 - 列表过滤器 列表内的数据匹配 - bool过滤器 True匹配 - 方法过滤器 用于一些要的属性以及不需要的属性查找。 属性: - class_ - id'''html_doc = """The Dormouse's story $37
Once upon a time there were three little sisters; and their names wereElsieLacie andTillieand they lived at the bottom of a well.
...
"""from bs4 import BeautifulSoupsoup = BeautifulSoup(html_doc,'lxml')# name 属性匹配# attrs 属性查找匹配# text 文本匹配# find与find_all查找'''字符串过滤器'''p = soup.find(name='p')ps = soup.find_all(name='p')print(p)print(ps)# name + attrsp = soup.find(name='p',attrs={ "id":"p"})print(p)# name + texttag = soup.find(name='title',text="The Dormouse's story")print(tag)# name + attrs + texttag = soup.find(name='a',attrs={ "class":"sister"},text="Elsie")print(tag)'''正则过滤器''''''re模块匹配'''# name# 根据re模块匹配带有a的节点import rea = soup.find(name=re.compile('a'))print(a)a_s = soup.find_all(name=re.compile('a'))print(a_s)# attrsa = soup.find(attrs={ "id":re.compile('link')})print(a)'''列表过滤器''''''列表内的数据匹配'''import reprint(soup.find(name=['a','p','html',re.compile('a')]))print(soup.find_all(name=['a','p','html',re.compile('a')]))'''bool过滤器''''''True匹配'''print(soup.find(name=True,attrs={ "id":True}))'''方法过滤器''''''用于一些要的属性以及不需要的属性查找。'''def has_class_no_id(tag): print(tag.name) if tag.name == 'p' and tag.has_attr("id") and not tag.has_attr("class"): return tagprint(soup.find_all(name=has_class_no_id)) # name=函数对象# 补充知识点# ida = soup.find(id='link2')print(a)# classp = soup.find(class_='sister')print(p)