form bs4 import BeautifulSoup
html_doc ="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
bs = BeautifulSoup(html_doc,features='lxml')print(bs.head)# <head><title>The Dormouse's story</title></head>print(type(bs.head))# <class 'bs4.element.Tag'>print(bs.a)# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>print(type(bs.a))# <class 'bs4.element.Tag'>
NavigableString对象
form bs4 import BeautifulSoup
html_doc ="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
bs = BeautifulSoup(html_doc,features='lxml')print(bs.a.string)# Elsieprint(type(bs.a.string))# <class 'bs4.element.NavigableString'>
bs4对象
form bs4 import BeautifulSoup
html_doc ="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
bs = BeautifulSoup(html_doc,features='lxml')print(bs)# 内容为html_doc全部内容print(type(bs))# <class 'bs4.BeautifulSoup'>
comment对象
html ='<a><!--圣诞快乐!!--></a>'
bs = BeautifulSoup(html, features='lxml')print(bs.a.string)# 圣诞快乐!!print(type(bs.a.string))# <class 'bs4.element.Comment'>
遍历文档树
遍历子节点
contents 返回的是一个所有子节点的列表
children 返回的是一个子节点的迭代器
descendants 返回的是一个生成器遍历子子孙孙
contents
form bs4 import BeautifulSoup
html_doc ="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
bs = BeautifulSoup(html_doc,features='lxml')print(bs.body.contents)'''
['\n', <p class="title"><b>The Dormouse's story</b></p>, '\n', <p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>, '\n', <p class="story">...</p>, '\n']
'''
children
form bs4 import BeautifulSoup
html_doc ="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
bs = BeautifulSoup(html_doc,features='lxml')print(bs.body.children)# <list_iterator object at 0x000001D7FA896130>for i in bs.body.children:print(i)'''
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
'''
descendants
form bs4 import BeautifulSoup
html_doc ="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
bs = BeautifulSoup(html_doc,features='lxml')print(bs.body.descendants)# <generator object Tag.descendants at 0x0000023952F9DCF0>'''
<p class="title"><b>The Dormouse's story</b></p>
<b>The Dormouse's story</b>
The Dormouse's story
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
Elsie
,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
Lacie
and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
Tillie
;
and they lived at the bottom of a well.
<p class="story">...</p>
...
'''
遍历内容
string获取标签里面的内容
strings 返回是一个生成器对象用过来获取多个标签内容
stripped_strings 和strings基本一致 但是它可以把多余的空格去掉
string
form bs4 import BeautifulSoup
html_doc ="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
bs = BeautifulSoup(html_doc,'lxml')print(bs.head.string)# The Dormouse's story
strings 和 stripped_strings
form bs4 import BeautifulSoup
html_doc ="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
bs = BeautifulSoup(html_doc,'lxml')print(bs.html.string)# Noneprint(bs.html.strings)# <generator object Tag._all_strings at 0x000002B55EE52C80>for i in bs.html.strings:print(i)# 下面有stripped_strings操作'''
The Dormouse's story
The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie
,
Lacie
and
Tillie
;
and they lived at the bottom of a well.
...
'''print(bs.html.stripped_strings)# <generator object Tag.stripped_strings at 0x00000199B38EFC10>for i in bs.html.stripped_strings:print(i)'''
The Dormouse's story
The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie
,
Lacie
and
Tillie
;
and they lived at the bottom of a well.
...
'''
遍历父节点
parent直接获得父节点
parents返回一个所有父节点的生成器对象
parent 和 parents
form bs4 import BeautifulSoup
html_doc ="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
bs = BeautifulSoup(html_doc,'lxml')print(bs.title.parent)# <head><title>The Dormouse's story</title></head>print(bs.html.parent)# 返回html_doc全部内容(并将标签补齐)print(bs.title.parents)# <generator object PageElement.parents at 0x000001E945F72C80>for i in bs.title.parents:print(i)print('-'*60)'''
<head><title>The Dormouse's story</title></head>
------------------------------------------------------------
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>
------------------------------------------------------------
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>
------------------------------------------------------------
'''
遍历兄弟节点
next_sibling 下一个兄弟节点
previous_sibling 上一个兄弟节点
next_siblings 返回下一个所有兄弟节点的生成器对象
previous_siblings返回上一个所有兄弟节点的生成器对象
next_sibling 和 previous_sibling
form bs4 import BeautifulSoup
html_doc ="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
bs = BeautifulSoup(html_doc,'lxml')print(repr(bs.p.next_sibling))# '\n'print(repr(bs.a.next_sibling))# ',\n'print(repr(bs.a.previous_sibling))# 'Once upon a time there were three little sisters; and their names were\n'print(repr(bs.p.previous_sibling))# '\n'
next_siblings 和 previous_siblings
form bs4 import BeautifulSoup
html_doc ="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
bs = BeautifulSoup(html_doc,'lxml')for i in bs.a.next_siblings:print(repr(i))print('-'*60)'''
',\n'
------------------------------------------------------------
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
------------------------------------------------------------
' and\n'
------------------------------------------------------------
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
------------------------------------------------------------
';\nand they lived at the bottom of a well.'
------------------------------------------------------------
'''for i in bs.find(id='link3').previous_siblings:print(repr(i))print('-'*60)'''
' and\n'
------------------------------------------------------------
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
------------------------------------------------------------
',\n'
------------------------------------------------------------
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
------------------------------------------------------------
'Once upon a time there were three little sisters; and their names were\n'
------------------------------------------------------------
'''