BeautifulSoup 崔老師爬蟲系列課程學習筆記
很多時候能代替正則表達式完成網頁信息的提取
安裝
pip install beautifulsoup4
基本使用方法
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class=\"title\" name=\"dromouse\"><b>The Dormouse's story</b></p>
<p class=\"story\">Once upon a time there were three little sisters; and their names were
<a href=\"http://example.com/elsie\" class=\"sister\" id=\"link1\"><!-- Elsie --></a>,
<a href=\"http://example.com/lacie\" class=\"sister\" id=\"link2\">Lacie</a> and
<a href=\"http://example.com/tillie\" class=\"sister\" id=\"link3\">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class=\"story\">...</p>"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,"html")
print(soup.prettify())#格式化代碼
print(soup.title.string)#選擇title標籤,並將裏面的內容打印出來
常用標籤選擇器
選擇元素
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class=\"title\" name=\"dromouse\"><b>The Dormouse's story</b></p>
<p class=\"story\">Once upon a time there were three little sisters; and their names were
<a href=\"http://example.com/elsie\" class=\"sister\" id=\"link1\"><!-- Elsie --></a>,
<a href=\"http://example.com/lacie\" class=\"sister\" id=\"link2\">Lacie</a> and
<a href=\"http://example.com/tillie\" class=\"sister\" id=\"link3\">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class=\"story\">...</p>"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.title.string)
print(type(soup.title))
print(soup.head)
print(soup.p)#僅返回第一個找到的對象
獲取名稱
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class=\"title\" name=\"dromouse\"><b>The Dormouse's story</b></p>
<p class=\"story\">Once upon a time there were three little sisters; and their names were
<a href=\"http://example.com/elsie\" class=\"sister\" id=\"link1\"><!-- Elsie --></a>,
<a href=\"http://example.com/lacie\" class=\"sister\" id=\"link2\">Lacie</a> and
<a href=\"http://example.com/tillie\" class=\"sister\" id=\"link3\">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class=\"story\">...</p>"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.title.name)
獲取屬性
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class=\"title\" name=\"dromouse\"><b>The Dormouse's story</b></p>
<p class=\"story\">Once upon a time there were three little sisters; and their names were
<a href=\"http://example.com/elsie\" class=\"sister\" id=\"link1\"><!-- Elsie --></a>,
<a href=\"http://example.com/lacie\" class=\"sister\" id=\"link2\">Lacie</a> and
<a href=\"http://example.com/tillie\" class=\"sister\" id=\"link3\">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class=\"story\">...</p>"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.p['name'])
print(soup.p.attrs['name'])
獲取內容
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class=\"title\" name=\"dromouse\"><b>The Dormouse's story</b></p>
<p class=\"story\">Once upon a time there were three little sisters; and their names were
<a href=\"http://example.com/elsie\" class=\"sister\" id=\"link1\"><!-- Elsie --></a>,
<a href=\"http://example.com/lacie\" class=\"sister\" id=\"link2\">Lacie</a> and
<a href=\"http://example.com/tillie\" class=\"sister\" id=\"link3\">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class=\"story\">...</p>"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.p.string)
嵌套選擇
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class=\"title\" name=\"dromouse\"><b>The Dormouse's story</b></p>
<p class=\"story\">Once upon a time there were three little sisters; and their names were
<a href=\"http://example.com/elsie\" class=\"sister\" id=\"link1\"><!-- Elsie --></a>,
<a href=\"http://example.com/lacie\" class=\"sister\" id=\"link2\">Lacie</a> and
<a href=\"http://example.com/tillie\" class=\"sister\" id=\"link3\">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class=\"story\">...</p>"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.head.title.string)
子節點和子節點
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link33">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.p.contents)
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class=\"title\" name=\"dromouse\"><b>The Dormouse's story</b></p>
<p class=\"story\">Once upon a time there were three little sisters; and their names were
<a href=\"http://example.com/elsie\" class=\"sister\" id=\"link1\"><!-- Elsie --></a>,
<a href=\"http://example.com/lacie\" class=\"sister\" id=\"link2\">Lacie</a> and
<a href=\"http://example.com/tillie\" class=\"sister\" id=\"link3\">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class=\"story\">...</p>"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.p.children)
for i,child in enumerate(soup.p.children):
print(i,child)
#獲取所有子節點。
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class=\"title\" name=\"dromouse\"><b>The Dormouse's story</b></p>
<p class=\"story\">Once upon a time there were three little sisters; and their names were
<a href=\"http://example.com/elsie\" class=\"sister\" id=\"link1\"><!-- Elsie --></a>,
<a href=\"http://example.com/lacie\" class=\"sister\" id=\"link2\">Lacie</a> and
<a href=\"http://example.com/tillie\" class=\"sister\" id=\"link3\">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class=\"story\">...</p>"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.p.descendants)#子孫節點
for i,child in enumerate(soup.p.descendants):
print(i,child)
父節點和祖先節點
html = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="story\>
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">
<span>Elsie</span>
</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.a.parent)
html = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="story\>
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">
<span>Elsie</span>
</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.a.parents)
兄弟節點
html = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="story\>
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">
<span>Elsie</span>
</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(list(enumerate(soup.a.next_siblings)))
print(list(enumerate(soup.a.previous_siblings)))
標準選擇器
find_all(name,attrs,recursive,text,**kwargs)
可以根據標籤名、屬性,內容查找文檔
name(標籤的名字)
html='''
<div class=\"panel\">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.find_all('ul'))
print(type(soup.find_all('ul')))
html='''
<div class=\"panel\">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
for ul in soup.find_all('ul'):
for li in ul.find_all('li'):
print(li)
attrs
html='''
<div class=\"panel\">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.find_all(attrs={'name':'elements'}))
print(soup.find_all(attrs={'id':'list-1'}))
html='''
<div class=\"panel\">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.find_all(id="list-1"))
print(soup.find_all(class_="element"))
text
html='''
<div class=\"panel\">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.find_all(text="Foo"))
find(name,attrs,recursive,text,**kwargs)
html='''
<div class=\"panel\">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.find('ul'))
print(type(soup.find('ul')))
print(soup.find('page'))
find_parents() ; find_parent()
find_next_sibling() ; find_next_sibling()
find_previous_sibling() ; find_previous_sibling()
find_all_previous();find_previous()
CSS選擇器
通過select()直接傳入CSS選擇器即可完成選擇
html='''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.select('.panel .panel-heading'))#'.'表示class
print(soup.select('ul li'))
print(soup.select('#list-2 .element'))#'#'表示id
html='''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
uls=soup.select('ul')
for ul in uls:
print(ul.select('li'))
獲取屬性
html='''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
uls =soup.select('ul')
for ul in uls:
print(ul['id'])
html='''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
for li in soup.select('li'):
print(li.get_text())
總結
1、推薦使用lxml解析庫,必要時使用html.parser
2、標籤選擇功能弱,但速度快
3、建議使用find()、find_all()查詢匹配單個結果或者多個結果
4、如果對CSS選擇器熟悉建議使用select()
5、記住常用的獲取屬性和文本的方法