Beautifulsoup

BeautifulSoup 崔老師爬蟲系列課程學習筆記

很多時候能代替正則表達式完成網頁信息的提取

安裝

pip install beautifulsoup4

基本使用方法

html="""
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class=\"title\" name=\"dromouse\"><b>The Dormouse's story</b></p>
    <p class=\"story\">Once upon a time there were three little sisters; and their names were
    <a href=\"http://example.com/elsie\" class=\"sister\" id=\"link1\"><!-- Elsie --></a>,
    <a href=\"http://example.com/lacie\" class=\"sister\" id=\"link2\">Lacie</a> and
    <a href=\"http://example.com/tillie\" class=\"sister\" id=\"link3\">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class=\"story\">...</p>"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,"html")
print(soup.prettify())#格式化代碼
print(soup.title.string)#選擇title標籤,並將裏面的內容打印出來

常用標籤選擇器

選擇元素

html="""
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class=\"title\" name=\"dromouse\"><b>The Dormouse's story</b></p>
    <p class=\"story\">Once upon a time there were three little sisters; and their names were
    <a href=\"http://example.com/elsie\" class=\"sister\" id=\"link1\"><!-- Elsie --></a>,
    <a href=\"http://example.com/lacie\" class=\"sister\" id=\"link2\">Lacie</a> and
    <a href=\"http://example.com/tillie\" class=\"sister\" id=\"link3\">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class=\"story\">...</p>"""


from bs4 import BeautifulSoup

soup = BeautifulSoup(html,'lxml')
print(soup.title.string)
print(type(soup.title))
print(soup.head)
print(soup.p)#僅返回第一個找到的對象

獲取名稱

html="""
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class=\"title\" name=\"dromouse\"><b>The Dormouse's story</b></p>
    <p class=\"story\">Once upon a time there were three little sisters; and their names were
    <a href=\"http://example.com/elsie\" class=\"sister\" id=\"link1\"><!-- Elsie --></a>,
    <a href=\"http://example.com/lacie\" class=\"sister\" id=\"link2\">Lacie</a> and
    <a href=\"http://example.com/tillie\" class=\"sister\" id=\"link3\">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class=\"story\">...</p>"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.title.name)

獲取屬性

html="""
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class=\"title\" name=\"dromouse\"><b>The Dormouse's story</b></p>
    <p class=\"story\">Once upon a time there were three little sisters; and their names were
    <a href=\"http://example.com/elsie\" class=\"sister\" id=\"link1\"><!-- Elsie --></a>,
    <a href=\"http://example.com/lacie\" class=\"sister\" id=\"link2\">Lacie</a> and
    <a href=\"http://example.com/tillie\" class=\"sister\" id=\"link3\">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class=\"story\">...</p>"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.p['name'])
print(soup.p.attrs['name'])

獲取內容

html="""
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class=\"title\" name=\"dromouse\"><b>The Dormouse's story</b></p>
    <p class=\"story\">Once upon a time there were three little sisters; and their names were
    <a href=\"http://example.com/elsie\" class=\"sister\" id=\"link1\"><!-- Elsie --></a>,
    <a href=\"http://example.com/lacie\" class=\"sister\" id=\"link2\">Lacie</a> and
    <a href=\"http://example.com/tillie\" class=\"sister\" id=\"link3\">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class=\"story\">...</p>"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.p.string)

嵌套選擇

html="""
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class=\"title\" name=\"dromouse\"><b>The Dormouse's story</b></p>
    <p class=\"story\">Once upon a time there were three little sisters; and their names were
    <a href=\"http://example.com/elsie\" class=\"sister\" id=\"link1\"><!-- Elsie --></a>,
    <a href=\"http://example.com/lacie\" class=\"sister\" id=\"link2\">Lacie</a> and
    <a href=\"http://example.com/tillie\" class=\"sister\" id=\"link3\">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class=\"story\">...</p>"""


from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.head.title.string)

子節點和子節點

html="""
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link33">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.p.contents)

html="""
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class=\"title\" name=\"dromouse\"><b>The Dormouse's story</b></p>
    <p class=\"story\">Once upon a time there were three little sisters; and their names were
    <a href=\"http://example.com/elsie\" class=\"sister\" id=\"link1\"><!-- Elsie --></a>,
    <a href=\"http://example.com/lacie\" class=\"sister\" id=\"link2\">Lacie</a> and
    <a href=\"http://example.com/tillie\" class=\"sister\" id=\"link3\">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class=\"story\">...</p>"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.p.children)
for i,child in enumerate(soup.p.children):
    print(i,child)
#獲取所有子節點。

html="""
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class=\"title\" name=\"dromouse\"><b>The Dormouse's story</b></p>
    <p class=\"story\">Once upon a time there were three little sisters; and their names were
    <a href=\"http://example.com/elsie\" class=\"sister\" id=\"link1\"><!-- Elsie --></a>,
    <a href=\"http://example.com/lacie\" class=\"sister\" id=\"link2\">Lacie</a> and
    <a href=\"http://example.com/tillie\" class=\"sister\" id=\"link3\">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class=\"story\">...</p>"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.p.descendants)#子孫節點
for i,child in enumerate(soup.p.descendants):
    print(i,child)

父節點和祖先節點

html = """
    <html>
        <head>
            <title>The Dormouse's story</title>
        </head>
        <body>
            <p class="story\>
                Once upon a time there were three little sisters; and their names were
                <a href="http://example.com/elsie" class="sister" id="link1">
                    <span>Elsie</span>
                </a>
                <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
                and
                <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
                and they lived at the bottom of a well.
            </p>
            <p class="story">...</p>
      """

from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.a.parent)
html = """
    <html>
        <head>
            <title>The Dormouse's story</title>
        </head>
        <body>
            <p class="story\>
                Once upon a time there were three little sisters; and their names were
                <a href="http://example.com/elsie" class="sister" id="link1">
                    <span>Elsie</span>
                </a>
                <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
                and
                <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
                and they lived at the bottom of a well.
            </p>
            <p class="story">...</p>
      """

from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.a.parents)

兄弟節點

html = """
    <html>
        <head>
            <title>The Dormouse's story</title>
        </head>
        <body>
            <p class="story\>
                Once upon a time there were three little sisters; and their names were
                <a href="http://example.com/elsie" class="sister" id="link1">
                    <span>Elsie</span>
                </a>
                <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
                and
                <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
                and they lived at the bottom of a well.
            </p>
            <p class="story">...</p>
      """

from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(list(enumerate(soup.a.next_siblings)))
print(list(enumerate(soup.a.previous_siblings)))

標準選擇器

find_all(name,attrs,recursive,text,**kwargs)

可以根據標籤名、屬性,內容查找文檔

name(標籤的名字)

 html='''
    <div class=\"panel\">
        <div class="panel-heading">
            <h4>Hello</h4>
        </div>
        <div class="panel-body">
            <ul class="list" id="list-1">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
                <li class="element">Jay</li>
            </ul>
            <ul class="list list-small" id="list-2">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
            </ul>
        </div>
    </div>
    '''
from bs4 import BeautifulSoup 
soup = BeautifulSoup(html,'lxml')

print(soup.find_all('ul'))
print(type(soup.find_all('ul')))
 html='''
    <div class=\"panel\">
        <div class="panel-heading">
            <h4>Hello</h4>
        </div>
        <div class="panel-body">
            <ul class="list" id="list-1">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
                <li class="element">Jay</li>
            </ul>
            <ul class="list list-small" id="list-2">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
            </ul>
        </div>
    </div>
    '''
from bs4 import BeautifulSoup 
soup = BeautifulSoup(html,'lxml')

for ul in soup.find_all('ul'):
    for li in ul.find_all('li'):
        print(li)

attrs

 html='''
    <div class=\"panel\">
        <div class="panel-heading">
            <h4>Hello</h4>
        </div>
        <div class="panel-body">
            <ul class="list" id="list-1" name="elements">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
                <li class="element">Jay</li>
            </ul>
            <ul class="list list-small" id="list-2" name="elements">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
            </ul>
        </div>
    </div>
    '''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.find_all(attrs={'name':'elements'}))
print(soup.find_all(attrs={'id':'list-1'}))
 html='''
    <div class=\"panel\">
        <div class="panel-heading">
            <h4>Hello</h4>
        </div>
        <div class="panel-body">
            <ul class="list" id="list-1" name="elements">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
                <li class="element">Jay</li>
            </ul>
            <ul class="list list-small" id="list-2" name="elements">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
            </ul>
        </div>
    </div>
    '''

from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.find_all(id="list-1"))
print(soup.find_all(class_="element"))

text

 html='''
    <div class=\"panel\">
        <div class="panel-heading">
            <h4>Hello</h4>
        </div>
        <div class="panel-body">
            <ul class="list" id="list-1" name="elements">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
                <li class="element">Jay</li>
            </ul>
            <ul class="list list-small" id="list-2" name="elements">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
            </ul>
        </div>
    </div>
    '''

from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.find_all(text="Foo"))

find(name,attrs,recursive,text,**kwargs)

 html='''
    <div class=\"panel\">
        <div class="panel-heading">
            <h4>Hello</h4>
        </div>
        <div class="panel-body">
            <ul class="list" id="list-1" name="elements">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
                <li class="element">Jay</li>
            </ul>
            <ul class="list list-small" id="list-2" name="elements">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
            </ul>
        </div>
    </div>
    '''


from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')

print(soup.find('ul'))
print(type(soup.find('ul')))
print(soup.find('page'))

find_parents() ; find_parent()

find_next_sibling() ; find_next_sibling()

find_previous_sibling() ; find_previous_sibling()

find_all_previous();find_previous()

CSS選擇器

通過select()直接傳入CSS選擇器即可完成選擇

 html='''
    <div class="panel">
        <div class="panel-heading">
            <h4>Hello</h4>
        </div>
        <div class="panel-body">
            <ul class="list" id="list-1" name="elements">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
                <li class="element">Jay</li>
            </ul>
            <ul class="list list-small" id="list-2" name="elements">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
            </ul>
        </div>
    </div>
    '''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.select('.panel .panel-heading'))#'.'表示class
print(soup.select('ul li'))
print(soup.select('#list-2 .element'))#'#'表示id
 html='''
    <div class="panel">
        <div class="panel-heading">
            <h4>Hello</h4>
        </div>
        <div class="panel-body">
            <ul class="list" id="list-1" name="elements">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
                <li class="element">Jay</li>
            </ul>
            <ul class="list list-small" id="list-2" name="elements">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
            </ul>
        </div>
    </div>
    '''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')

uls=soup.select('ul')
for ul in uls:
    print(ul.select('li'))

獲取屬性

 html='''
    <div class="panel">
        <div class="panel-heading">
            <h4>Hello</h4>
        </div>
        <div class="panel-body">
            <ul class="list" id="list-1" name="elements">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
                <li class="element">Jay</li>
            </ul>
            <ul class="list list-small" id="list-2" name="elements">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
            </ul>
        </div>
    </div>
    '''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
uls =soup.select('ul')
for ul in uls:
    print(ul['id'])
 html='''
    <div class="panel">
        <div class="panel-heading">
            <h4>Hello</h4>
        </div>
        <div class="panel-body">
            <ul class="list" id="list-1" name="elements">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
                <li class="element">Jay</li>
            </ul>
            <ul class="list list-small" id="list-2" name="elements">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
            </ul>
        </div>
    </div>
    '''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
for li in soup.select('li'):
    print(li.get_text())

總結

1、推薦使用lxml解析庫,必要時使用html.parser
2、標籤選擇功能弱,但速度快
3、建議使用find()、find_all()查詢匹配單個結果或者多個結果
4、如果對CSS選擇器熟悉建議使用select()
5、記住常用的獲取屬性和文本的方法

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章