文章目錄
現在大部分的主流網站都用JavaScript動態顯示網頁內容,這樣使得我們之前提取技術無法正常運行。本篇將介紹兩種提取基於JS動態網頁的數據。
- JavaScript逆向工程
- 渲染JavaScript
1.動態網頁示例
我們先看一個動態網頁的示例。在示例網站的中,我們從http://127.0.0.1:8000/places/default/search 搜索國家名包涵A
的表單。
我們根據按F12
開發者工具顯示的標籤,用lxml模塊提取數據,發現提取不到什麼數據。
>>> import lxml.html
>>> from downloader import Downloader
>>> D=Downloader()
>>> html=D('http://127.0.0.1:8000/places/default/search')
Downloading: http://127.0.0.1:8000/places/default/search
>>> tree=lxml.html.fromstring(html)
>>> tree.cssselect('div#result a')
[]
>>>
我們在瀏覽器右擊查看網頁源代碼發現我們要提取的div數據是空的。
...
<div id="results">
</div>
...
這是因爲F12
的開發者工具是顯示的標籤是網頁當前的狀態,也就是使用JavaScript動態加載完搜索結果之後的網頁。
2.對加載內容進行逆向工程
由於這些網頁的數據是JS動態加載的,要想提取該數據,我們需要網頁如何加載該數據的,該過程也被稱爲逆向工程。
2.1通過開發者工具的逆向工程
我們在上節F12
的開發者工具的Network
發現AJAX響應一個json文件,即:http://127.0.0.1:8000/places/ajax/search.json?&search_term=A&page_size=10&page=0 。AJAX響應的返回數據是JSON格式的,因此我們可以使用Python的json模塊將解析爲一個字典。
>>> import json
>>> html=D('http://127.0.0.1:8000/places/ajax/search.json?&search_term=A&page_size=10&page=0')
Downloading: http://127.0.0.1:8000/places/ajax/search.json?&search_term=A&page_size=10&page=0
>>> json.loads(html)
{u'records': [
{u'pretty_link': u'<div><a href="/places/default/view/Afghanistan-1"><img src="/places/static/images/flags/af.png" /> Afghanistan</a></div>', u'country': u'Afghanistan', u'id': 3781},
{u'pretty_link': u'<div><a href="/places/default/view/Aland-Islands-2"><img src="/places/static/images/flags/ax.png" /> Aland Islands</a></div>', u'country': u'Aland Islands', u'id': 3782},...],
u'num_pages': 22,
u'error': u''}
>>>
我們可以通過分頁請求提取json數據存到txt文件中。分頁請求會讓同一個國家在多次搜索返回多次,但通過set()
集合會過濾重複的元素。
# -*- coding: utf-8 -*-
import json
import string
import downloader
def main():
template_url = 'http://127.0.0.1:8000/places/ajax/search.json?&page={}&page_size=10&search_term={}'
countries = set()
download = downloader.Downloader()
for letter in string.lowercase:
page = 0
while True:
html = download(template_url.format(page, letter))
try:
ajax = json.loads(html)
except ValueError as e:
print e
ajax = None
else:
for record in ajax['records']:
countries.add(record['country'])
page += 1
if ajax is None or page >= ajax['num_pages']:
break
open('2countries2.txt', 'w').write('\n'.join(sorted(countries)))
if __name__ == '__main__':
main()
2.2通過墨盒測試的逆向工程
在不知道源代碼的情況下的測試稱爲墨盒測試。我們可以使用一次搜索查詢就能匹配所有結果,接下來,我們將嘗試使用不同字符測試這種想法是否可行。
2.2.1搜索條件爲空時
>>> import json
>>> from downloader import Downloader
>>> D=Downloader()
>>> url='http://127.0.0.1:8000/places/ajax/search.json?&page_size=10&page=0&search_term='
>>> json.loads(D(url))['num_pages']
Downloading: http://127.0.0.1:8000/places/ajax/search.json?&page_size=10&page=0&search_term=
0
>>>
搜索條件爲空時,這種方法並沒有奏效。
2.2.2用*
號匹配時
>>> json.loads(D(url+'*'))['num_pages']
Downloading: http://127.0.0.1:8000/places/ajax/search.json?&page_size=10&page=0&search_term=*
0
用*
號匹配時,這種方法也沒有奏效。
2.2.2用.
號匹配時
>>> json.loads(D(url+'.'))['num_pages']
Downloading: http://127.0.0.1:8000/places/ajax/search.json?&page_size=10&page=0&search_term=.
26
這種方法測試成功了,看來服務器是通過正則表達式進行匹配的。在搜索界面中包含4、10、20這幾種選項,其中默認值是10。我們增加顯示數量進行測試。
>>> url='http://127.0.0.1:8000/places/ajax/search.json?&page_size=20&page=0&search_term='
>>> json.loads(D(url+'.'))['num_pages']
Downloading: http://127.0.0.1:8000/places/ajax/search.json?&page_size=20&page=0&search_term=.
13
>>> url='http://127.0.0.1:8000/places/ajax/search.json?&page_size=1000&page=0&search_term='
>>> json.loads(D(url+'.'))['num_pages']
Downloading: http://127.0.0.1:8000/places/ajax/search.json?&page_size=1000&page=0&search_term=.
1
>>>
我們如下整合過完整代碼。
# -*- coding: utf-8 -*-
import json
import csv
import downloader
def main():
writer = csv.writer(open('2.2countries.csv', 'w'))
D = downloader.Downloader()
#html = D('http://example.webscraping.com/ajax/search.json?page=0&page_size=1000&search_term=.')
html = D('http://127.0.0.1:8000/places/ajax/search.json?&page_size=1000&page=0&search_term=.')
ajax = json.loads(html)
for record in ajax['records']:
writer.writerow([record['country']])
if __name__ == '__main__':
main()
3.渲染動態網頁
一些網站用Google Web Toolkit(GWT)工具開發的,產生的JS代碼是壓縮的,但可以通過JSbeautifier工具進行還原,但逆向工程效果不是很好。渲染引擎是瀏覽器加載網頁時解析HTML、應用CSS樣式並執行JS語句進行渲染顯示。本節中我們使用WebKit渲染引擎,並通過Qt框架獲得引擎的一個便捷Python接口,也可以用Selenium自定義渲染。
3.1使用WebKit渲染引擎
<html>
<body>
<div id="result"></div>
<script>document.getElementById("result").innerText = 'Hello World';</script>
</body>
</html>
# -*- coding: utf-8 -*-
import lxml.html
import downloader
try:
from PySide.QtGui import *
from PySide.QtCore import *
from PySide.QtWebKit import *
except ImportError:
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
def direct_download(url):
download = downloader.Downloader()
return download(url)
def webkit_download(url):
app = QApplication([])
webview = QWebView()
loop=QEventLoop()
webview.loadFinished.connect(loop.quit)
webview.load(QUrl(url))
app.exec_() # delay here until download finished
return webview.page().mainFrame().toHtml()
def parse(html):
tree = lxml.html.fromstring(html)
print tree.cssselect('#result')[0].text_content()
def main():
url = 'http://127.0.0.1:8000/places/default/dynamic'
#url = 'http://example.webscraping.com/dynamic'
parse(direct_download(url))
parse(webkit_download(url))
return
print len(r.html)
if __name__ == '__main__':
main()
# -*- coding: utf-8 -*-
try:
from PySide.QtGui import QApplication
from PySide.QtCore import QUrl, QEventLoop, QTimer
from PySide.QtWebKit import QWebView
except ImportError:
from PyQt4.QtGui import QApplication
from PyQt4.QtCore import QUrl, QEventLoop, QTimer
from PyQt4.QtWebKit import QWebView
def main():
app = QApplication([])
webview = QWebView()
loop = QEventLoop()
webview.loadFinished.connect(loop.quit)
webview.load(QUrl('http://127.0.0.1:8000/places/default/dynamic'))
#webview.load(QUrl('http://example.webscraping.com/search'))
loop.exec_()
webview.show()
frame = webview.page().mainFrame()
frame.findFirstElement('#search_term').setAttribute('value', '.')
frame.findFirstElement('#page_size option:checked').setPlainText('1000')
frame.findFirstElement('#search').evaluateJavaScript('this.click()')
elements = None
while not elements:
app.processEvents()
elements = frame.findAllElements('#results a')
countries = [e.toPlainText().strip() for e in elements]
print countries
if __name__ == '__main__':
main()
# -*- coding: utf-8 -*-
import re
import csv
import time
try:
from PySide.QtGui import QApplication
from PySide.QtCore import QUrl, QEventLoop, QTimer
from PySide.QtWebKit import QWebView
except ImportError:
from PyQt4.QtGui import QApplication
from PyQt4.QtCore import QUrl, QEventLoop, QTimer
from PyQt4.QtWebKit import QWebView
import lxml.html
class BrowserRender(QWebView):
def __init__(self, display=True):
self.app = QApplication([])
QWebView.__init__(self)
if display:
self.show() # show the browser
def open(self, url, timeout=60):
"""Wait for download to complete and return result"""
loop = QEventLoop()
timer = QTimer()
timer.setSingleShot(True)
timer.timeout.connect(loop.quit)
self.loadFinished.connect(loop.quit)
self.load(QUrl(url))
timer.start(timeout * 1000)
loop.exec_() # delay here until download finished
if timer.isActive():
# downloaded successfully
timer.stop()
return self.html()
else:
# timed out
print 'Request timed out:', url
def html(self):
"""Shortcut to return the current HTML"""
return self.page().mainFrame().toHtml()
def find(self, pattern):
"""Find all elements that match the pattern"""
return self.page().mainFrame().findAllElements(pattern)
def attr(self, pattern, name, value):
"""Set attribute for matching elements"""
for e in self.find(pattern):
e.setAttribute(name, value)
def text(self, pattern, value):
"""Set attribute for matching elements"""
for e in self.find(pattern):
e.setPlainText(value)
def click(self, pattern):
"""Click matching elements"""
for e in self.find(pattern):
e.evaluateJavaScript("this.click()")
def wait_load(self, pattern, timeout=60):
"""Wait for this pattern to be found in webpage and return matches"""
deadline = time.time() + timeout
while time.time() < deadline:
self.app.processEvents()
matches = self.find(pattern)
if matches:
return matches
print 'Wait load timed out'
def main():
br = BrowserRender()
br.open('http://127.0.0.1:8000/places/default/dynamic')
#br.open('http://example.webscraping.com/search')
br.attr('#search_term', 'value', '.')
br.text('#page_size option:checked', '1000')
br.click('#search')
elements = br.wait_load('#results a')
writer = csv.writer(open('countries.csv', 'w'))
for country in [e.toPlainText().strip() for e in elements]:
writer.writerow([country])
if __name__ == '__main__':
main()
3.2使用Selenium自定義渲染
from selenium import webdriver
def main():
driver = webdriver.Firefox()
driver.get('http://127.0.0.1:8000/places/default/dynamic')
#driver.get('http://example.webscraping.com/search')
driver.find_element_by_id('search_term').send_keys('.')
driver.execute_script("document.getElementById('page_size').options[1].text = '1000'");
driver.find_element_by_id('search').click()
driver.implicitly_wait(30)
links = driver.find_elements_by_css_selector('#results a')
countries = [link.text for link in links]
driver.close()
print countries
if __name__ == '__main__':
main()
Wu_Being 博客聲明:本人博客歡迎轉載,請標明博客原文和原鏈接!謝謝!
【Python爬蟲系列】《【Python爬蟲5】提取JS動態網頁數據》http://blog.csdn.net/u014134180/article/details/55507014
Python爬蟲系列的GitHub代碼文件:https://github.com/1040003585/WebScrapingWithPython