安裝依賴
更新pip(需要以管理員身份運行)
pip install --upgrade pip
如果執行pip install的時候就一直提示 Script file ‘E:\Anaconda\Scripts\pip-script.py’ is not present. 這個錯誤
使用 easy_install pip
這個命令,再重新安裝一下pip就好了。
安裝lxml
參考:https://www.jianshu.com/p/6734a20fa6dd
1. 到這個連接下, 搜索lxml, 下載對應的 lxml.whl 文件(預編譯過的)
如:我本機python的版本是3.59 64位win10系統, 那麼就選擇
注意: 如果沒有安裝wheel, 要先安裝wheel
pip install wheel
2. 下載完成後, 把下載好的.whl文件放在python的跟目錄下, 比如我的是: D:\Python\Python39
在該目錄下執行:
python -m pip install lxml-4.6.4-cp39-cp39-win_amd64.whl
安裝beautifulsoup4
pip install beautifulsoup4
編寫代碼 weather.py
參考:https://zhuanlan.zhihu.com/p/85783510
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit # BS 內置庫,猜測文檔編碼
import urllib.request
url = 'http://www.weather.com.cn/weather/101080101.shtml'
try:
headers = {'User-Agent':'Mozilla/5.0(Windows;U;Windows NT 6.0 x64;en-US;rv:1.9pre)Gecko/20191008 Minefield/3.0.2pre'}
req = urllib.request.Request(url,headers = headers)
data = urllib.request.urlopen(req)
data = data.read()
dammint = UnicodeDammit(data,['utf-8','gbk']) #鑑別編碼,做一個包裝-markup
data = dammint.unicode_markup
soup = BeautifulSoup(data,'lxml')
lis = soup.select("ul[class='t clearfix'] li") # 找到ul下的所有li
for li in lis:
try:
data = li.select('h1')[0].text # h1的第一個元素的text文本
weather = li.select("p[class='wea']")[0].text
temp = li.findAll('span')[0].text + '/' + li.findAll('i')[0].text
print(data,weather,temp)
except Exception as err:
print(err)
except Exception as err:
print(err)
測試
python weather.py
成功爬取中國天氣網某城市七天的天氣情況。
升級版
爬取的升級版,是爬取幾個城市的天氣情況:
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import sqlite3
class WeatherDB: # 包含對數據庫的操作
def openDB(self):
self.con = sqlite3.connect('weathers.db')
self.cursor = self.con.cursor()
try:
self.cursor.execute('create table weathers (wCity varchar(16),wDate varchar(16),wWeather varchar(64),wTemp varchar(32),constraint pk_weather primary key(wCity,wDate))')
# 爬取城市的天氣預報數據儲存到數據庫weather.db中
except: # 第一次創建表格是成功的;第二次創建就會清空表格
self.cursor.execute('delete from weathers')
def closeDB(self):
self.con.commit()
self.con.close()
def insert(self,city,date,weather,temp):
try:
self.cursor.execute('insert into weather (wCity,wDate,wTemp)values(?,?,?,?)',(city,date,weather,temp))
except Exception as err:
print(err)
def show(self):
self.cursor.execute('select * from weathers')
rows = self.cursor.fetchall()
print('%-16s%-16s%-32s%-16s'%('city','date','weather','temp'))
for row in rows:
print('%-16s%-16s%-32s%-16s'%(row[0],row[1],row[2],row[3]))
class WeatherForecast: # 調用url,request函數訪問網站
def __init__(self):
self.headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US;rv:1.9pre)Gecko/2019100821 Minefield/3.0.2pre'} # 創建頭,僞裝成服務器/瀏覽器訪問遠程的web服務器
self.cityCode = {'北京':'101010100','上海':'101020100','廣州':'101280101','深圳':'101280601'} # 查找的城市
def forecastCity(self,city):
if city not in self.cityCode.keys():
print(city+'code cannot be found')
return
url = 'http://www.weather.com.cn/weather/'+self.cityCode[city]+'.shtml' # 創建成url
try:
req = urllib.request.Request(url,headers=self.headers) # 訪問地址
data = urllib.request.urlopen(req)
data = data.read()
dammit = UnicodeDammit(data,['utf-8'],'gbk')
data = dammit.unicode_markup
soup = BeautifulSoup(data,'lxml')
lis = soup.select("ul[class='t clearfix'] li") # 找到每一個天氣數據
for li in lis:
try:
date = li.select('h1')[0].text
weather = li.select('p[class="wea"]')[0].text
temp = li.select('p[class="tem"] span')[0].text+'/'+li.select('p[class="tem"] i')[0].text
print(city,date,weather,temp)
self.db.insert(city,date,weather,temp) # 插入到數據庫的記錄
except Exception as err:
print(err)
except Exception as err:
print(err)
def process(self,cities):
self.db = WeatherDB()
self.db.openDB()
for city in cities:
self.forecastCity(city) # 循環每一個城市
self.db.closeDB()
ws = WeatherForecast()
ws.process(['北京','上海','廣州','深圳'])
print('completed')