Python 爬蟲新浪2019年五大聯賽所有球員基本數據爬取與分析

紀念自己第一個有稍微用心的小學期項目，使用Python編程語言編寫一個網絡爬蟲項目，對新浪足球球員數據庫（http://match.sports.sina.com.cn）的數據爬取，獲取2019年五大聯賽所有球員的基本數據存儲到csv中，進行統計分析,爬取信息字段要求：['中文名稱','英文名稱','英文全稱','生日','身高','體重','年齡','位置','國籍','俱樂部','球衣號碼']。

首先對網站網頁進行分析，發現各個聯賽都有相對於的鏈接頁面，且發現規律英超的爲（http://match.sports.sina.com.cn/football/opta_rank.php?year=2019&lid=1），西甲的lid爲2，德甲爲3，意甲爲4，法甲爲5，點進去球隊後，右邊有一列當前陣容的表格，點擊頭像便可以鏈接到球員的詳細數據頁面，所以初步計劃先爬取各個聯賽各支球隊鏈接，然後再通過球隊鏈接來爬取球員鏈接。

編寫getteam.py

#encoding:UTF-8
import requests
from bs4 import BeautifulSoup
def get_teamlink(url):
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'}
    res = requests.get(url,headers = headers) #get方法中加入請求頭
    res.encoding = 'utf-8'#設置編碼格式防止亂碼
    soup = BeautifulSoup(res.text, 'lxml') #對返回的結果進行解析
    team=soup.select('a[href^="team.php?id"]')
    team_list={}
    for x in range(len(team)):
        team_list[str(team[x].string)]=str('http://match.sports.sina.com.cn/football/'+team[x].attrs['href'])
    return (team_list)

England = get_teamlink('http://match.sports.sina.com.cn/football/opta_rank.php?year=2019&lid=1')#英超聯賽的球隊鏈接獲取
Spain=get_teamlink('http://match.sports.sina.com.cn/football/opta_rank.php?year=2019&lid=2')#西甲聯賽的球隊鏈接獲取
Germany=get_teamlink('http://match.sports.sina.com.cn/football/opta_rank.php?year=2019&lid=3')#德甲聯賽的球隊鏈接獲取
Italy=get_teamlink('http://match.sports.sina.com.cn/football/opta_rank.php?year=2019&lid=4')#意甲聯賽的球隊鏈接獲取
French=get_teamlink('http://match.sports.sina.com.cn/football/opta_rank.php?year=2019&lid=5')#法甲聯賽的球隊鏈接獲取
with open('teamlinks.csv', 'w') as f:#把數據放到csv文件中
    [f.write('{0},{1}\n'.format(key, value)) for key, value in England.items()]
    [f.write('{0},{1}\n'.format(key, value)) for key, value in Spain.items()]
    [f.write('{0},{1}\n'.format(key, value)) for key, value in Germany.items()]
    [f.write('{0},{1}\n'.format(key, value)) for key, value in Italy.items()]
    [f.write('{0},{1}\n'.format(key, value)) for key, value in French.items()]

得到球隊鏈接teamlinks.csv文件

編寫player.py

#encoding:UTF-8
import csv
import requests
from bs4 import BeautifulSoup
with open('teamlinks.csv','r') as f:#取球隊鏈接
    reader = csv.reader(f)
    column1 = [row[1]for row in reader]
player={}#建存儲球員鏈接的字典
for x in column1:
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'}
    url=x
    res = requests.get(url,headers = headers) #get方法中加入請求頭
    res.encoding = 'gbk'
    soup = BeautifulSoup(res.text, 'lxml') #對返回的結果進行解析
    player_name=soup.find(class_='sub03_c').find_all('p')#找球員姓名
    for i in range(len(player_name)):
        player_name[i]=str(player_name[i].get_text())
    player_links=soup.find(class_='sub03_c').find_all('a')#找球員鏈接
    for j in range(len(player_links)):
        player_links[j]=str(player_links[j].attrs['href'])#取href屬性
    for k in range(len(player_name)):
        player[str(player_name[k])]=str(player_links[k])
with open('playerlinks.csv', 'w') as f:#寫入球員鏈接
    [f.write('{0},{1}\n'.format(key, value)) for key, value in player.items()]

得到球員鏈接

編寫state.py

#encoding:UTF-8
import csv
import os

import pandas as pd
import requests
from bs4 import BeautifulSoup
def get_state(player):#取球員詳細數據的方法
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'}
    res = requests.get(player,headers = headers) #get方法中加入請求頭
    res.encoding = 'gbk'
    soup = BeautifulSoup(res.text, 'lxml') #對返回的結果進行解析
    state=list(soup.find(class_='txt').find_all('dd'))
    for x in range(len(state)):
        state[x]=str(state[x])[4:-5]
    return state

with open('playerlinks.csv','r') as f:#打開球員鏈接文件
    reader = csv.reader(f)
    column1 = [row[1]for row in reader]
player=list()#新建存儲球員數據的列表
for x in column1:
    player.append(get_state(x))

p_columns=['中文名稱','英文名稱','英文全稱','生日','身高','體重','年齡','位置','國籍','俱樂部','球衣號碼']
raw_list = pd.DataFrame(columns=p_columns,data=player)#把嵌套列表轉化爲DataFrame對象
raw_list.to_csv('player.csv', encoding='gbk')#存儲爲csv文件

得到球員信息

編寫country.py分析五大聯賽球員國家比例分析

#coding:utf-8
import pandas as pd
from collections import Counter
from matplotlib import pyplot as plt
from matplotlib import font_manager

#設置圖表樣式
plt.style.use('fivethirtyeight')
#這裏使用pandas讀取csv文件
data = pd.read_csv('player.csv',encoding='gbk')
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['SimHei']
countrys = data['國籍']
#定義一個Counter
#用來國家分類的總數
country_counter = Counter()
for country in countrys:
  country_counter.update(country.split(' '))
countries = []
popularity = []
#取前15個
for item in country_counter.most_common(15):
  countries.append(item[0])
  popularity.append(item[1])
#倒序顯示
countries.reverse()
popularity.reverse()
#設置圖表的字體微軟雅黑 防止中文亂碼的
zh_font = font_manager.FontProperties(fname='C:\\Windows\\Fonts\\simhei.ttf')
#使用橫向條形圖表
plt.yticks(fontsize = 8)
plt.barh(countries,popularity)
plt.title("五大聯賽球員國籍比例",fontproperties=zh_font)
plt.xlabel("人數",fontproperties=zh_font)
plt.tight_layout()
plt.show()

從數據中可以看出法國球員最多，這不禁讓人感覺到法國國家隊奪得2018的世界盃冠軍也是有一定道理，人員儲備的充足給國家隊主帥提供了更多的可行性。同時也可以看出國家聯賽的發展也是對本國球員發展有一定的促進作用，排在前五的恰好就是五大聯賽本國。瑞士丹麥在2016歐洲盃的表現也是得益於在五大聯賽球員中的比例越來越大。

編寫old.py分析年齡

#coding:utf-8
import pandas as pd
from collections import Counter
from matplotlib import pyplot as plt
from matplotlib import font_manager

#設置圖表樣式
plt.style.use('fivethirtyeight')
#這裏使用pandas讀取csv文件
data = pd.read_csv('player.csv',encoding='gbk')
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['SimHei']
olds = data['年齡']
#定義一個Counter
#用來年齡分類的總數
old_counter = Counter()
for old in olds:
  old_counter.update(old.split(' '))
oldes = []
popularity = []
#取前15個
for item in old_counter.most_common(15):
  oldes.append(item[0])
  popularity.append(item[1])
#倒序顯示
oldes.reverse()
popularity.reverse()
#設置圖表的字體微軟雅黑 防止中文亂碼的
zh_font = font_manager.FontProperties(fname='C:\\Windows\\Fonts\\simhei.ttf')
#使用橫向條形圖表
plt.barh(oldes,popularity)
plt.title("五大聯賽球員年齡比例",fontproperties=zh_font)
plt.xlabel("人數",fontproperties=zh_font)
plt.tight_layout()
plt.show()

得到五大聯賽球員年齡分析

從中可以看出現在的五大聯賽球員年齡偏年輕化，當打之年的球員佔據較大比例，更多的年輕小將湧現世界足壇，留給穆巴佩，登貝萊等超新星的時間不多了，另外五大聯賽中年齡最高的是來自尤文圖斯的門神布馮以及蒙彼利埃的後衛希爾頓，他們的年齡都爲41歲，不禁讓人讚歎他們的職業素養，唯有超高的職業素養以及對足球的熱愛才能夠讓他們依舊活躍在世界主流聯賽中。

編寫hight.py

#coding:utf-8
import pandas as pd
from collections import Counter
from matplotlib import pyplot as plt
from matplotlib import font_manager

#設置圖表樣式
plt.style.use('fivethirtyeight')
#這裏使用pandas讀取csv文件
data = pd.read_csv('player.csv',encoding='gbk')
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['SimHei']
heights = data['身高']
#定義一個Counter
#用來身高分類的總數
height_counter = Counter()
for country in heights:
  height_counter.update(country.split(' '))
heightes = []
popularity = []
#取前15個
for item in height_counter.most_common(15):
  heightes.append(item[0])
  popularity.append(item[1])
#倒序顯示
heightes.reverse()
popularity.reverse()
#設置圖表的字體微軟雅黑 防止中文亂碼的
zh_font = font_manager.FontProperties(fname='C:\\Windows\\Fonts\\simhei.ttf')
#使用橫向條形圖表
plt.yticks(fontsize = 8)
plt.barh(heightes,popularity)
plt.title("五大聯賽球員身高比例",fontproperties=zh_font)
plt.xlabel("人數",fontproperties=zh_font)
plt.tight_layout()
plt.show()

wight.py

#coding:utf-8
import pandas as pd
from collections import Counter
from matplotlib import pyplot as plt
from matplotlib import font_manager

#設置圖表樣式
plt.style.use('fivethirtyeight')
#這裏使用pandas讀取csv文件
data = pd.read_csv('player.csv',encoding='gbk')
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['SimHei']
weights = data['體重']
#定義一個Counter
#用來體重分類的總數
weight_counter = Counter()
for weight in weights:
  weight_counter.update(weight.split(' '))
weightes = []
popularity = []
#取前15個
for item in weight_counter.most_common(15):
  weightes.append(item[0])
  popularity.append(item[1])
#倒序顯示
weightes.reverse()
popularity.reverse()
#設置圖表的字體微軟雅黑 防止中文亂碼的
zh_font = font_manager.FontProperties(fname='C:\\Windows\\Fonts\\simhei.ttf')
#使用橫向條形圖表
plt.yticks(fontsize = 8)
plt.barh(weightes,popularity)
plt.title("五大聯賽球員體重比例",fontproperties=zh_font)
plt.xlabel("人數",fontproperties=zh_font)
plt.tight_layout()
plt.show()

可得五大聯賽球員的體重身高和比例

可以看到有0cm和0kg的佔據大比例，這其實不是爬取數據中出錯了，而是新浪數據庫的數據不全導致。普遍身高爲180cm，體重爲70kg。其中身高最高的是來自斯帕爾俱樂部的蒂亞姆，他的身高爲202cm，最矮的是來自佈雷斯特的巴託齊奧，他的身高爲160cm，體重最大的是來自萊斯特城俱樂部的摩根，他的體重爲101kg，體重最小的是來自那不勒斯俱樂部的莫滕斯和來自摩納哥俱樂部的索菲亞內-迪奧普，他們的體重都爲55kg。

源碼和文件已上傳到csdn資源中，可以修改部分代碼獲取別的數據，僅用於學術研究請勿他用

Python 爬蟲新浪2019年五大聯賽所有球員基本數據爬取與分析

Educoder python pandas介紹第7關：總的挑戰

Educoder python NumPy基礎及取值操作第4關：隨機數生成

python程序基礎 bmi計算題

歡迎關注我的個人網站

Educoder Python入門之模塊

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結

Python 爬蟲 新浪2019年五大聯賽所有球員基本數據爬取與分析

Python 爬蟲新浪2019年五大聯賽所有球員基本數據爬取與分析