2019年程序員崗位招聘信息分析

        爬取某招聘網站java、python、c/c++,php四種語言在北京,上海,廣州,深圳四個一線城市的公開職位發佈信息進行分析,數據樣本來自前30頁的數據,樣本大小大概6058個。

 

一、數據抓取

       非常簡單,基本上沒有發抓取策略

def downloader(city, keyword, page):

'''

:param city:

:param keyword:

:param page:

:return:

'''

url ="https://www.lagou.com/jobs/positionAjax.json?city={}&needAddtionalResult=false"\

.format(quote(city))

data = {

"first":"false",

"pn": page,

"kd": keyword

}

headers = {

"Accept":"application/json, text/javascript, */*; q=0.01",

"Accept-Encoding":"gzip, deflate, br",

"Accept-Language":"zh-CN,zh;q=0.9,en;q=0.8",

"Connection":"keep-alive",

"Content-Length":"26",

"Content-Type":"application/x-www-form-urlencoded; charset=UTF-8",

"Host":"www.lagou.com",

"Cookie":"WEBTJ-ID=20181228093856-167f276e34849d-015bd2bf49274b-6114147a-1327104-167f276e34a334; _ga=GA1.2.651225173.1545961137; _gid=GA1.2.952777220.1545961137; user_trace_token=20181228093740-29e0dba1-0a41-11e9-b14d-525400f775ce; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Fs%3Fwd%3D%25E6%258B%2589%25E5%258B%25BE%25E7%25BD%2591%26rsv_spt%3D1%26rsv_iqid%3D0xdc8f964d00002f4f%26issp%3D1%26f%3D8%26rsv_bp%3D1%26rsv_idx%3D2%26ie%3Dutf-8%26rqlang%3Dcn%26tn%3Dbaiduhome_pg%26rsv_enter%3D1%26oq%3D%2525E4%2525B8%252593%2525E8%2525B5%252584%2525E5%25258A%25259E%26rsv_t%3Df7a1d2gJnPyNK%252FsS4vTWJ9EOKhzAsK05aVgqC43iWtqWmiKpIp0u6YQblMkUzbi3KwO7%26inputT%3D8441%26rsv_pq%3D9f44c2a800002af6%26rsv_sug3%3D57%26rsv_sug1%3D62%26rsv_sug7%3D101%26bs%3D%25E4%25B8%2593%25E8%25B5%2584%25E5%258A%259E; LGUID=20181228093740-29e0e252-0a41-11e9-b14d-525400f775ce; LGSID=20181228093745-2cd1a71c-0a41-11e9-b14d-525400f775ce; PRE_UTM=m_cf_cpc_baidu_pc; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpc_baidu_pc%26m_kw%3Dbaidu_cpc_bj_e110f9_d2162e_%25E6%258B%2589%25E5%258B%25BE%25E7%25BD%2591; JSESSIONID=ABAAABAAAGGABCB3EDF3AFE52B111A35A8BDCCF214C647F; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1545961137,1545961142,1545961149; index_location_city=%E5%8C%97%E4%BA%AC; TG-TRACK-CODE=index_search; SEARCH_ID=832387387eb944a39636c9973cbd41c4; LGRID=20181228093800-3605ba8a-0a41-11e9-ad84-5254005c3644; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1545961158",

"Origin":"https://www.lagou.com",

"Referer":"https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=",

"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",

"X-Anit-Forge-Code":"0",

"X-Anit-Forge-Token":"None",

"X-Requested-With":"XMLHttpRequest"

}

proxies = {

"http":"****",

"https":"****",

}

whileTrue:

try:

response = requests.post(url, data=data, headers=headers, proxies=proxies)

response.encoding ="utf-8"

ifresponse.status_code ==200:

data = json.loads(response.text)

result = jsonpath.jsonpath(data,"$.content.positionResult.result")[0]

with MongodbTools("dataanalysis") as mongo:

lagou = mongo.db["lagou"]

forrow in result:

row["_id"] ="{}".format(row["positionId"])

lagou.update_one({"_id": row["_id"]}, {"$set": row}, upsert=True)

print("update or insert data = {}".format(row["_id"]))

break

except BaseException as e:

print(e)

pass

直接保存數據到mongodb中。

對人工智能感興趣的朋友可以加羣705673780,一起學習交流

二、數據分析

1、數據清洗,格式化

importpandasaspd

importnumpyasnp

from matplotlibimportpyplotasplt

from datetimeimportdatetime,timedelta

from pymongoimportMongoClient

importtime

mongo = MongoClient()["dataanalysis"]["lagou"]

values = mongo.find({},{"_id":0,"positionAdvantage":1,"salary":1,"city":1,"positionName":1,"workYear":1,"education":1,"industryField":1,"companySize":1,"financeStage":1,"firstType":1,"secondType":1,"thirdType":1})

values = [rowforrowinvalues]

df = pd.DataFrame(values)

# 格式化公司規模

def length(data,type):

value =data.values

ifnot value:

return0

value = value[0]

ifnot value:

return0

ifvalue.find("以上") !=-1:

iftype ==1:

return2000

else:

return10000

elif value.find("-") !=-1:

t = value.replace("人","").split("-")

iftype ==1:

returnint(t[0])

else:

returnint(t[1])

else:

iftype ==1:

return0

else:

return15

def min_staff(data):

returnlength(data,1)

def max_staff(data):

returnlength(data,2)

df["min_staff"] = df[["companySize"]].apply(min_staff,axis=1)

df["max_staff"] = df[["companySize"]].apply(max_staff,axis=1)

df = df.drop(["companySize"],axis=1)

# 格式化薪資

def salary(data,type):

value =data.values

ifnot value:

return0

value = value[0]

ifnot value:

return0

ifvalue.find("-") !=-1:

t = value.replace("k","").replace("K","").split("-")

iftype ==1:

returnint(t[0])*1000

elif type ==2:

returnint(t[1])*1000

else:

return(int(t[0])*1000+int(t[1])*1000)/2

else:

return0

def min_salary(data):

returnsalary(data,1)

def max_salary(data):

returnsalary(data,2)

def avg_salary(data):

returnsalary(data,3)

df["min_salary"] = df[["salary"]].apply(min_salary,axis=1)

df["max_salary"] = df[["salary"]].apply(max_salary,axis=1)

df["avg_salary"] = df[["salary"]].apply(avg_salary,axis=1)

# 格式化語言

def language(data):

value =data.values

ifnot value:

returnNone

value = value[0]

ifnot value:

returnNone

value = value.upper()

ifvalue.find("PYTHON") !=-1:

return"python"

ifvalue.find("C++") !=-1:

return"c/c++"

ifvalue.find("C") !=-1:

return"c/c++"

ifvalue.find("JAVA") !=-1:

return"java"

ifvalue.find("PHP") !=-1:

return"php"

returnNone

df["language"] = df[["positionName"]].apply(language,axis=1)

df = df.dropna()

把薪資,語言,公司規模進行格式化數據,刪除爲Nan的數據。

 

2、每個城市地區的平均工資圖

total_x = None

total_y = []

total_city = []

for city_name,data in df.groupby(by="city"):

result = data.groupby(by=["language"])["avg_salary"].mean().sort_index()

plt.figure(figsize=(20,8),dpi=80)

_x = result.index

_y = result.values

plt.bar(_x,_y)

total_x = _x

total_y.append(_y)

total_city.append(city_name)

plt.xlabel("語言")

plt.ylabel("平均薪資")

plt.title("{}地區編程語言平均薪資".format(city_name))

plt.grid()

plt

 

3、平均薪資城市之間的對比

plt.figure(figsize=(20,8),dpi=80)

interval =6

ind = np.array(range(0,len(total_x) * interval,interval))

width =1

forindex inrange(len(total_city)):

plt.bar(ind - (2- index) * width + width/2,total_y[index],label=total_city[index],width=1)

plt.xticks(range(0,len(total_x) * interval,interval),total_x)

plt.xlabel("語言")

plt.ylabel("平均薪資")

plt.title("一線城市編程語言平均薪資")

plt.grid()

plt.legend()

plt

可見大帝都的平均工資最高

 

4、崗位優勢的分析

importre

defposition_advantage(data):

value = data.values

ifnotvalue:

return[]

value = value[0]

ifnotvalue:

return[]

value = re.sub(r"[.~]","",value)

returnre.split(r'[,,; ;、+-]',value)

labels = list(set([iforrowindf[["positionAdvantage"]].apply(position_advantage,axis=1).valuesforiinrowifi]))

position_data = pd.DataFrame(np.zeros((df.shape[0],len(labels))).astype(int),columns=labels,index=df.index)

forlabelinlabels:

position_data[label][df["positionAdvantage"].str.contains(label)] =1

result = position_data.sum().sort_values(ascending=False)

size = result[:10].values

size = [rowforrowinsize]

labels = result[:10].index

labels = [rowforrowinlabels]

size.append(result.sum() - sum(size))

labels.append("其它")

explode = [0foriinrange(len(size))]

explode[0] =0.1

plt.figure(figsize=(10,10),dpi=80)

plt.pie(size, explode=explode, labels=labels, autopct='%1.1f%%',

shadow=True, startangle=90)

plt.title("崗位優勢百分比")

plt

 

5、城市之間的崗位優勢對比

total_value = []

total_label = []

labels = [rowforrow in result[:10].index]

forindex inrange(len(total_city)):

city = total_city[index]

data = position_data[df["city"] == city]

total_size = data.sum().sum()

total_label.append(city)

total_value.append((data[labels].sum()/total_size*10000).values.tolist())

plt.figure(figsize=(20,8),dpi=80)

interval =8

ind = np.array(range(0,len(labels) * interval,interval))

width =1

forindex inrange(len(total_label)):

plt.bar(ind - (2- index) * width + width/2,total_value[index],label=total_label[index],width=1)

plt.xticks(range(0,len(labels) * interval,interval),labels)

plt.xlabel("福利")

plt.ylabel("佔比(*100)")

plt.title("崗位優勢佔比圖")

plt.grid()

plt.legend()

plt

 

6、工作經驗要求佔比分析

#工作經驗要求佔比

forcity_name,dataindf.groupby(by="city"):

result =data.groupby(by=["workYear"])["avg_salary"].count().sort_values()

plt.figure(figsize=(8,8),dpi=80)

_x = result.index

_y = result.values

plt.pie(_y, labels=_x, autopct='%1.1f%%',shadow=True, startangle=90)

plt.title("{}地區編程語言學歷要求佔比".format(city_name))

plt.grid()

plt

 

7、學歷經驗要求佔比分析

#學歷要求佔比

forcity_name,dataindf.groupby(by="city"):

result =data.groupby(by=["education"])["avg_salary"].count().sort_index()

plt.figure(figsize=(8,8),dpi=80)

_x = result.index

_y = result.values

plt.pie(_y, labels=_x, autopct='%1.1f%%',shadow=True, startangle=90)

plt.title("{}地區編程語言學歷要求佔比".format(city_name))

plt.grid()

plt

 

8、繪製崗位優勢的詞雲圖

# 生成詞圖

fromscipy.miscimportimread

fromwordcloudimportWordCloud

fromwordcloudimportImageColorGenerator

importmatplotlib.pyplotasplt

fromosimportpath

cloud = WordCloud(

#設置字體,不指定就會出現亂碼,文件名不支持中文

font_path="C:/simfang.ttf",

#font_path=path.join(d,'simsun.ttc'),

#設置背景色,默認爲黑,可根據需要自定義爲顏色

background_color='black',

#詞雲形狀,

#mask=color_mask,

#允許最大詞彙

max_words=400,

#最大號字體,如果不指定則爲圖像高度

max_font_size=100,

#畫布寬度和高度,如果設置了msak則不會生效

width=1200,

height =800,

margin =2,

#詞語水平擺放的頻率,默認爲0.9.即豎直襬放的頻率爲0.1

prefer_horizontal =0.8

)

result = position_data.sum().sort_values(ascending=False)

_labels = [rowforrowinresult.index]

_frequency = [rowforrowinresult.values]

_data = { _labels[index]:_frequency[index]forindexinrange(len(_labels))}

wc = cloud.generate_from_frequencies(_data)

wc.to_file("cloud.jpg")#保存圖片

#顯示詞雲圖片

plt.imshow(wc)

#不現實座標軸

plt.axis('off')

plt

 

對人工智能感興趣的朋友可以加羣705673780,一起學習交流

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章