python爬取美團信息數據，人生第一個爬蟲程序

原創

2018-10-27 00:19

#!/usr/bin/env python
#-- coding:utf-8 --

import requests
import re
import json
import time
import random
from requests.exceptions import RequestException

def get_ono_page(url):
“”"
獲取一個頁面數據,並下載數據
“”"
headers = {“User-Agent”:“Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)”
}
try:
response = requests.get(url,headers=headers)
if response.status_code == 200:
#print(“200”)
return response.text
return None
except RequestException:
return None #報錯

#{“poiId”:1782324,“frontImg”:“https://img.meituan.net/600.600/msmerchant/15063dce5c3491d6383b015df333524186600.jpg",“title”:“原石牛扒（石巖店）”,“avgScore”:5,“allCommentNum”:3241,“address”:“寶安區寶石東路377號2樓（石巖影劇院東側）”,"avgPrice”:59,

def deal_one_page(html):
# print(type(html))
# 正則表達式匹配
pattern = re.compile(’“frontImg”:"[\s\S]?",“title”:"[\s\S]?",“avgScore”:[\s\S]?,“allCommentNum”:\d?,“address”:"[\s\S]?",“avgPrice”:\d?,’)
results = re.findall(pattern, html)
print(results) ##匹配出來數據是字符串格式
# 新建列表
resultsL = []
#遍歷字符串
for item in results:
# 對字符串進行切割,先以逗號—索引下標，再以冒號切割—索引下標
resultsL.append({‘frontImg’:item.split(",")[0].split(":",1)1,
‘title’:item.split(",")1.split("😊1,
‘avgScore’:item.split(",")2.split("😊1,
‘allCommentNum’:item.split(",")3.split("😊1,
‘address’:item.split(",")4.split("😊1,
‘avgPrice’:item.split(",")[5].split("😊1
})
print(resultsL)
return resultsL

def write2File(item):
“”"
將抓取到數據一條條寫入meituanmeishi.txt
“”"
#json數據格式
with open(“meituanmeishi.txt”, “a”, encoding=“utf-8”) as f:
#轉化爲json字符串寫入,方便數據分析
f.write(json.dumps(item, ensure_ascii=False+’\n’))

def crawlPage(i):
# 得到真正的URL
#http://sz.meituan.com/meishi/b32/pn1/
url = “http://sz.meituan.com/meishi/b32/pn”+str(i)+’/’
# 下載頁面
html = get_ono_page(url)
# 提取信息,寫入到本地文件系統
for item in deal_one_page(html):
#將數據寫到本地的文件系統中
write2File(item)

if name == “main”:
# 循環爬取次數,根據數據數量而定
for i in range(3):
#頁面序號:1\2\3\4\5\6…
crawlPage(i)
time.sleep(random.randint(1,3)) #每抓取一個頁面隨機休息1到3秒鐘
print(‘爬取完成’)

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

python爬取美團信息數據，人生第一個爬蟲程序

《Python進階》學習筆記

Leetcode 3161. 物塊放置查詢

leetcode 60 排列序列

一個docker容器暴露多個端口

微服務實踐之使用 Visual Studio 2022 調試Dapr 應用程序

wpf附加屬性理解 WPF附加屬性

Google資深工程師推薦Python面試必須要看的15個問題

Python中的urllib.request模塊

平均年薪70萬？剛剛，這類程序員又漲薪了

機器學習之FP-growth頻繁項集算法

機器學習之優雅落地線性迴歸法

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結