import requests
import re
import json
import os
class NeiHanSpider():
def __init__(self):
self.start_url = "http://www.budejie.com/"
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}
def get_html(self, url):
response = requests.get(url, headers=self.headers, timeout=10)
if response.status_code == 200:
return response.content.decode()
else:
return None
def parse_content(self, html_str):
div = re.findall(r'"j-r-list"(.*?)j-r-wrst gud-put index-wrst', html_str, re.S)
div_html = div[0] + div[1]
li = re.findall(r'<li>(.*?)<!--操作工具條-->', div_html, re.S)
data = []
for l in li:
author_name = re.findall(r'class="u-user-name".*>(.*)</a>', l)[0]
pub_time = re.findall(r'class="u-time f-ib f-fr">(.*)</span>', l)[0]
content = re.findall(r'class="j-r-list-c-desc">.*?<a.*?>(.*?)</a>.*</div>', l, re.S)[0]
img_src = re.findall(r'class="j-r-list-c-img">.*data-original="(.*)" title', l, re.S)
if len(img_src) == 0:
img_src = None
else:
img_src = img_src[0]
item = {
"author_name": author_name,
"pub_time": pub_time,
"content": content,
"img_src": img_src
}
data.append(item)
return data
def save_img(self, url, content):
content = re.sub(r'[\\|/|:|*|?|<|>\|\n]', "", content)
end_name = os.path.splitext(url)[1]
filename = content + end_name
with open("./data/imgs/" + filename, "wb") as fp:
fp.write(requests.get(url).content)
print(content + "寫入成功...")
def run(self):
for index, url in enumerate([self.start_url + "{}".format(i+1) for i in range(10)]):
print("正在爬取第{}頁...".format(index + 1) + "{}".format(url))
html_str = self.get_html(url)
data = self.parse_content(html_str)
for d in data:
f = open("./data/duanzi.json", "a", encoding="utf-8")
json.dump(d, f, ensure_ascii=False)
f.write("\n")
f.close()
if d["img_src"]:
self.save_img(d["img_src"], d["content"])
if __name__ == '__main__':
nhs = NeiHanSpider()
nhs.run()
運行結果如下: