前言:
第一次接觸爬蟲,還是大學做挑戰杯的時候了,單身奶狗一枚。那個時候,蒙着頭就是找資料,然後寫。嘗試了很多次,但是還是不夠完美,而且JS寫的比較複雜。但是還是好多學姐學妹很崇拜我(壞笑)。
現在已經結婚,雖然依然每天不斷學習與提升自己,但是畢竟經歷社會多年的風雨磨鍊,不是當年非常單純的小男生。非常感謝親愛的陪伴。作爲老公,會一直陪伴你,守護你。讓技術歲月,銘記我們的愛情。
import requests
import os
import time
import threading
from bs4 import BeautifulSoup
def download_page(url):
headers = {“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0”}
r = requests.get(url, headers=headers)
r.encoding = ‘gb2312’
return r.text
def get_pic_list(html):
soup = BeautifulSoup(html, ‘html.parser’)
pic_list = soup.find_all(‘li’, class_=‘wp-item’)
for i in pic_list:
a_tag = i.find(‘h3’, class_=‘tit’).find(‘a’)
link = a_tag.get(‘href’)
text = a_tag.get_text()
get_pic(link, text)
def get_pic(link, text):
html = download_page(link) # 下載圖片界面
soup = BeautifulSoup(html, ‘html.parser’)
pic_list = soup.find(‘div’, id=“picture”).find_all(‘img’)
headers = {“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0”}
create_dir(‘pic/{}’.format(text))
for i in pic_list:
pic_link = i.get(‘src’) # 拿到圖片的具體 url
r = requests.get(pic_link, headers=headers) # 下載圖片,之後保存到文件
with open(‘pic/{}/{}’.format(text, link.split(’/’)[-1]), ‘wb’) as f:
f.write(r.content)
time.sleep(1)
def create_dir(name):
if not os.path.exists(name):
os.makedirs(name)
def execute(url):
page_html = download_page(url)
get_pic_list(page_html)
def main():
create_dir(‘pic’)
queue = [i for i in range(1, 70)]
threads = []
while len(queue) > 0:
for thread in threads:
if not thread.is_alive():
threads.remove(thread)
while len(threads) < 5 and len(queue) > 0:
cur_page = queue.pop(0)
url = ‘http://meizitu.com/a/more_{}.html’.format(cur_page)
thread = threading.Thread(target=execute, args=(url,))
thread.setDaemon(True)
thread.start()
print(’{}正在下載{}頁’.format(threading.current_thread().name, cur_page))
threads.append(thread)
if name == ‘main’:
main()