# -*- coding:utf-8 -*- # 獲取網頁源碼/下載網頁/圖片/視頻/音頻.. import requests # 解析網頁相關數據 from lxml import etree # 操作文件夾/路徑 import os # 1.下載網頁源碼 # 2.解析網頁源碼(難度比較大) # 3.存儲相關數據 url = "http://www.ivsky.com/tupian/ziranfengguang/" response = requests.get(url) # content text 數據類型不一樣 # 把網頁源碼解析爲根節點 root = etree.HTML(response.content) # 根據xpath來定位相關數據 # ul li a # 注意:xpath返回的結果一定是個列表 a_list = root.xpath("//ul[@class='tpmenu']/li/a") # 對列表進行切片,跳過"所有分類"這個元素 for a in a_list[1:]: # text() 表示獲取標籤之間的文本內容 big_title = a.xpath("text()")[0] # 獲取標籤中的某個屬性 big_url = a.xpath("@href")[0] if not big_url.startswith("http"): big_url = "http://www.ivsky.com" + big_url big_response = requests.get(big_url) big_root = etree.HTML(big_response.content) big_a_list = big_root.xpath("//div[@class='sline']/div/a") for big_a in big_a_list: small_title = big_a.xpath("text()")[0] small_url = big_a.xpath("@href")[0] if not small_url.startswith("http"): small_url = "http://www.ivsky.com" + small_url path = "images/" + big_title + "/" + small_title # 如果路徑對應的文件夾不存在,(目的防止出現"文件夾已存在,創建失敗") if not os.path.exists(path): # makedirs = MakeDirectorys 根據路徑創建文件夾 os.makedirs(path) page = 1 old_small_url = small_url while True: # div/a/img small_response = requests.get(small_url) small_root = etree.HTML(small_response.content) img_list = small_root.xpath("//div[@class='il_img']/a/img") if not img_list: break for idx, img in enumerate(img_list): src = img.xpath("@src")[0] # 命名圖片的兩種方式: # name = src.split("/")[-1] name = img.xpath("@alt")[0] + str(page) + "-" + str(idx) + ".jpg" img_response = requests.get(src) f = open(path+"/"+name,"wb") f.write(img_response.content) f.close() page += 1 small_url = old_small_url + "/index_%s.html" % page print(small_url)
基於python的-爬取風景圖片網圖片
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.