由於之前有java基礎和web開發基礎,所以把https://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000廖雪峯老師的python3的教程看到面向對象這裏開始學習py爬蟲
記錄下今天的第一個爬蟲例子
爬的是知乎的其中一個話題,但是感覺圖片並沒有爬完,目前不清楚情況,有待以後研究深入瞭解
貼下收穫以及代碼:
import urllib.request,re,os if __name__ == '__main__': targetPath = "D:\\python\\download\\images" def saveFile(path): #校驗路徑,如果不存在則創建 if not os.path.isdir(targetPath): os.mkdir(targetPath) pos = path.rindex('/') t = os.path.join(targetPath, path[pos + 1:]) print(t) return t url = "https://www.zhihu.com/question/36006897" headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36' } req = urllib.request.Request(url=url, headers=headers) res = urllib.request.urlopen(req) data = res.read() for myurl, other in set(re.findall(r'(https:[^\s]*?(png|gif|jpg))', str(data))): print(myurl) try: urllib.request.urlretrieve(myurl, saveFile(myurl)) except: print('掛掉了.....')