# _*_ coding:utf-8 _*_ from bs4 import BeautifulSoup import urllib2 #2017-3-14 爬 <python中文社區高級教程>自動保存爲txt文本文件 urllib2 beautifulsoup 4 #1.for in,數據類型list,切片操作 #2.bsp使用,如何找到內容,css選擇器,select #3.文件操作,命名,寫入方式 url = 'http://www.pythontab.com/html/pythonhexinbiancheng/index.html' #高級教程頁面 url_list = [url] #鏈接放入列表裏 for i in range(2,20): url_list.append('http://www.pythontab.com/html/pythonhexinbiancheng/index.html') #print(url_list[-1]) source_list = [] for j in url_list: request = urllib2.urlopen(j) #打開鏈接 html = request.read() #讀成源代碼 #print(html) soup = BeautifulSoup(html,'html.parser') titles = soup.select('#catlist > li > a') #find title #print titles links = soup.select('#catlist > li > a') #print links for title,link in zip(titles,links): data = { "title": title.get_text(), #標題文本 "link": link.get('href') #直接獲取按標籤裏的值"href = http://www.pythontab.com/html/2017/pythonhexinbiancheng_0228/1120.html" } source_list.append(data) #print source_list for l in source_list: request = urllib2.urlopen(l['link']) #找到鏈接 獲取href html = request.read() #print html soup = BeautifulSoup(html,'html.parser') #創建一個對象 text_p = soup.select('div.content') #查找到內容 #print text_p text = [] for t in text_p: text.append(t.get_text().encode('utf-8')) #print text title_text = l['title'] #找到標題 '''title_text = title_text.replace('*', '').replace('/', 'or').replace('"', ' ').replace('?', 'wenhao').replace(':', ' ') #替換代替符號''' #open(路徑+文件名字+模式(讀\寫) with open('study/%s.txt' % title_text, 'wb') as f: for a in text: f.write(a) print title_text
python爬蟲實例2017-3-14
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.