網站鏈接:http://www.plantarium.ru/page/samples/taxon/41302.html
網站的圖片需要逐級進去,而且打開緩慢容易出錯,所以打算把圖片下下來便於查找,於是便有了這個小爬蟲。
# -*- coding: utf-8 -*-
import re,os,requests,urllib2,chardet,time,sys #requests,chardet模塊需要自己安裝
stdi,stdo,stde=sys.stdin,sys.stdout,sys.stderr
reload(sys)
sys.stdin,sys.stdout,sys.stderr=stdi,stdo,stde
sys.setdefaultencoding('utf-8')
#只獲取網頁源代碼
def only_content(url):
headers = {'User-agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'}
request = requests.get(url,timeout=20,headers = headers)
content = request.text
return content
#獲取網頁源代碼(提取所需內容)
def get_content(url,reg):
i=0
p=True
headers = {'User-agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'}
while p and i<=10:
try:
request = requests.get(url,timeout=20,headers = headers)
content = request.text
want=reg.findall(content)
if want==[]:
i+=1
print 'get none,I will try again'
# time.sleep(1)
else:
print 'get success!'
p=False
except:
i+=1
print 'get wrong,please wait 2 seconds!'
time.sleep(2)
return want
#獲取網頁源代碼(用於轉碼)-爲了解決防止個別網址不是使用的utf-8而亂碼
def for_change(url,reg):
p=True
headers={'User-agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'}
while p:
try:
request=urllib2.Request(url,headers=headers)
req=urllib2.urlopen(request,timeout=20)
res=req.read()
enc=chardet.detect(res)['encoding']
print u'該網頁使用'+enc+u'編碼'
content=res.decode(enc).encode('utf-8')
want=reg.findall(content)
print 'get success!'
p=False
except:
print 'get wrong,please wait 10 minutes!'
time.sleep(10)
return want
#創建文件夾
def create_folder(path):
if not os.path.exists(path):
os.mkdir(path)
#保存圖片
def download_image(imageurl,imagename):
i=0
p=True
while p and i<=10:
try:
data=requests.get(imageurl,timeout=20).content
with open(imagename,'wb') as f:
f.write(data)
p=False
except:
i+=1
print 'save picture wrong,please wait 2 seconds'
time.sleep(2)
#主程序
if __name__ == '__main__':
path='D:\\Russian_pictures\\'
create_folder(path)
n=0 #計數
order=[] #存放目網址
family=[] #存放科網址
genus=[] #存放屬網址
#提取單雙子葉
url="http://www.plantarium.ru"
url1=url+'/page/samples/taxon/41302.html'
a1=re.compile(r'href="(/page/samples/taxon.+?.html)',re.I) #()爲無命名組,僅獲取括號內內容
u1=get_content(url1,a1)
print u1
#提取目
for u11 in u1:
url2=url+u11
a2=re.compile(r'href="(/page/samples/taxon.+?.html)',re.I)
u2=get_content(url2,a2)
u2.pop(0) #刪除第一個目錄網址
order.extend(u2)
print 'It has '+str(len(order))+' orders'
#提取科
for u22 in order:
url3=url+u22
a3=re.compile(r'href="(/page/samples/taxon.+?.html)',re.I)
u3=get_content(url3,a3)
u3.pop(0)
u3.pop(0)
family.extend(u3)
print 'It has '+str(len(family))+' families'
#提取屬
for u33 in family:
url4=url+u33
a4=re.compile(r'href="(/page/samples/taxon.+?.html)',re.I)
u4=get_content(url4,a4)
u4.pop(0)
u4.pop(0)
u4.pop(0)
genus.extend(u4)
print 'It has '+str(len(genus))+' genera'
#下載種 (直接從科裏面提)
for u44 in genus:
url5=url+u44
print url5
a5=re.compile(r'href="(/page/view/item/.+?.html)',re.I)
b5=re.compile(r'this,event.+?">(.+?)</a>',re.I)
u5=get_content(url5,a5)
n5=get_content(url5,b5) #每個科的路徑列表
pat=path
for pa in n5:
pat=pat+pa+'\\'
create_folder(pat)
u5=set(u5) #每個屬的所有圖片集合
#獲取該屬圖片頁數
for u55 in u5:
pp=True
num=0 #設置錯誤超過次數跳過
url6=url+u55
#此處的俄文用正則表達式沒有匹配到,不知道什麼原因,有大神瞭解的請指點下!!
'''
a6=re.compile(r'из (.+?) найденных изображений')
page=int(get_content(url6,a6)[0])/30+1
'''
#這裏使用split函數來代替
while pp and num<=10:
try:
number=only_content(url6).split('найденных изображений')[0].split('Показаны')[1].split('из ')[1]
print number
page=int(number)/30+1
pp=False
for i in range(0,page):
url7=url6.replace('view/item','view/part/'+str(i)+'/item')
a7=re.compile(r'href="(/page/image/id/.+?.html)',re.I)
u7=get_content(url7,a7)
#提取每張圖片
for u77 in u7:
n+=1
url_every=url+u77
name_a=re.compile(r'<title>.+?([a-zA-Z]+ +[a-zA-Z]*).+?</title>',re.I)
image_a=re.compile(r'src="(.+?.jpg)" width=',re.I)
name=get_content(url_every,name_a)[0].strip()+'-'+str(n)+'.jpg'
print name
image_name=pat+name
image_url=url+get_content(url_every,image_a)[0]
download_image(image_url,image_name)
print str(n)+' now'
except:
num+=1
print 'page is not get,please wait 2 seconds'
time.sleep(2)
print 'all '+str(n)+' download over'