python爬取並下載一個俄語植物網站上的圖片

網站鏈接:http://www.plantarium.ru/page/samples/taxon/41302.html

網站的圖片需要逐級進去,而且打開緩慢容易出錯,所以打算把圖片下下來便於查找,於是便有了這個小爬蟲。


# -*- coding: utf-8 -*-
import re,os,requests,urllib2,chardet,time,sys   #requests,chardet模塊需要自己安裝
stdi,stdo,stde=sys.stdin,sys.stdout,sys.stderr
reload(sys)
sys.stdin,sys.stdout,sys.stderr=stdi,stdo,stde
sys.setdefaultencoding('utf-8')
#只獲取網頁源代碼
def only_content(url):
    headers = {'User-agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'}
    request = requests.get(url,timeout=20,headers = headers)
    content = request.text
    return content

#獲取網頁源代碼(提取所需內容)
def get_content(url,reg):
    i=0
    p=True
    headers = {'User-agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'}
    while p and i<=10:
        try:
            request = requests.get(url,timeout=20,headers = headers)
            content = request.text
            want=reg.findall(content)
            if want==[]:
                i+=1
                print 'get none,I will try again'
            #   time.sleep(1)
            else:
                print 'get success!'
                p=False
        except:
            i+=1
            print 'get wrong,please wait 2 seconds!'
            time.sleep(2)
    return want

#獲取網頁源代碼(用於轉碼)-爲了解決防止個別網址不是使用的utf-8而亂碼
def for_change(url,reg):
    p=True
    headers={'User-agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'}
    while p:
        try:
            request=urllib2.Request(url,headers=headers)
            req=urllib2.urlopen(request,timeout=20)
            res=req.read()
            enc=chardet.detect(res)['encoding']
            print u'該網頁使用'+enc+u'編碼'
            content=res.decode(enc).encode('utf-8')
            want=reg.findall(content)
            print 'get success!'
            p=False
        except:
            print 'get wrong,please wait 10 minutes!'
            time.sleep(10)
    return want

#創建文件夾
def create_folder(path):
    if not os.path.exists(path):
        os.mkdir(path)
        
#保存圖片
def download_image(imageurl,imagename):
    i=0
    p=True
    while p and i<=10:
        try:
            data=requests.get(imageurl,timeout=20).content
            with open(imagename,'wb') as f:
                f.write(data)
            p=False
        except:
            i+=1
            print 'save picture wrong,please wait 2 seconds'
            time.sleep(2)
        
#主程序
if __name__ == '__main__':
    path='D:\\Russian_pictures\\'
    create_folder(path)
    n=0 #計數
    order=[]  #存放目網址
    family=[]  #存放科網址
    genus=[]  #存放屬網址
    
    #提取單雙子葉
    url="http://www.plantarium.ru"
    url1=url+'/page/samples/taxon/41302.html'
    a1=re.compile(r'href="(/page/samples/taxon.+?.html)',re.I)  #()爲無命名組,僅獲取括號內內容
    u1=get_content(url1,a1)
    print u1
    
    #提取目
    for u11 in u1:
        url2=url+u11
        a2=re.compile(r'href="(/page/samples/taxon.+?.html)',re.I)
        u2=get_content(url2,a2)
        u2.pop(0)  #刪除第一個目錄網址
        order.extend(u2)
    print 'It has '+str(len(order))+' orders'

    #提取科
    for u22 in order:
        url3=url+u22
        a3=re.compile(r'href="(/page/samples/taxon.+?.html)',re.I)
        u3=get_content(url3,a3)
        u3.pop(0)
        u3.pop(0)
        family.extend(u3)
    print 'It has '+str(len(family))+' families'

    #提取屬
    for u33 in family:
        url4=url+u33
        a4=re.compile(r'href="(/page/samples/taxon.+?.html)',re.I)
        u4=get_content(url4,a4)
        u4.pop(0)
        u4.pop(0)
        u4.pop(0)
        genus.extend(u4)
    print 'It has '+str(len(genus))+' genera'

    #下載種 (直接從科裏面提)
    for u44 in genus:
        url5=url+u44
        print url5
        a5=re.compile(r'href="(/page/view/item/.+?.html)',re.I)
        b5=re.compile(r'this,event.+?">(.+?)</a>',re.I)
        u5=get_content(url5,a5)
        n5=get_content(url5,b5)   #每個科的路徑列表
        pat=path
        for pa in n5:
            pat=pat+pa+'\\'
            create_folder(pat)
        u5=set(u5)   #每個屬的所有圖片集合
        #獲取該屬圖片頁數
        for u55 in u5:
            pp=True
            num=0  #設置錯誤超過次數跳過
            url6=url+u55
            #此處的俄文用正則表達式沒有匹配到,不知道什麼原因,有大神瞭解的請指點下!!
            '''
            a6=re.compile(r'из (.+?) найденных изображений')
            page=int(get_content(url6,a6)[0])/30+1
            '''
            #這裏使用split函數來代替
            while pp and num<=10:
                try:
                    number=only_content(url6).split('найденных изображений')[0].split('Показаны')[1].split('из ')[1]
                    print number
                    page=int(number)/30+1
                    pp=False
                    for i in range(0,page):
                        url7=url6.replace('view/item','view/part/'+str(i)+'/item')
                        a7=re.compile(r'href="(/page/image/id/.+?.html)',re.I)
                        u7=get_content(url7,a7)
                        #提取每張圖片
                        for u77 in u7:
                            n+=1
                            url_every=url+u77
                            name_a=re.compile(r'<title>.+?([a-zA-Z]+ +[a-zA-Z]*).+?</title>',re.I)
                            image_a=re.compile(r'src="(.+?.jpg)" width=',re.I)
                            name=get_content(url_every,name_a)[0].strip()+'-'+str(n)+'.jpg'
                            print name
                            image_name=pat+name
                            image_url=url+get_content(url_every,image_a)[0]
                            download_image(image_url,image_name)
                            print str(n)+' now'
                except:
                    num+=1
                    print 'page is not get,please wait 2 seconds'
                    time.sleep(2)
            
    print 'all '+str(n)+' download over'


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章