Python網頁爬蟲

1:

#! /usr/bin/env python
# -*- coding: utf-8 -*-
#filename:splider.py
#author:wfu([email protected])

from spdUtility import PriorityQueue,Parser
import urllib2
import sys
import os


def updatePriQueue( priQueue, url ):
    "更新優先級隊列"
    extraPrior = url.endswith('.html') and 2 or 0 #這裏優先下載以html結尾的url
    extraMyBlog = 'www.kgblog.net' in url and 5 or 0 #優先抓取含有指定內容的網頁,競價抓取排名??
    item = priQueue.getitem(url)
    if item :
        newitem = ( item[0]+1+extraPrior+extraMyBlog, item[1] )
        priQueue.remove(item)
        priQueue.push( newitem )
    else :
        priQueue.push( (1+extraPrior+extraMyBlog,url) )

def getmainurl(url):
    "獲得該url的主站地址,用於添加在相對url地址的開頭"
    ix = url.find('/',len('http://') )
    if ix > 0 :
        return url[:ix]
    else :
        return url

def analyseHtml(url,html, priQueue,downlist):
    "分析html的超鏈接,並更新優先級隊列"
    p = Parser()
    try :
        p.feed(html)
        p.close()
    except:
        return
    mainurl = getmainurl(url)
    for k, v in p.anchors.items():
        for u in v :
            if not u.startswith('http://'):  #處理相對地址的url
                u = mainurl + u       
            if not downlist.count(u) :    #如果該url已經下載,就不處理了
                updatePriQueue( priQueue, u )

def downloadUrl(id, url, priQueue , downlist,downFolder):
    "下載指定url內容,並分析html超鏈接"
    downFileName = downFolder+'/%d.html' % (id,)
    print 'downloading',url,'as', downFileName ,
    try:
        fp = urllib2.urlopen(url)
    except:
        print '[ failed ]'
        return False
    else :
        print '[ success ]'
        downlist.push( url )  #把已下載的url添加到列表中
        op = open(downFileName,"wb")
        html = fp.read()
        unicode(html,"gb18030","ignore").encode("utf8");
        op.write( html )
        op.close()
        fp.close()
        analyseHtml(url,html,priQueue,downlist)
        return True

def spider(beginurl, pages,downFolder):
    "爬蟲主程序,循環從優先級隊列中取出最高優先級的結點處理"
    priQueue = PriorityQueue()
    downlist = PriorityQueue() #已下載url的集合,防止重複下載
    priQueue.push( (1,beginurl) )
    i = 0
    while not priQueue.empty() and i < pages :
        k, url = priQueue.pop()
        if downloadUrl(i+1, url, priQueue , downlist,downFolder):
            i += 1
    print '\nDownload',i,'pages, Totally.'

def main():
    "主函數,設定相關參數:開始url,抓取的網頁數目,保存的文件夾"
    beginurl = 'http://www.csdn.net'  #開始抓取的URL地址
    pages = 10   #抓取網頁的數目
    downloadFolder = './down' #指定保存網頁的文件夾
    if not os.path.isdir( downloadFolder ):
        os.mkdir( downloadFolder )
    spider( beginurl, pages, downloadFolder)

if __name__ == '__main__':
    main()


2:

#! /usr/bin/env python
# -*- coding: utf-8 -*-
#filename:spdUtility.py
#author:wfu([email protected])
import bisect
import string
import htmllib
import formatter
class PriorityQueue(list):
    "優先級隊列,用於存儲url,及它的優先級"
    def __init__(self):
        list.__init__(self)
        self.map  =  {}
    def push(self, item):
        #  按順序插入,防止重複元素;若要按升序排列,可使用bisect.insort_left
        if  self.count(item)  ==  0:
            bisect.insort(self,  item)
            self.map[  item[1]  ]  =  item
    def pop(self):
        r  =  list.pop(self)
        del  self.map[  r[1]  ]
        return  r
    def getitem(self,url):
        if  self.map.has_key(  url  ):
            return  self.map[url]
        else  :
            return  None
    def empty(self):
        return  len(self)  ==  0
    def remove(self,item):
        list.remove(self,  item)
        del  self.map[  item[1]  ]

    def count(self,item):
        if len(self)  ==  0  :
            return  0
        #二分查找
        left = 0
        right =  len(self)-1
        mid  =  -1
        while  left  <=  right:
            mid  =  (left+right)/2
            if  self[mid]  <  item  :
                left  =  mid  +  1
            elif  self[mid]  >  item  :
                right  =  mid  -1
            else  :
                break
        return  self[mid]  ==  item  and  1  or  0


class Parser(htmllib.HTMLParser):
    #HTML分析類     
    def  __init__(self,  verbose=0):
        self.anchors  =  {}
        f  =  formatter.NullFormatter()
        htmllib.HTMLParser.__init__(self,  f,  verbose)

    def  anchor_bgn(self,  href,  name,  type):
        self.save_bgn()
        self.anchor  =  href

    def  anchor_end(self):
        text  =  string.strip(self.save_end())
        if  self.anchor  and  text:
            self.anchors[text]  =  self.anchors.get(text,  [])  +  [self.anchor]


def main():  #just  for  test
    pq  =  PriorityQueue()
    #  add  items  out  of  order
    pq.push(  (1,'http://www.baidu.com')  )
    pq.push(  (2,'http://www.sina.com')  )
    pq.push(  (3,'http://www.google.com')  )
    pq.push(  (1,'http://www.163.com')  )

    item  =  pq.getitem('http://www.sina.com')
    print  item
    print  pq.count(item)
    pq.remove(  item  )
    print  pq.count(item)
    #  print  queue  contents
    while  not  pq.empty():
        print  pq.pop()

if __name__ == '__main__':
    main()



發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章