python獲取所有鏈接保存到數據表並依次打開

python獲取網頁上所有鏈接,並保存到sqlite3數據表中,並用瀏覽器打開。如果該表已存在,則直接從表中讀取鏈接並打開。

表名中去掉開頭"http://", 結尾"/", 端口號,替換中間字符".", "/"爲"_"

 

用到的python庫:

sgmllib,urllib網頁有關

re正則表達式

sqlite3數據表

subprocess子進程

 

 

 

#!/usr/bin/env python
#-*-coding:utf-8 -*-

from sgmllib import SGMLParser
import urllib,re
import sys, os, string, time
import sqlite3
import subprocess, signal

class UrlList(SGMLParser):
    def reset(self):
        self.urls=[]
        SGMLParser.reset(self)
    def start_a(self,attrs):
        href=[v for k,v in attrs if k=='href']
        if href:
            self.urls.extend(href)

def get_urls(url):
    try:
        usock=urllib.urlopen(url)
    except:
        print "get url except "+url
        return []
    result=[]
    parser=UrlList()
    parser.feed(usock.read())
    usock.close()
    parser.close()
    urls=parser.urls
    for url in urls:
        if len(re.findall(r'^http://',url))>0 or len(re.findall(r'^../../',url))>0:  #指定正則表達式

            result.append(url)
    return result

def find_string(url, sub, rdepth):
    if rdepth == 0:
        return url
    n=url.rfind(sub)
    return find_string(url[:n], sub, rdepth-1)


def update_urls(startURL, url_list):
    if len(url_list)==0:
        return []
    result=[]
    s=find_string(startURL, r'/', 3) #找到右數第三次出現'/'之前的字符串,去替換../..
    for u in url_list:
        if u.find(r'../../')==0:
            u=u.replace(r'../..', s)
        result.append(u)
    return result


def write_urls_into_table(urldb, table_name, urls):
    conn=sqlite3.connect(urldb)
    conn.isolation_level = None
    conn.execute('create table if not exists %s(id integer primary key, url varchar(255), comment varchar(128))' % table_name)
    for i, url in enumerate(urls):
        conn.execute("insert into %s values(%d, '%s', '')" % (table_name, i, url))
    conn.commit()
    conn.close
def read_urls_from_table(urldb, table_name):
    conn=sqlite3.connect(urldb)
    conn.isolation_level = None
    conn.text_factory = str
    cur = conn.cursor()
    try:
        cur.execute("select url from %s" % table_name)
    except sqlite3.Error, e:
        print "An error occurred:", e.args[0]
    res = cur.fetchall()
    cur.close
    conn.close
    if len(res):
        print "total urls: %d" % len(res)
        return res
    else:
        print "read table %s null" % table_name
        sys.exit(1)


def open_url(content):
    if len(content)==0:
        return
    for line in content:
        strl=str(line)
        url=strl[2:-3]
        print "open url "+url
        try:
            p=subprocess.Popen(["chrome", url], close_fds=True, preexec_fn=os.setsid)
            time.sleep(8)
            os.killpg(p.pid, signal.SIGUSR1)
            time.sleep(3)
            if p.poll():
                print '/n'
                continue
            else:
                print "Not kill all child process"
                sys.exit(1)
        except KeyboardInterrupt:
            print "Pressed ctrl+c quit"
            sys.exit(0)
    else:
        print "open urls over"



def start_run(startUrl, urldb):
    if startUrl is None:
        print "start url is null"
        sys.exit(1)
    if urldb is None:
        print "db is null"
        sys.exit(1)

    table_name=''
    if startUrl.find(r'http://')==0: #以http://開頭,去掉開頭
        url=startUrl[7:]
        start=url.find(':')
        if start!=-1:   #去掉端口號
            end=url.find(r'/')
            url=url[:start]+url[end:]
        if startUrl.rfind(r'/')==(len(startUrl)-1): #以/結尾,去掉結尾
            url=url[:-1]
        if startUrl.find(r'/', 7)!=-1: #含有/,如http://bj.58.com/wenziluru,將/轉換爲_
            url=url.replace(r'/', '_')
        table_name=url.replace('.', '_') #將.轉換爲_

    print "table name: %s" % table_name
    # sys.exit(0)

    conn=sqlite3.connect(urldb)
    conn.isolation_level = None
    try:
        conn.execute("select * from %s" % table_name) #判斷表是否存在,存在就直接讀取內容
    except :
        print "%s not exists, create ..." % table_name #不存在需創建表
        urls=get_urls(startUrl)
        newurls=update_urls(startUrl, urls)   #將../../替換爲絕對路徑
        write_urls_into_table(urldb, table_name, newurls) #將鏈接寫入數據庫中

    conn.close
    content=read_urls_from_table(urldb, table_name) #讀取表中內容
    open_url(content) #打開鏈接


if __name__=="__main__":
    startUrl="http://www.baidu.com:80/"
    urldb='urls.db'
    start_run(startUrl, urldb)

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章