python獲取網頁上所有鏈接,並保存到sqlite3數據表中,並用瀏覽器打開。如果該表已存在,則直接從表中讀取鏈接並打開。
表名中去掉開頭"http://", 結尾"/", 端口號,替換中間字符".", "/"爲"_"
用到的python庫:
sgmllib,urllib網頁有關
re正則表達式
sqlite3數據表
subprocess子進程
#!/usr/bin/env python
#-*-coding:utf-8 -*-
from sgmllib import SGMLParser
import urllib,re
import sys, os, string, time
import sqlite3
import subprocess, signal
class UrlList(SGMLParser):
def reset(self):
self.urls=[]
SGMLParser.reset(self)
def start_a(self,attrs):
href=[v for k,v in attrs if k=='href']
if href:
self.urls.extend(href)
def get_urls(url):
try:
usock=urllib.urlopen(url)
except:
print "get url except "+url
return []
result=[]
parser=UrlList()
parser.feed(usock.read())
usock.close()
parser.close()
urls=parser.urls
for url in urls:
if len(re.findall(r'^http://',url))>0 or len(re.findall(r'^../../',url))>0: #指定正則表達式
result.append(url)
return result
def find_string(url, sub, rdepth):
if rdepth == 0:
return url
n=url.rfind(sub)
return find_string(url[:n], sub, rdepth-1)
def update_urls(startURL, url_list):
if len(url_list)==0:
return []
result=[]
s=find_string(startURL, r'/', 3) #找到右數第三次出現'/'之前的字符串,去替換../..
for u in url_list:
if u.find(r'../../')==0:
u=u.replace(r'../..', s)
result.append(u)
return result
def write_urls_into_table(urldb, table_name, urls):
conn=sqlite3.connect(urldb)
conn.isolation_level = None
conn.execute('create table if not exists %s(id integer primary key, url varchar(255), comment varchar(128))' % table_name)
for i, url in enumerate(urls):
conn.execute("insert into %s values(%d, '%s', '')" % (table_name, i, url))
conn.commit()
conn.close
def read_urls_from_table(urldb, table_name):
conn=sqlite3.connect(urldb)
conn.isolation_level = None
conn.text_factory = str
cur = conn.cursor()
try:
cur.execute("select url from %s" % table_name)
except sqlite3.Error, e:
print "An error occurred:", e.args[0]
res = cur.fetchall()
cur.close
conn.close
if len(res):
print "total urls: %d" % len(res)
return res
else:
print "read table %s null" % table_name
sys.exit(1)
def open_url(content):
if len(content)==0:
return
for line in content:
strl=str(line)
url=strl[2:-3]
print "open url "+url
try:
p=subprocess.Popen(["chrome", url], close_fds=True, preexec_fn=os.setsid)
time.sleep(8)
os.killpg(p.pid, signal.SIGUSR1)
time.sleep(3)
if p.poll():
print '/n'
continue
else:
print "Not kill all child process"
sys.exit(1)
except KeyboardInterrupt:
print "Pressed ctrl+c quit"
sys.exit(0)
else:
print "open urls over"
def start_run(startUrl, urldb):
if startUrl is None:
print "start url is null"
sys.exit(1)
if urldb is None:
print "db is null"
sys.exit(1)
table_name=''
if startUrl.find(r'http://')==0: #以http://開頭,去掉開頭
url=startUrl[7:]
start=url.find(':')
if start!=-1: #去掉端口號
end=url.find(r'/')
url=url[:start]+url[end:]
if startUrl.rfind(r'/')==(len(startUrl)-1): #以/結尾,去掉結尾
url=url[:-1]
if startUrl.find(r'/', 7)!=-1: #含有/,如http://bj.58.com/wenziluru,將/轉換爲_
url=url.replace(r'/', '_')
table_name=url.replace('.', '_') #將.轉換爲_
print "table name: %s" % table_name
# sys.exit(0)
conn=sqlite3.connect(urldb)
conn.isolation_level = None
try:
conn.execute("select * from %s" % table_name) #判斷表是否存在,存在就直接讀取內容
except :
print "%s not exists, create ..." % table_name #不存在需創建表
urls=get_urls(startUrl)
newurls=update_urls(startUrl, urls) #將../../替換爲絕對路徑
write_urls_into_table(urldb, table_name, newurls) #將鏈接寫入數據庫中
conn.close
content=read_urls_from_table(urldb, table_name) #讀取表中內容
open_url(content) #打開鏈接
if __name__=="__main__":
startUrl="http://www.baidu.com:80/"
urldb='urls.db'
start_run(startUrl, urldb)