python網頁爬蟲之列車時刻表的抓取-完整的python腳本

#! /usr/bin/env python

#coding=utf8
# by [email protected] 2010/5/30
from HTMLParser import HTMLParser
from pyquery import PyQuery as pq
import sqlite3,urllib2,logging,sys
from datetime import datetime
from decimal import Decimal

# 日誌初始化
logFileName='./%s.log'%(datetime.now().strftime("%Y%m%d%H%M%S"))
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
                    datefmt='%m-%d %H:%M',
                    filename=logFileName,
                    filemode='w')
logger = logging.getLogger('transchedule')
hdlr = logging.StreamHandler(sys.stdout)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(logging.DEBUG)

global witchTrain # 所屬當前頁面列車的序列號
witchTrain = 0

# 初始化數據庫
conn = sqlite3.connect('trainsInfo.sqlite')
c = conn.cursor()

def getTrainInfo(i,e):   
    result=[]
    # 取車次
    trainNumbers=pq(e)('tr td').eq(2).text().split(' ')
    for oneTNum in trainNumbers:
        result.append([oneTNum])
    # 運行時間
    runtimeMeta=pq(e)('tr td').eq(4).text().split(' ')
    for (counter,oneRuntime) in enumerate(runtimeMeta):
        runtime = int(oneRuntime.split(u'時')[0])*60+int(oneRuntime.split(u'時')[1][0:-1])
        result[counter].append(runtime)
    # 始發站
    startingStations=pq(e)('tr').eq(1)('td').eq(1).text().split(' ')
    for (counter,oneStartingStation) in enumerate(startingStations):
        result[counter].append(oneStartingStation)
    # 終點站
    terminatingStations=pq(e)('tr').eq(1)('td').eq(3).text().split(' ')
    for (counter,oneTerminatingStation) in enumerate(terminatingStations):
        result[counter].append(oneTerminatingStation)
    # 始發時間
    departureTimes=pq(e)('tr').eq(2)('td').eq(1).text().split(' ')
    for (counter,oneDepartureTime) in enumerate(departureTimes):
        result[counter].append(datetime.strptime(oneDepartureTime,'%H:%M'))
    # 到達時間
    arrivalTimes=pq(e)('tr').eq(2)('td').eq(3).text().split(' ')
    for (counter,oneArrivalTime) in enumerate(arrivalTimes):
        result[counter].append(datetime.strptime(oneArrivalTime,'%H:%M'))
    # 類型
    clazzes=pq(e)('tr').eq(3)('td').eq(1).text().split(' ')
    for (counter,oneClazz) in enumerate(clazzes):
        result[counter].append(oneClazz)
    # 全程
    ranges=pq(e)('tr').eq(3)('td').eq(3).text().split(' ')
    for (counter,oneRange) in enumerate(ranges):
        result[counter].append(int(oneRange[0:-2]))
    return result
   
def insertTrainInfo(trainData,cursor):
    try:
        cursor.execute('insert into trains_Info values(?,?,?,?,?,?,?,?)',trainData)
        logger.info("train number %s processed"%trainData[0])
    except Exception,e:
        logger.error("%s %s"%(e,trainData[0]))

def getScheduleInfo(i,e):
    global witchTrain # 所屬當前頁面列車的序列號
    td = pq(e)('td')
    if td.eq(0).text() in ('No.',""):
        witchTrain += 1
        return
    # 解析異常處理
    if len(td) == 2:
        logger.error("%s:%s"%(td.text().encode('gb18030'),len(td)))
        return    
    # 停車時間
    stopTime = datetime.strptime("00:00",'%H:%M')
    try:
        stopTime = datetime.strptime(td.eq(5).text(),'%H:%M')
    except Exception:
        #print 'stop time parse error:%s:%s'%(td.eq(5).text(),td.eq(0).text())
        None
    # 開車時間
    startTime = datetime.strptime("00:00",'%H:%M')
    try:
        startTime = datetime.strptime(td.eq(6).text(),'%H:%M')
    except Exception:
        #print 'start time parse error:%s:%s'%(td.eq(6).text(),td.eq(0).text())
        None
    # 里程
    range = int(td.eq(7).text()[:-2])
    # 硬座
    hardSeatPrice=0.0
    if td.eq(8) and len(td.eq(8).text()) > 1:
        hardSeatPrice=td.eq(8).text()[:-1]
    # 硬臥中鋪
    hardBerthPrice=0.0
    if td.eq(9) and len(td.eq(9).text()) > 1 and td.eq(9).text()[:-1] <> "-":
        hardBerthPrice=td.eq(9).text()[:-1]
    # 軟座
    softSeatPrice=0.0
    if td.eq(10) and len(td.eq(10).text()) > 1:
        softSeatPrice=td.eq(10).text()[:-1]
    # 軟臥下鋪
    softBerthPrice='0'
    if td.eq(11) and len(td.eq(11).text()) > 1:
        softBerth=td.eq(11).text()[:-1]
    return [[witchTrain,
            int(td.eq(0).text()),td.eq(1).text(),
            td.eq(4).text(),stopTime,
            startTime,range,
            hardSeatPrice,hardBerthPrice,
            softSeatPrice,softBerthPrice,]]

def insertTrainSchedule(trainInfos,scheduleData,cursor):      
    scheduleData[0]=trainInfos[scheduleData[0]-1][0]
    try:
      cursor.execute('insert into trains_schedule values(?,?,?,?,?,?,?,?,?,?,?)',scheduleData)
    except Exception,e:
      logger.error("%s %s"%(e,scheduleData[0]))

def getNextPageLink(e):
    d = pq(e)
    if d.text() == u"下一頁":
        return d.attr('href')

# 獲取頁面內的車次
def processTrainsInPage(url):
    # 解析指定的連接   
    d = pq(url=url)
    # 取得車次列表
    lis = d('body center div.ListContent div.ListContentLeft ul li')
    lis.make_links_absolute()
    trains = lis.map(lambda i,e:pq(e)('a').attr('href'))
    # 循環處理每個車次的時刻表
    for oneTrain in trains:
        pageContent=urllib2.urlopen(oneTrain).read().replace("gb2312","gb18030")
        # 列車信息
        trainInfo = pq(pageContent)('body center div.ResultContent div.ResultContentLeft div.ResultContentLeftContent\
div.ResultTrainCodeContent table').eq(1)
        trainInfos = trainInfo.map(getTrainInfo)
        for oneTrainInfo in trainInfos:
            insertTrainInfo(oneTrainInfo,c)
            conn.commit()
        # 列車途經站點
        trainSchedule = pq(pageContent)('body center div.ResultContent div.ResultContentLeft div.ResultContentLeftContent\
div.ResultTrainCodeContent table').eq(2)('tr')
        trainSchedules = trainSchedule.map(getScheduleInfo)
        global witchTrain
        witchTrain = 0
        for oneTrainSchedule in trainSchedules:
            insertTrainSchedule(trainInfos,oneTrainSchedule,c)
            conn.commit()
    # 遞歸處理下一頁
    nextPageTable = d('body center div.ListContent div.ListContentLeft div.ListContentLeftContent').eq(2)('a')
    nextPageTable.make_links_absolute()
    lis=nextPageTable.map(lambda i,e:getNextPageLink(e))
    if lis[0] == url:
        logger.info(lis[0].encode('gb18030'))
    else:
        logger.info("start process %s"%lis[0])
        processTrainsInPage(lis[0])

logger.info("start get data...")
try:
    processTrainsInPage(r"http://www.tielu.org/TrainList/TrainList-1.html")
except Exception,e:
    logger.error(e)
logger.info("finish...")

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章