#編碼格式要求爲utf-8
#coding=UTF-8
#urllib解析網站內容
import urllib2
#soup是特別好使的html解析器
from BeautifulSoup import *
#開站,讀內容
c=urllib2.urlopen('http://xxxxxx.html')
soup=BeautifulSoup(c.read())
#讀作者
user = soup.find('a',onclick=re.compile('shareRec'))['onclick']
regex=ur"發現:.*的#"
match = re.search(regex, user)
user = match.group()
user = user[3:]
templeng = len(user)
templeng = templeng-2
user = user[:templeng]
print "author: "+user
#讀日期
date = soup.find('span',attrs={'class':'date m_l_5'}).text
year = date[:4]
month = date[5:7]
day = date[8:10]
hour = date[11:13]
minute = date[14:16]
second = date[17:19]
print "date: "+date
print "year:"+year
print "month:"+month
print "day:"+day
print "hour:"+hour
print "minute:"+minute
print "second:"+second
#讀標題,地區
title = soup.find('div',attrs={'class':'Mztit'}).text
print "title:"+title
areaid = soup.find('a',href = re.compile('mddid')).attrs[0][1]
areaid = areaid[20:]
area = soup.findAll('a',href = re.compile('mddid='+areaid))[1].text
partid = soup.find('a',href = re.compile('travel-scenic-spot')).text
templen = len(partid)
templen = templen - 4
part = partid[:templen]
print "area:"+area
print "part:"+part
#讀描述,重點是第一張圖之前的文字內容
description = soup.find('div',attrs={'id':'pnl_contentinfo'})
des = description.contents
length = len(des)
descrip = " "
for d in des:
print "description:"+descrip
#重點來了,讀取每張圖與其文字
data = soup.findAll('div',attrs={'vaname':user})
txt = [""]
p_w_picpath = [""]
for d in data:
print len(txt)
print len(p_w_picpath)