自制爬蟲例--抓取網站圖像與簡介

#編碼格式要求爲utf-8

#coding=UTF-8

#urllib解析網站內容

import urllib2

#soup是特別好使的html解析器

from BeautifulSoup import *

#開站，讀內容

c=urllib2.urlopen('http://xxxxxx.html')

soup=BeautifulSoup(c.read())

#讀作者

user = soup.find('a',onclick=re.compile('shareRec'))['onclick']

regex=ur"發現：.*的#"

match = re.search(regex, user)

user = match.group()

user = user[3:]

templeng = len(user)

templeng = templeng-2

user = user[:templeng]

print "author: "+user

#讀日期

date = soup.find('span',attrs={'class':'date m_l_5'}).text

year = date[:4]

month = date[5:7]

day = date[8:10]

hour = date[11:13]

minute = date[14:16]

second = date[17:19]

print "date: "+date

print "year:"+year

print "month:"+month

print "day:"+day

print "hour:"+hour

print "minute:"+minute

print "second:"+second

#讀標題，地區

title = soup.find('div',attrs={'class':'Mztit'}).text

print "title:"+title

areaid = soup.find('a',href = re.compile('mddid')).attrs[0][1]

areaid = areaid[20:]

area = soup.findAll('a',href = re.compile('mddid='+areaid))[1].text

partid = soup.find('a',href = re.compile('travel-scenic-spot')).text

templen = len(partid)

templen = templen - 4

part = partid[:templen]

print "area:"+area

print "part:"+part

#讀描述，重點是第一張圖之前的文字內容

description = soup.find('div',attrs={'id':'pnl_contentinfo'})

des = description.contents

length = len(des)

descrip = " "

for d in des:

try:

if(not(d.find("img") == -1 or d.find("img") == None)):

if(length < 4):

des_i = d.contents

for i in des_i:

try:

if(not(i.find("img") == -1 or i.find("img") == None)):

break

else:

descrip = descrip + i.text

except:

i = i.strip()

if(not(i.find("img") == -1 or i.find("img") == None)):

break

else:

descrip = descrip + i

leng = len(d.contents)

if(leng > 15):

descrip = descrip + d.text

break

else:

descrip = descrip + d.text

except:

pass

print "description:"+descrip

#重點來了，讀取每張圖與其文字

data = soup.findAll('div',attrs={'vaname':user})

txt = [""]

p_w_picpath = [""]

for d in data:

have_jpg = d.find('img',attrs={'src':re.compile('jpeg')})

start = "false"

temp_txt = ""

if(have_jpg != None):

content = d.contents

for x in content:

try:

if((not(x.find("img") == -1 or x.find("img") == None)) and start == "false"):

start = "true"

if(start == "true"):

t = x.contents

for tt in t:

try:

if(tt.find('img',src = re.compile('http.*jpeg')) == None):

temp_txt = temp_txt + tt.text

else:

txt.append(temp_txt)

p_w_picpath.append(tt.find('img',src = re.compile('http.*jpeg'))['src'])

temp_txt = ""

except:

ttt = tt.strip()

temp_txt = temp_txt + ttt

except:

pass

print len(txt)

自制爬蟲例--抓取網站圖像與簡介

Python正則表達式操作指南

使用mechanize和Beautiful Soup輕鬆收集Web數據

python 實例一則

自制爬蟲例--抓取網站圖像與簡介

python時間轉爲時間戳

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結