實例講解asp抓取網上信息

<%@LANGUAGE="VBSCRIPT" CODEPAGE="936"%>
<!-- #include file="conn.asp" -->
<!-- #include file="inc/function.asp" -->
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 
"http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<title>Untitled Document</title>
<meta http-equiv="Content-Type" content="text/html; charset=gb2312">
<meta http-equiv="refresh" content="300;URL=steal_house.asp">
</head>
<body>
<%
on error resume next
' 
Server.ScriptTimeout = 999999
'========================================================
'
字符編碼函數
'
====================================================
Function BytesToBstr(body,code) 
dim objstream 
set objstream = Server.CreateObject("adodb.stream"
objstream.Type 
= 1 
objstream.Mode 
=3 
objstream.Open 
objstream.Write body 
objstream.Position 
= 0 
objstream.Type 
= 2 
objstream.Charset 
=code
BytesToBstr 
= objstream.ReadText 
objstream.Close 
set objstream = nothing 
End Function 
'取行字符串在另一字符串中的出現位置
Function Newstring(wstr,strng) 
Newstring
=Instr(lcase(wstr),lcase(strng)) 
if Newstring<=0 then Newstring=Len(wstr) 
End Function 
'替換字符串函數
function ReplaceStr(ori,str1,str2)
ReplaceStr
=replace(ori,str1,str2)
end function
'====================================================
function ReadXml(url,code,start,ends)
set oSend=createobject("Microsoft.XMLHTTP")
SourceCode 
= oSend.open ("GET",url,false
oSend.send()
ReadXml
=BytesToBstr(oSend.responseBody,code )
start
=Instr(ReadXml,start)
ReadXml
=mid(ReadXml,start)
ends
=Instr(ReadXml,ends)
ReadXml
=left(ReadXml,ends-1)
end function
function SubStr(body,start,ends)
start
=Instr(body,start)
SubStr
=mid(body,start+len(start)+1)
ends
=Instr(SubStr,ends)
SubStr
=left(SubStr,ends-1)
end function
dim getcont,NewsContent
dim url,title
url
="http://www.***.com"'新聞網址
getcont=ReadXml(url,"gb2312","<table class=k2 border=""0""","</table>")
getcont
=RegexHtml(getcont)
dim KeyId,NewsClass,City,Position,HouseType,Level,Area,Price,Demostra
dim ContactMan,Contact
for i=2 to ubound(getcont)
response.Write(getcont(i)
&"__<br>")

tempLink
=mid(getcont(i),instr(getcont(i),"href=""")+6,instr(getcont(i),""" 
onClick")-10)
tempLink=replace(tempLink,"../","")

response.Write(i
&":"&tempLink&"<br>")
NewsContent
=ReadXml(tempLink,"gb2312","<td valign=""bottom"" 
width=""400"">","<hr width=""760"" 
noshade size
=""1"" color=""#808080""> 
")
NewsContent=RemoveHtml(NewsContent)
NewsContent
=replace(NewsContent,VbCrLf,"")
NewsContent
=replace(NewsContent,vbNewLine,"")
NewsContent
=replace(NewsContent," ","")
NewsContent
=replace(NewsContent," ","")
NewsContent
=replace(NewsContent,"&nbsp;",""
NewsContent
=replace(NewsContent," ",""
NewsContent
=replace(NewsContent,chr(10),"")
NewsContent
=replace(NewsContent,chr(13),"")
'===============get Content=======================
response.Write(NewsContent)
KeyId
=SubStr(NewsContent,"列號:","信息類別:")
NewsClass
=SubStr(NewsContent,"類別:","所在城市:")
City
=SubStr(NewsContent,"城市:","房屋具體位置:")
Position
=SubStr(NewsContent,"位置:","房屋類型:")
HouseType
=SubStr(NewsContent,"類型:","樓層:")
Level
=SubStr(NewsContent,"樓層:","使用面積:")
Area
=SubStr(NewsContent,"面積:","房價:")
Price
=SubStr(NewsContent,"房價:","其他說明:")
Demostra
=SubStr(NewsContent,"說明:","聯繫人:")
ContactMan
=SubStr(NewsContent,"聯繫人:","聯繫方式:")
Contact
=SubStr(NewsContent,"聯繫方式:","信息來源:"
response.Write(
"總序列號:"&KeyId&"<br>")
response.Write(
"信息類別:"&NewsClass&"<br>")
response.Write(
"所在城市:"&City&"<br>")
response.Write(
"房屋具體位置:"&Position&"<br>")
response.Write(
"房屋類型:"&HouseType&"<br>")
response.Write(
"樓層:"&Level&"<br>")
response.Write(
"使用面積:"&Area&"<br>")
response.Write(
"房價:"&Price&"<br>")
response.Write(
"其他說明:"&Demostra&"<br>")
response.Write(
"聯繫人:"&ContactMan&"<br>")
response.Write(
"聯繫方式:"&Contact&"<br>")
'title=RemoveHTML(aa(i))
'
response.Write("title:"&title)
for n=0 to application.Contents.count
if(application.Contents(n)=KeyId) then
ifexit
=true 
end if 
next 
if not ifexit then
application(
time&i)=KeyId
'添加到數據庫
'
====================================================
set rs=server.CreateObject("adodb.recordset"
rs.open 
"select top 1 * from news order by id desc",conn,3,3
rs.addnew
rs(
"NewsClass")=NewsClass
rs(
"City")=City
rs(
"Position")=Position
rs(
"HouseType")=HouseType
rs(
"Level")=Level
rs(
"Area")=Area
rs(
"Price")=Price
rs(
"Demostra")=Demostra
rs(
"ContactMan")=ContactMan
rs(
"Contact")=Contact
rs.update
rs.close
set rs=nothing
end if
'==================================================

next
function RemoveTag(body)
Set regEx = New RegExp
regEx.Pattern 
= "<[a].*?</[a]>"
regEx.IgnoreCase 
= True
regEx.Global 
= True
Set Matches = regEx.Execute(body) 
dim i,arr(15),ifexit
i
=0
j
=0
For Each Match in Matches
TempStr 
= Match.Value 
TempStr
=replace(TempStr,"<td>","")
TempStr
=replace(TempStr,"</td>","")
TempStr
=replace(TempStr,"<tr>","")
TempStr
=replace(TempStr,"</tr>",""
arr(i)
=TempStr 
i
=i+1
if(i>=15then
exit for
end if
Next
Set regEx=nothing
Set Matches =nothing
RemoveTag
=arr

end function
function RegexHtml(body)
dim r_arr(47),r_temp
Set regEx2 = New RegExp
regEx2.Pattern 
="<a.*?</a>"
regEx2.IgnoreCase 
= True
regEx2.Global 
= True
Set Matches2 = regEx2.Execute(body) 
iii
=0 
For Each Match in Matches2

r_arr(iii)
=Match.Value

iii
=iii+1 
Next
RegexHtml
=r_arr
set regEx2=nothing
set Matches2=nothing
end function
'======================================================
conn.close
set conn=nothing
%
>
</body>
</html>


 
function.asp

 

 

<%
'**************************************************
'
函數名:gotTopic
'
作 用:截字符串,漢字一個算兩個字符,英文算一個字符
'
參 數:str ----原字符串
'
 strlen ----截取長度
'
返回值:截取後的字符串
'
**************************************************
function gotTopic(str,strlen)
if str="" then
gotTopic
=""
exit function
end if
dim l,t,c, i
str
=replace(replace(replace(replace(str,"&nbsp;"," "),"&quot;",chr(34)),"&gt;",">"),"&lt;","<")
str
=replace(str,"?","")
l
=len(str)
t
=0
for i=1 to l
c
=Abs(Asc(Mid(str,i,1)))
if c>255 then
t
=t+2
else
t
=t+1
end if
if t>=strlen then
gotTopic
=left(str,i) & ""
exit for
else
gotTopic
=str
end if
next
gotTopic
=replace(replace(replace(replace(gotTopic," ","&nbsp;"),chr(34),"&quot;"),">","&gt;"),"<","&lt;")
end function
'=========================================================
'
函數:RemoveHTML(strHTML)
'
功能:去除HTML標記
'
參數:strHTML --要去除HTML標記的字符串
'
=========================================================
Function RemoveHTML(strHTML) 
Dim objRegExp, Match, Matches 
Set objRegExp = New Regexp 
objRegExp.IgnoreCase 
= True 
objRegExp.Global 
= True 
'取閉合的<> 
objRegExp.Pattern = "<.+?>" 
'進行匹配 
Set Matches = objRegExp.Execute(strHTML) 
' 遍歷匹配集合,並替換掉匹配的項目 
For Each Match in Matches 
strHtml
=Replace(strHTML,Match.Value,""
Next 
RemoveHTML
=strHTML 
Set objRegExp = Nothing 
set Matches=nothing
End Function 
%
>


  conn.asp
<%
'on error resume next
set conn=server.CreateObject("adodb.connection"
con
= "driver={Microsoft Access Driver (*.mdb)};dbq=" & Server.MapPath("stest.mdb"

conn.open con
sub connclose 
conn.close
set conn=nothing 
end sub
%
>


 
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章