1. 程式人生 > >如何實現新聞自動抓取?

如何實現新聞自動抓取?

我想做一個系統能夠從一些別的網站抓新聞,然後動態新增到我的站點上來,請問如何實現?
新浪的新聞抓取
1。首頁呼叫
<style type="text/css">
<!--
body {  font-size: 12px}
-->
</style>
<%
Server.ScriptTimeOut=120

''*********頁面設定部分***********************************************************************

const m=40 ''首頁列出多少條新聞

const NeedTime=False ''是否需要顯示時間,True 表示顯示時間 , False 表示不顯示時間

const NewsLength=20 ''新聞標題擷取長度(不包括時間),注意截取了新聞長度就不能顯示新聞時間

const Points="…" ''擷取長度後的標題要跟的省略號樣子,可不填。

''*********************************************************************************************

dim wstr,str,url,start,over,i,News


on error resume next
url="http://dailynews.sina.com.cn/news1000.shtml"
wstr=getHTTPPage(url)
if err.number=0 then
start=newstring(wstr,"<!--新聞開始-->")
over=newstring(wstr,"<!--新聞結束-->")
wstr=mid(wstr,start+11,over-start-11)
wstr=replace(wstr,"<ul>","")
wstr=trim(replace(wstr,"</ul>",""))
'' Set fs = CreateObject("Scripting.FileSystemObject")
'' Set f = fs.CreateTextFile(server.mappath("mynews.htm"))
'' f.writeLine wstr
'' f.close
'' set f = nothing
'' set fs = nothing
str=split(wstr,"<li>")
If Unbound(str)<m then m=Unbound(str)
for i=1 to m
News=News&"<li>"&LeftNews(str(i),NewsLength,NeedTime)
next
set str=nothing
else
wscript.echo err.description
end if


Sub writeLog(Msg)
On Error Resume Next
Dim f
Set f = fs.OpenTextFile(logfile,8,true)
f.WriteLine now & " - " & Msg
f.close
End Sub
function getHTTPPage(url)
on error resume next
dim http
set http=Server.createobject("Microsoft.XMLHTTP")
Http.open "GET",url,false
Http.send()
if Http.readystate<>4 then
exit function
end if
getHTTPPage=bytes2BSTR(Http.responseBody)
set http=nothing
if err.number<>0 then err.Clear  
end function

Function bytes2BSTR(vIn)
dim strReturn
dim i,ThisCharCode,NextCharCode
strReturn = ""
For i = 1 To LenB(vIn)
ThisCharCode = AscB(MidB(vIn,i,1))
If ThisCharCode < &H80 Then
strReturn = strReturn & Chr(ThisCharCode)
Else
NextCharCode = AscB(MidB(vIn,i+1,1))
strReturn = strReturn & Chr(CLng(ThisCharCode) * &H100 + CInt(NextCharCode))
i = i + 1
End If
Next
bytes2BSTR = strReturn
End Function

Function newstring(wstr,strng)
newstring=Instr(wstr,strng)
End Function

Function LeftNews(strng,NewsLength,NeedTime)
If NeedTime<>True then
Left_0=Instr(strng,"</a>")+3
TheRed=Instr(strng,"<font color=#ff0000>")
If TheRed>0 then
Left_1=Instr(strng,"<font color=#ff0000>")+20
Left_2=Instr(strng,"</font>")
If Left_1+NewsLength>=Left_2 then
LeftNews=Left(strng,Left_0)
Else
LeftNews=Left(strng,Left_1+NewsLength)&Points&"</font></a>"
End if
Else
Left_1=Instr(strng,"_blank>")+7
Left_2=Instr(strng,"</a>")
If Left_1+NewsLength>=Left_2 then
LeftNews=Left(strng,Left_0)
Else
LeftNews=Left(strng,Left_1+NewsLength)&Points&"</a>"
End if
End if
Else
LeftNews=strng
End if
End Function


Response.Write News ''變數News為內容
%>
2。新聞列表
<style type="text/css">
<!--
body {  font-size: 12px}
-->
</style>

<a href="news.asp">首頁</a>
<a href="news.asp?n=娛樂">娛樂</a>
<a href="news.asp?n=體育">體育</a>
<a href="news.asp?n=國內">國內</a>
<a href="news.asp?n=科技">科技</a>
<a href="news.asp?n=財經">財經</a>
<a href="news.asp?n=社會">社會</a>
<a href="news.asp?n=汽車">汽車</a>
<a href="news.asp?n=國際">國際</a>
<a href="news.asp?n=文教">文教</a>
<a href="news.asp?n=影音">影音</a>
<p>
<%
Server.ScriptTimeOut=120

''*********頁面設定部分***********************************************************************

const m=10 ''每個分類的新聞最多幾條

const NeedTime=False ''是否需要顯示時間,True 表示顯示時間 , False 表示不顯示時間

const NewsLength=20 ''新聞標題擷取長度(不包括時間),注意截取了新聞長度就不能顯示新聞時間

const Points="…" ''擷取長度後的標題要跟的省略號樣子,可不填。

''*********************************************************************************************

dim wstr,str,url,start,over,NewsClass,i
dim n0,n1,n2,n3,n4,n5,n6,n7,n8,n9
n0=0
n1=0
n2=0
n3=0
n4=0
n5=0
n6=0
n7=0
n8=0
n9=0

NewsClass=trim(Request("n"))

on error resume next
url="http://dailynews.sina.com.cn/news1000.shtml" ''新聞來源的頁面
wstr=getHTTPPage(url) ''取得頁面內容
if err.number=0 then
start=newstring(wstr,"<!--新聞開始-->")
over=newstring(wstr,"<!--新聞結束-->")
wstr=mid(wstr,start+11,over-start-11)
wstr=replace(wstr,"href=""","href=""show.asp?url=")
wstr=replace(wstr,"<ul>","")
wstr=trim(replace(wstr,"</ul>","")) ''完成對頁面內容的擷取加工
'' Set fs = CreateObject("Scripting.FileSystemObject")
'' Set f = fs.CreateTextFile(server.mappath("mynews.htm"))
'' f.writeLine wstr
'' f.close
'' set f = nothing
'' set fs = nothing
str=split(wstr,"<li>")
If NewsClass<>"" then ''對分類新聞的擷取
for i=1 to Ubound(str)
If Left(str(i),4)="["&NewsClass&"]" then
News=News&"<li>"&LeftNews(str(i),NewsLength,NeedTime)
End if
next
Else ''對所有新聞進行分類
for i=1 to Ubound(str)
If     Left(str(i),4)="[娛樂]" then
If n0<m then YuLe=YuLe&"<li>"&LeftNews(str(i),NewsLength,NeedTime)
n0=n0+1
Elseif Left(str(i),4)="[體育]" then
If n1<m then TiYu=TiYu&"<li>"&LeftNews(str(i),NewsLength,NeedTime)
n1=n1+1
Elseif Left(str(i),4)="[國內]" then
If n2<m then GuoNei=GuoNei&"<li>"&LeftNews(str(i),NewsLength,NeedTime)
n2=n2+1
Elseif Left(str(i),4)="[科技]" then
If n3<m then KeJi=KeJi&"<li>"&LeftNews(str(i),NewsLength,NeedTime)
n3=n3+1
Elseif Left(str(i),4)="[財經]" then
If n4<m then CaiJing=CaiJing&"<li>"&LeftNews(str(i),NewsLength,NeedTime)
n4=n4+1
Elseif Left(str(i),4)="[社會]" then
If n5<m then SheHui=SheHui&"<li>"&LeftNews(str(i),NewsLength,NeedTime)
n5=n5+1
Elseif Left(str(i),4)="[汽車]" then
If n6<m then QiChe=QiChe&"<li>"&LeftNews(str(i),NewsLength,NeedTime)
n6=n6+1
Elseif Left(str(i),4)="[國際]" then
If n7<m then GuoJi=GuoJi&"<li>"&LeftNews(str(i),NewsLength,NeedTime)
n7=n7+1
Elseif Left(str(i),4)="[影音]" then
If n8<m then YingYin=YingYin&"<li>"&LeftNews(str(i),NewsLength,NeedTime)
n8=n8+1
Elseif Left(str(i),4)="[文教]" then
If n9<m then WenJiao=WenJiao&"<li>"&LeftNews(str(i),NewsLength,NeedTime)
n9=n9+1
End if
next
End if
set str=nothing
else
wscript.echo err.description
end if

Sub writeLog(Msg)
On Error Resume Next
Dim f
Set f = fs.OpenTextFile(logfile,8,true)
f.WriteLine now & " - " & Msg
f.close
End Sub
function getHTTPPage(url)
on error resume next
dim http
set http=Server.createobject("Microsoft.XMLHTTP")
Http.open "GET",url,false
Http.send()
if Http.readystate<>4 then
exit function
end if
getHTTPPage=bytes2BSTR(Http.responseBody)
set http=nothing
if err.number<>0 then err.Clear  
end function

Function bytes2BSTR(vIn)
dim strReturn
dim i,ThisCharCode,NextCharCode
strReturn = ""
For i = 1 To LenB(vIn)
ThisCharCode = AscB(MidB(vIn,i,1))
If ThisCharCode < &H80 Then
strReturn = strReturn & Chr(ThisCharCode)
Else
NextCharCode = AscB(MidB(vIn,i+1,1))
strReturn = strReturn & Chr(CLng(ThisCharCode) * &H100 + CInt(NextCharCode))
i = i + 1
End If
Next
bytes2BSTR = strReturn
End Function

Function newstring(wstr,strng)
newstring=Instr(wstr,strng)
End Function

Function LeftNews(strng,NewsLength,NeedTime)
If NeedTime<>True then
Left_0=Instr(strng,"</a>")+3
TheRed=Instr(strng,"<font color=#ff0000>")
If TheRed>0 then
Left_1=Instr(strng,"<font color=#ff0000>")+20
Left_2=Instr(strng,"</font>")
If Left_1+NewsLength>=Left_2 then
LeftNews=Left(strng,Left_0)
Else
LeftNews=Left(strng,Left_1+NewsLength)&Points&"</font></a>"
End if
Else
Left_1=Instr(strng,"_blank>")+7
Left_2=Instr(strng,"</a>")
If Left_1+NewsLength>=Left_2 then
LeftNews=Left(strng,Left_0)
Else
LeftNews=Left(strng,Left_1+NewsLength)&Points&"</a>"
End if
End if
Else
LeftNews=strng
End if
End Function

''每個變數代表一個分類的新聞

Response.Write YuLe&"<p>"
Response.Write TiYu&"<p>"
Response.Write GuoNei&"<p>"
Response.Write KeJi&"<p>"
Response.Write CaiJing&"<p>"
Response.Write SheHui&"<p>"
Response.Write QiChe&"<p>"
Response.Write GuoJi&"<p>"
Response.Write YingYin&"<p>"
Response.Write WenJiao
''變數News是選擇分類新聞後的變數
Response.Write News

%>
3。新聞內容
<%
Server.ScriptTimeOut=60
dim wstr,url,start,over,i


on error resume next
url=Request("url")
wstr=getHTTPPage(url)
if err.number=0 then
wstr=Autolink(wstr) ''完成擷取後的頁面
'' Set fs = CreateObject("Scripting.FileSystemObject") ''把截下來的頁面寫在一個檔案裡
'' Set f = fs.CreateTextFile(server.mappath("mynews.htm"))
'' f.writeLine wstr
'' f.close
'' set f = nothing
'' set fs = nothing
else
wscript.echo err.description
end if

function getHTTPPage(url)
on error resume next
dim http
set http=Server.createobject("Microsoft.XMLHTTP")
Http.open "GET",url,false
Http.send()
if Http.readystate<>4 then
exit function
end if
getHTTPPage=bytes2BSTR(Http.responseBody)
set http=nothing
if err.number<>0 then err.Clear  
end function

Function bytes2BSTR(vIn)
dim strReturn
dim i,ThisCharCode,NextCharCode
strReturn = ""
For i = 1 To LenB(vIn)
ThisCharCode = AscB(MidB(vIn,i,1))
If ThisCharCode < &H80 Then
strReturn = strReturn & Chr(ThisCharCode)
Else
NextCharCode = AscB(MidB(vIn,i+1,1))
strReturn = strReturn & Chr(CLng(ThisCharCode) * &H100 + CInt(NextCharCode))
i = i + 1
End If
Next
bytes2BSTR = strReturn
End Function

Function NewsString(wstr,strng)
NewsString=Instr(wstr,strng)
End Function

Function Autolink(strContent)
dim re
set re = New RegExp
re.IgnoreCase = True
re.Global = True
If Instr(url,"http://ent.")>0 then ''影音和娛樂新聞的介面
start=NewsString(strContent,"<table width=604") ''擷取的起點
over=NewsString(strContent,"<center></center>") ''擷取的終點
strContent=mid(strContent,start,over-start) ''擷取新聞
re.Pattern = "/<table border=0(.[^/[]*)/<//table>"
strContent = re.Replace(strContent,"") ''去掉畫中畫廣告
strContent = Replace(strContent,"/p>","") ''去掉頁面中一個奇怪的錯誤
strContent = Replace(strContent,"<table width=604 border=0 cellpadding=0 cellspacing=0>","")
strContent = Replace(strContent,"</table></table>","")
strContent = Replace(strContent,"<img src=http://image2.sina.com.cn/ent/news_rou.gif width=30 height=53>","")
strContent = Replace(strContent,"<img src=http://image2.sina.com.cn/ent/images/c.gif width=1 height=1>","<hr size=1 bgcolor=#d9d9d9>")
strContent = Replace(strContent,"bgcolor=#fff3ff","") ''去掉背景顏色
strContent = Replace(strContent,"bgcolor=#bd6bff","") ''去掉背景顏色
strContent = Replace(strContent,"width=603","width=100% ") ''把一個定義了大小的表格放到最大
strContent = Replace(strContent,"width=554","width=100% ") ''把一個定義了大小的表格放到最大
strContent = "<table width=100% border=0 cellspacing=0 cellpadding=10 align=center >"&strContent&"</td></tr></table>" ''修補HTML的結構錯誤
Else ''其他分類新聞的介面
start=NewsString(strContent,"<th class=f24>") ''擷取的起點
over=NewsString(strContent,"<br clear=all>") ''擷取的終點
strContent=mid(strContent,start,over-start) ''擷取新聞
re.Pattern = "/<table border=0(.[^/[]*)/<//table>"
strContent = re.Replace(strContent,"") ''去掉畫中畫廣告
strContent = Replace(strContent,"/p>","") ''去掉頁面中一個奇怪的錯誤
strContent = "<table width=100% border=0 cellspacing=0 cellpadding=10 align=center >"&strContent&"</td></tr></table>" ''修補HTML的結構錯誤
End if
Autolink=strContent
End Function

%>
<style type="text/css">
<!--
td {  font-size: 12px}
-->
</style>
<table width="770" border="0" cellspacing="0" cellpadding="10" align="center" class="line_l_r" bgcolor="#EEEEEE">
  <tr>
    <td>
  <% Response.Write wstr %>

</td>
  </tr>
</table>