sina網頁新聞小偷原理及原始碼(java版)
在網上看到網頁小偷,有偷笑話的有偷天氣的。。。最可笑的是GOOGLE上有個人釋出了一篇文章,說是js版本的,我開啟一看,language=vbscript.......還被很多人引用,一搜網頁小偷,都是那一篇文章,真讓人無語。更讓人無語的就是那個所謂的超級經典的笑話小偷。引用了很多網站上的js,根本看不到js程式碼。下載下來原始碼,更令人吐血,說是用js實現的,結果裡面的js都是0KB,就是空檔案。
我今天寫了個偷新聞的,由於是“偷”所以程式得基於要偷得網站的程式碼。
簡單的說,就是開啟網頁,看他的原始檔。然後使用java的URL類獲取網頁內容,使用正則表示式過濾想要的內容,我寫的是偷新浪新聞的。還有很多地方沒有完善,不過基本功能實現了。開啟頁面,有個按鈕,點開始,就會開始下載和過濾。然後將新聞存入資料庫中。
以下程式碼:
callJsp.jsp:
<%@ page language="java" import="java.util.*" pageEncoding="utf-8"%>
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd
">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=gb2312">
<title>callJsp</title>
</head>
<body>
<form action="CallServlet">
<button type="submit">start</button>
</form>
</body>
</html>
CallServlet.java:
package servlet;
import java.io.IOException;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import bo.GetContent;
public class CallServlet extends HttpServlet {
private static final long serialVersionUID = 1L;
public void doGet(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {
System.out.println("called");
GetContent gc = new GetContent();
gc.getContent("http://news.sina.com.cn
");
gc.savaToDB();
}
public void doPost(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {
this.doGet(request, response);
}
}
實體類:
package dto;
public class New {
private String title;
private int id;
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
private String content;
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
}
邏輯類(關鍵程式碼):
package bo;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import dao.NewsDAO;
import dto.New;
public class GetContent {
ArrayList<String> newsList;
Pattern newsPat;
Pattern mainPat;
List<New> news;
NewsDAO newsDAO = new NewsDAO();
int i = 1;
public GetContent() {
String newsReg = "<li(.*)<a(.*)(http://news[/.]sina[/.]com[/.]cn(.*)[/.]shtml)+(.*)>.*</a></li
>";
String mainReg = "<p>(.*)</p>";
newsList = new ArrayList<String>();
newsPat = Pattern.compile(newsReg);
mainPat = Pattern.compile(mainReg);
news = new ArrayList<New>();
}
public void getContent(String URLSTR) {
try {
URL url = new URL(URLSTR);
InputStream o = url.openStream();
BufferedReader br = new BufferedReader(new InputStreamReader(o,
"gbk"));
String line = null;
while ((line = br.readLine()) != null) {
Matcher m = newsPat.matcher(line);
if (m.matches()) {
newsList.add(line);
System.out.println(line);
getMain(line);
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
public void getMain(String line) throws MalformedURLException {
String newsContent = null;
int indexOfHrefBegin = line.indexOf("http");
int indexOfHrefEnd = line.indexOf("shtml");
int indexOfTitleBegin = line.indexOf(">", indexOfHrefBegin);
int indexOfTitleEnd = line.indexOf("</a>");
String title = line.substring(indexOfTitleBegin + 1, indexOfTitleEnd);
System.out.println(title);
String hrefStr = line.substring(indexOfHrefBegin, indexOfHrefEnd + 5);
URL u = new URL(hrefStr);
try {
InputStream o = u.openStream();
BufferedReader br = new BufferedReader(new InputStreamReader(o,
"gbk"));
String content = null;
StringBuffer sf = new StringBuffer();
while ((content = br.readLine()) != null) {
Matcher m = mainPat.matcher(content);
if (m.matches()) {
sf.append(content);
newsContent = sf.toString();
}
}
if (sf != null) {
New n = new New();
n.setId(i);
i++;
n.setContent(newsContent);
n.setTitle(title);
news.add(n);
}
} catch (IOException e) {
e.printStackTrace();
}
}
public void savaToDB() {
newsDAO.saveNews(news);
}
}
持久類:
package dao;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.List;
import dto.New;
public class NewsDAO {
Connection con = DBConnection.getConnection();
PreparedStatement pst = null;
ResultSet rs = null;
public void saveNews(List<New> news) {
try {
String sql = "insert into news values(?,?,?)";
pst = con.prepareStatement(sql);
for (New n : news) {
pst.setInt(1, n.getId());
pst.setString(2, n.getTitle());
pst.setString(3, n.getContent());
pst.executeUpdate();
}
} catch (SQLException e) {
e.printStackTrace();
} finally {
try {
pst.close();
con.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
}
資料庫連結類:
package dao;
import java.sql.Connection;
import java.sql.DriverManager;
public class DBConnection {
private static final String DBDRIVER = "com.mysql.jdbc.Driver";
private static final String DBURL = "jdbc:mysql://localhost:3306/newsstealing?useUnicode=true&characterEncoding=GBK";
private static final String DBUSER = "root";//使用者名稱
private static final String PASSWORD = "pass";//密碼
private static Connection connection = null;
static{
try{
Class.forName(DBDRIVER);
}catch(ClassNotFoundException e){
e.printStackTrace();
}
}
public static Connection getConnection(){
try{
connection = DriverManager.getConnection(DBURL,DBUSER,PASSWORD);
}catch(Exception e){
e.printStackTrace();
}
return connection;
}
}
所有的類都在這裡了。根據類名和包名,自己構建一個web project。把mysql的驅動程式放到lib下面。在資料庫中建立個數據庫,和表(資料庫連結類中和dao中可以找到名字)。可以根據自己的條件使用不同的資料庫。因為功能還不完善,所有建立表的時候,除了ID,其它的要允許為空值,否則插入的時候有的新聞內容沒有獲取到,就會出錯。