java寫爬蟲
package hu;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.HashSet;
import java.util.Properties;
import java.util.Set;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import java.sql;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.mysql.cj.jdbc.Driver;
import com.mysql.cj.xdevapi.Statement;
public class SyncTest {
public static Integer SyncTest=0;
public static void main(String[] args) throws MalformedURLException, IOException, SQLException {
int sum=0;
class news{
public String time;
public String kind;
public String title;
};
news []n = new news[609];
Set<String> set = new HashSet<>();
System.out.println(set.add("123"));
Driver driver = new com.mysql.cj.jdbc.Driver();
DriverManager.registerDriver(driver);
String urlll = "jdbc:mysql://localhost:3306/test01?useSSL=false&allowPublicKeyRetrieval=true&serverTimezone=Asia/Shanghai";
String user = "root";//MySQL使用者名稱
String password = "1234567890";//MySQL登入密碼,自己設定
java.sql.Connection connection = DriverManager.getConnection(urlll,user,password);
java.sql.Statement statement= connection.createStatement();
Document doc = Jsoup.connect("http://news.baidu.com/").timeout(5000).get();
int nu=0;
Element e = doc.getElementById("channel-shanghai").firstElementSibling();
for(int i=1;i<13;i++) {
System.out.println(e.getElementsByIndexEquals(i).text());
String url = e.getElementsByIndexEquals(i).select("a[href]").attr("abs:href");//類別
Document doc1 = Jsoup.connect(url).timeout(5000).get();
Elements title = doc1.select("a[target]a[mon]").select("a[href]");
for(int i1=0;i1<doc1.select("a[target]a[mon]").select("a[href]").size();i1++) {
Calendar calendar = Calendar.getInstance();
SimpleDateFormat formatter = new SimpleDateFormat("dd-MM-yyyy HH:mm:ss");
System.out.println("抓取時間"+formatter.format(calendar.getTime()).toString());//抓取時間
System.out.println("提取的標籤:"+doc1.select("a[target]a[mon]").select("a[href]").get(i1).text());
sum++;
//INSERT INTO test(id,time,title,class) VALUES('1','2','3','4');
if(set.add(doc1.select("a[target]a[mon]").select("a[href]").get(i1).text())) {
String key=String.valueOf(sum);
String sql="INSERT INTO tt(id,tiem,title,class) VALUES('"+key+"','"+formatter.format(calendar.getTime()).toString()+"','"+doc1.select("a[target]a[mon]").select("a[href]").get(i1).text()+"','"+e.getElementsByIndexEquals(i).text()+"')";
System.out.println(sql);
statement.execute(sql);
}
}
}
System.out.println(sum);
}
private static void String(int sum) {
// TODO Auto-generated method stub
}}