一個很垃圾的整站爬取--Java爬蟲
阿新 • • 發佈:2019-01-13
Jsoup---
讀取檔案中的種子頁,整站爬取整站資料,並儲存。
如果你想簡單用一下,可以,如果學習使用,個人覺得有點亂,
package cn; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.OutputStream; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.io.FileUtils; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; public class CrawlerUtils { public static int count = 0; // seeds public static List<String> list = new ArrayList<String>(); // 存所有url public static HashSet<String> hashSet = new HashSet<String>(); // 執行緒池 ExecutorService pool = Executors.newFixedThreadPool(5); public static String gethtml(String url) { String content; try { Connection con = Jsoup.connect(url); con.header("Accept", "text/html, application/xhtml+xml, */*"); con.header("Content-Type", "application/x-www-form-urlencoded"); con.header("User-Agent", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0))"); con.header("Cookie", ""); content = con.get().toString(); } catch (IOException e) { e.printStackTrace(); return null; } return content; } /** * 拿取所有包含主站的url地址 返回list */ public static List<String> geturl(String html, String url) { List<String> list = new ArrayList<String>(); Pattern pattern = Pattern.compile("href=\"(.*?)\""); Matcher matcher = pattern.matcher(html); // find向前迭代 while (matcher.find()) { String urlline = matcher.group().replace("href=\"", "") .replace("\"", ""); if (urlline.contains("http")) { if (url.contains("http")) { if (urlline.contains(url.replace("http://", ""))) { System.out.println(urlline); list.add(urlline); } } } else if (urlline.contains("https")) { if (url.contains("https")) { if (urlline.contains(url.replace("https://", ""))) { System.out.println(urlline); list.add(urlline); } } } else { String urlString = url.substring(0, url.length() - 1) + urlline; System.out.println(urlString); list.add(urlString); } } return list; } /** * 儲存html -寫入檔案 * * @throws IOException */ public static void saveFile(String pathname, String html, String charset) throws IOException { FileUtils.write(new File(pathname), html, charset, true); } /** * 通過位元組流-寫入檔案 * * @throws IOException */ public static void WriteByte(String pathname, String date, String charset) throws IOException { File file = new File(pathname); OutputStream outputStream = new FileOutputStream(file); byte[] datebyte = date.getBytes(charset); outputStream.write(datebyte); outputStream.close(); } /** 主執行類 */ public static void mainUtil(String url, String maintitle) { try { String html = gethtml(url); System.out.println(html); List<String> urlList = geturl(html, url); for (String string : urlList) { if (hashSet.add(string)) { String htmlline = gethtml(string); try { String title = "未命名"; title = Jsoup.parse(htmlline).getElementsByTag("title") .get(0).text(); saveFile("E://crawler4j/房地產行業/" + maintitle + "/" + title + System.currentTimeMillis() + ".html", htmlline, "utf8"); System.out.println("第" + count++ + "儲存檔案:" + string); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); System.out.println("第" + count + "寫入失敗!!!" + "網址:" + string); } } } } catch (Exception e) { // TODO: handle exception System.out.println("99999999999"); } } public static void main(String[] args) { try { FileReader reader = new FileReader("E://crawler4j/房地產行業seeds.txt"); BufferedReader br = new BufferedReader(reader); while (br.ready()) { String line = br.readLine(); list.add(line); } br.close(); reader.close(); } catch (Exception e1) { // TODO Auto-generated catch block e1.printStackTrace(); System.out.println("沒有種子頁!!"); } String url1 = "http://www.minagri.gov.rw/index.php?id=16"; for (String url : list) { String maintitle = "未命名" + System.currentTimeMillis(); try { maintitle = Jsoup.connect(url).get().getElementsByTag("title") .get(0).text(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); continue; } mainUtil(url, maintitle); String html = gethtml(url); if (html.equals(null)) { continue; } System.out.println(html); List<String> urlList = geturl(html, url); for (int i = 0; i < urlList.size(); i++) { mainUtil(urlList.get(i), maintitle); } } } }