java使用jsoup爬蟲入門
阿新 • • 發佈:2019-02-05
一、maven專案裡pom新增jsoup依賴
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.9.2</version>
</dependency>
二、以csdn網址為例,獲取頁面,使用dom獲取內容,寫入本地
public class JsoupDemo { private static OutputStream os; public static void main(String[] args) { try { Document doc = Jsoup.connect("https://www.csdn.net/").get(); // System.out.println(doc.title()); //CSDN-專業IT技術社群 //把文章標題和連線寫入txt檔案 Element feedlist_id = doc.getElementById("feedlist_id"); Elements h2 = feedlist_id.select("h2.csdn-tracking-statistics"); Elements a = h2.select("a"); //指定檔名及路徑 File file = new File("E:\\jsoup\\word\\test.txt"); if (!file.exists()) { file.createNewFile(); } //寫入本地 PrintWriter pw = new PrintWriter("E:\\jsoup\\word\\test.txt","UTF-8"); for (Element element : a) { pw.println(element.text()); pw.println(element.attr("href")); pw.println("------------------------------------------------------------------------------------------------------------------------------------"); } pw.close(); //關閉輸出流 //獲取頁面上的圖片儲存到本地 Elements imgs = doc.select("img[src$=.png]"); for (Element element : imgs) { String img = element.attr("src"); String url = "http:"+img; System.out.println(url); System.out.println(url.indexOf("csdn")); if (url.indexOf("csdn")==-1) { continue; } URL u = new URL(url); URLConnection uc=u.openConnection(); //獲取資料流 InputStream is=uc.getInputStream(); //獲取字尾名 String imageName = img.substring(img.lastIndexOf("/") + 1,img.length()); //寫入本地 os = new FileOutputStream(new File("E:\\jsoup\\img", imageName)); byte[] b = new byte[1024]; int i=0; while((i=is.read(b))!=-1){ os.write(b, 0, i); } is.close(); os.close(); } } catch (IOException e) { e.printStackTrace(); } } }
三、效果如下