1. 程式人生 > >java使用jsoup爬蟲入門

java使用jsoup爬蟲入門

一、maven專案裡pom新增jsoup依賴

<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.9.2</version>
</dependency>

二、以csdn網址為例,獲取頁面,使用dom獲取內容,寫入本地

public class JsoupDemo {
	private static OutputStream os;

	public static void main(String[] args) {
		try {
			Document doc = Jsoup.connect("https://www.csdn.net/").get();
//			System.out.println(doc.title()); //CSDN-專業IT技術社群
			//把文章標題和連線寫入txt檔案
			Element feedlist_id = doc.getElementById("feedlist_id");
			Elements h2 = feedlist_id.select("h2.csdn-tracking-statistics");
			Elements a = h2.select("a");
			//指定檔名及路徑
			File file = new File("E:\\jsoup\\word\\test.txt"); 
			if (!file.exists()) {
				file.createNewFile();
			}
			//寫入本地
			PrintWriter pw = new PrintWriter("E:\\jsoup\\word\\test.txt","UTF-8"); 
			for (Element element : a) {
				pw.println(element.text());
				pw.println(element.attr("href")); 
				pw.println("------------------------------------------------------------------------------------------------------------------------------------");
			}
			pw.close(); //關閉輸出流
			//獲取頁面上的圖片儲存到本地
			Elements imgs = doc.select("img[src$=.png]");
			for (Element element : imgs) {
				String img = element.attr("src");
				String url = "http:"+img;
				System.out.println(url);
				System.out.println(url.indexOf("csdn"));
				if (url.indexOf("csdn")==-1) {
					continue;
				}
				URL u = new URL(url);
				URLConnection uc=u.openConnection();
		        //獲取資料流
		        InputStream is=uc.getInputStream();
		        //獲取字尾名
		        String imageName = img.substring(img.lastIndexOf("/") + 1,img.length());
		        //寫入本地
		        os = new FileOutputStream(new File("E:\\jsoup\\img", imageName));
		        byte[] b = new byte[1024];
		        int i=0;
		        while((i=is.read(b))!=-1){
		          os.write(b, 0, i);
		        }
		        is.close();
		        os.close();
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
}

三、效果如下



jsoup爬蟲demo:https://download.csdn.net/download/qq_15260315/10524321