Java獲取網頁下超連結並儲存
阿新 • • 發佈:2018-11-02
把本程式改造為一個輸入一個起始URL及其引數之後就可以下載此URL及其引數所指定的WEB頁面以及此WEB頁面中HTML語言超級連結所指向的所有WEB頁面(只下載一級即可)。
主要需要利用Pattern類方法和正則表示式來獲取超連結
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
class HtmlParser1 {
String htmlUrl;
ArrayList<String> hrefList = new ArrayList();
String charSet;
public HtmlParser1(String htmlUrl) {
// TODO 自動生成的建構函式存根
this.htmlUrl = htmlUrl;
}
public ArrayList<String> parser() throws IOException { //獲得該網頁下的超連結
URL url = new URL(htmlUrl); //建立URL物件,建立連線
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setDoOutput(true);
String contenttype = connection.getContentType();
charSet = getCharset(contenttype);
InputStreamReader isr = new InputStreamReader (connection.getInputStream(), "gb2312"); //建立輸入流
BufferedReader br = new BufferedReader(isr);
String str = null, rs = null;
while ((str = br.readLine()) != null) {
Pattern pattern = Pattern.compile("<a href=(.*?)>"); //識別這一行是否符合網頁的格式
Matcher matcher = pattern.matcher(str);
while (matcher.find()) {
Pattern pattern1 = Pattern.compile("\"(.*?)\"");
Matcher matcher1 = pattern1.matcher(matcher.group(1));
if (matcher1.find()) {
rs = matcher1.group(1); //將本行引號中的內容截取出來
}
if (rs.indexOf("http") != -1) { //帶http的為URL
if (rs != null)
hrefList.add(rs);
}
}
}
return hrefList;
}
public void getURL() throws IOException { //獲得每個超連結對應的web網頁
ArrayList<String> URLList = parser();
for (int i = 0; i < URLList.size(); i++) {
URL url = new URL(URLList.get(i)); //讀取每個擷取的URL
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setDoOutput(true);
InputStreamReader isr = new InputStreamReader(connection.getInputStream(),"gb2312");
BufferedReader br = new BufferedReader(isr);
String str = null;
File dest = new File("wangye/"+i+".html"); //按數字順序命名儲存
dest.createNewFile();
FileOutputStream fileOutputStream = new FileOutputStream(dest);
OutputStreamWriter outputStreamWriter = new OutputStreamWriter(fileOutputStream, "gb2312");
while ((str = br.readLine()) != null) {
outputStreamWriter.write(str); //輸出流寫入
}
}
}
private String getCharset(String str) { //獲取網頁編碼方式,有些網頁沒有提供,所以暫時不使用
Pattern pattern = Pattern.compile("charset=.*");
Matcher matcher = pattern.matcher(str);
if (matcher.find())
return matcher.group(0).split("charset=")[1];
return null;
}
}
public class sp312{
public static void main(String[] arg) throws IOException { //主方法
HtmlParser1 HP = new HtmlParser1("https://news.163.com/");
ArrayList<String> hrefList = HP.parser();
for (int i = 0; i < hrefList.size(); i++)
System.out.println(hrefList.get(i));
HP.getURL();
}
}
結果:
擷取的超連結
獲取的超連結的HTML網頁: