Java 獲得網頁原始碼和模擬瀏覽器請求(個人總結)
阿新 • • 發佈:2019-02-09
Java獲取原始碼自己知道的幾種方式,在這裡總結一下。
1:GetSourceCode.java
package kalision; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; public class GetSourceCode { public static void main(String[] args) throws IOException { HttpURLConnection huc;
URL myurl = new URL("http://www.baidu.com");//獲取原始碼的頁面。
huc = (HttpURLConnection) myurl.openConnection(); BufferedReader in; in = new BufferedReader(new InputStreamReader(huc.getInputStream())); String line; while ((line = in.readLine()) != null) { System.out.println(line); } } }
或者
2.test1.java
注意:package kalision; import java.io.BufferedReader; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.net.URL; import java.net.URLConnection; public class test1 { /** * @param args */ public static void main(String[] args) { try { URL url = new URL("http://train.qunar.com/stationToStation.htm?fromStation=%E6%B5%8E%E5%8D%97&toStation=%E7%83%9F%E5%8F%B0&date=2012-01-08"); URLConnection conn = url.openConnection(); conn.setDoOutput(true); InputStream in = null; in = url.openStream(); String content = pipe(in,"utf-8"); System.out.println(content); } catch (Exception e) { e.printStackTrace(); } } static String pipe(InputStream in,String charset) throws IOException { StringBuffer s = new StringBuffer(); if(charset==null||"".equals(charset)){ charset="utf-8"; } String rLine = null; BufferedReader bReader = new BufferedReader(new InputStreamReader(in,charset)); PrintWriter pw = null; FileOutputStream fo = new FileOutputStream("../index.html"); OutputStreamWriter writer = new OutputStreamWriter(fo, "utf-8"); pw = new PrintWriter(writer); while ( (rLine = bReader.readLine()) != null) { String tmp_rLine = rLine; int str_len = tmp_rLine.length(); if (str_len > 0) { s.append(tmp_rLine); pw.println(tmp_rLine); pw.flush(); } tmp_rLine = null; } in.close(); pw.close(); return s.toString(); } }
如果得到的原始檔儲存執行,出現亂碼。是因為編碼問題。可以嘗試修改
原始檔頭部的編碼為GBK等即可。
以上兩種方式都可以得到頁面的原始碼。
對於有請求引數的頁面如:
test1類中的url,它是一個請求連線,帶有引數,以get方式提交的url
返回的原始碼可能沒有我們想要的資料。
據個人瞭解這種頁面大多數情況資料是放到了另一個頁面。
在返回的原始檔中以js動態去彼頁面獲取動態資料載入到此頁面中。
可以用firebug等一些工具來抓到此頁面。
來分析解析這些需要的動態資料。
當然個人感覺這樣的工作,非推薦的。也是不易實現的。
對於上面講到的get方式提交 ,可以直接在url後邊新增引數。下面是以post方式提交資料並請求
1.Test.java
import java.util.Properties;
public class Test {
public static void testRequestPostStringByteArray() throws Exception {
Properties requestProperties = new Properties();
// 模擬瀏覽器資訊
requestProperties
.put(
"User-Agent",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; TencentTraveler ; .NET CLR 1.1.4322)");
byte[] b = HtmlPost.requestPost("http://train.qunar.com/stationToStation.htm?fromStation=%E5%8C%97%E4%BA%AC&toStation=%E4%B8%8A%E6%B5%B7&date=2012-01-01",
"XML".getBytes());
System.err.println(new String(b, "utf-8"));
}
/**
* Test method for
* {@link org.zlex.commons.net.NetUtils#requestPostForm(java.lang.String, java.util.Properties)}
* .
*/
public static void testRequestPostForm() throws Exception {
Properties formProperties = new Properties();
formProperties.put("ictN", "5924");
formProperties.put("fdl", "");
formProperties.put("lx", "00");
formProperties.put("nyear3", "2011");
formProperties.put("nyear3_new_value", "true");
formProperties.put("nmonth3", "12");
formProperties.put("nmonth3_new_value", "true");
formProperties.put("nday3", "27");
formProperties.put("nday3_new_value", "false");
formProperties.put("startStation_ticketLeft", "6d4e53e80482a0b7");
formProperties.put("startStation_ticketLeft_new_value", "true");
formProperties.put("arriveStation_ticketLeft", "53174e1300e781a2");
formProperties.put("arriveStation_ticketLeft_new_value", "true");
formProperties.put("trainCode", "");
formProperties.put("trainCode_new_value", "true");
formProperties.put("rFlag", "1");
formProperties.put("name_ckball", "value_ckball");
formProperties.put("tFlagDC", "DC");
formProperties.put("tFlagZ", "Z");
formProperties.put("tFlagT", "T");
formProperties.put("tFlagK", "K");
formProperties.put("tFlagPK", "PK");
formProperties.put("tFlagPKE", "PKE");
formProperties.put("tFlagLK", "LK");
formProperties.put("randCode", "BYHJ");
byte[] b = HtmlPost.requestPostForm(
"http://dynamic.12306.cn/TrainQuery/iframeLeftTicketByStation.jsp",
formProperties);
// byte[] b = HtmlPost.requestPostForm(
// "http://train.qunar.com/stationToStation.htm?fromStation=%E5%8C%97%E4%BA%AC&toStation=%E5%B9%BF%E5%B7%9E&date=2011-12-31",
// formProperties);
//
System.err.println(new String(b, "utf-8"));
}
public static void main(String args[]){
try {
testRequestPostForm();
// testRequestPostStringByteArray();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}