java爬蟲學習1
1 需求:比如要從這樣一個網頁上抓取資料
這個請求最後面的uid其實是百度地圖上查到該點的uid(也就是5ef5edbdc64c1bb49e9d6899),我的資料庫裡面已經獲取了武漢的房地產的uid,現在要通過uid獲取詳細資訊。
先從一個著手,再多的資料也是迴圈抓取了。
2 傳送請求到網頁 ,用到HttpURLConnection類
package connection;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
public class ConnectionUtil {
public static String Connect(String address){
HttpURLConnection conn = null;
URL url = null;
InputStream in = null;
BufferedReader reader = null;
StringBuffer stringBuffer = null;
try {
url = new URL(address);
conn = (HttpURLConnection) url.openConnection();
conn.setConnectTimeout(5000);
conn.setReadTimeout(5000);
conn.setDoInput(true);
conn.connect();
in = conn.getInputStream();
reader = new BufferedReader(new InputStreamReader(in));
stringBuffer = new StringBuffer();
String line = null;
while((line = reader.readLine()) != null){
stringBuffer.append(line);
}
} catch (Exception e) {
e.printStackTrace();
} finally{
conn.disconnect();
try {
in.close();
reader.close();
} catch (Exception e) {
e.printStackTrace();
}
}
return stringBuffer.toString();
}
}
3 接收伺服器返回的頁面html資料
返回了html字串之後,首先要明確需要抓取的資料是哪些,分析網頁的特點。
比如我現在要抓取的資料有:
圖片url
價格
房屋型別
建築型別
建築年代
容積率
物業費
物業公司
開發商
分析好結構之後就好寫正則表示式了,用來匹配獲取。
4 進行解析
package main;
import connection.ConnectionUtil;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Analyze {
//使用時呼叫此方法,傳入uid,如 5ef5edbdc64c1bb49e9d6899,返回的是一個字串,可以打印出來看一下
public String regexMain(String uid) {
String url="http://map.baidu.com/detail?qt=ninf&from=housezt&detail=house&uid="+uid;
String result = ConnectionUtil.Connect(url);
return getHouseInfo(result);
}
private String getHouseInfo(String targetStr) {
StringBuilder lastInfo=new StringBuilder();
//提取圖片url
Pattern imgpattern=Pattern.compile("<img class=\"img-large\".*\" />");
Matcher imgmatcher=imgpattern.matcher(targetStr);
while (imgmatcher.find()){
String imgString=imgmatcher.group();
int n=imgString.lastIndexOf("=\"");
String imgUrl=imgString.substring(n+2,imgString.length()-3);
//System.out.println("imgRul:"+imgUrl);
lastInfo.append("圖片url:"+imgUrl+"\n");
}
//首先提取出包含房產資訊的html片段,再分別處理
Pattern pattern1 = Pattern
.compile("fcg\">\\w*.+\\s*<dd>\\w*.+</dd>");
Matcher matcher1 = pattern1.matcher(targetStr);
String info = "";
while (matcher1.find()) {
info = matcher1.group(); //提取出的片段
Pattern pattern2 = Pattern
.compile("f24 fcr\">[0-9]+");
Matcher matcher2 = pattern2.matcher(info);
while (matcher2.find()) {
String price = matcher2.group().substring(9); //提取出價格
//System.out.println("價格:" + price);
lastInfo.append("價格:"+price+"元/平米\n");
}
Pattern pattern3 = Pattern
.compile("房屋型別:</dt>\\s*<dd>[\\u4e00-\\u9fa5]+( *[\\u4e00-\\u9fa5]+)*");
Matcher matcher3 = pattern3.matcher(info);
String houseType = ""; //可能有的沒有
while (matcher3.find()) {
int n = matcher3.group().lastIndexOf(">");
houseType = matcher3.group().substring(n + 1);
}
//ystem.out.println("房屋型別:" + houseType);
lastInfo.append("房屋型別:"+houseType+"\n");
//建築型別同理
Pattern pattern4 = Pattern
.compile("建築型別:</dt>\\s*<dd>[\\u4e00-\\u9fa5]+( *[\\u4e00-\\u9fa5]+)*");
Matcher matcher4 = pattern4.matcher(info);
String blockType = ""; //可能有的
while (matcher4.find()) {
int n = matcher4.group().lastIndexOf(">");
blockType = matcher4.group().substring(n + 1);
}
//System.out.println("建築型別:" + blockType);
lastInfo.append("建築型別:"+blockType+"\n");
//建築年代
Pattern pattern5 = Pattern
.compile("建築年代:</dt>\\s*<dd>[0-9]+");
Matcher matcher5 = pattern5.matcher(info);
String blockAge = ""; //可能有的沒有
while (matcher5.find()) {
int n = matcher5.group().lastIndexOf(">");
blockAge = matcher5.group().substring(n + 1);
}
//System.out.println("建築年代:" + blockAge);
lastInfo.append("建築年代:"+blockAge+"\n");
//容積率
Pattern pattern6 = Pattern
.compile("容積率:</dt>\\s*<dd>[0-9]+\\.[0-9]+");
Matcher matcher6 = pattern6.matcher(info);
String FAR = ""; //可能有的沒有
while (matcher6.find()) {
int n = matcher6.group().lastIndexOf(">");
FAR = matcher6.group().substring(n + 1);
}
//System.out.println("容積率:" + FAR);
lastInfo.append("容積率:"+FAR+"\n");
//物業費
Pattern pattern9 = Pattern
.compile("物業費:</dt>\\s*<dd>[0-9]+\\.[0-9]+");
Matcher matcher9 = pattern9.matcher(info);
String fee = ""; //可能有的沒有
while (matcher9.find()) {
int n = matcher9.group().lastIndexOf(">");
fee = matcher9.group().substring(n + 1);
}
//System.out.println("物業費:" + fee);
lastInfo.append("物業費:"+fee+"元/平米/月\n");
//物業公司
Pattern pattern7 = Pattern
.compile("物業公司:</dt>\\s*<dd>[\\u4e00-\\u9fa5]+( *[\\u4e00-\\u9fa5]+)*");
Matcher matcher7 = pattern7.matcher(info);
String pManage = ""; //可能有的沒有
while (matcher7.find()) {
int n = matcher7.group().lastIndexOf(">");
pManage = matcher7.group().substring(n + 1);
}
//System.out.println("物業公司:" + pManage);
lastInfo.append("物業公司:"+pManage+"\n");
//開發商
Pattern pattern8 = Pattern
.compile("開發商:</dt>\\s*<dd>[\\u4e00-\\u9fa5]+( *[\\u4e00-\\u9fa5]+)*");
Matcher matcher8 = pattern8.matcher(info);
String company = ""; //可能有的沒有
while (matcher8.find()) {
int n = matcher8.group().lastIndexOf(">");
pManage = matcher8.group().substring(n + 1);
}
//System.out.println("開發商:" + pManage);
lastInfo.append("開發商:"+pManage+"\n");
}
return lastInfo.toString();
}
}