Java爬蟲爬取網易汽車車型庫
阿新 • • 發佈:2018-12-30
最近由於工作需要,寫了一個小的爬蟲,主要用於爬取網易汽車車型庫(http://product.auto.163.com/)上的不同品牌/車標(共175個車標)下不同車系(共1650個系列)的的圖片(各八張)
程式碼如下:
共CarBrand.java,CarCrawer.java,CarCrawerDemo.java三個檔案。
實體
CarBrand.java
package com.mingo.crawer;
import java.util.ArrayList;
public class CarBrand {
private String ppName;
private String ppUrl;
private ArrayList<CarBrand> ppList;
private String cxName;
private String cxUrl;
private ArrayList<CarBrand> cxList;
private String cxTpName;
private String cxTpUrl;
private ArrayList<CarBrand> cxTpList;
private String tpName;
private String tpNameUrl;
//getter() 和 setter() 省略
}
具體實現
CarCrawer.java
package com.mingo.crawer;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.RandomAccessFile;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class CarCrawer {
public static String carUrl = "http://product.auto.163.com";
public static String SendGet(String url) {
// 定義一個字串用來儲存網頁內容
String result = "";
// 定義一個緩衝字元輸入流
BufferedReader in = null;
try {
// 將string轉成url物件
URL realUrl = new URL(url);
// 初始化一個連結到那個url的連線
URLConnection connection = realUrl.openConnection();
// 開始實際的連線
connection.connect();
// 初始化 BufferedReader輸入流來讀取URL的響應
in = new BufferedReader(new InputStreamReader(
connection.getInputStream(), "GB2312"));
// 用來臨時儲存抓取到的每一行的資料
String line;
while ((line = in.readLine()) != null) {
// 遍歷抓取到的每一行並將其儲存到result裡面
result += line;
}
} catch (Exception e) {
System.out.println("傳送GET請求出現異常!" + e);
e.printStackTrace();
}
// 使用finally來關閉輸入流
finally {
try {
if (in != null) {
in.close();
}
} catch (Exception e2) {
e2.printStackTrace();
}
}
return result;
}
/**
* 下載檔案到本地
*
* @param urlString
* 被下載的檔案地址
* @param filename
* 本地檔名
* @throws Exception
* 各種異常
*/
public static void download(String urlString, String filename,String savePath) throws Exception {
// 構造URL
URL url = new URL(urlString);
// 開啟連線
URLConnection con = url.openConnection();
//設定請求超時為5s
con.setConnectTimeout(5*1000);
// 輸入流
InputStream is = con.getInputStream();
// 1K的資料緩衝
byte[] bs = new byte[1024];
// 讀取到的資料長度
int len;
// 輸出的檔案流
File sf=new File(savePath);
if(!sf.exists()){
sf.mkdirs();
}
OutputStream os = new FileOutputStream(sf.getPath()+"\\"+filename);
// 開始讀取
while ((len = is.read(bs)) != -1) {
os.write(bs, 0, len);
}
// 完畢,關閉所有連結
os.close();
is.close();
}
public static void writeTxtFile(String content,String txtfilename)throws Exception{
FileWriter writer = new FileWriter(txtfilename, true);
writer.write(content);
writer.close();
}
public static ArrayList<CarBrand> removeDuplicate(ArrayList<CarBrand> list) {
List<CarBrand> newlist= new ArrayList<CarBrand>();
Set<String> set=new HashSet<String>();
for (CarBrand car:list) {
if (car == null) {continue;}
String str = car.getCxName();
if (str != null) {
if (!set.contains(str)) { //set中不包含重複的
set.add(str);
newlist.add(car);
}
}
}
return (ArrayList<CarBrand>) newlist;
}
/*
* @param url
* 示例 http://product.auto.163.com/brand/a/
*/
public static ArrayList<CarBrand> getPpUrl(String url) throws Exception {
ArrayList<CarBrand> ppList = new ArrayList<CarBrand>();
String content = CarCrawer.SendGet(url);
Pattern patternName = Pattern.compile("title=\"進入.{1,20}品牌頻道");
Pattern patternUrl = Pattern.compile("<a href='/brand/[a-z]/.{1,20}' title");
Matcher matcherName = patternName.matcher(content);
Matcher matcherUrl = patternUrl.matcher(content);
while(matcherName.find()&&matcherUrl.find()){
CarBrand carBrand = new CarBrand();
carBrand.setPpName(matcherName.group(0).substring(9, matcherName.group(0).length()-4));
carBrand.setPpUrl(carUrl+matcherUrl.group(0).substring(9, matcherUrl.group(0).length()-7));
//System.out.println(carBrand.getPpName()+": "+carBrand.getPpUrl());
ppList.add(carBrand);
}
return ppList;
}
/*
* @param url
* 示例 http://product.auto.163.com/brand/a/
*/
public static ArrayList<CarBrand> getCxUrl(String url) throws Exception {
ArrayList<CarBrand> cxPicList = new ArrayList<CarBrand>();
String content = CarCrawer.SendGet(url);
//Pattern pattern = Pattern.compile("class=\"group\">.*<div class=\"gbox gbox2\" >");
//Matcher matcher = pattern.matcher(content);
int i=0;
while(content.indexOf("class=\"group\">",i)>0){
int subS = content.indexOf("class=\"group\">",i);
int subE = content.indexOf("<div class=\"gbox gbox2\" >",i);
String subContent = content.substring(subS, subE);
i=subE+10;
//System.out.println("subContent "+subContent);
Pattern patternTitle = Pattern.compile("頻道\">進入.{1,20}品牌頻道</a>]</span>");
Matcher matcherTitle = patternTitle.matcher(subContent);
String strtitle= null;
if(matcherTitle.find()){
strtitle = matcherTitle.group(0).substring(6, matcherTitle.group(0).length()-16);
}
Pattern patternName = Pattern.compile("\"檢視.{1,20}圖片\">");
Pattern patternUrl = Pattern.compile("/series/photo/.{10,20}\"");
Matcher matcherName = patternName.matcher(subContent);
Matcher matcherUrl = patternUrl.matcher(subContent);
while(matcherName.find()&&matcherUrl.find()){
CarBrand carBrand = new CarBrand();
carBrand.setPpName(strtitle);
//System.out.println(carBrand.getPpName());
carBrand.setCxName(matcherName.group(0).substring(3, matcherName.group(0).length()-4));
carBrand.setCxUrl(carUrl+matcherUrl.group(0).substring(0, matcherUrl.group(0).length()-1));
//System.out.println(carBrand.getCxName()+": "+carBrand.getCxUrl());
cxPicList.add(carBrand);
}
}
return cxPicList;
}
/*
* @param url
* 示例 http://product.auto.163.com/series/photo/2350.html#CX001
*/
public static ArrayList<CarBrand> getCxPic(String url) throws Exception {
ArrayList<CarBrand> cxPicList = new ArrayList<CarBrand>();
String content = CarCrawer.SendGet(url);
Pattern pattern = Pattern.compile("http://product.auto.163.com/picture/photoview.{30,40}.html");
Matcher matcher = pattern.matcher(content);
int num=1;
while(matcher.find()&&num<9){
CarBrand carBrand = new CarBrand();
if(num==1){ carBrand.setCxTpName("左前");} else if(num==2){ carBrand.setCxTpName("正前");
}else if(num==3){ carBrand.setCxTpName("正側"); } else if(num==4){ carBrand.setCxTpName("左後");
}else if(num==5){ carBrand.setCxTpName("正後"); } else if(num==6){ carBrand.setCxTpName("車頂");
}else if(num==7){ carBrand.setCxTpName("前大燈區域性"); } else if(num==8){ carBrand.setCxTpName("後大燈區域性");
}else{ System.out.println("Error: num = "+num); return null;}
carBrand.setCxTpUrl(matcher.group(0));
//System.out.println(carBrand.getCxTpName()+": "+matcher.group(0));
num = num + 1;
cxPicList.add(carBrand);
}
return cxPicList;
}
public static String getBigPic(String url) throws Exception {
String bigPicUrl = null;
String content = CarCrawer.SendGet(url);
Pattern pattern = Pattern.compile("<img class=\"main_photo hidden\" data-src=\".{60,70}.jpg");
Matcher matcher = pattern.matcher(content);
if(matcher.find()){
//System.out.println(matcher.group(0).substring(41));
bigPicUrl = matcher.group(0).substring(41);
}
return bigPicUrl;
}
}
呼叫
CarCrawerDemo.java
package com.mingo.crawer;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class CarCrawerDemo {
public static String carUrl = "http://product.auto.163.com";
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
//儲存路徑 D:\\CarPic\\
String savePath = "D:\\CarTp\\";
//檔名 奧迪__奧迪Q5_2017款_左後.jpg
String filename = "";
//txt檔名
String txtfilename=savePath+"output.txt";
String url = "http://product.auto.163.com/brand/";
System.out.println(url);
ArrayList<CarBrand> pplist = CarCrawer.getPpUrl(url);
System.out.println(pplist.size());
ArrayList<CarBrand> cxUrllistNew = new ArrayList<CarBrand>();
Set<String> ppUrlSet=new HashSet<String>();
for(CarBrand pp:pplist){
String ppUrlStr= pp.getPpUrl().substring(0, 36);
if (!ppUrlSet.contains(ppUrlStr)) { //set中不包含重複的
ppUrlSet.add(ppUrlStr);
ArrayList<CarBrand> cxUrllist = CarCrawer.getCxUrl(pp.getPpUrl());
cxUrllistNew.addAll(cxUrllist);
}
}
System.out.println(cxUrllistNew.size());
CarCrawer.writeTxtFile("\nCalendar: "+Calendar.getInstance(),txtfilename);
for(CarBrand cxUrlNew:cxUrllistNew){
//System.out.println(cxUrlNew.getPpName()+" "+cxUrlNew.getCxName()+" "+cxUrlNew.getCxUrl());
ArrayList<CarBrand> cxTplist = CarCrawer.getCxPic(cxUrlNew.getCxUrl());
for(CarBrand cxTp:cxTplist){
String tpName = cxUrlNew.getPpName()+"_"+cxUrlNew.getCxName()+"_"+cxTp.getCxTpName()+".jpg";
String tpNameUrl = CarCrawer.getBigPic(cxTp.getCxTpUrl());
//System.out.println(tpName+" "+tpNameUrl);
CarCrawer.writeTxtFile("\n"+tpName+" "+tpNameUrl,txtfilename);
if(tpName!=null&&tpNameUrl!=null){
CarCrawer.download(tpNameUrl, tpName, savePath);
}
}
}
System.out.println("finished!");
}
}
下載結果:
改進點:
1 沒有爬取每個車系的年款;
2 庫有點小,車輛主要是小型車,
3 程式碼速度要進一步優化。