簡單的java爬蟲實現
阿新 • • 發佈:2019-01-29
去年中旬開始接觸爬蟲一直都是淺顯帶過 期間也寫過 知乎爬蟲和科技網站定向抓取及爬取整個網際網路的爬蟲
今天和大家分享一下第三個 及其實現方式和程式碼 早期的實現想法 附程式碼
關於爬蟲其實理論上很簡單 就是通過網際網路上的超連結導航實現頁面的調轉與抓取 網際網路的網也因此而來
我也會一步一步的將實現方式和想法展現出來 方便大家能夠明白每一步要做什麼應該怎麼做
爬蟲可以分為6個部分:
1.下載器 ——實現爬蟲的基礎
2.連結解析器——獲取文件超連結
3.連結佇列——負責管理連結(分為兩部分 1已經抓取的,2待抓取(實現去重))
4.頁面分析器——負責將有用資訊剝離出來
5.儲存器——將頁面資訊進行儲存(這裡為了方便展示選擇了生成html檔案,同樣也可以持久化資訊)
6.任務分發器——負責以上模組的協作
1.下載器我們選擇了apache提供的httpClient(還有其他一些也不錯,自由選擇)
- package com.search.sprider;
- import java.io.IOException;
- import org.apache.http.HttpEntity;
- import org.apache.http.HttpStatus;
- import org.apache.http.ParseException;
- import org.apache.http.client.ClientProtocolException;
- import org.apache.http.client.config.RequestConfig;
- import org.apache.http.client.methods.CloseableHttpResponse;
- import org.apache.http.client.methods.HttpGet;
- import org.apache.http.impl.client.CloseableHttpClient;
- import org.apache.http.impl.client.HttpClients;
- import org.apache.http.util.EntityUtils;
- /**
- * @see 爬取網頁內容
- * @author zhuGe
- *
- */
- public class Sprider {
- public static String get(String url) {
- CloseableHttpClient httpClient = HttpClients.createDefault();
- // 建立httpget
- HttpGet httpGet;
- try {
- httpGet = new HttpGet(url);
- } catch (Exception e1) {
- return null;
- }
- // 設定表頭
- httpHeader(httpGet);
- //設定超時
- RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(2000).setConnectTimeout(2000).build();//設定請求和傳輸超時時間
- httpGet.setConfig(requestConfig);
- String download = null;
- try {
- // 執行get請求.
- CloseableHttpResponse response = httpClient.execute(httpGet);
- // 獲取響應實體
- HttpEntity entity = response.getEntity();
- //System.out.println(httpGet.getURI());
- //// 列印響應狀態
- //System.out.println(response.getStatusLine());
- //System.out.println("--------------------------------------");
- /**
- * 爬蟲
- */
- if(entity != null){
- if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
- download = EntityUtils.toString(entity);
- }
- }
- // if (entity != null) {
- // // 列印響應內容長度
- // System.out.println("Response content length: " +
- // entity.getContentLength());
- // 列印響應內容
- // System.out.println(download);
- } catch (ClientProtocolException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- return null;
- } catch (ParseException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- return null;
- } catch (IOException e) {
- // TODO Auto-generated catch block
- new Exception("ioe");
- return null;
- }finally {
- // 關閉連線,釋放資源
- try {
- httpClient.close();
- } catch (IOException e) {
- e.printStackTrace();
- return null;
- }
- }
- return download;
- }
- //設定表頭
- public static void httpHeader(HttpGet httpGet){
- httpGet.setHeader("Accept", "Accept text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
- httpGet.setHeader("Accept-Charset", "GB2312,utf-8;q=0.7,*;q=0.7");
- httpGet.setHeader("Accept-Encoding", "gzip, deflate");
- httpGet.setHeader("Accept-Language", "zh-cn,zh;q=0.5");
- httpGet.setHeader("Connection", "keep-alive");
- //httpGet.setHeader("Cookie", "__utma=226521935.73826752.1323672782.1325068020.1328770420.6;");
- //httpGet.setHeader("Host", "www.cnblogs.com");
- httpGet.setHeader("refer",
- "http://www.baidu.com/s?tn=monline_5_dg&bs=httpclient4+MultiThreadedHttpConnectionManager");
- httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:6.0.2) Gecko/20100101 Firefox/6.0.2");
- //System.out.println("Accept-Charset: " + httpGet.getFirstHeader("Accept-Charset"));
- }
- }
2.連結解析器選擇了jsoup 配合正則(通過dom樹更方便獲取,可以選擇單純使用正則或jsoup>_< 早期寫程式碼失誤了下版升級優化)
- package com.search.split;
- import java.util.HashSet;
- import java.util.Set;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- /**
- *
- * @author zhuGe
- * @see 連結獲取器
- */
- public class HrefOfPage {
- /**
- *
- * @see 獲取所有符合要求的連結
- * @param doc
- * @return 所有的http://的a連結裡面的href屬性值
- *
- */
- @SuppressWarnings({ "rawtypes", "unchecked" })
- public static Set<String> printHref(Document doc){
- Set aHref = null;
- if(aHref==null){
- aHref = new HashSet<String>();
- }
- aHref.clear();
- //獲取所有的a元素
- Elements aS = doc.getElementsByTag("a");
- for (Element element : aS) {
- //正則匹配
- //獲取屬性href裡面滿足條件的內容
- String href = (element.attr("href"));
- String regex ="(http://.+)";
- Pattern p = Pattern.compile(regex);
- Matcher m = p.matcher(href);
- //獲取遍歷所有滿足條件的標籤並獲取連結
- while(m.find()){
- String a = m.group(0);
- aHref .add(a);
- }
- }
- //System.out.println("頁面連結數量:"+aHref.size());
- return aHref;
- }
- }
3.連結佇列 待抓取佇列 選擇了LinkedList的集合(佇列(queue)方便管理)
- package com.search.url;
- import java.util.LinkedList;
- public class UrlQueue {
- /**超連結佇列*/
- public static LinkedList<String> urlQueue = new LinkedList<String>();
- /**佇列中對應最多的超連結數量*/
- public static final int MAX_SIZE = 10000;
- public synchronized static void addElem(String url)
- {
- urlQueue.add(url);
- }
- public synchronized static String outElem()
- {
- String outUrl = urlQueue.removeFirst();
- //將查詢過的去除掉
- if(urlQueue.contains(outUrl)){
- urlQueue.remove(outUrl);
- System.out.println("faxxx");
- }
- return outUrl;
- }
- public synchronized static boolean isEmpty()
- {
- return urlQueue.isEmpty();
- }
- }
3.連結佇列 以抓取佇列 選擇了set結婚(可以去重)
- package com.search.url;
- import java.util.HashSet;
- /**
- * 已訪問url佇列
- * @author zhuGe
- *
- */
- public class VisitedUrlQueue
- {
- public static HashSet<String> visitedUrlQueue = new HashSet<String>();
- public synchronized static void addElem(String url)
- {
- visitedUrlQueue.add(url);
- }
- public synchronized static boolean isContains(String url)
- {
- return visitedUrlQueue.contains(url);
- }
- public synchronized static int size()
- {
- return visitedUrlQueue.size();
- }
- }
4.頁面分析器同樣採用jsoup(2和4分開方便後期維護管理,只獲取了網站標題,可以定製)
- package com.search.split;
- import org.jsoup.nodes.Document;
- import org.jsoup.select.Elements;
- public class PageTitle {
- public static String printTitle(Document doc){
- Elements title = doc.getElementsByTag("title");
- return title.text();
- }
- }
5.儲存器使用輸出流輸出資料生成html頁面 6.任務分發器配合多執行緒提升效率(加入和深度篩選 控制深度優先 )
- package com.search.tread;
- import java.io.BufferedWriter;
- import java.io.FileWriter;
- import java.io.IOException;
- import java.util.Set;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import com.search.split.HrefOfPage;
- import com.search.split.PageTitle;
- import com.search.sprider.Sprider;
- import com.search.url.UrlQueue;
- import com.search.url.VisitedUrlQueue;
- import com.search.util.Depth;
- /**
- * @author zhuGe
- * @data 2016年1月17日
- */
- public class UrlTread implements Runnable{
- @Override
- public void run() {
- while(!UrlQueue.isEmpty()){
- String url = UrlQueue.outElem();
- System.out.println("移除"+url);
- String context = null;
- if(!VisitedUrlQueue.isContains(url)){
- context = Sprider.get(url);
- }
- if(context!=null){
- //訪問過的連結
- addHref(context,url);
- }
- VisitedUrlQueue.addElem(url);
- }
- }
- /**
- * @see 獲取連結並輸出標題
- * @param context
- * @param url
- */
- public void addHref(String context,String url){
- Document doc = Jsoup.parse(context);
- //獲取所有連結
- Set<String> hrefSet = HrefOfPage.printHref(doc);
- //獲取網站標題
- String title = PageTitle.printTitle(doc);
- System.out.println(Thread.currentThread().getName());
- String html =("<li><a href='"+url+"'>"+title+"</a></li>\n");
- //新增檔案到輸出物件
- outFile(html);
- System.out.println(html);
- //進行深度篩選
- if(hrefSet!=null){
- hrefSet = Depth.depth(hrefSet, 1);
- }
- //將連結新增進待訪問佇列
- for (String string : hrefSet) {
- if(!VisitedUrlQueue.isContains(string)){//判斷是否已被訪問
- System.out.println("加入佇列"+string);
- UrlQueue.addElem(string);
- }else{
- System.out.println("重複"+string);
- }
- }
- }
- public void outFile(String html){
- try {
- @SuppressWarnings("resource")
- BufferedWriter out = new BufferedWriter(new FileWriter("d://test.html",true));
- out.write(html);
- out.flush();
- } catch (IOException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- }
- }
其他擴充套件
深度控制器
- package com.search.util;
- import java.util.HashSet;
- import java.util.Set;
- /**
- * @see 篩選連結的深度
- * @author zhuGe
- *
- */
- public class Depth {
- /**
- *
- * @param hrefSet 注入需要控制深度的連結
- * @param depth 篩選滿足深度的連結
- */
- public static Set<String> depth(Set<String> hrefSet,int depth){
- Set<String> deptahHrefSet=null;
- if(deptahHrefSet==null){
- deptahHrefSet = new HashSet<String>();
- }
- deptahHrefSet.clear();
- String[] str = null;
- for (String href : hrefSet) {
- str = href.split("/");
- //連結深度
- int idepth = str==null?0:str.length-2;
- //
- //System.out.println(href+" [深度:"+idepth+"]");
- if(idepth<=depth){
- //去除最後的反斜槓
- if(href.lastIndexOf("/")==href.length()-1){
- deptahHrefSet.add(href.substring(0, href.length()-1));
- }else{
- deptahHrefSet.add(href);
- }
- }
- }
- return deptahHrefSet;
- }
- }
啟動入口(‘加入睡眠防止開啟時連結數目過少導致執行緒沒有獲取任務“)
- package com.search.control;
- import com.search.tread.UrlTread;
- import com.search.url.UrlQueue;
- public class controlCentre {
- public static void main(String[] args) {
- UrlQueue.addElem("http://www.ifanr.com");
- UrlQueue.addElem("http://www.leiphone.com");
- UrlQueue.addElem("http://www.huxiu.com");
- UrlTread[] t = new UrlTread[8];
- for(int i=0;i<t.length;i++){
- t[i] = new UrlTread();
- try {
- Thread.sleep(2000);
- } catch (InterruptedException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- new Thread(t[i],"蜘蛛人:"+i+"號").start();
- }
- //
- }
- }