利用HttpClient和HtmlParser構造簡單爬蟲
阿新 • • 發佈:2019-01-22
/** * 爬蟲主方法入口類 * @author Qing * */ public class Clawler { /** * 用種子url初始化url佇列 * @param seeds */ private void initCrawlerWithSeeds(String[] seeds){ for(int i = 0; i < seeds.length; i ++){ LinkDB.addUnvisitedUrl(seeds[i]); } } public void crawling(String[] seeds){ LinkFilter filter = new LinkFilter(){ //:http://club.xdnice.com/ public boolean accept(String url){ if(url.startsWith("http://club.xdnice.com/")) return true; else return false; } }; //初始化url佇列 initCrawlerWithSeeds(seeds); //visit的url的最大值,並且未訪問的url集不為空 while(!LinkDB.unVisitedUrlIsEmpty() && LinkDB.getVisitedUrlNum() <= 10){ //隊列出隊一個url String visitUrl = LinkDB.unVisitedUrlDeQueue(); if(visitUrl == null){ continue; } //下載網頁 FileDownLoader fdloader = new FileDownLoader(); fdloader.downloadFile(visitUrl); //加入已訪問列表 LinkDB.addVisitedUrl(visitUrl); //提取url Set<String> links = HtmlParserTool.extracLinks(visitUrl,filter); for(String link: links){ LinkDB.addUnvisitedUrl(link); } } }
FileDownLoader是一個利用HttpClient將網頁的位元組下載到本地,負責網頁下載的物件
HtmlParserTool利用HtmlParser對網頁進行過濾,過濾出符合條件的連結/** * 網頁下載類 * @author Qing * */ public class FileDownLoader { /** * 根據url和網頁型別生成需要儲存的網頁中文名,去除url中非檔名字元 * @param url * @param contentType * @return */ public String getFileNameByUrl(String url,String contentType){ url = url.substring(7);//移除http:// if(contentType.indexOf("html") != -1){//html url = url.replaceAll("[\\?/:*|<>\"]","_");//去掉url中非檔名字元生成檔名 return url; } else{ return url.replaceAll("[\\?/:*|<>\"]","_")+"." + contentType.substring(contentType.lastIndexOf("/")+1); } } /** * 儲存網頁位元組陣列到本地檔案 * @param data * @param filePath */ public void saveToLocal(String data,String filePath){ try{ DataOutputStream out = new DataOutputStream(new FileOutputStream(new File(filePath))); out.writeUTF(data);//write in utf-8 out.flush(); out.close(); }catch(Exception e){ e.printStackTrace(); } } /** * 下載url網頁 * @param url * @return */ public String downloadFile(String url){ UrlEncodedFormEntity uefEntity; String filePath = null; CloseableHttpClient httpclient = HttpClients.createDefault(); try{ HttpGet httpget = new HttpGet(url); List<NameValuePair> params = new ArrayList<NameValuePair>(); String str = EntityUtils.toString(new UrlEncodedFormEntity(params, Consts.UTF_8)); httpget.setURI(new URI(httpget.getURI().toString() +"?" + str)); //執行get請求 CloseableHttpResponse response = httpclient.execute(httpget); //獲取響應實體 HttpEntity entity = response.getEntity(); filePath = "temp/"+ getFileNameByUrl(url,response.getFirstHeader("Content-Type").getValue()); saveToLocal(EntityUtils.toString(entity),filePath); }catch(Exception e){ e.printStackTrace(); } return filePath; }
LinkDB負責管理未訪問的連結的集合,和未訪問的連結的佇列/** *分析網頁獲取連結類 * @author Qing * */ public class HtmlParserTool { /** * 獲取一個網站上的連結,filter用來過濾連結 * @param url * @param filter * @return */ public static Set<String> extracLinks(String url,LinkFilter filter){ Set<String> links = new HashSet<String>(); try{ Parser parser = new Parser(url); parser.setEncoding("gb2312"); //過濾<frame>標籤的filter NodeFilter frameFilter = new NodeFilter(){ @Override public boolean accept(Node node) { if(node.getText().startsWith("frame src=")){ return true; } // TODO Auto-generated method stub return false; } }; //OrFilter設定過濾<a><frame>標籤,or關係 OrFilter linkfilter = new OrFilter(new NodeClassFilter(LinkTag.class),frameFilter); NodeList list = parser.extractAllNodesThatMatch(linkfilter); for(int i = 0; i < list.size(); i++){ Node tag = list.elementAt(i); if(tag instanceof LinkTag){ LinkTag link = (LinkTag) tag; String linkUrl = link.getLink(); if(filter.accept(linkUrl)){ links.add(linkUrl); } } else{ String frame = tag.getText(); int start = frame.indexOf("src="); frame = frame.substring(start); int end = frame.indexOf(" "); if(end == -1){ end = frame.indexOf(">"); } frame = frame.substring(5, end -1); if(filter.accept(frame)){ links.add(frame); } } } }catch(Exception e){ e.printStackTrace(); } return links; }
public class LinkDB {
private static Set<String> visitedUrl = new HashSet<String>();
private static Queue<String> unVisitedUrl = new ArrayDeque<String>();
LinkFilter是一個Filter介面,實現了accept(String url)方法,因為NodeFilter只能實現accept(Node node)
public interface LinkFilter {
public boolean accept(String url);
}