利用記憶體資料庫和布隆過濾器寫的網路爬蟲
阿新 • • 發佈:2019-02-15
記憶體資料庫用來儲存待訪問url,布隆過濾器用來記錄已訪問的url。先前我們待訪問url是存放在記憶體中,已訪問的url是利用HashSet實現的。
布隆過濾器
package hashfilter; import java.util.BitSet; import bdb.CrawlUrl; public class SimpleBloomFilter { private static final int DEFAULT_SIZE=2<<24; private static final int seeds[]={7,11,13,31,37,61}; private BitSet bits=new BitSet(DEFAULT_SIZE); private SimpleHash func[]=new SimpleHash[seeds.length]; public SimpleBloomFilter() { int n=func.length; for(int i=0;i<n;i++) { func[i]=new SimpleHash(DEFAULT_SIZE,seeds[i]); } } public void add(CrawlUrl crawlUrl) { add(crawlUrl.getOriUrl()); } private void add(String value) { if(value!=null) { for(SimpleHash f:func) { bits.set(f.hash(value), true); } } } public boolean contains(CrawlUrl crawlUrl) { return contains(crawlUrl.getOriUrl()); } private boolean contains(String value) { if(value==null) return false; else { boolean ret=true; for(SimpleHash f:func) { ret=ret&&bits.get(f.hash(value)); } return ret; } } }
package hashfilter; public class SimpleHash { private int cap; private int seed; public SimpleHash(int cap,int seed) { this.cap=cap; this.seed=seed; } public int hash(String value) { int result=0; int n=value.length(); for(int i=0;i<n;i++) { result=result*seed+value.charAt(i); } return (cap-1)&result; } }
記憶體資料庫
package bdb; import java.io.Serializable; import java.util.Date; import com.sleepycat.je.utilint.Timestamp; public class CrawlUrl implements Serializable{ private static final long serialVersionUID=7931672194843948629L; public CrawlUrl(){ } private String oriUrl; // 原始 URL 的值,主機部分是域名 private String url; // URL 的值,主機部分是 IP,為了防止重複主機的出現 private int urlNo; // URL NUM private int statusCode; // 獲取 URL 返回的結果碼 private int hitNum; // 此 URL 被其他文章引用的次數 private String charSet; // 此 URL 對應文章的漢字編碼 private String abstractText; // 文章摘要 private String author; // 作者 private int weight; // 文章的權重(包含導向詞的資訊) private String description; // 文章的描述 private int fileSize; // 文章大小 private Timestamp lastUpdateTime; // 最後修改時間 private Date timeToLive; // 過期時間 private String title; // 文章名稱 private String type; // 文章型別 private String[] urlRefrences; // 引用的連結 private int layer; // 爬取的層次, 從種子開始, 依次為第 0 層, 第 1 層... public int getLayer() { return layer; } public void setLayer(int layer) { this.layer=layer; } public String getUrl() { return url; } public void setUrl(String url) { this.url=url; } public int getUrlNo() { return urlNo; } public void setUrlNo(int urlNo) { this.urlNo = urlNo; } public int getStatusCode() { return statusCode; } public void setStatusCode(int statusCode) { this.statusCode = statusCode; } public int getHitNum() { return hitNum; } public void setHitNum(int hitNum) { this.hitNum = hitNum; } public String getCharSet() { return charSet; } public void setCharSet(String charSet) { this.charSet = charSet; } public String getAbstractText() { return abstractText; } public void setAbstractText(String abstractText) { this.abstractText = abstractText; } public String getAuthor() { return author; } public void setAuthor(String author) { this.author = author; } public int getWeight() { return weight; } public void setWeight(int weight) { this.weight = weight; } public String getDescription() { return description; } public void setDescription(String description) { this.description = description; } public int getFileSize() { return fileSize; } public void setFileSize(int fileSize) { this.fileSize = fileSize; } public Timestamp getLastUpdateTime() { return lastUpdateTime; } public void setLastUpdateTime(Timestamp lastUpdateTime) { this.lastUpdateTime = lastUpdateTime; } public Date getTimeToLive() { return timeToLive; } public void setTimeToLive(Date timeToLive) { this.timeToLive = timeToLive; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getType() { return type; } public void setType(String type) { this.type = type; } public String[] getUrlRefrences() { return urlRefrences; } public void setUrlRefrences(String[] urlRefrences) { this.urlRefrences = urlRefrences; } public final String getOriUrl() { return oriUrl; } public void setOriUrl(String oriUrl) { this.oriUrl = oriUrl; } }
package bdb;
public interface Frontier {
public CrawlUrl getNext() throws Exception;
public boolean putUrl(CrawlUrl url) throws Exception;
}
package bdb;
import java.io.File;
import com.sleepycat.bind.serial.StoredClassCatalog;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseConfig;
import com.sleepycat.je.Environment;
import com.sleepycat.je.EnvironmentConfig;
public abstract class AbstractFrontier {
private Environment env;
private static final String CLASS_CATALOG="java_class_catalog";
protected StoredClassCatalog javaCatalog;
protected Database catalogdatabase;
protected Database database;
public AbstractFrontier(String homeDirectory)
{
System.out.println("Opening environment in: "+homeDirectory);
EnvironmentConfig envConfig=new EnvironmentConfig();
envConfig.setTransactional(true);
envConfig.setAllowCreate(true);
env=new Environment(new File(homeDirectory),envConfig);
DatabaseConfig dbConfig=new DatabaseConfig();
dbConfig.setAllowCreate(true);
dbConfig.setTransactional(true);
catalogdatabase=env.openDatabase(null, CLASS_CATALOG, dbConfig);
// A single StoredClassCatalog object is normally used along with a set of databases that stored serialized objects.
// 存放需要序列化的物件
javaCatalog=new StoredClassCatalog(catalogdatabase);
DatabaseConfig dbConfig0=new DatabaseConfig();
dbConfig0.setAllowCreate(true);
dbConfig0.setTransactional(true);
// 存放的是key
database=env.openDatabase(null,"URL", dbConfig0);
}
public void close()
{
database.close();
javaCatalog.close();
env.close();
}
protected abstract void put(Object key,Object value);
protected abstract Object get(Object key);
protected abstract Object delete(Object key);
}
package bdb;
import java.util.Map.Entry;
import java.util.Set;
import com.sleepycat.bind.EntryBinding;
import com.sleepycat.bind.serial.SerialBinding;
import com.sleepycat.collections.StoredMap;
public class BDBFrontier extends AbstractFrontier implements Frontier{
private StoredMap pendingUrisDB=null;
public BDBFrontier(String homeDirectory) {
super(homeDirectory);
// TODO Auto-generated constructor stub
// 獲得DatabaseEntry有兩種方式,一是通過其建構函式,引數是物件的位元組;
// 二是通過EntryBinding.objectToEntry()函式來獲得
EntryBinding keyBinding=new SerialBinding(javaCatalog, String.class);
EntryBinding valueBinding=new SerialBinding(javaCatalog,CrawlUrl.class);
// Creates a map entity view of a Database
pendingUrisDB=new StoredMap(database,keyBinding,valueBinding,true);
}
@Override
public CrawlUrl getNext() throws Exception {
// TODO Auto-generated method stub
CrawlUrl result=null;
if(!pendingUrisDB.isEmpty())
{
// Set entrys=pendingUrisDB.entrySet();
// System.out.println(entrys);
Entry<String,CrawlUrl>
entry=(Entry<String,CrawlUrl>)pendingUrisDB.entrySet().iterator().next();
result=entry.getValue();
delete(entry.getKey());
}
return result;
}
@Override
public boolean putUrl(CrawlUrl url) throws Exception {
// TODO Auto-generated method stub
put(url.getOriUrl(),url);
return true;
}
@Override
protected void put(Object key, Object value) {
// TODO Auto-generated method stub
pendingUrisDB.put(key, value);
}
@Override
protected Object get(Object key) {
// TODO Auto-generated method stub
return pendingUrisDB.get(key);
}
@Override
protected Object delete(Object key) {
// TODO Auto-generated method stub
return pendingUrisDB.remove(key);
}
// 根據url可計算鍵值,可使用包括MD5在內的各種壓縮演算法
private String calulateUrl(String url)
{
return url;
}
public boolean contains(CrawlUrl url)
{
return pendingUrisDB.containsKey(url.getOriUrl());
}
public boolean isEmpty()
{
return pendingUrisDB.isEmpty();
}
// 測試程式
// public static void main(String[] args)
// {
// BDBFrontier bDBFrontier=new BDBFrontier("D:\\bdb");
// CrawlUrl url=new CrawlUrl();
// url.setOriUrl("http://www.baidu.com");
// try {
// bDBFrontier.putUrl(url);
// System.out.println(bDBFrontier.getNext().getOriUrl());
// bDBFrontier.close();
// } catch (Exception e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
// }
}
封裝待訪問url和已訪問url
import bdb.BDBFrontier;
import bdb.CrawlUrl;
import hashfilter.SimpleBloomFilter;
public class NewLinkQueue {
private static SimpleBloomFilter visitedUrl=new SimpleBloomFilter();
private static BDBFrontier unvistedUrl=new BDBFrontier("D:\\bdb");
public static Object unvisitedUrlDeQueue() throws Exception
{
return unvistedUrl.getNext().getOriUrl();
}
public static void addUnvisitedUrl(String url)
{
CrawlUrl crawlUrl=new CrawlUrl();
crawlUrl.setOriUrl(url);
if(url!=null&&!url.trim().equals("")
&&!unvistedUrl.contains(crawlUrl)&&!visitedUrl.contains(crawlUrl))
{
try {
unvistedUrl.putUrl(crawlUrl);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
public static boolean unvisitedUrlIsEmpty()
{
return unvistedUrl.isEmpty();
}
public static void addVisitedUrl(String url)
{
CrawlUrl crawlUrl=new CrawlUrl();
crawlUrl.setOriUrl(url);
visitedUrl.add(crawlUrl);
}
}
//下載網頁
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.Header;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.HttpClientConnectionManager;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.BasicHttpClientConnectionManager;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.HttpConnectionParams;
import org.apache.http.params.HttpParams;
public class DownLoadFile {
private String filePath;
private CloseableHttpClient httpclient;
DownLoadFile()
{
filePath=null;
// httpclient=HttpClients.createDefault();
// HttpParams httpParams=new BasicHttpParams();
// HttpConnectionParams.setConnectionTimeout(httpParams, 50000);
// HttpConnectionParams.setSoTimeout(httpParams, 5000);
}
//根據URL和網頁型別生成需要儲存的網頁的檔名,去除URL中的非檔名字元
public String getFileNameByUrl(String url,String contentType)
{
url=url.substring(7);
//text/html型別
if(contentType.indexOf("html")!=-1)
{
url=url.replaceAll("[\\?/:|<>\"]","_")+".html";
return url;
}
else
{
return url.replaceAll("[\\?/:|<>\"]","_")+"."
+contentType.substring(contentType.lastIndexOf("/")+1);
}
}
//儲存網頁位元組陣列到本地檔案,filePath為要儲存的檔案的相對路徑
//下載URL指向的網頁
public String downloadFile(String url)
{
System.out.println("link:"+url);
// HttpClientConnectionManager connManager=new BasicHttpClientConnectionManager();
// connManager.closeIdleConnections(5, TimeUnit.SECONDS);
// httpclient=HttpClients.createMinimal(connManager);
// RequestConfig.Builder requestBuilder=RequestConfig.custom();
// requestBuilder = requestBuilder.setConnectionRequestTimeout(5*1000);
// requestBuilder = requestBuilder.setConnectTimeout(5*1000);
// HttpClientBuilder builder=HttpClientBuilder.create();
// builder.setDefaultRequestConfig(requestBuilder.build());
// CloseableHttpClient httpclient=builder.build();
HttpParams params = new BasicHttpParams();
HttpConnectionParams.setConnectionTimeout(params, 10000);
HttpConnectionParams.setSoTimeout(params, 10000);
HttpClient httpClient = new DefaultHttpClient(params);
try {
HttpGet httpGet=new HttpGet(url);
HttpResponse response=httpClient.execute(httpGet);
System.out.println("得到http響應");
if(response.getStatusLine().getStatusCode()==HttpStatus.SC_OK)
{
/**************************************************************************************/
//提取網頁編碼方式
/* Header[] headers=response.getAllHeaders();
String charset=null;
int temp=-1;
for(int i=0;i<headers.length;i++)
{
if((temp=headers[i].getValue().indexOf("charset="))!=-1)
{
// int end=headers[i].getValue().indexOf("\"");
// if(end==-1)
// end=headers[i].getValue().indexOf(">");
// charset=headers[i].getValue().substring(temp+8,end-1);
charset=headers[i].getValue().substring(temp+8);
break;
}
}
*/
/* InputStream in=response.getEntity().getContent();
String charset=null;
byte b[]=null;
int contentLength=in.available();
if(contentLength>1000)
{
contentLength=1000;
}
b=new byte[1000];
in.read(b,0,contentLength);
String strTmp=new String(b);
Pattern p;
Matcher m;
String regex="gb2312|GB2312|GBK|gbk|utf-8|UTF-8|utf8|UTF8";
p=Pattern.compile(regex);
m=p.matcher(strTmp);
if(m.find())
{
charset=m.group();
}
else
{
charset="utf-8";
}
System.out.println("得到網頁字符集"+charset);
// BufferedReader br=new BufferedReader(new InputStreamReader(in));
// if(charset==null)
// {
// String line="";
// StringBuffer buffer=new StringBuffer();
// while((line=br.readLine())!=null)
// {
// buffer.append(line);
// }
// line=buffer.toString();
// int a=line.indexOf("charset=");
// String str=line.substring(a);
// charset=str.substring(8,str.indexOf("\""));
// }
// if(charset==null)
// {
// charset="utf-8";
// }
*/
/*************************************************************************************/
/* //得到網頁內容
BufferedReader responseBody=new BufferedReader(new InputStreamReader(in,charset));
*/
/*************************************************************************************/
String a=response.getFirstHeader("Content-Type").getValue();
System.out.println("Content-Type內容: "+a);
InputStream responseBody=response.getEntity().getContent();
filePath="E:\\temp\\"
+getFileNameByUrl(url,response.getFirstHeader("Content-Type").getValue());
System.out.println("檔案路徑: "+filePath);
// saveToLocal(responseBody,filePath);
FileOutputStream outputStream=new FileOutputStream(new File(filePath));
int length=0;
byte b[]=new byte[1024];
while((length=responseBody.read(b))!=-1)
{
outputStream.write(b,0,length);
}
responseBody.close();
outputStream.close();
}
else
{
System.err.print("Method Failed:"+response.getStatusLine().getStatusCode());
}
} catch (ClientProtocolException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}catch(Exception e){
e.printStackTrace();
}
// try {
// httpclient.close();
// } catch (IOException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
return filePath;
}
private void saveToLocal(InputStream responseBody,String filePath) throws IOException
{
// int ch;
// FileWriter fw=new FileWriter(filePath);
//
//
// while((ch=responseBody.read())!=-1){
// fw.write(ch);
// }
// responseBody.close();
// fw.close();
//
// return ;
// String line="";
// StringBuffer buffer=new StringBuffer();
// int i=0;
// while((line=responseBody.readLine())!=null)
// {
// buffer.append(line);
// System.out.println("第"+i+"次迴圈");
// i++;
// }
// line=buffer.toString();
// System.out.println(line);//輸出原始碼
/**********************************************************************************************/
//向檔案中寫入原始碼字串
// FileWriter fw1=new FileWriter(filePath);
// fw1.write(line);
// fw1.close();
// System.out.println("儲存完成"+filePath);
// DataOutputStream out=new DataOutputStream(new FileOutputStream(new File(filePath)));
// for(int i=0;i<b.length;i++)
// {
// out.write(b[i]);
// }
FileOutputStream outputStream=new FileOutputStream(new File(filePath));
byte b[]=new byte[1024];
while(responseBody.read(b)!=-1)
{
outputStream.write(b);
}
responseBody.close();
outputStream.close();
}
/*****************************************************************************************/
//除錯用
// public static void main(String[] args)
// {
// DownLoadFile df=new DownLoadFile();
// df.downloadFile("http://www.baidu.com");
// }
}
提取連結import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.util.HashSet;
import java.util.Set;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
public class HtmlParserTool {
public static Set<String> extractLinks(String filePath)
{
Set<String> links=new HashSet<String>();
NodeList nodeList;
String line="";
StringBuffer sb=new StringBuffer();
NodeFilter linkFilter=new NodeClassFilter(LinkTag.class);
OrFilter lastFilter=new OrFilter();
lastFilter.setPredicates(new NodeFilter[]{linkFilter});
try {
BufferedReader br=new BufferedReader(new FileReader(filePath));
while((line=br.readLine())!=null)
{
sb.append(line);
}
Parser parser=Parser.createParser(sb.toString(), "utf-8");
nodeList=parser.parse(lastFilter);
Node nodes[]=nodeList.toNodeArray();
String link=null;
for(int i=0;i<nodes.length;i++)
{
if(nodes[i] instanceof LinkTag)// <a> 標籤
{
LinkTag linkNode=(LinkTag)(nodes[i]);
link=linkNode.getLink();
links.add(link);
}
else// <frame標籤>
{
//提取frame裡src屬性的連結,如<frame src="test.html"/>
String frame=nodes[i].getText();
int start=frame.indexOf("src");
int end=frame.indexOf(" ");
if(end==-1)
{
end=frame.indexOf(">");
}
String frameUrl=frame.substring(start+5, end-1);
links.add(frameUrl);
}
}
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}catch (Exception e) {
e.printStackTrace();
}
return links;
// try {
// Parser parser=new Parser(url);
// parser.setEncoding("gb2312");
// NodeFilter linkFilter=new NodeClassFilter(LinkTag.class);
// OrFilter lastFilter=new OrFilter();
// lastFilter.setPredicates(new NodeFilter[]{linkFilter});
//// parser.setEncoding("gb2312");
// nodeList=parser.parse(lastFilter);
// Node[] nodes=nodeList.toNodeArray();
// String link="";
// System.out.println("開始提取連結迴圈");
// for(int i=0;i<nodes.length;i++)
// {
// if(nodes[i] instanceof LinkTag)// <a> 標籤
// {
// LinkTag linkNode=(LinkTag)(nodes[i]);
// link=linkNode.getLink();
// links.add(link);
// }
// else// <frame標籤>
// {
// //提取frame裡src屬性的連結,如<frame src="test.html"/>
// String frame=nodes[i].getText();
// int start=frame.indexOf("src");
// int end=frame.indexOf(" ");
// if(end==-1)
// {
// end=frame.indexOf(">");
// }
// String frameUrl=frame.substring(start+5, end-1);
// links.add(frameUrl);
// }
// }
// } catch (ParserException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
// catch(Exception e){
// e.printStackTrace();
// }
// return links;
}
}
主程式
import java.util.Set;
public class MyClawler {
private void initCrawlerWithSeeds(String[] seeds)
{
for(int i=0;i<seeds.length;i++)
{
NewLinkQueue.addUnvisitedUrl(seeds[i]);
}
}
public void crawling(String[] seeds)
{
/******************************************************************************/
//定義過濾器
/* LinkFilter filter=new LinkFilter()
{
public boolean accept(String url)
{
if(url.startsWith("http://www.baidu.com"))
return true;
else
return false;
}
};
*/
/******************************************************************************/
initCrawlerWithSeeds(seeds);
DownLoadFile downLoader=new DownLoadFile();
Set<String> links=null;
String filePath=null;
while(!NewLinkQueue.unvisitedUrlIsEmpty())
{
String visitUrl;
try {
visitUrl = (String)NewLinkQueue.unvisitedUrlDeQueue();// 未訪問佇列隊首Url出列
System.out.println("提取未訪問的Url"+visitUrl);
if(visitUrl==null)
continue;
filePath=downLoader.downloadFile(visitUrl);// 下載網頁
NewLinkQueue.addVisitedUrl(visitUrl);// 將該Url放入已訪問佇列
links=HtmlParserTool.extractLinks(filePath);// 提取網頁中的連結
System.out.println("網頁中的連結數:"+links.size());
for(String link:links)
{
NewLinkQueue.addUnvisitedUrl(link);// 將連結放入未訪問佇列
System.out.println(link);
}
System.out.println("網頁中的連結數:"+links.size());
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
public static void main(String[] args)
{
MyClawler clawler=new MyClawler();
clawler.crawling(new String[]{"http://www.baidu.com"});
System.out.println("done");
}
}
參考文獻:《自己動手寫網路爬蟲》、Berkeley DB參考手冊等