爬蟲中的基於LRU演算法的URL過濾器

阿新 • • 發佈：2019-01-12

public class LRUFilter {
private int CurrentCacheSize;
private int MAX;
private int MAXIP;
private HashMap<Integer,HashMap> rootMap;
private HashMap<Integer,Int> DBMap;
private Connection con;
private LinkedList<Element> lQueue;
private HashMap<Integer,Element> slQueue;
public LRUFilter(Connection c){
this(20000,10000,c);
}
public LRUFilter(int capable,int oneip,Connection c){
MAX=capable;
CurrentCacheSize=0;
MAXIP=oneip;
rootMap=new HashMap<Integer,HashMap>();
DBMap=new HashMap<Integer,Int>();
lQueue=new LinkedList<Element>();
slQueue=new HashMap<Integer,Element>();
con=c;
initialCache();
}
private boolean hit(HashMap<Integer,DBElement> urlMap,int ip,int url){
boolean contain=false;
slQueue.get(ip).hits++;
DBElement dbe=null;
if((dbe=urlMap.get(url))!=null){
   contain=true;
   if(dbe.flag==0){
    dbe.hits++;
    dbe.flag=1;
   }else
    dbe.hits++;
}else{
   if(DBMap.get(ip).value<MAXIP){
    contain=false;
    urlMap.put(url, new DBElement(1,ip,2));
    CurrentCacheSize++;
    while(CurrentCacheSize>MAX)
     toDB(0);
    if(urlMap.size()>MAXIP){
     toDB(ip);
     fromDB(ip);
    }
   }else{
    contain=toDBDirect(url,new DBElement(1,ip,0));
    Int i=null;
    if(!contain&&(i=DBMap.get(ip))!=null)
     i.value+=1;
   }
}
return contain;
}
private boolean notHit(int ip,int url){
return hit(fromDB(ip),ip,url);
}
/*
* get the keyip and its count into the DBMap which are the count cache of the ip
*/
private void initialCache(){
try{
   String sql="use crawler;select keyip,count(keyip) from visited group by keyip";
   Statement stm=con.createStatement();
   ResultSet rs=stm.executeQuery(sql);
   while(rs.next())
    DBMap.put(rs.getInt(1), new Int(rs.getInt(2)));
   rs.close();
   stm.close();
}catch(Exception e){
   e.printStackTrace();
}
}
/*
* filter the url, ip is the url's ip.
*/
public boolean contain(String ip,String url){
HashMap<Integer,DBElement> urlMap=null;
int keyip=ip.hashCode();
int keyurl=url.hashCode();
if((urlMap=rootMap.get(keyip))!=null)
   return hit(urlMap,keyip,keyurl);
else
   return notHit(keyip,keyurl);
}
/*
* If ip is equal to 0,get the least recently use ip from the lQueue,writ all
* the url belong the ip to the database, change the number of url belong this ip in DBMap
* if ip is not equal to 0, write the url belong this ip to the database.renew the lQueue and slQueue and DBMap
*/
private boolean toDB(int ip){
HashMap<Integer,DBElement> urlMap=null;
Int count=null;
int num;
if(ip==0){
   Collections.sort(lQueue,new MyComparator());
   Element e=null;
   if((e=lQueue.poll())!=null){
    ip=e.ip;
    slQueue.remove(ip);
    if((urlMap=rootMap.remove(ip))!=null){
     num=writeToDB(urlMap);
     if((count=DBMap.get(ip))!=null)
      count.value+=num;
     CurrentCacheSize-=urlMap.size();
    }
   }else
    return false;//empty
}else{
   if((urlMap=rootMap.remove(ip))!=null){
    num=writeToDB(urlMap);
    if((count=DBMap.get(ip))!=null)
     count.value+=num;
    CurrentCacheSize-=urlMap.size();
   }
   lQueue.remove(slQueue.remove(ip));
}
return true;
}
private HashMap<Integer,DBElement> fromDB(int ip){
Int i=null;
HashMap<Integer,DBElement> urlMap=null;
if((i=DBMap.get(ip))!=null){
   if(i.value>MAXIP)
    urlMap=readFromDB(ip,true);
   else
    urlMap=readFromDB(ip,false);
}else{
   urlMap=new HashMap<Integer,DBElement>();
   DBMap.put(ip, new Int(0));
}
while(urlMap.size()+CurrentCacheSize>MAX)
   toDB(0);

for(int j=0;j<lQueue.size();j++)
   lQueue.get(j).hits=0;
Element e=new Element(ip,0);
lQueue.add(e);
slQueue.put(ip, e);
rootMap.put(ip, urlMap);
CurrentCacheSize+=urlMap.size();
while(CurrentCacheSize>MAX)
   toDB(0);
return urlMap;
}
/*
* write to database,return the number of
* record which is inserted into the database
*/
private int writeToDB(HashMap<Integer,DBElement> urlDBMap){
boolean insertAble=false,updateAble=false;
int num=0;
try{
   PreparedStatement insertStm=con.prepareStatement("use crawler;insert into visited values(?,?,?);");
   DBElement dbe=null;
   for(Iterator i=urlDBMap.entrySet().iterator();i.hasNext();){
    Entry<Integer,DBElement> entry=(Entry<Integer,DBElement>)i.next();
    int keyurl=entry.getKey();
    DBElement e=entry.getValue();
    if(e.flag==2){
     insertStm.setInt(1, keyurl);
     insertStm.setInt(2,e.hits);
     insertStm.setInt(3, e.keyip);
     insertStm.addBatch();
     num++;
     insertAble=true;
    }
   }
   if(insertAble)
    insertStm.executeBatch();
   insertStm.close();
   PreparedStatement updateStm=con.prepareStatement("use crawler;update visited set hits=? where keyurl=?;");
   for(Iterator i=urlDBMap.entrySet().iterator();i.hasNext();){
    Entry<Integer,DBElement> entry=(Entry<Integer,DBElement>)i.next();
    int keyurl=entry.getKey();
    DBElement e=entry.getValue();
    if(e.flag==1){
     updateStm.setInt(1, e.hits);
     updateStm.setInt(2, keyurl);
     updateStm.addBatch();
     updateAble=true;
    }
   }
   if(updateAble)
    updateStm.executeBatch();
   updateStm.close();
   con.commit();
}catch(Exception e){
   e.printStackTrace();
}
return num;
}
/*
* read from database,if the number of the record
* which belong to the ip exceed the MAXIP,just read
* half of it from the database
*/
public HashMap<Integer,DBElement> readFromDB(int ip,boolean exceed){
HashMap<Integer,DBElement> urlMap=new HashMap<Integer,DBElement>();
String sql=null;
int count=MAXIP/2;
if(exceed)
   sql="select top "+count+" keyurl,hits from visited where keyip=?;";
else
   sql="select keyurl,hits from visited where keyip=?;";
try{
   PreparedStatement stm=con.prepareStatement(sql);
   stm.setInt(1, ip);
   ResultSet rs=stm.executeQuery();
   while(rs.next())
    urlMap.put(rs.getInt(1),new DBElement(rs.getInt(2)));
   rs.close();
   stm.close();
}catch(Exception e){
   e.printStackTrace();
}
return urlMap;
}
/*
* insert into the database directly
*/
private boolean toDBDirect(int keyurl,DBElement dbe){
boolean contain=false;
try{
   Statement stm=con.createStatement();
   String sql=null;
   ResultSet rs=stm.executeQuery("use crawler;select hits from visited where keyurl="+keyurl+";");

   if(rs.next()){
    contain=true;
    int hits=rs.getInt(1)+dbe.hits;
    sql="use crawler;update visited set hits="+hits+" where keyurl="+keyurl+";";
   }else{
    contain=false;
    sql="use crawler;insert into visited values("+keyurl+","+dbe.hits+","+dbe.keyip+");";

   }
   stm.executeUpdate(sql);
   rs.close();
   stm.close();
}catch(Exception e){
   e.printStackTrace();
}
return contain;
}
/*
* store the cache data
*/
public void store(){
while(toDB(0));
}
/*
* tool classes
*/
private class Element{
public Element(int i,int h){ip=i;hits=h;}
public int ip;
public int hits;
}
private class Int{
public Int(int v){value=v;}
public int value=0;
}
private class DBElement{
public DBElement(int h,int k,int f){hits=h;keyip=k;flag=f;}
public DBElement(int h){hits=h;}
public int hits=0;
public int keyip=0;
/*
   * 0 stand for not change
   * 1 stand for change
   * 2 stand for a new record
   */
public int flag=0;;
}
private class MyComparator implements Comparator{
public int compare(Object o1,Object o2){
   Element e1=(Element)o1;
   Element e2=(Element)o2;
   if(e1.hits<e1.hits)
    return -1;
   else if(e1.hits>e2.hits)
    return 1;
   else
    return 0;
}
}
}

爬蟲中的基於LRU演算法的URL過濾器

爬蟲中的基於LRU演算法的URL過濾器

淺談網路爬蟲中深度優先演算法和簡單程式碼實現

淺談網路爬蟲中廣度優先演算法和程式碼實現

基於LRU演算法的快取池——阿里筆試題

opencv3.3中基於ssd演算法的目標檢測示例教程

項目一:第十二天 1、常見權限控制方式 2、基於shiro提供url攔截方式驗證權限 3、在realm中授權 5、總結驗證權限方式（四種） 6、用戶註銷7、基於treegrid實現菜單展示

基於FPGA的快速中值濾波演算法--轉載我之前的blog的內容

網遊伺服器中的GUID(唯一標識碼)實現-基於snowflake演算法

Crawler/ML：爬蟲技術(基於urllib.request庫從網頁獲取圖片)+HierarchicalClustering層次聚類演算法，實現自動從網頁獲取圖片然後根據圖片色調自動分類

基於快閃記憶體資料庫的CCF-LRU演算法優化

Python 網路爬蟲 009 (程式設計) 通過正則表示式來獲取一個網頁中的所有的URL連結，並下載這些URL連結的原始碼

python爬蟲中對含中文的url處理

動手實現 LRU 演算法，以及 Caffeine 和 Redis 中的快取淘汰策略

Android中基於Socket的網絡通信

MySQL中基於mysqldump和二進制日誌log-bin二進制日誌進行邏輯備份以及基於時間點的還原

Java中基於HTTP協議網絡編程

爬蟲——爬蟲中使用正則表達式

centos7中基於hadoop安裝hive(CentOS7+hadoop2.8.0+hive2.1.1)

爬蟲中代理的設置問題介紹

Spark HA 配置中spark.deploy.zookeeper.url 的意思

爬蟲中的基於LRU演算法的URL過濾器

相關推薦