1. 程式人生 > 其它 >Java 寫的簡單網路爬蟲

Java 寫的簡單網路爬蟲

覺得好玩,昨天就研究了一下java爬蟲。

在網上搜索了一些樣例研究了一下。仿造寫了一個簡單的爬蟲,可以自動爬取某本小說的章節(需要自定義正則表示式),利用 多執行緒+鎖 可以爬的更快,也可以同時爬多本書。

目前針對的是起點小說網的正則,利用set和list存需要爬的連結和已經爬過的連結,再用map存某本書的名字,已經爬取的章節數等等,然後寫到檔案裡面。

兩個類實現

AllUrl.java

import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.text.SimpleDateFormat;
import java.util.*; public class AllUrl { public static int maxDepth = 100;//章節數 public static int maxThread = 3;//最大執行緒數 public static List<String> waitUrl = new ArrayList<>(); public static Set<String> overUrl = new HashSet<>(); public static Map<String,Integer> UrlDepth = new
HashMap<>(); public static Map<String,String> bookName = new HashMap<>(); public static String savePath = "E:\\起點book\\"; public static synchronized void workUrl(String url,int depth){ if(depth > AllUrl.maxDepth){ System.out.println("《"+bookName.get(url)+"》爬取達到設定的章節數,停止爬取。"); SimpleDateFormat formatter
= new SimpleDateFormat("yyyy-MM-dd 'at' HH:mm:ss z"); Date date = new Date(System.currentTimeMillis()); System.out.println(formatter.format(date)); Thread.currentThread().interrupt(); }else{ if(AllUrl.overUrl.contains(url)){ System.out.println(url+"已經爬取過"); }else{ try{ URL url1 = new URL(url);//新連結 URLConnection urlConnection = url1.openConnection();//連結 新連結 InputStream is = urlConnection.getInputStream();//獲取連結的內容 BufferedReader br = new BufferedReader(new InputStreamReader(is,"UTF-8"));//讀取轉換獲取的內容到緩衝區 StringBuilder stringBuilder = new StringBuilder();//讀取緩衝區內容 String tempString ; while((tempString = br.readLine()) != null){ stringBuilder.append(tempString);//追加字元到stringBuilder } //這裡的正則表示式針對的是https://www.qidian.com/ 起點中文網 String sStart = "class=\"read-content j_readContent\"", tem = "<p>", Sin = ""; StringBuilder nextUrl = new StringBuilder("<a id=\"j_chapterNext\" href=\""); int start = stringBuilder.indexOf(sStart), end = stringBuilder.indexOf(tem, start + 1), AllEnd = stringBuilder.indexOf("<div class=\"admire-wrap\">"); int nextUrlStart = stringBuilder.indexOf(String.valueOf(nextUrl)); nextUrlStart += nextUrl.length(); int nextUrlEnd = stringBuilder.indexOf("\"", nextUrlStart + 1); nextUrl.setLength(0);//清空 nextUrl.append("https:");//加字首 nextUrl.append(stringBuilder.substring(nextUrlStart, nextUrlEnd));//加字尾 addUrl(nextUrl.toString(),depth+1,bookName.get(url)); start += sStart.length() + 20; AllEnd -= 10;//定義文章開始結束位置 PrintWriter pw = new PrintWriter(new File(savePath + bookName.get(url)+"第"+depth +"章"+ ".txt"));//建立存放資料的檔案,檔名包含當前時間,防止重複 String sin = ""; while ((Sin = stringBuilder.substring(start, end)) != null) { pw.println(Sin);//寫入檔案 start = end + tem.length(); end = stringBuilder.indexOf(tem, start + 1); if (end == -1 || end >= AllEnd) { break; } } //這裡的正則表示式針對的是https://www.qidian.com/ 起點中文網 pw.close(); br.close(); overUrl.add(url); System.out.println("《"+bookName.get(url)+"》已爬取,共爬取所有小說章節數量" + overUrl.size() + "剩餘爬取章節數量:" + waitUrl.size()); }catch (Exception e){ e.printStackTrace(); } } } } public static synchronized String getUrl(){//取得最新Url if(!waitUrl.isEmpty()){ String tempUrl = waitUrl.get(0); waitUrl.remove(0); return tempUrl; }else return null; } public static synchronized void addUrl(String Url,int Depth,String bName){//新增Url if(overUrl.contains(Url) == false){ waitUrl.add(Url); UrlDepth.put(Url,Depth); bookName.put(Url,bName); System.out.println("《"+bookName.get(Url)+"》的章節"+Depth+"已經新增到待爬取佇列,目前待爬取佇列有"+waitUrl.size()+"個任務。"); }else{ System.out.println("《"+bookName.get(Url)+"》的章節"+Depth+"已經爬取過了,不再爬取。"); } } }
CrawlTheWeb.java
import java.sql.Time;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.concurrent.Executor;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

public class CrawlTheWeb extends Thread{
    public static Object lock = new Object();//執行緒鎖 用於使執行緒進入睡眠,或隨機喚醒一個執行緒
    public void run(){
        while(true){
            if(Thread.currentThread().isInterrupted()){
                System.out.println(Thread.currentThread().getName()+"完成了任務。");
                break;
            }
            if(AllUrl.waitUrl.isEmpty() == false){
                String nextUrl = AllUrl.waitUrl.get(0);
                AllUrl.waitUrl.remove(0);
                AllUrl.workUrl(nextUrl,AllUrl.UrlDepth.get(nextUrl));
                System.out.println(this.getName()+"開始爬取《"+AllUrl.bookName.get(nextUrl)+"》,章節數: "+AllUrl.UrlDepth.get(nextUrl));
            }else{
                synchronized (lock){
                    try {
                        System.out.println("待爬取列表為空,"+this.getName()+"進入等待狀態。");
                        lock.wait();
                    }catch (Exception e){
                        e.printStackTrace();
                    }
                }
            }
        }
    }

    public static void main(String[] args) {
        SimpleDateFormat formatter= new SimpleDateFormat("yyyy-MM-dd 'at' HH:mm:ss z");
        Date date = new Date(System.currentTimeMillis());
        System.out.println(formatter.format(date));
        String strUrl = "https://read.qidian.com/chapter/D-1F0Iq1JGPOVUeyz9PqUQ2/DIfEaAmW-9X6ItTi_ILQ7A2/";//爬取的網頁
        AllUrl.addUrl(strUrl,1,"模擬器:開局天牢死囚");
        strUrl = "https://read.qidian.com/chapter/W08HMrSPUHj7X4qr8VpWrA2/8W_pmmniqFvM5j8_3RRvhw2/";//爬取的網頁
        AllUrl.addUrl(strUrl,1,"我的屬性修行人生");
        strUrl = "https://read.qidian.com/chapter/q2B9dFLoeqU3v1oFI-DX8Q2/dsXQ94IHlUZp4rPq4Fd4KQ2/";//爬取的網頁
        AllUrl.addUrl(strUrl,1,"這個武聖超有素質");
        for(int i=0;i<AllUrl.maxThread;i++){
            new CrawlTheWeb().start();
        }

    }
}

大多數人都推薦 實現 Runnable ,但是目前我還用不著,暫時先繼承稍微熟悉一點的Thread。