1. 程式人生 > >4.11 實戰專案3:併發爬蟲

4.11 實戰專案3:併發爬蟲

匯入類庫

import (
    "fmt"
    "regexp"
    "net/http"
    "io/ioutil"
    "strings"
    "sync"
    "time"
    "strconv"
)

定義全域性資料

var (
    //存放圖片連結
    chanImgUrls chan string
    //存放147個任務是否已完成
    chanTask  chan string
    waitGroup sync.WaitGroup
)

爬取一個頁面上的全部圖片連結,返回結果切片

func SpiderPrettyImg(url
string) (urls []string) { pageStr := GetPageStr(url) //fmt.Println(pageStr) re := regexp.MustCompile(reImg) results := re.FindAllStringSubmatch(pageStr, -1) fmt.Printf("共找到%d條結果:\n", len(results)) for _, result := range results { url := result[1] fmt.Println(url)
urls = append(urls, url)
} return }

從url中提取檔名稱

func GetFilenameFromUrl(url string, dirPath string) (filename string) {
    lastIndex := strings.LastIndex(url, "/")
    filename = url[lastIndex+1:]
    timePrefix := strconv.Itoa(int(time.Now().UnixNano()))
    filename = timePrefix + "_"
+ filename filename = dirPath + filename //fmt.Println(fileName) return
}

下載url對應的檔案到指定路徑

func DownloadFile(url string, filename string) (ok bool) {
    resp, err := http.Get(url)
    if err != nil {
        HandleError(err, "http.Get(url)")
        return
    }
    defer resp.Body.Close()

    //ioutil.ReadAll(resp.Body)read tcp 192.168.20.50:57178->175.6.244.4:80: wsarecv:
    // An existing connection was forcibly closed by the remote host.
    fBytes, e := ioutil.ReadAll(resp.Body)
    HandleError(e, "ioutil.ReadAll(resp.Body)")
    err = ioutil.WriteFile(filename, fBytes, 0644)
    HandleError(err, "http.Get(url)")
    if err != nil {
        return false
    } else {
        return true
    }
}

爬取一個頁面下的所有圖片連結,並丟入全域性待下載資料管道

func SpiderImgUrls(url string) {
    //獲取一個頁面下的所有圖片連結
    urls := SpiderPrettyImg(url)
    //將所有圖片超連結丟入資料管道
    for _, url := range urls {
        chanImgUrls <- url
    }

    //通知當前協程任務完成
    chanTask <- url
    waitGroup.Done()
}

同步下載圖片連結管道中的所有圖片

func DownloadImg() {
    for url := range chanImgUrls {
        filename := GetFilenameFromUrl(url, "D:/BJBlockChain1801/demos/W4/day4/img/")
        ok := DownloadFile(url, filename)
        if ok {
            fmt.Printf("%s下載成功!\n", filename)
        } else {
            fmt.Printf("%s下載失敗!!!!!!!!!!!!\n", filename)
        }
    }
    waitGroup.Done()
}

檢查147個任務是否全部完成,完成則關閉資料管道

func CheckIfAllSpidersOk() {
    var count int
    for {
        url := <-chanTask
        fmt.Printf("%s完成爬取任務\n", url)
        count ++
        if count == 147 {
            close(chanImgUrls)
            break
        }
    }
    waitGroup.Done()
}

主程式

func main() {
    //初始化資料管道
    chanImgUrls = make(chan string, 1000000)
    chanTask = make(chan string, 147)

    //爬蟲協程:源源不斷地往管道中新增圖片連結
    for i := 1; i < 148; i++ {
        waitGroup.Add(1)
        go SpiderImgUrls("http://www.umei.cc/tags/meinv_" + strconv.Itoa(i) + ".htm")
    }

    //開闢任務統計協程,如果147個任務全部完成,則關閉資料管道
    waitGroup.Add(1)
    go CheckIfAllSpidersOk()

    //下載協程:源源不斷地從管道中讀取地址並下載
    for i := 0; i < 10; i++ {
        waitGroup.Add(1)
        go DownloadImg()
    }
    waitGroup.Wait()
}