1. 程式人生 > >豆瓣小組爬蟲

豆瓣小組爬蟲

.html 不存在 lin rom 是否 inpu clas main safari

豆瓣小組上有很多小組裏面的圖片非常的好

所以我們給爬下來

if __name__=="__main__":
    url = raw_input(請輸入小組字符串代碼如:haixiuzu)
    beginPage = int(raw_input(請輸入起始頁碼))
    endPage= int(raw_input(請輸入結束頁碼))
    #構建url
    url = https://www.douban.com/group/+url+/discussion?start=
    #獲取所有url
    getPageLink(url,beginPage,endPage)

先構建主頁url

接著用xpath獲取需要爬的帖子鏈接

def getPageLink(url,begin,end):
    """
    獲取小組主頁全部帖子鏈接
    :param url:
    :return:
    """
    #構建所有URL
    urlList = []
    #構建所有需要獲取的鏈接
    for page in range(begin,end+1):
        pn = (page - 1) * 25
        urlList.append(str(url)+str(pn))
    #構建head
    headers = {
User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36} #存放所有帖子鏈接 linkList = [] for i in urlList: Request = urllib2.Request(i,headers=headers) html = urllib2.urlopen(Request).read() content=etree.HTML(html)
#用xpath獲取鏈接 tempList=content.xpath(//td[@class="title"]/a/@href) for t in tempList: linkList.append(t) getImgLink(linkList)

接著獲取所有圖片的鏈接

def getImgLink(url):
    """
    獲取帖子裏所有圖片的鏈接
    :param url:
    :return:
    """
    headers = {
        Connection: keep-alive,
        Upgrade-Insecure-Requests: 1,
        Referer: https://www.douban.com/group/haixiuzu/discussion?start=0,
        User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36
    }
    imgUrl = []
    for t in url:
        request = urllib2.Request(t,headers=headers)
        html = urllib2.urlopen(request).read()
        content = etree.HTML(html)
        #用xpath獲取所有圖片鏈接
        tempList = content.xpath(//div[@class="topic-content"]/div[@class="topic-figure cc"]/img/@src)
        for t in tempList:
            imgUrl.append(t)
    savaImg(imgUrl)

然後當然是保存所有圖片了

def savaImg(imgList):
    """
    保存圖片到本地
    :param imgList:
    :return:
    """
    headers = {
        User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36}
    #判斷目錄是否存在
    isExists = os.path.exists(img)
    #如果不存在創建
    if not isExists:
        os.makedirs(img)
    for t in imgList:
        request =  urllib2.Request(t,headers=headers)
        img = urllib2.urlopen(request).read()
        fileName = t[-10:]
        #寫圖片到指定目錄
        with open(img\\+fileName,"wb") as writ:
            writ.write(img)

以上僅供學習交流使用

豆瓣小組爬蟲