1. 程式人生 > >python爬蟲如何爬知乎的話題?

python爬蟲如何爬知乎的話題?

write targe connect 問題 brush img fetchone new text

因為要做觀點,觀點的屋子類似於知乎的話題,所以得想辦法把他給爬下來,搞了半天最終還是妥妥的搞定了,代碼是python寫的,不懂得麻煩自學哈!懂得直接看代碼,絕對可用

#coding:utf-8
from fileinput import filename
__author__ = ‘haoning‘
__crawler for http://www.guandn.com/

#!/usr/bin/env python

import urllib
import urllib2
import time
import re
import json
import uuid
import platform
import os
import sys
import cookielib
import MySQLdb as mdb
from bs4 import BeautifulSoup

reload(sys)
sys.setdefaultencoding( "utf-8" )

headers = {
   ‘User-Agent‘ : ‘Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0‘,
   ‘Content-Type‘:‘application/x-www-form-urlencoded; charset=UTF-8‘,
   ‘X-Requested-With‘:‘XMLHttpRequest‘,
   ‘Referer‘:‘https://www.zhihu.com/topics‘,
   ‘Cookie‘:‘__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a‘
}

DB_HOST = ‘127.0.0.1‘
DB_USER = ‘root‘
DB_PASS = ‘root‘

conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, ‘zhihu‘, charset=‘utf8‘)
conn.autocommit(False)
curr = conn.cursor()

def get_html(url):
    try:
        req = urllib2.Request(url)
        response = urllib2.urlopen(req,None,20) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞?
        html = response.read()
        return html
    except:
        print "timeout"
    return None

def getTopics():
    url = ‘https://www.zhihu.com/topics‘
    print url
    try:
        req = urllib2.Request(url)
        response = urllib2.urlopen(req) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞?
        html = response.read().decode(‘utf-8‘)
        print html
        soup = BeautifulSoup(html)
        lis = soup.find_all(‘li‘, {‘class‘ : ‘zm-topic-cat-item‘})
        
        for li in lis:
            data_id=li.get(‘data-id‘)
            name=li.text
            curr.execute(‘select id from classify_new where name=%s‘,(name))
            y= curr.fetchone()
            if not y:
                curr.execute(‘INSERT INTO classify_new(data_id,name)VALUES(%s,%s)‘,(data_id,name))
        conn.commit()
    except Exception as e:
        print "get topic error",e
        

def get_extension(name):  
    where=name.rfind(‘.‘)
    if where!=-1:
        return name[where:len(name)]
    return None


def which_platform():
    sys_str = platform.system()
    return sys_str

def GetDateString():
    when=time.strftime(‘%Y-%m-%d‘,time.localtime(time.time()))
    foldername = str(when)
    return foldername 

def makeDateFolder(par,classify):
    try:
        if os.path.isdir(par):
            newFolderName=par + ‘//‘ + GetDateString() + ‘//‘  +classify
            if which_platform()=="Linux":
                newFolderName=par + ‘/‘ + GetDateString() + "/" +classify
            if not os.path.isdir( newFolderName ):
                os.makedirs( newFolderName )
            return newFolderName
        else:
            return None 
    except Exception,e:
        print "kk",e
    return None 

def download_img(url,classify):
    try:
        extention=get_extension(url)
        if(extention is None):
            return None
        req = urllib2.Request(url)
        resp = urllib2.urlopen(req,None,15)
        dataimg=resp.read()
        name=str(uuid.uuid1()).replace("-","")+"_www.guandn.com"+extention
        top="E://topic_pic"
        folder=makeDateFolder(top, classify)
        filename=None
        if folder is not None:
            filename  =folder+"//"+name
        #print "filename",filename
        try:
            if "e82bab09c_xs" not in str(url):
                if not os.path.exists(filename):
                    file_object = open(filename,‘w+b‘)
                    file_object.write(dataimg)
                    file_object.close()
                    return GetDateString()+‘/‘+classify+"/"+name
                else:
                    print "file exist"
                    return None
        except IOError,e1:
            print "e1=",e1
            pass
    except Exception as e:
        print "eee",e
        pass
    return None #濡傛灉娌℃湁涓嬭澆涓嬫潵灝卞埄鐢ㄥ師鏉ョ綉絝欑殑閾炬帴
    
                
def get_topis(top_id,topic_name):
    url = ‘https://www.zhihu.com/node/TopicsPlazzaListV2‘
    isGet = True;
    offset = -20;
    top_id=str(top_id)
    while isGet:
        offset = offset + 20
        values = {‘method‘: ‘next‘, ‘params‘: ‘{"topic_id":‘+top_id+‘,"offset":‘+str(offset)+‘,"hash_id":""}‘}
        try:
            data = urllib.urlencode(values)
            request = urllib2.Request(url,data,headers)
            response = urllib2.urlopen(request)
            html=response.read().decode(‘utf-8‘)
            if html is None:
                return
            json_str = json.loads(html)
            ms=json_str[‘msg‘]
            if len(ms) <5:
                break
            msg=ms[0]
            #print msg
            soup = BeautifulSoup(str(msg))
            blks = soup.find_all(‘div‘, {‘class‘ : ‘blk‘})
            for blk in blks:
                page=blk.find(‘a‘).get(‘href‘)
                if page is not None:
                    node=page.replace("/topic/","")
                    print node,page
        except urllib2.URLError, e:
            print "error is",e
            pass
                

def work():
    #getTopics() #鑾峰緱璇濋
    curr.execute(‘select data_id,name from classify_new‘)
    results = curr.fetchall()
    for r in results:
        data_id=r[0]
        name=r[1]
        get_topis(data_id,name)
        
if __name__ == ‘__main__‘:
    i=0
    while i< 40:
        work()
        i=i+1

  

說下數據庫的問題,我這裏就不傳附件了,看字段自己建立,因為這確實太簡單了,我是用的mysql,你看自己的需求自己建。

有什麽不懂得麻煩去去轉盤網找我,因為這個也是我開發的,上面會及時更新qq群號,這裏不留qq號啥的,以免被系統給K了。

python爬蟲如何爬知乎的話題?