python爬蟲如何爬知乎的話題?
阿新 • • 發佈:2018-01-05
write targe connect 問題 brush img fetchone new text
因為要做觀點,觀點的屋子類似於知乎的話題,所以得想辦法把他給爬下來,搞了半天最終還是妥妥的搞定了,代碼是python寫的,不懂得麻煩自學哈!懂得直接看代碼,絕對可用
#coding:utf-8 from fileinput import filename __author__ = ‘haoning‘ __crawler for http://www.guandn.com/ #!/usr/bin/env python import urllib import urllib2 import time import re import json import uuid import platform import os import sys import cookielib import MySQLdb as mdb from bs4 import BeautifulSoup reload(sys) sys.setdefaultencoding( "utf-8" ) headers = { ‘User-Agent‘ : ‘Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0‘, ‘Content-Type‘:‘application/x-www-form-urlencoded; charset=UTF-8‘, ‘X-Requested-With‘:‘XMLHttpRequest‘, ‘Referer‘:‘https://www.zhihu.com/topics‘, ‘Cookie‘:‘__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a‘ } DB_HOST = ‘127.0.0.1‘ DB_USER = ‘root‘ DB_PASS = ‘root‘ conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, ‘zhihu‘, charset=‘utf8‘) conn.autocommit(False) curr = conn.cursor() def get_html(url): try: req = urllib2.Request(url) response = urllib2.urlopen(req,None,20) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞? html = response.read() return html except: print "timeout" return None def getTopics(): url = ‘https://www.zhihu.com/topics‘ print url try: req = urllib2.Request(url) response = urllib2.urlopen(req) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞? html = response.read().decode(‘utf-8‘) print html soup = BeautifulSoup(html) lis = soup.find_all(‘li‘, {‘class‘ : ‘zm-topic-cat-item‘}) for li in lis: data_id=li.get(‘data-id‘) name=li.text curr.execute(‘select id from classify_new where name=%s‘,(name)) y= curr.fetchone() if not y: curr.execute(‘INSERT INTO classify_new(data_id,name)VALUES(%s,%s)‘,(data_id,name)) conn.commit() except Exception as e: print "get topic error",e def get_extension(name): where=name.rfind(‘.‘) if where!=-1: return name[where:len(name)] return None def which_platform(): sys_str = platform.system() return sys_str def GetDateString(): when=time.strftime(‘%Y-%m-%d‘,time.localtime(time.time())) foldername = str(when) return foldername def makeDateFolder(par,classify): try: if os.path.isdir(par): newFolderName=par + ‘//‘ + GetDateString() + ‘//‘ +classify if which_platform()=="Linux": newFolderName=par + ‘/‘ + GetDateString() + "/" +classify if not os.path.isdir( newFolderName ): os.makedirs( newFolderName ) return newFolderName else: return None except Exception,e: print "kk",e return None def download_img(url,classify): try: extention=get_extension(url) if(extention is None): return None req = urllib2.Request(url) resp = urllib2.urlopen(req,None,15) dataimg=resp.read() name=str(uuid.uuid1()).replace("-","")+"_www.guandn.com"+extention top="E://topic_pic" folder=makeDateFolder(top, classify) filename=None if folder is not None: filename =folder+"//"+name #print "filename",filename try: if "e82bab09c_xs" not in str(url): if not os.path.exists(filename): file_object = open(filename,‘w+b‘) file_object.write(dataimg) file_object.close() return GetDateString()+‘/‘+classify+"/"+name else: print "file exist" return None except IOError,e1: print "e1=",e1 pass except Exception as e: print "eee",e pass return None #濡傛灉娌℃湁涓嬭澆涓嬫潵灝卞埄鐢ㄥ師鏉ョ綉絝欑殑閾炬帴 def get_topis(top_id,topic_name): url = ‘https://www.zhihu.com/node/TopicsPlazzaListV2‘ isGet = True; offset = -20; top_id=str(top_id) while isGet: offset = offset + 20 values = {‘method‘: ‘next‘, ‘params‘: ‘{"topic_id":‘+top_id+‘,"offset":‘+str(offset)+‘,"hash_id":""}‘} try: data = urllib.urlencode(values) request = urllib2.Request(url,data,headers) response = urllib2.urlopen(request) html=response.read().decode(‘utf-8‘) if html is None: return json_str = json.loads(html) ms=json_str[‘msg‘] if len(ms) <5: break msg=ms[0] #print msg soup = BeautifulSoup(str(msg)) blks = soup.find_all(‘div‘, {‘class‘ : ‘blk‘}) for blk in blks: page=blk.find(‘a‘).get(‘href‘) if page is not None: node=page.replace("/topic/","") print node,page except urllib2.URLError, e: print "error is",e pass def work(): #getTopics() #鑾峰緱璇濋 curr.execute(‘select data_id,name from classify_new‘) results = curr.fetchall() for r in results: data_id=r[0] name=r[1] get_topis(data_id,name) if __name__ == ‘__main__‘: i=0 while i< 40: work() i=i+1
說下數據庫的問題,我這裏就不傳附件了,看字段自己建立,因為這確實太簡單了,我是用的mysql,你看自己的需求自己建。
有什麽不懂得麻煩去去轉盤網找我,因為這個也是我開發的,上面會及時更新qq群號,這裏不留qq號啥的,以免被系統給K了。
python爬蟲如何爬知乎的話題?