python爬蟲如何爬知乎的話題？

阿新 • • 發佈：2018-01-05

write targe connect 問題 brush img fetchone new text

因為要做觀點，觀點的屋子類似於知乎的話題，所以得想辦法把他給爬下來，搞了半天最終還是妥妥的搞定了，代碼是python寫的，不懂得麻煩自學哈！懂得直接看代碼，絕對可用

#coding:utf-8
from fileinput import filename
__author__ = ‘haoning‘
__crawler for http://www.guandn.com/

#!/usr/bin/env python

import urllib
import urllib2
import time
import re
import json
import uuid
import platform
import os
import sys
import cookielib
import MySQLdb as mdb
from bs4 import BeautifulSoup

reload(sys)
sys.setdefaultencoding( "utf-8" )

headers = {
   ‘User-Agent‘ : ‘Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0‘,
   ‘Content-Type‘:‘application/x-www-form-urlencoded; charset=UTF-8‘,
   ‘X-Requested-With‘:‘XMLHttpRequest‘,
   ‘Referer‘:‘https://www.zhihu.com/topics‘,
   ‘Cookie‘:‘__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a‘
}

DB_HOST = ‘127.0.0.1‘
DB_USER = ‘root‘
DB_PASS = ‘root‘

conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, ‘zhihu‘, charset=‘utf8‘)
conn.autocommit(False)
curr = conn.cursor()

def get_html(url):
    try:
        req = urllib2.Request(url)
        response = urllib2.urlopen(req,None,20) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞?
        html = response.read()
        return html
    except:
        print "timeout"
    return None

def getTopics():
    url = ‘https://www.zhihu.com/topics‘
    print url
    try:
        req = urllib2.Request(url)
        response = urllib2.urlopen(req) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞?
        html = response.read().decode(‘utf-8‘)
        print html
        soup = BeautifulSoup(html)
        lis = soup.find_all(‘li‘, {‘class‘ : ‘zm-topic-cat-item‘})
        
        for li in lis:
            data_id=li.get(‘data-id‘)
            name=li.text
            curr.execute(‘select id from classify_new where name=%s‘,(name))
            y= curr.fetchone()
            if not y:
                curr.execute(‘INSERT INTO classify_new(data_id,name)VALUES(%s,%s)‘,(data_id,name))
        conn.commit()
    except Exception as e:
        print "get topic error",e
        

def get_extension(name):  
    where=name.rfind(‘.‘)
    if where!=-1:
        return name[where:len(name)]
    return None


def which_platform():
    sys_str = platform.system()
    return sys_str

def GetDateString():
    when=time.strftime(‘%Y-%m-%d‘,time.localtime(time.time()))
    foldername = str(when)
    return foldername 

def makeDateFolder(par,classify):
    try:
        if os.path.isdir(par):
            newFolderName=par + ‘//‘ + GetDateString() + ‘//‘  +classify
            if which_platform()=="Linux":
                newFolderName=par + ‘/‘ + GetDateString() + "/" +classify
            if not os.path.isdir( newFolderName ):
                os.makedirs( newFolderName )
            return newFolderName
        else:
            return None 
    except Exception,e:
        print "kk",e
    return None 

def download_img(url,classify):
    try:
        extention=get_extension(url)
        if(extention is None):
            return None
        req = urllib2.Request(url)
        resp = urllib2.urlopen(req,None,15)
        dataimg=resp.read()
        name=str(uuid.uuid1()).replace("-","")+"_www.guandn.com"+extention
        top="E://topic_pic"
        folder=makeDateFolder(top, classify)
        filename=None
        if folder is not None:
            filename  =folder+"//"+name
        #print "filename",filename
        try:
            if "e82bab09c_xs" not in str(url):
                if not os.path.exists(filename):
                    file_object = open(filename,‘w+b‘)
                    file_object.write(dataimg)
                    file_object.close()
                    return GetDateString()+‘/‘+classify+"/"+name
                else:
                    print "file exist"
                    return None
        except IOError,e1:
            print "e1=",e1
            pass
    except Exception as e:
        print "eee",e
        pass
    return None #濡傛灉娌℃湁涓嬭澆涓嬫潵灝卞埄鐢ㄥ師鏉ョ綉絝欑殑閾炬帴
    
                
def get_topis(top_id,topic_name):
    url = ‘https://www.zhihu.com/node/TopicsPlazzaListV2‘
    isGet = True;
    offset = -20;
    top_id=str(top_id)
    while isGet:
        offset = offset + 20
        values = {‘method‘: ‘next‘, ‘params‘: ‘{"topic_id":‘+top_id+‘,"offset":‘+str(offset)+‘,"hash_id":""}‘}
        try:
            data = urllib.urlencode(values)
            request = urllib2.Request(url,data,headers)
            response = urllib2.urlopen(request)
            html=response.read().decode(‘utf-8‘)
            if html is None:
                return
            json_str = json.loads(html)
            ms=json_str[‘msg‘]
            if len(ms) <5:
                break
            msg=ms[0]
            #print msg
            soup = BeautifulSoup(str(msg))
            blks = soup.find_all(‘div‘, {‘class‘ : ‘blk‘})
            for blk in blks:
                page=blk.find(‘a‘).get(‘href‘)
                if page is not None:
                    node=page.replace("/topic/","")
                    print node,page
        except urllib2.URLError, e:
            print "error is",e
            pass
                

def work():
    #getTopics() #鑾峰緱璇濋
    curr.execute(‘select data_id,name from classify_new‘)
    results = curr.fetchall()
    for r in results:
        data_id=r[0]
        name=r[1]
        get_topis(data_id,name)
        
if __name__ == ‘__main__‘:
    i=0
    while i< 40:
        work()
        i=i+1

說下數據庫的問題，我這裏就不傳附件了，看字段自己建立，因為這確實太簡單了，我是用的mysql，你看自己的需求自己建。

有什麽不懂得麻煩去去轉盤網找我，因為這個也是我開發的，上面會及時更新qq群號，這裏不留qq號啥的，以免被系統給K了。

python爬蟲如何爬知乎的話題？

write targe connect 問題 brush img fetchone new text 因為要做觀點，觀點的屋子類似於知乎的話題，所以得想辦法把他給爬下來，搞了半天最終還是妥妥的搞定了，代碼是python寫的，不懂得麻煩自學哈！懂得直接看代碼，絕對可用 #c

使用python爬蟲——爬取淘寶圖片和知乎內容

本文主要內容：目標：使用python爬取淘寶圖片；使用python的一個開源框架pyspider（非常好用，一個國人寫的）爬取知乎上的每個問題，及這個問題下的所有評論最簡單的爬蟲——如下python程式碼爬取淘寶上模特圖片爬

python爬蟲——爬取知乎上自己關注的問題

與之前爬的網站圖片的不同的是，現在爬取的是要自己個人的關注的東西，所以需要做到模擬登入。模擬登入的原理是登入網站後，在瀏覽器上獲取儲存的cookies資訊，填充之後與請求一起傳送。如果前面的爬取圖片的會爬取了，對於這個解析字串的也沒有多大問題了。一直看著知乎上很多程式設計師把

Python爬蟲爬取知乎小結

最近學習了一點網路爬蟲，並實現了使用python來爬取知乎的一些功能，這裡做一個小的總結。網路爬蟲是指通過一定的規則自動的從網上抓取一些資訊的程式或指令碼。我們知道機器學習和資料探勘等都是從大量的資料出發，找到一些有價值有規律的東西，而爬蟲則可以幫助我們解決

通過Python爬蟲爬取知乎某個問題下的圖片

該爬蟲的完整程式碼我把它放到了GitHub上，因為目前是在一點點的增加功能階段，所以程式碼可能沒有完善好，但是正常執行時沒有問題的，歡迎拍磚，:) 該爬蟲主要是通過requests來實現的，該模組完全可以很好的代替urllib和urllib2，而且功能更強大，詳細可以看這

python爬取知乎話題的精華問題下的使用者資訊

今天試著用自己的爬蟲程式碼爬取了知乎【同性戀】話題下的所有精華問題的使用者位置資訊程式碼： __author__ = 'yang' # -*- coding: utf-8 -*- import configparser import requests

Python爬知乎妹子都愛取啥名

imread gen fancybox pda port fun dmi mage panda 閑來無事上知乎，看到好多妹子，於是抓取一波。有沒有興趣？？目標網址https://www.zhihu.com/collection/78172986 抓取分析爬取分析

爬取知乎話題async使用協程

ret header tps mob ans print __name__ next and import requests import json import time from pyquery import PyQuery import pandas as pd f

Python 爬知乎下某個問題下所有的圖片

# -*- coding:utf-8 -*- import re import requests import os from urllib.parse import urlparse as urlsplit from os.path import basena

python爬蟲——爬取知網體育學刊引證論文資訊

前言國慶百無聊賴，然後幫一個小姐姐爬取知網資訊，覺得知網算目前處理過的對爬蟲稍微有點防範的網站，遂有了這篇部落格目標爬取知網上2003年體育學刊文獻所有論文的引證論文，包括論文名稱、作者、發表時間，也就是下面紅框所指處點選click處，點選黑框，紅框所

Python爬蟲-爬取糗事百科段子

hasattr com ima .net header rfi star reason images 閑來無事，學學python爬蟲。在正式學爬蟲前，簡單學習了下HTML和CSS，了解了網頁的基本結構後，更加快速入門。 1.獲取糗事百科url http://www.qiu

python爬蟲爬取頁面源碼在本頁面展示

一個 nts ring 想要 strip code 空白列表 ngs python爬蟲在爬取網頁內容時，需要將內容連同內容格式一同爬取過來，然後在自己的web頁面中顯示，自己的web頁面為django框架首先定義一個變量html，變量值為一段HTML代碼 >&

python 爬蟲爬取證券之星網站

爬蟲周末無聊，找點樂子。。。#coding:utf-8 import requests from bs4 import BeautifulSoup import random import time #抓取所需內容 user_agent = ["Mozilla/5.0 (Windows NT 10.0

python爬蟲爬取海量病毒文件

tle format nbsp contex logs request spl tde __name__ 因為工作需要，需要做深度學習識別惡意二進制文件，所以爬一些資源。 # -*- coding: utf-8 -*- import requests import re

用Python爬蟲爬取廣州大學教務系統的成績（內網訪問）

enc 用途 css選擇器狀態 csv文件表格 area 加密重要用Python爬蟲爬取廣州大學教務系統的成績（內網訪問）在進行爬取前，首先要了解： 1、什麽是CSS選擇器？每一條css樣式定義由兩部分組成，形式如下： [code] 選擇器{樣式} [/code

python爬蟲——爬取古詩詞

爬蟲古詩詞實現目標 1.古詩詞網站爬取唐詩宋詞 2.落地到本地數據庫頁面分析通過firedebug進行頁面定位：源碼定位：根據lxml etree定位div標簽：# 通過 lxml進行頁面分析 response = etree.HTML(data

利用Python爬蟲爬取淘寶商品做數據挖掘分析實戰篇，超詳細教程

實戰趨勢 fat sts AI top 名稱 2萬安裝模塊項目內容本案例選擇>> 商品類目：沙發；數量：共100頁 4400個商品；篩選條件：天貓、銷量從高到低、價格500元以上。項目目的 1. 對商品標題進行文本分析詞雲可視化 2.

Python爬蟲 - 爬取百度html代碼前200行

http src mage bsp bubuko str 百度爬蟲圖片 Python爬蟲 - 爬取百度html代碼前200行 - 改進版, 增加了對字符串的.strip()處理 Python爬蟲 - 爬取百度html代碼前200行

簡易python爬蟲爬取boss直聘職位，並寫入excel

python爬蟲寫入excel1，默認城市是杭州，代碼如下#! -*-coding:utf-8 -*-from urllib import request, parsefrom bs4 import BeautifulSoupimport datetimeimport xlwt starttime = dat

Python 爬蟲爬取微信文章

微信爬蟲爬取微信文章爬取公眾號文章搜狗微信平臺為入口地址：http://weixin.sogou.com/ --------------------------------------------------------------搜索關鍵詞“科技”對比網址變化情況查看網址http://wei

python爬蟲如何爬知乎的話題？

相關推薦