python獲取自己發的說說內容
阿新 • • 發佈:2019-02-14
一、模擬登陸
import re from selenium import webdriver from time import sleep from PIL import Image #定義QQ空間登入函式 def QR_login(): def getGTK(cookie): """ 根據cookie得到GTK """ hashes = 5381 for letter in cookie['p_skey']: hashes += (hashes << 5) + ord(letter) return hashes & 0x7fffffff browser=webdriver.PhantomJS(executable_path="D:\python\phantomjs.exe")#這裡要輸入你的phantomjs所在的路徑 url="https://qzone.qq.com/"#QQ登入網址 browser.get(url) browser.maximize_window()#全屏 sleep(3)#等三秒 browser.get_screenshot_as_file('QZone.png')#截圖並儲存圖片 im = Image.open('QZone.png')#開啟圖片 im.show()#用手機掃二維碼登入qq空間 sleep(10)#等二十秒,可根據自己的網速和效能修改 print(browser.title)#列印網頁標題 cookie = {}#初始化cookie字典 for elem in browser.get_cookies():#取cookies cookie[elem['name']] = elem['value'] print('Get the cookie of QQlogin successfully!(共%d個鍵值對)' % (len(cookie))) html = browser.page_source#儲存網頁原始碼 print(html) g_qzonetoken=re.search(r'window\.g_qzonetoken = \(function\(\)\{ try\{return (.*?);\} catch\(e\)',html)#從網頁原始碼中提取g_qzonetoken gtk=getGTK(cookie)#通過getGTK函式計算gtk browser.quit() return (cookie,gtk,g_qzonetoken.group(1)) if __name__=="__main__": QR_login()
二、評論獲取
import requests import re import datetime from time import sleep from urllib import parse def comment(my_qq, target_qq, topicid, content, gtk, qzonetoken, cookie): data = { 'qzreferrer': 'https://qzs.qq.com/qzone/app/mood_v6/html/index.html#mood&uin=790178228&pfid=2&qz_ver=8&appcanvas=0&qz_style=35¶ms=&entertime=1498019616488&canvastype=&cdn_use_https=1', 'uin': my_qq, 'hostUin': target_qq, 'topicId': topicid, 'commentUin': my_qq, 'content': content, 'richval': '', 'richtype': '', 'inCharset': '', 'outCharset': '', 'ref': '', 'private': '0', 'with_fwd': '0', 'to_tweet': '0', 'hostuin': my_qq, 'code_version': '1', 'format': 'fs' } comment_data = parse.urlencode(data) content_length = str(data) comment_params = { 'g_tk': gtk, 'qzonetoken': qzonetoken } comment_headers = { 'Host': 'h5.qzone.qq.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0', 'Accept': '*/*', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded', 'Content-Length': content_length, 'Upgrade-Insecure-Requests': '1' } res = s.request('POST', 'https://h5.qzone.qq.com/proxy/domain/taotao.qzone.qq.com/cgi-bin/emotion_cgi_addcomment_ugc', params=comment_params, data=comment_data, headers=comment_headers, cookies=cookie) print(res.status_code) res = res.text print(res) commentid = re.findall('"id":(.*?),"postTime"', res) if commentid: f = open('target_qq.txt', 'a') f.write(str(topicid)) f.write(' ') f.write(str(commentid[0])) f.write('\n') f.close() print('評論成功') return True else: print('評論失敗') return False headers = { 'Host': 'h5.qzone.qq.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0', 'Accept': '*/*', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate, br', 'Referer': 'https://user.qzone.qq.com/790178228?_t_=0.22746974226377736', 'Connection': 'keep-alive' } cookie = {'QZ_FE_WEBP_SUPPORT': '0', 'ptcz': '7cac1c7521b1ad8be9b1489f9b0aaba8efe9500f0f5dcb7693a9f693d37a8dff', 'fnc': '2', 'skey': '@F6CRfhQVd', 'pgv_si': 's493469696', 'ptui_loginuin': '790178228', 'RK': 'gYFn6+IOYo', 'pt2gguin': 'o0790178228', 'p_uin': 'o0790178228', 'rv2': '808A93A64B1A6FC5AE6D906AB5E744B38AF1EAA4163EC57A76', 'ptisp': 'ctc', 'p_skey': '5Iv6LkqOjJH*JPtrq0xqZmVlBNkbKLCRcDasiGGq71w_', '_qpsvr_localtk': '0.6656868932768703', 'pgv_pvi': '7208859648', '790178228_todaycount': '4', '__Q_w_s_hat_seed': '1', '790178228_totalcount': '24703', 'pgv_pvid': '1698820840', 'qz_screen': '1366x768', 'pt4_token': 'WeiGzJbrn*TO4HO4FFXRdiD3SpXE2UqW2Litsm-TZPw_', 'pgv_info': 'ssid=s6237051136', 'uin': 'o0790178228', 'Loading': 'Yes', 'property20': '9D827FD9F839B247CF95AA1787B450E4D22D6C9F2A76DC8C4D27798667EBB92CA7122514560889AF'} gtk ="" qzonetoken ="" s = requests.session() my_qq ="" target_qq ="" content = '加油!' cnt = 0 for page in range(0, 170): pos = page * 20 params = { 'uin': target_qq, 'ftype': '0', 'sort': '0', 'pos': pos, 'num': '20', 'replynum': '100', 'g_tk': gtk, 'callback': '_preloadCallback', 'code_version': '1', 'format': 'jsonp', 'need_private_comment': '1', 'qzonetoken': qzonetoken } response = s.request('GET', 'https://h5.qzone.qq.com/proxy/domain/taotao.qq.com/cgi-bin/emotion_cgi_msglist_v6', params=params, headers=headers, cookies=cookie) print(response.status_code) text = response.text if not re.search('lbs', text): print('全部說說評論完成,共新增評論%s條' % cnt) exit() textlist = re.split('\{"certified"', text)[0:] for i in range(1, len(textlist)): text = re.sub('"commentlist":.*?"conlist":', '', textlist[i]) tid = re.findall('"t1_termtype":.*?"tid":"(.*?)"', text)[0] topicid = target_qq + '_' + str(tid) print(topicid) counts = comment(my_qq=my_qq, target_qq=target_qq, content=content, topicid=topicid, gtk=gtk, qzonetoken=qzonetoken, cookie=cookie) sleep(180) if counts == True: cnt = cnt + 1
二、資料抓取存入資料庫
import requests import re import datetime import pymysql import csv from qq_mood.qq import QRlogin def parse_mood(i): '''從返回的json中,提取我們想要的欄位''' text = re.sub('"commentlist":.*?"conlist":', '', i) if text: myMood = {} myMood["isTransfered"] = False tid = re.findall('"t1_termtype":.*?"tid":"(.*?)"', text)[0] # 獲取說說ID tid = qq + '_' + tid myMood['id'] = tid myMood['pos_y'] = 0 myMood['pos_x'] = 0 mood_cont = re.findall('\],"content":"(.*?)"', text) if re.findall('},"name":"(.*?)",', text): name = re.findall('},"name":"(.*?)",', text)[0] myMood['name'] = name if len(mood_cont) == 2: # 如果長度為2則判斷為屬於轉載 myMood["Mood_cont"] = "評語:" + mood_cont[0] + "--------->轉載內容:" + mood_cont[1] # 說說內容 myMood["isTransfered"] = True elif len(mood_cont) == 1: myMood["Mood_cont"] = mood_cont[0] else: myMood["Mood_cont"] = "" if re.findall('"created_time":(\d+)', text): created_time = re.findall('"created_time":(\d+)', text)[0] temp_pubTime = datetime.datetime.fromtimestamp(int(created_time)) temp_pubTime = temp_pubTime.strftime("%Y-%m-%d %H:%M:%S") dt = temp_pubTime.split(' ') time = dt[1] myMood['time'] = time date = dt[0] myMood['date'] = date if re.findall('"source_name":"(.*?)"', text): source_name = re.findall('"source_name":"(.*?)"', text)[0] # 獲取發表的工具(如某手機) myMood['tool'] = source_name if re.findall('"pos_x":"(.*?)"', text): pos_x = re.findall('"pos_x":"(.*?)"', text)[0] pos_y = re.findall('"pos_y":"(.*?)"', text)[0] if pos_x: myMood['pos_x'] = pos_x if pos_y: myMood['pos_y'] = pos_y idname = re.findall('"idname":"(.*?)"', text)[0] myMood['idneme'] = idname cmtnum = re.findall('"cmtnum":(.*?),', text)[0] myMood['cmtnum'] = cmtnum return myMood #從csv檔案中取qq號,並儲存在一個列表中 csv_reader = csv.reader(open('qq.csv')) friend=[] for row in csv_reader: friend.append(row[3]) friend.pop(0) friends=[] for f in friend: f=f[:-7] friends.append(f) headers={ 'Host': 'h5.qzone.qq.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0', 'Accept': '*/*', 'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate, br', 'Referer': 'https://user.qzone.qq.com/790178228?_t_=0.22746974226377736', 'Connection':'keep-alive' }#偽造瀏覽器頭 conn = pymysql.connect('localhost', 'root', 'root', 'test', charset="utf8", use_unicode=True)#連線mysql資料庫 cursor = conn.cursor()#定義遊標 cookie,gtk,qzonetoken=QRlogin.QR_login() #通過登入函式取得cookies,gtk,qzonetoken s=requests.session()#用requests初始化會話 for qq in friends:#遍歷qq號列表 for p in range(0,10): pos=p*20 params={ 'uin':qq, 'ftype':'0', 'sort':'0', 'pos':pos, 'num':'20', 'replynum':'100', 'g_tk':gtk, 'callback':'_preloadCallback', 'code_version':'1', 'format':'jsonp', 'need_private_comment':'1', 'qzonetoken':qzonetoken } response=s.request('GET','https://h5.qzone.qq.com/proxy/domain/taotao.qq.com/cgi-bin/emotion_cgi_msglist_v6',params=params,headers=headers,cookies=cookie) print(response.status_code) #通過列印狀態碼判斷是否請求成功 text=response.text #讀取響應內容 print(text) if not re.search('lbs', text):#通過lbs判斷此qq的說說是否爬取完畢 print('%s說說下載完成'% qq) break textlist = re.split('\{"certified"', text)[1:] for i in textlist: myMood=parse_mood(i) '''將提取的欄位值插入mysql資料庫,通過用異常處理防止個別的小bug中斷爬蟲,開始的時候可以先不用異常處理判斷是否能正常插入資料庫''' try: insert_sql = ''' insert into mood(id,content,time,sitename,pox_x,pox_y,tool,comments_num,date,isTransfered,name) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ''' cursor.execute(insert_sql, (myMood['id'],myMood["Mood_cont"],myMood['time'],myMood['idneme'],myMood['pos_x'],myMood['pos_y'],myMood['tool'],myMood['cmtnum'],myMood['date'],myMood["isTransfered"],myMood['name'])) conn.commit() except: pass print('說說全部下載完成!')
三、結果展示
引用於: https://zhuanlan.zhihu.com/p/27604277