爬取qq群資訊
阿新 • • 發佈:2021-01-26
技術標籤:python
獲取cookie和bkn引數
from selenium import webdriver
from selenium.webdriver import ChromeOptions
import time
import pyautogui
import redis
url='https://qun.qq.com/member.html#gid=453987149'
# 需要在這裡邊獲取Cookie,bkn,和所有群號碼,把所有群號碼存起來,然後在爬蟲裡邊一個一個取群號碼,Cookie和bkn的值就直接傳
# headers={
# "cookie": "SINAGLOBAL=6855701965715.122.1577692539947; __gads=ID=c5153c4711c3fa8d:T=1577692757:S=ALNI_MZn5p9pdEiyOo6N9NhaO3oLE1lKsg; _ga=GA1.2.789113268.1577692750; login_sid_t=62e1525776c0bc911756365be2f04f7c; cross_origin_proto=SSL; _s_tentry=cn.bing.com; Apache=8344750195808.959.1603933930414; ULV=1603933930438:89:2:2:8344750195808.959.1603933930414:1603763999621; wb_view_log=1920*10801; wb_view_log_3607491825=1920*10801; UOR=,,login.sina.com.cn; WBtopGlobal_register_version=2020102910; ALF=1635473639; SSOLoginState=1603937640; SCF=ApfIXjexvlOALW_AwBZGr5lNOOESACnJqp_QJtptKZ35jgGHHFSN2QNy3PWTydCN6YAT53oDzyHeUOE95ONOmSI.; SUB=_2A25ynlU4DeRhGeNN6lQU8ivFyjqIHXVR6sHwrDV8PUNbmtANLVr1kW9NScY2ECGW9SVd_9-19MZvuNdisoBSgzC4; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhXgPg0.VYZ6lqP_5Dg2LWr5JpX5KzhUgL.Fo-0eKqfeo-4eKq2dJLoIEBLxK-L12BL1heLxK-LB-qLB.zLxKBLBonL12-LxK-LBKnL1-et; SUHB=05i629ijI_wSsQ; wvr=6; wb_view_log_5316525916=1920*10801; webim_unReadCount=%7B%22time%22%3A1603958917009%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A0%2C%22msgbox%22%3A0%7D"
# }
# browser.add_cookie(headers)
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
# url=""
# 設定中文
option.add_argument('lang=zh_CN.UTF-8')
# 更換頭部
option.add_argument('user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"' )
browser = webdriver.Chrome(options=option)
browser.get(url)#開啟瀏覽器預設網址
browser.maximize_window()
time.sleep(5)
# 模擬點選登陸
# browser.switch_to.frame('ptlogin_iframe')#frame切換,進入frame內部才可以進行點選
pyautogui.click(1016, 595)
# time.sleep(5)
# browser.find_element_by_xpath('//*[@id="bottom_qlogin"]').click()#使用xpath找到頭像按鈕元素
# print(browser.page_source)
# print(browser.execute_script('return event = event||window.event;x=event.clientX;y=event.clientY'))
# browser.find_element_by_id('img_out_3526748995').click()
time.sleep(2)
# print(browser.page_source)#列印網頁原始碼
# 獲取cookies
# 'Cookie': 'RK=3hYoIM4OUR; ptcz=22062646123ccd493ed12173c7b29ec0dbe8425bb3c65e1fd6f12f2d1709caea;
# pgv_pvid=4613506960; _qpsvr_localtk=0.6521903418751458; uin=o1784329053; [email protected]; p_uin=o1784329053;
# pt4_token=DLWBd9LdPUwpXTsYe-tg7eAM1dVCX9-HKjjvo6YdlHI_; p_skey=kd1ZrcOjDHPyY-9g6fSvQ8Mui8gnJdGt9J9PZIpkeLA_;
# traceid=1e9a5faf2a'
# print(browser.get_cookies())
cookie = browser.get_cookies()
for lq in cookie:
# print(lq)
if lq["name"] == 'RK':
# print(lq["value"])
RK = lq["value"]
elif lq["name"] == 'ptcz':
# print(lq["value"])
ptcz = lq["value"]
elif lq["name"] == 'pgv_pvid':
# print(lq["value"])
pgv_pvid = lq["value"]
elif lq["name"] == '_qpsvr_localtk':
# print(lq["value"])
_qpsvr_localtk = lq["value"]
elif lq["name"] == 'uin':
# print(lq["value"])
uin = lq["value"]
elif lq["name"] == 'skey':
# print(lq["value"])
skey = lq["value"]
elif lq["name"] == 'p_uin':
# print(lq["value"])
p_uin = lq["value"]
elif lq["name"] == 'pt4_token':
# print(lq["value"])
pt4_token = lq["value"]
elif lq["name"] == 'p_skey':
# print(lq["value"])
p_skey = lq["value"]
elif lq["name"] == 'traceid':
# print(lq["value"])
traceid = lq["value"]
# 拼接Cookies
cookies = 'RK='+RK+'; '+'ptcz='+ptcz+'; '+'_qpsvr_localtk='+_qpsvr_localtk+'; '+'uin='+uin+'; '+'skey='+skey+'; '+'p_uin='+p_uin+'; '+'pt4_token='+pt4_token+'; '+'p_skey='+p_skey+'; '+'traceid='+traceid+'; '
print(cookies)
# RK=31gEwhzu1Q;ptcz=b2456e2b484e6ec14abbfda2e37ed7dcdcd49215e0df8ef0ac601d2c8c589f3f;_qpsvr_localtk=0.5913548757973062;uin=o3526748995;[email protected];p_uin=o3526748995;pt4_token=K1YlECESkc03EeCGCBxhkpIFOYtZPWNCNFHeT4D*mrU_;p_skey=YkbDrjjcbxSMPo8pdKPDDfR*kpJ8ZINkwcnt9*ALBM4_;traceid=f00fb9b92c;
# _qpsvr_localtk=0.5913548757973062; uin=o3526748995; [email protected]; RK=31gEwhzu1Q; ptcz=b2456e2b484e6ec14abbfda2e37ed7dcdcd49215e0df8ef0ac601d2c8c589f3f; p_uin=o3526748995; pt4_token=K1YlECESkc03EeCGCBxhkpIFOYtZPWNCNFHeT4D*mrU_; p_skey=YkbDrjjcbxSMPo8pdKPDDfR*kpJ8ZINkwcnt9*ALBM4_; traceid=f00fb9b92c
# RK=31gEwhzu1Q; ptcz=75cea5abd61decb2916239bf7951fe4e43076a6a34a57b3fed25ec40ce416a61; _qpsvr_localtk=0.1803814196177329; uin=o3526748995; [email protected]; p_uin=o3526748995; pt4_token=TRmwjYWuulBbk-5rTyHf7zmYLxEmCsYGoSt4AKWOnPU_; p_skey=SBnVUwqCr2yaVQkdgLb*4Lk28g0ZJgBnsKIUoS0QlX0_; traceid=23a67f1f50;
# 執行js語句,然後獲取bkn的返回值
# 獲取bkn引數
bkn = browser.execute_script('return $.getCSRFToken()')
print(browser.execute_script('return $.getCSRFToken()'))
寫提取群號和爬蟲邏輯
import requests
import json
import time
from qq.getBkn import bkn,cookies
import xlwt
import random
def getqun():
global user_agent
user_agent = [
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"]
headers = {
'User-Agent':user_agent[random.randint(0, len(user_agent)-1)],
'Cookie': str(cookies)
}
global qunid
qunid = []
global qunname
qunname = []
url = 'https://qun.qq.com/cgi-bin/qun_mgr/get_group_list'
json_data = {
'bkn': str(bkn)
}
res = requests.post(url, headers=headers, data=json_data)
html_str = res.text
print(html_str)
a = json.loads(html_str)
print(a)
for quid in a["join"]:
# print(quid)
qunid.append(quid['gc'])
# print(quid['gc'])
qunname.append(quid['gn'])
# print(quid['gn'])
print(qunid)
print(qunname)
if __name__ == '__main__':
getqun()
# print(qunid)
# print(qunname)
# print(len(qunid))
# print(len(qunname))
# print(qunid[0])
# print(qunname[2])
length = len(qunid)
for q in range(0,length):
time.sleep(5)
print(qunname[q])
book = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = book.add_sheet('群成員資訊', cell_overwrite_ok=True)
col = ('賬號', '加入時間', '最後發言時間', '群暱稱', 'qq暱稱','q齡')
for inser in range(0, 6):
sheet.write(0, inser, col[inser])
# flag =1
i=0
j=20
headers = {
'User-Agent':user_agent[random.randint(0, len(user_agent)-1)],
'Cookie':str(cookies)
}
url = 'https://qun.qq.com/cgi-bin/qun_mgr/search_group_members'
# 先請求一次,獲取並計算相關引數,獲取第一頁的資料
json_data = {
'gc':str(qunid[q]),
'st':str(i),
'end':str(j),
'sort':'0',
'bkn':str(bkn)
}
i = i+21
j= j+21
res = requests.post(url, headers=headers,data = json_data)
html_str = res.text
# print(html_str)
a = json.loads(html_str)#json轉為字典
# 提取群裡面的總人數
person = a["count"]
# person = person+11
# 計算出頁碼和最後一頁的個數
c=int(person/21) #除最後一頁之外的頁數
if person>21 and person<42:
c=int(person/21)+1
# print(c)
d=person%21 #最後一頁的人數
print(c,d)
down = 1
for lala in a["mems"]:
joinTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(lala['join_time']))
last_speak_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(lala['last_speak_time']))
# sheet.write(lala['uin'],joinTime,last_speak_time,lala['card'],lala['nick'])
data = [lala['uin'],joinTime,last_speak_time,lala['card'],lala['nick'],lala['qage']]
for inser in range(0,6):
sheet.write(down, inser, data[inser])
down += 1
print("賬號:",str(lala['uin']),"加入時間:",joinTime,"最後發言時間:",last_speak_time,"群暱稱",lala['card'],"qq暱稱:",lala['nick'],lala['qage'])
# 得到相關引數後加入迴圈,進行迴圈請求,獲取中間頁碼資料
if c>=2:
code = 1
for flag in range(1,c):
# code = 1
code+=1
json_data = {
'gc':str(qunid[q]),
'st': str(i),
'end': str(j),
'sort': '0',
'bkn': str(bkn)
}
i = i + 21
j = j + 21
# if code==c-1:
# i = i + 21
# j = j + d
res = requests.post(url, headers=headers, data=json_data)
html_str = res.text
# print(html_str)
a = json.loads(html_str) # json轉為字典
# print(a)
# 提取群裡面的總人數
# person = a["count"]
# # 計算出頁碼和最後一頁的個數
# c = int(person / 21) # 除最後一頁之外的頁數
# d = person % 21 # 最後一頁的人數
for lala in a["mems"]:
joinTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(lala['join_time']))
last_speak_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(lala['last_speak_time']))
# sheet.write(lala['uin'], joinTime, last_speak_time, lala['card'], lala['nick'])
data = [lala['uin'], joinTime, last_speak_time, lala['card'], lala['nick'], lala['qage']]
for inser in range(0, 6):
sheet.write(down, inser, data[inser])
down += 1
print("賬號:", str(lala['uin']), "加入時間:", joinTime, "最後發言時間:", last_speak_time, "群暱稱", lala['card'],
"qq暱稱:", lala['nick'], lala['qage'])
# 獲取最後一頁的資料
if c !=2:
if d!=0:#保證這個最後一頁是存在的
j=j-20+d
# print(i,j)
json_data = {
'gc':str(qunid[q]),
'st': str(i),
'end': str(j),
'sort': '0',
'bkn': str(bkn)
}
res = requests.post(url, headers=headers, data=json_data)
html_str = res.text
a = json.loads(html_str) # json轉為字典
for lala in a["mems"]:
joinTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(lala['join_time']))
last_speak_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(lala['last_speak_time']))
# sheet.write(lala['uin'], joinTime, last_speak_time, lala['card'], lala['nick'])
data = [lala['uin'], joinTime, last_speak_time, lala['card'], lala['nick'], lala['qage']]
for inser in range(0, 6):
sheet.write(down, inser, data[inser])
down += 1
print("賬號:", str(lala['uin']), "加入時間:", joinTime, "最後發言時間:", last_speak_time, "群暱稱", lala['card'],
"qq暱稱:", lala['nick'], lala['qage'])
# 儲存excel檔案,以群名稱儲存
savepath = 'C:/Users/Administrator/Desktop/shuju/'+str(qunname[q])+'.xls'
book.save(savepath)
print(a["mems"])
print(html_str)
print(c,d)
# ss=12
# c=ss/2
# print(c)
# print(type(person))
# print(html_str)
最後參考下面的部落格把py檔案合併成exe
pyinstaller打包py檔案壓縮成exe
最後要記得將瀏覽器驅動放在exe所在的目錄下