QQ群資訊爬取
阿新 • • 發佈:2019-02-19
需要安裝谷歌瀏覽器,下載chrome.exe放到python的安裝路徑下
#coding=utf-8
from lxml import etree
import time
from selenium import webdriver
class qqGroupSpider():
'''
Q群爬蟲類
'''
def __init__(self, driver,qq,passwd,qqgroup,writefile):
'''
初始化根據使用者資訊登入到Q群管理介面
:param driver:
:param qq:
:param passwd:
:param qqgroup:
:param writefile:
'''
url = "https://qun.qq.com/member.html#gid=" + str(qqgroup)
self.writefile=writefile
self.driver=driver
driver.delete_all_cookies()
driver.get(url)
time.sleep(1)
driver.switch_to.frame("login_frame") # 進入登入iframe
time.sleep(1)
change = driver.find_element_by_id("switcher_plogin")
change.click()
driver.find_element_by_id('u').clear() # 選擇使用者名稱框
driver.find_element_by_id('u').send_keys(qq)
driver.find_element_by_id('p').clear()
driver.find_element_by_id('p').send_keys(passwd)
driver. find_element_by_class_name("login_button").click()
time.sleep(1)
def scroll_foot(self,driver):
'''
控制螢幕向下滾動一下
:param driver:
:return:
'''
js = "var q=document.documentElement.scrollTop=100000"
return driver.execute_script(js)
def getTbodyList(self, driver):
return driver.find_elements_by_xpath('//div[@class="group-memeber"]//tbody[contains(@class,"list")]')
def parseMember(self, mb):
'''
解析每個人各項描述,以逗號隔開,返回一個成員的基本情況
:param mb:
:return:
'''
master = mb.getchildren()[2].getchildren()[0].get('class')
if master == None:
master = '0'
else:
master = '1'
qId = mb.getchildren()[1].text.strip()
nickName = mb.getchildren()[2].getchildren()[2].text.strip()
card = mb.getchildren()[3].getchildren()[0].text.strip()
qq = mb.getchildren()[4].text.strip()
sex = mb.getchildren()[5].text.strip()
qqAge = mb.getchildren()[6].text.strip()
joinTime = mb.getchildren()[7].text.strip()
lastTime = mb.getchildren()[8].text.strip()
return (
master + "," + qq + "," + nickName + "," + card + "," + sex + "," + qqAge + "," + joinTime + "," + lastTime).encode(
'utf-8')
def parseTbody(self, html):
'''
解析tbody裡面的內容,一個tbody裡面有多個成員,
解析完成後,返回成員基本情況的列表
:param html:
:return:
'''
selector = etree.HTML(html)
mbs = selector.xpath('//tr[contains(@class,"mb mb")]')
memberList = map(self.parseMember, mbs)
return memberList
def parseAndWrite(self, tbody):
'''
解析HTML中的tbody,解析完成後寫入到本地檔案
:param tbody:
:return:
'''
html = tbody.get_attribute('innerHTML')
memberList = self.parseTbody(html)
map(lambda x: self.writefile.write(x + '\n'), memberList)
def main():
# filename = driver.find_element_by_xpath('//*[@id="groupTit"]').text.encode('utf-8').strip()
# file = open(unicode('qq/' + filename.replace('/', '').replace('\\', '') + '.txt', 'utf-8'), 'w')
qq = str(raw_input("請輸入你的QQ:"))
passwd = str(raw_input("請輸入你的QQ密碼:"))
qqgroup = raw_input("請輸入QQ群號:")
filename = str(raw_input("請輸入儲存的檔名:"))
# 儲存在qq目錄下,沒有需要先建立
file = open(unicode('qq/' + filename+ '.txt', 'utf-8'), 'w')
driver = webdriver.Chrome()
spider=qqGroupSpider(driver,qq,passwd,qqgroup,file)
# 找到QQ群的人數
qqNum = int(driver.find_element_by_xpath('//*[@id="groupMemberNum"]').text.strip())
curren_qq_num=0
count=0
prelen=0
while curren_qq_num != qqNum:
# 不停的向下滾動螢幕,直到底部,一邊抽取資料
count = count+1
print count
spider.scroll_foot(driver)
time.sleep(1)
curren_qq_num=len(driver.find_elements_by_xpath('//*[@id="groupMember"]//td[contains(@class,"td-no")]'))
tlist = spider.getTbodyList(driver)
map(spider.parseAndWrite, tlist[prelen:])
prelen = len(tlist)#更新tbody列表的長度
driver.quit()
file.close()
if __name__ == '__main__':
main()
執行如下:
爬取出的資料儲存成文字檔案,格式:是否群管理,暱稱,群名片,QQ號,性別,q齡,入群時間,等級,最後發言
我的郵箱:[email protected]
我的GitHub賬號:https://github.com/LoyalWilliams
我建了一個大資料的學習交流群
QQ:2541692705
Q群:882855741
微信公眾號:程式國度