1. 程式人生 > >QQ群資訊爬取

QQ群資訊爬取

需要安裝谷歌瀏覽器,下載chrome.exe放到python的安裝路徑下

#coding=utf-8
from lxml import etree
import time
from selenium import webdriver

class qqGroupSpider():
    '''
    Q群爬蟲類
    '''
    def __init__(self, driver,qq,passwd,qqgroup,writefile):
        '''
        初始化根據使用者資訊登入到Q群管理介面
        :param driver:
        :param qq:
        :param passwd:
        :param qqgroup:
        :param writefile:
        '''
url = "https://qun.qq.com/member.html#gid=" + str(qqgroup) self.writefile=writefile self.driver=driver driver.delete_all_cookies() driver.get(url) time.sleep(1) driver.switch_to.frame("login_frame") # 進入登入iframe time.sleep(1) change =
driver.find_element_by_id("switcher_plogin") change.click() driver.find_element_by_id('u').clear() # 選擇使用者名稱框 driver.find_element_by_id('u').send_keys(qq) driver.find_element_by_id('p').clear() driver.find_element_by_id('p').send_keys(passwd) driver.
find_element_by_class_name("login_button").click() time.sleep(1) def scroll_foot(self,driver): ''' 控制螢幕向下滾動一下 :param driver: :return: ''' js = "var q=document.documentElement.scrollTop=100000" return driver.execute_script(js) def getTbodyList(self, driver): return driver.find_elements_by_xpath('//div[@class="group-memeber"]//tbody[contains(@class,"list")]') def parseMember(self, mb): ''' 解析每個人各項描述,以逗號隔開,返回一個成員的基本情況 :param mb: :return: ''' master = mb.getchildren()[2].getchildren()[0].get('class') if master == None: master = '0' else: master = '1' qId = mb.getchildren()[1].text.strip() nickName = mb.getchildren()[2].getchildren()[2].text.strip() card = mb.getchildren()[3].getchildren()[0].text.strip() qq = mb.getchildren()[4].text.strip() sex = mb.getchildren()[5].text.strip() qqAge = mb.getchildren()[6].text.strip() joinTime = mb.getchildren()[7].text.strip() lastTime = mb.getchildren()[8].text.strip() return ( master + "," + qq + "," + nickName + "," + card + "," + sex + "," + qqAge + "," + joinTime + "," + lastTime).encode( 'utf-8') def parseTbody(self, html): ''' 解析tbody裡面的內容,一個tbody裡面有多個成員, 解析完成後,返回成員基本情況的列表 :param html: :return: ''' selector = etree.HTML(html) mbs = selector.xpath('//tr[contains(@class,"mb mb")]') memberList = map(self.parseMember, mbs) return memberList def parseAndWrite(self, tbody): ''' 解析HTML中的tbody,解析完成後寫入到本地檔案 :param tbody: :return: ''' html = tbody.get_attribute('innerHTML') memberList = self.parseTbody(html) map(lambda x: self.writefile.write(x + '\n'), memberList) def main(): # filename = driver.find_element_by_xpath('//*[@id="groupTit"]').text.encode('utf-8').strip() # file = open(unicode('qq/' + filename.replace('/', '').replace('\\', '') + '.txt', 'utf-8'), 'w') qq = str(raw_input("請輸入你的QQ:")) passwd = str(raw_input("請輸入你的QQ密碼:")) qqgroup = raw_input("請輸入QQ群號:") filename = str(raw_input("請輸入儲存的檔名:")) # 儲存在qq目錄下,沒有需要先建立 file = open(unicode('qq/' + filename+ '.txt', 'utf-8'), 'w') driver = webdriver.Chrome() spider=qqGroupSpider(driver,qq,passwd,qqgroup,file) # 找到QQ群的人數 qqNum = int(driver.find_element_by_xpath('//*[@id="groupMemberNum"]').text.strip()) curren_qq_num=0 count=0 prelen=0 while curren_qq_num != qqNum: # 不停的向下滾動螢幕,直到底部,一邊抽取資料 count = count+1 print count spider.scroll_foot(driver) time.sleep(1) curren_qq_num=len(driver.find_elements_by_xpath('//*[@id="groupMember"]//td[contains(@class,"td-no")]')) tlist = spider.getTbodyList(driver) map(spider.parseAndWrite, tlist[prelen:]) prelen = len(tlist)#更新tbody列表的長度 driver.quit() file.close() if __name__ == '__main__': main()

執行如下:
在這裡插入圖片描述
在這裡插入圖片描述
爬取出的資料儲存成文字檔案,格式:是否群管理,暱稱,群名片,QQ號,性別,q齡,入群時間,等級,最後發言

我的郵箱:[email protected]
我的GitHub賬號:https://github.com/LoyalWilliams
我建了一個大資料的學習交流群
QQ:2541692705
Q群:882855741
微信公眾號:程式國度
在這裡插入圖片描述