利用appium和Android模擬器爬取微信朋友圈(解決每次重啟登入)
阿新 • • 發佈:2018-11-04
特別注意: 微信具有一定的反爬能力,在測試時發現,每次爬取任務時, 對應特定節點的ID和XPath都會發生變化,保險起見,每次重新連線手機,都要對節點ID和Xpath作更新。
同時設定引數 'noReset': True, # 啟動後結束後不清空應用資料,用例執行完後會預設重置APP,也就是刪除APP所有資料。
避免多次輸入使用者名稱和密碼登入,防止被封
首次登入需要用到login函式,第二次登入,請註釋login
import os from appium import webdriver from appium.webdriver.common.touch_action import TouchAction from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from pymongo import MongoClient from time import sleep from processor import Processor from config import * class Moments(): def __init__(self): """ 初始化 """ # 驅動配置 self.desired_caps = { 'platformName': PLATFORM, 'deviceName': DEVICE_NAME, 'appPackage': APP_PACKAGE, 'appActivity': APP_ACTIVITY, 'noReset': True } self.driver = webdriver.Remote(DRIVER_SERVER, self.desired_caps) self.wait = WebDriverWait(self.driver, TIMEOUT) self.client = MongoClient(MONGO_URL) self.db = self.client[MONGO_DB] self.collection = self.db[MONGO_COLLECTION] # 處理器 self.processor = Processor() def login(self): """ 登入微信 :return: """ # 登入按鈕 login = self.wait.until(EC.presence_of_element_located((By.ID, 'com.tencent.mm:id/cjk'))) login.click() # 手機輸入 phone = self.wait.until(EC.presence_of_element_located((By.ID, 'com.tencent.mm:id/h2'))) phone.set_text(USERNAME) # 下一步 next = self.wait.until(EC.element_to_be_clickable((By.ID, 'com.tencent.mm:id/adj'))) next.click() # 密碼 password = self.wait.until( EC.presence_of_element_located((By.XPATH, '//*[@resource-id="com.tencent.mm:id/h2"][1]'))) password.set_text(PASSWORD) # 提交 submit = self.wait.until(EC.element_to_be_clickable((By.ID, 'com.tencent.mm:id/adj'))) submit.click() def enter(self): """ 進入朋友圈 :return: """ # 選項卡 tab = self.wait.until( EC.presence_of_element_located((By.XPATH, '//*[@resource-id="com.tencent.mm:id/bw3"][3]'))) tab.click() # 朋友圈 moments = self.wait.until(EC.presence_of_element_located((By.ID, 'com.tencent.mm:id/atz'))) moments.click() def crawl(self): """ 爬取 :return: """ while True: # 當前頁面顯示的所有狀態 items = self.wait.until( EC.presence_of_all_elements_located( (By.XPATH, '//*[@resource-id="com.tencent.mm:id/cve"]//android.widget.FrameLayout'))) # 上滑 self.driver.swipe(FLICK_START_X, FLICK_START_Y + FLICK_DISTANCE, FLICK_START_X, FLICK_START_Y) # 遍歷每條狀態 for item in items: try: # 暱稱 nickname = item.find_element_by_id('com.tencent.mm:id/aig').get_attribute('text') # 正文 content = item.find_element_by_id('com.tencent.mm:id/cwm').get_attribute('text') # 日期 date = item.find_element_by_id('com.tencent.mm:id/crh').get_attribute('text') # 處理日期 date = self.processor.date(date) print(nickname, content, date) data = { 'nickname': nickname, 'content': content, 'date': date, } # 插入MongoDB self.collection.update({'nickname': nickname, 'content': content}, {'$set': data}, True) sleep(SCROLL_SLEEP_TIME) except NoSuchElementException: pass def main(self): """ 入口 :return: """ # 登入 首次登入需要用到login函式,第二次登入,請註釋login self.login() # 進入朋友圈 self.enter() # 爬取 self.crawl() if __name__ == '__main__': moments = Moments() moments.main()
配置程式碼config.py
import os # 平臺 PLATFORM = 'Android' # 裝置名稱 通過 adb devices -l 獲取 DEVICE_NAME = 'MI_NOTE_Pro' # APP路徑 APP = os.path.abspath('.') + '/weixin.apk' # APP包名 APP_PACKAGE = 'com.tencent.mm' # 入口類名 APP_ACTIVITY = '.ui.LauncherUI' # Appium地址 DRIVER_SERVER = 'http://localhost:4723/wd/hub' # 等待元素載入時間 TIMEOUT = 300 # 微信手機號密碼 USERNAME = '' PASSWORD = '' # 滑動點 FLICK_START_X = 300 FLICK_START_Y = 300 FLICK_DISTANCE = 700 # MongoDB配置 MONGO_URL = 'localhost' MONGO_DB = 'moments' MONGO_COLLECTION = 'moments' # 滑動間隔 SCROLL_SLEEP_TIME = 1
時間處理程式碼processor.py
import time import re class Processor(): def date(self, datetime): """ 處理時間 :param datetime: 原始時間 :return: 處理後時間 """ if re.match('\d+分鐘前', datetime): minute = re.match('(\d+)', datetime).group(1) datetime = time.strftime('%Y-%m-%d', time.localtime(time.time() - float(minute) * 60)) if re.match('\d+小時前', datetime): hour = re.match('(\d+)', datetime).group(1) datetime = time.strftime('%Y-%m-%d', time.localtime(time.time() - float(hour) * 60 * 60)) if re.match('昨天', datetime): datetime = time.strftime('%Y-%m-%d', time.localtime(time.time() - 24 * 60 * 60)) if re.match('\d+天前', datetime): day = re.match('(\d+)', datetime).group(1) datetime = time.strftime('%Y-%m-%d', time.localtime(time.time()) - float(day) * 24 * 60 * 60) return datetime