Python爬蟲:爬取無賬號無限制獲取企查查資訊
阿新 • • 發佈:2020-12-16
本文的文字及圖片來源於網路,僅供學習、交流使用,不具有任何商業用途,版權歸原作者所有,如有問題請及時聯絡我們以作處理
以下文章來源於騰訊雲 作者:昱良
通過網上爬蟲獲取了全國所有企業,然後就需要補充企業資訊,首先想到的就是企查查,啟信寶等專業網站,最終選擇了企查查,嘗試了多種方法:
1、selenium爬蟲,繞過企查查的登入驗證,但賬號和IP限制太大,最終放棄
2、通過requests直接請求+cookies,遇到了cookie有效期和限制問題
不斷的嘗試和修改引數,最終發現一種有效方式selenium + wep
只需要IP代理,不需要賬號,沒有限制,因為是沒有登入,拿到的資訊有限,能展示的都能獲取。
image
一、初始化selenium
sysstr = platform.system() if(sysstr =="Windows"): chromedriver_path = os.getcwd() + "\\utools\\chromedriver.exe" else: #mac chromedriver_path = os.getcwd() + "/mac_chromedriver" logger.info("chromedriver_path: %s" %(chromedriver_path,)) default_agent = '--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"' class wap_QiChaCha(object): def __init__(self, user_agent_header=default_agent, chromedriver_path=chromedriver_path): self.options = webdriver.ChromeOptions() # 建立一個配置物件 self.options.add_argument('--no-sandbox') self.options.add_argument('--disable-dev-shm-usage') self.options.add_argument('--proxy-server=http://47.98.154.206:3008') self.options.add_argument("--headless") # 開啟無介面模式cd self.options.add_argument("--disable-gpu") # 可選項:禁用gpu,可以解決一些莫名的問題 self.options.add_argument(user_agent_header) mobileEmulation = {'deviceName': 'iPhone X'} self.options.add_experimental_option('mobileEmulation', mobileEmulation) def init(self): self.driver = webdriver.Chrome(executable_path=chromedriver_path, chrome_options=self.options) # 開啟登入頁面 self.driver.get('https://m.qichacha.com/') self.error_encounter = 0
二、判斷公司存在
def search_company(self, company_name): #time.sleep(0.3) try: result = {} result[COMPANY.NAME] = utils.normalizeCompanyName(company_name) logger.info("search for: %s" %(company_name,)) ''' self.driver.get('https://m.qichacha.com/') self.driver.find_element_by_id('searchkey').send_keys(company_name) # 單擊搜尋按鈕 srh_btn = self.driver.find_element_by_xpath('//*[@id="V3_Index_S"]//span') srh_btn.click() ''' self.driver.get('https://m.qcc.com/search?key=%s' %(company_name)) utils.alertWait(WebDriverWait(self.driver, 3).until, expected_conditions.presence_of_element_located( (By.XPATH, '//*[contains(@class,"text-danger") or contains(@class,"nodata")]')), 5, 0, "not found text-danger or nodata") # 檢測企業是不是存在 inc_full = self.driver.find_element_by_xpath('//*[@class="text-danger"]').text self.error_encounter = 0 if inc_full == "0": logger.error("company %s not found" %(company_name,)) return None # 獲取首個企業文字 cname = self.driver.find_element_by_xpath('//div[@class="list-item-name"]').text href = self.driver.find_element_by_xpath('//a[@class="a-decoration"]').get_attribute("href") # 曾用名 cym = None try: stock_or_others = self.driver.find_element_by_xpath('//div[@class="list-item-bottom"]').text # print(stock_or_others) # 稱呼不同:曾用名 或 歷史股東等 if utils.normalizeCompanyName(company_name) in stock_or_others: company_bottom = stock_or_others.replace(":", ":") cym = company_bottom.split(":")[1] except: # 獲取下面顯示失敗 pass if utils.normalizeCompanyName(cname) == utils.normalizeCompanyName(company_name) \ or utils.normalizeCompanyName(cym) == utils.normalizeCompanyName(company_name): result[COMPANY.URL] = href # time.sleep(0.2) return self.company_detail(href, result) except Exception as err: # self.driver.delete_all_cookies() logger.error(err) self.error_encounter = self.error_encounter + 1 if self.error_encounter >= 3: self.driver.quit() self.init() return None finally: pass
image
三、獲取公司資訊
def company_detail(self, href, result): self.driver.get(href) utils.alertWait(WebDriverWait(self.driver, 3).until, expected_conditions.presence_of_element_located((By.XPATH, '//*[@class="company-name"]')), 5, 0, "not found text-danger") try: phone = self.driver.find_element_by_xpath('/html/body/div[1]/div[2]/div[1]/div[3]/a[1]').text if phone and len(phone) > 0: result[COMPANY.TEL] = phone.strip() except Exception as e: pass # logger.info("沒有手機號") try: email = self.driver.find_element_by_xpath('/html/body/div[1]/div[2]/div[1]/div[3]/a[2]').text if email and len(email) > 0: result[COMPANY.EMAIL] = email.strip() except Exception as e: pass # logger.info("沒有郵箱") try: address = self.driver.find_element_by_xpath('/html/body/div[1]/div[2]/div[1]/div[4]').text if address and len(address) > 0: result[COMPANY.ADDRESS] = address.strip() except Exception as e: pass # logger.info("沒有地址") try: infos = self.driver.find_element_by_xpath('//div[@class="basic-wrap"]/table') # infos = self.driver.find_element_by_xpath('//*[@id="Cominfo"]/table') except: return result result[COMPANY.TAX_LEVEL] = "稅務等級&&" try: taxcreditlist = self.driver.find_element_by_xpath('//div[@id="taxcreditlist"]').text info = str(taxcreditlist).replace("\n", "&").strip() result[COMPANY.TAX_LEVEL] = result[COMPANY.TAX_LEVEL] + info except: return result # 轉為etree data = etree.HTML(infos.get_property("innerHTML")) data_info = data.xpath('.//tr') result[COMPANY.BUSINESS] = "工商資訊" for info in data_info: info_list = info.xpath(".//td//text()") new_info_list = [] for info in list(info_list): new_info = str(info).replace("\n", "").strip() new_info_list.append(new_info) new_info_list = [i for i in new_info_list if i != ''] self.retrieveInfo(new_info_list, result) result[COMPANY.BUSINESS] = result[COMPANY.BUSINESS] + " && " + " && ".join( map(str, new_info_list)) # 以 && 分割 連線 list 內容 return result