python手機號前7位歸屬地爬蟲程式碼例項
阿新 • • 發佈:2020-04-01
需求分析
專案上需要用到手機號前7位,判斷號碼是否合法,還有歸屬地查詢。舊的資料是幾年前了太久了,打算用python爬蟲重新爬一份
單執行緒版本
# coding:utf-8 import requests from datetime import datetime class PhoneInfoSpider: def __init__(self,phoneSections): self.phoneSections = phoneSections def phoneInfoHandler(self,textData): text = textData.splitlines(True) # print("text length:" + str(len(text))) if len(text) >= 9: number = text[1].split('\'')[1] province = text[2].split('\'')[1] mobile_area = text[3].split('\'')[1] postcode = text[5].split('\'')[1] line = "number:" + number + ",province:" + province + ",mobile_area:" + mobile_area + ",postcode:" + postcode line_text = number + "," + province + "," + mobile_area + "," + postcode print(line_text) # print("province:" + province) try: f = open('./result.txt','a') f.write(str(line_text) + '\n') except Exception as e: print(Exception,":",e) def requestPhoneInfo(self,phoneNum): try: url = 'https://tcc.taobao.com/cc/json/mobile_tel_segment.htm?tel=' + phoneNum response = requests.get(url) self.phoneInfoHandler(response.text) except Exception as e: print(Exception,e) def requestAllSections(self): # last用於接上次異常退出前的號碼 last = 0 # last = 4 # 自動生成手機號碼,後四位補0 for head in self.phoneSections: head_begin = datetime.now() print(head + " begin time:" + str(head_begin)) # for i in range(last,10000): for i in range(last,10): middle = str(i).zfill(4) phoneNum = head + middle + "0000" self.requestPhoneInfo(phoneNum) last = 0 head_end = datetime.now() print(head + " end time:" + str(head_end)) if __name__ == '__main__': task_begin = datetime.now() print("phone check begin time:" + str(task_begin)) # 電信,聯通,移動,虛擬運營商 dx = ['133','149','153','173','177','180','181','189','199'] lt = ['130','131','132','145','146','155','156','166','171','175','176','185','186','166'] yd = ['134','135','136','137','138','139','147','148','150','151','152','157','158','159','172','178','182','183','184','187','188','198'] add = ['170'] all_num = dx + lt + yd + add # print(all_num) print(len(all_num)) # 要爬的號碼段 spider = PhoneInfoSpider(all_num) spider.requestAllSections() task_end = datetime.now() print("phone check end time:" + str(task_end))
發現爬取一個號段,共10000次查詢,單執行緒版大概要多1個半小時,太慢了。
多執行緒版本
# coding:utf-8 import requests from datetime import datetime import queue import threading threadNum = 32 class MyThread(threading.Thread): def __init__(self,func): threading.Thread.__init__(self) self.func = func def run(self): self.func() def requestPhoneInfo(): global lock while True: lock.acquire() if q.qsize() != 0: print("queue size:" + str(q.qsize())) p = q.get() # 獲得任務 lock.release() middle = str(9999 - q.qsize()).zfill(4) phoneNum = phone_head + middle + "0000" print("phoneNum:" + phoneNum) try: url = 'https://tcc.taobao.com/cc/json/mobile_tel_segment.htm?tel=' + phoneNum # print(url) response = requests.get(url) # print(response.text) phoneInfoHandler(response.text) except Exception as e: print(Exception,e) else: lock.release() break def phoneInfoHandler(textData): text = textData.splitlines(True) if len(text) >= 9: number = text[1].split('\'')[1] province = text[2].split('\'')[1] mobile_area = text[3].split('\'')[1] postcode = text[5].split('\'')[1] line = "number:" + number + ",postcode:" + postcode line_text = number + "," + postcode print(line_text) # print("province:" + province) try: f = open('./result.txt','a') f.write(str(line_text) + '\n') except Exception as e: print(Exception,e) if __name__ == '__main__': task_begin = datetime.now() print("phone check begin time:" + str(task_begin)) dx = ['133','198'] all_num = dx + lt + yd print(len(all_num)) for head in all_num: head_begin = datetime.now() print(head + " begin time:" + str(head_begin)) q = queue.Queue() threads = [] lock = threading.Lock() for p in range(10000): q.put(p + 1) print(q.qsize()) for i in range(threadNum): middle = str(i).zfill(4) global phone_head phone_head = head thread = MyThread(requestPhoneInfo) thread.start() threads.append(thread) for thread in threads: thread.join() head_end = datetime.now() print(head + " end time:" + str(head_end)) task_end = datetime.now() print("phone check end time:" + str(task_end))
多執行緒版的1個號碼段1000條資料,大概2,3min就好,cpu使用飆升,大概維持在70%左右。
總共40多個號段,爬完大概1,2個小時,總資料41w左右
以上就是本文的全部內容,希望對大家的學習有所幫助,也希望大家多多支援我們。