1. 程式人生 > 程式設計 >python手機號前7位歸屬地爬蟲程式碼例項

python手機號前7位歸屬地爬蟲程式碼例項

需求分析

專案上需要用到手機號前7位,判斷號碼是否合法,還有歸屬地查詢。舊的資料是幾年前了太久了,打算用python爬蟲重新爬一份

單執行緒版本

# coding:utf-8
import requests
from datetime import datetime


class PhoneInfoSpider:
  def __init__(self,phoneSections):
    self.phoneSections = phoneSections

  def phoneInfoHandler(self,textData):
    text = textData.splitlines(True)
    # print("text length:" + str(len(text)))

    if len(text) >= 9:
      number = text[1].split('\'')[1]
      province = text[2].split('\'')[1]
      mobile_area = text[3].split('\'')[1]
      postcode = text[5].split('\'')[1]
      line = "number:" + number + ",province:" + province + ",mobile_area:" + mobile_area + ",postcode:" + postcode
      line_text = number + "," + province + "," + mobile_area + "," + postcode
      print(line_text)
      # print("province:" + province)

      try:
        f = open('./result.txt','a')
        f.write(str(line_text) + '\n')
      except Exception as e:
        print(Exception,":",e)

  def requestPhoneInfo(self,phoneNum):
    try:
      url = 'https://tcc.taobao.com/cc/json/mobile_tel_segment.htm?tel=' + phoneNum
      response = requests.get(url)
      self.phoneInfoHandler(response.text)
    except Exception as e:
      print(Exception,e)

  def requestAllSections(self):
    # last用於接上次異常退出前的號碼
    last = 0
    # last = 4
    # 自動生成手機號碼,後四位補0
    for head in self.phoneSections:
      head_begin = datetime.now()
      print(head + " begin time:" + str(head_begin))

      # for i in range(last,10000):
      for i in range(last,10):
        middle = str(i).zfill(4)
        phoneNum = head + middle + "0000"
        self.requestPhoneInfo(phoneNum)
      last = 0

      head_end = datetime.now()
      print(head + " end time:" + str(head_end))


if __name__ == '__main__':
  task_begin = datetime.now()
  print("phone check begin time:" + str(task_begin))

  # 電信,聯通,移動,虛擬運營商
  dx = ['133','149','153','173','177','180','181','189','199']
  lt = ['130','131','132','145','146','155','156','166','171','175','176','185','186','166']
  yd = ['134','135','136','137','138','139','147','148','150','151','152','157','158','159','172','178','182','183','184','187','188','198']
  add = ['170']
  all_num = dx + lt + yd + add

  # print(all_num)
  print(len(all_num))

  # 要爬的號碼段
  spider = PhoneInfoSpider(all_num)
  spider.requestAllSections()

  task_end = datetime.now()
  print("phone check end time:" + str(task_end))

發現爬取一個號段,共10000次查詢,單執行緒版大概要多1個半小時,太慢了。

多執行緒版本

# coding:utf-8
import requests
from datetime import datetime
import queue
import threading

threadNum = 32


class MyThread(threading.Thread):
  def __init__(self,func):
    threading.Thread.__init__(self)
    self.func = func

  def run(self):
    self.func()


def requestPhoneInfo():
  global lock
  while True:
    lock.acquire()
    if q.qsize() != 0:
      print("queue size:" + str(q.qsize()))
      p = q.get() # 獲得任務
      lock.release()

      middle = str(9999 - q.qsize()).zfill(4)
      phoneNum = phone_head + middle + "0000"
      print("phoneNum:" + phoneNum)

      try:
        url = 'https://tcc.taobao.com/cc/json/mobile_tel_segment.htm?tel=' + phoneNum
        # print(url)
        response = requests.get(url)
        # print(response.text)
        phoneInfoHandler(response.text)
      except Exception as e:
        print(Exception,e)
    else:
      lock.release()
      break


def phoneInfoHandler(textData):
  text = textData.splitlines(True)

  if len(text) >= 9:
    number = text[1].split('\'')[1]
    province = text[2].split('\'')[1]
    mobile_area = text[3].split('\'')[1]
    postcode = text[5].split('\'')[1]
    line = "number:" + number + ",postcode:" + postcode
    line_text = number + "," + postcode
    print(line_text)
    # print("province:" + province)

    try:
      f = open('./result.txt','a')
      f.write(str(line_text) + '\n')
    except Exception as e:
      print(Exception,e)


if __name__ == '__main__':
  task_begin = datetime.now()
  print("phone check begin time:" + str(task_begin))

  dx = ['133','198']
  all_num = dx + lt + yd
  print(len(all_num))

  for head in all_num:
    head_begin = datetime.now()
    print(head + " begin time:" + str(head_begin))

    q = queue.Queue()
    threads = []
    lock = threading.Lock()

    for p in range(10000):
      q.put(p + 1)

    print(q.qsize())

    for i in range(threadNum):
      middle = str(i).zfill(4)
      global phone_head
      phone_head = head

      thread = MyThread(requestPhoneInfo)
      thread.start()
      threads.append(thread)
    for thread in threads:
      thread.join()

    head_end = datetime.now()
    print(head + " end time:" + str(head_end))

  task_end = datetime.now()
  print("phone check end time:" + str(task_end))

多執行緒版的1個號碼段1000條資料,大概2,3min就好,cpu使用飆升,大概維持在70%左右。

總共40多個號段,爬完大概1,2個小時,總資料41w左右

以上就是本文的全部內容,希望對大家的學習有所幫助,也希望大家多多支援我們。