1. 程式人生 > 其它 >python爬蟲爬取百度百科資料

python爬蟲爬取百度百科資料

技術標籤:python爬蟲

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import json
import re
import requests
import datetime
from bs4 import BeautifulSoup
import os
import collections
import json 
import sys
import time 


reload(sys)
sys.setdefaultencoding('utf-8')
#獲取當天的日期,並進行格式化,用於後面檔案命名,格式:20200420
today = datetime.date.today().strftime('%Y%m%d')    

def crawl_wiki_data(city):
    """
    爬取百度百科中地區基礎資訊
    """
    headers = { 
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    }
    url="""https://baike.baidu.com/item/{city}""".format(city=city)   
    polysemyURL ="""https://baike.baidu.com{href}"""
    try:
        time.sleep(1)
        response = requests.get(url,headers=headers,timeout=(3,7))
        count=0
        while response.status_code != 200:
            if count>10:
                break
            time.sleep(0.5)
            response = requests.get(url,headers=headers,timeout=(3,7))
            count=count+1
        #print response.text
        #將一段文件傳入BeautifulSoup的構造方法,就能得到一個文件的物件, 可以傳入一段字串
        soup = BeautifulSoup(response.text,'lxml')
        #print soup
        #返回的是class為basic-info的<div>所有標籤
        Polysemy = soup.find_all('div',{'class':'lemmaWgt-subLemmaListTitle'})
        #print Polysemy[0]
        if len(Polysemy) !=0:
            Polysemy = soup.find_all('li',{'class':'list-dot list-dot-paddingleft'})
            #print Polysemy[0]
            for Alabel in Polysemy:
                if str(Alabel).find('地級市')!=-1 or str(Alabel).find('縣級市')!=-1 or str(Alabel).find('地級')!=-1 or str(Alabel).find('市級')!=-1:
                    #print Alabel
                    href = Alabel.select('a')[0].get('href')
                    #print href
                    #print polysemyURL.format(href=str(href))
                    #exit(0)
                    time.sleep(1)
                    response = requests.get(polysemyURL.format(href=href) ,headers=headers,timeout=(3,7))
                    count=0
                    while response.status_code != 200:
                        if count>10:
                            break
                        time.sleep(0.5)
                        response = requests.get(polysemyURL.format(href=href) ,headers=headers,timeout=(3,7))
                        count=count+1
                    soup = BeautifulSoup(response.text,'lxml')
        tables = soup.find_all('div',{'class':'basic-info'})
      
        dt = tables[0].select('dt')
        dd = tables[0].select('dd')
        dateDict=collections.OrderedDict()
        
        for i in range(len(dt)):

            key = str(dt[i]).split('>')[1].split('<')[0].strip().replace(' ', '').replace(u'\xa0', "")

            if key =="別名":
                key ="別名"
            if key =="佔地面積":
                key ="面積"
            if key =="人口":
                key ="人口數量"
            if key =="常住人口":
                key ="人口數量"
            if key.find('中文名')!=-1:
                key= '中文名'

            re_h=re.compile('</?\w+[^>]*>')
            data = re_h.sub('',str(dd[i])).strip().replace("\n","").replace("\t", "").replace(u'\xa0', "")
            re_1=re.compile('\\[.*.\\]')
            data = re_1.sub('',data)
            dateDict[str(key)]=str(data)
        f.writelines(json.dumps(dateDict, ensure_ascii=False))
        f.write('\n')
        #return '\t'.join(map(str,mainInfo))
        
    except Exception as e:
        print "error: "+city+"\t"+str(e)


if __name__ == '__main__':
    code_map = {}
    cityList =[]
    code_map['371200']= "萊蕪市"
    with open('city_code_map','r') as f:
        data = f.readlines()
        for line in data:
            #print line
            line_list = line.strip().split('\t')
            city =line_list[0]
            cityList.append(city)
            code_map[line_list[0]] = line_list[1]
    #print code_map
   # print cityList
    #cityList=['上海市']
    fileName = './city_json'
    with open(fileName,'w') as f:
        f.truncate()
        for city in cityList:
            crawl_wiki_data(city)
            #print result
    #print crawl_wiki_data('龍巖市')