python爬蟲爬取百度百科資料
阿新 • • 發佈:2021-02-03
#!/usr/bin/env python # -*- coding: utf-8 -*- import json import re import requests import datetime from bs4 import BeautifulSoup import os import collections import json import sys import time reload(sys) sys.setdefaultencoding('utf-8') #獲取當天的日期,並進行格式化,用於後面檔案命名,格式:20200420 today = datetime.date.today().strftime('%Y%m%d') def crawl_wiki_data(city): """ 爬取百度百科中地區基礎資訊 """ headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' } url="""https://baike.baidu.com/item/{city}""".format(city=city) polysemyURL ="""https://baike.baidu.com{href}""" try: time.sleep(1) response = requests.get(url,headers=headers,timeout=(3,7)) count=0 while response.status_code != 200: if count>10: break time.sleep(0.5) response = requests.get(url,headers=headers,timeout=(3,7)) count=count+1 #print response.text #將一段文件傳入BeautifulSoup的構造方法,就能得到一個文件的物件, 可以傳入一段字串 soup = BeautifulSoup(response.text,'lxml') #print soup #返回的是class為basic-info的<div>所有標籤 Polysemy = soup.find_all('div',{'class':'lemmaWgt-subLemmaListTitle'}) #print Polysemy[0] if len(Polysemy) !=0: Polysemy = soup.find_all('li',{'class':'list-dot list-dot-paddingleft'}) #print Polysemy[0] for Alabel in Polysemy: if str(Alabel).find('地級市')!=-1 or str(Alabel).find('縣級市')!=-1 or str(Alabel).find('地級')!=-1 or str(Alabel).find('市級')!=-1: #print Alabel href = Alabel.select('a')[0].get('href') #print href #print polysemyURL.format(href=str(href)) #exit(0) time.sleep(1) response = requests.get(polysemyURL.format(href=href) ,headers=headers,timeout=(3,7)) count=0 while response.status_code != 200: if count>10: break time.sleep(0.5) response = requests.get(polysemyURL.format(href=href) ,headers=headers,timeout=(3,7)) count=count+1 soup = BeautifulSoup(response.text,'lxml') tables = soup.find_all('div',{'class':'basic-info'}) dt = tables[0].select('dt') dd = tables[0].select('dd') dateDict=collections.OrderedDict() for i in range(len(dt)): key = str(dt[i]).split('>')[1].split('<')[0].strip().replace(' ', '').replace(u'\xa0', "") if key =="別名": key ="別名" if key =="佔地面積": key ="面積" if key =="人口": key ="人口數量" if key =="常住人口": key ="人口數量" if key.find('中文名')!=-1: key= '中文名' re_h=re.compile('</?\w+[^>]*>') data = re_h.sub('',str(dd[i])).strip().replace("\n","").replace("\t", "").replace(u'\xa0', "") re_1=re.compile('\\[.*.\\]') data = re_1.sub('',data) dateDict[str(key)]=str(data) f.writelines(json.dumps(dateDict, ensure_ascii=False)) f.write('\n') #return '\t'.join(map(str,mainInfo)) except Exception as e: print "error: "+city+"\t"+str(e) if __name__ == '__main__': code_map = {} cityList =[] code_map['371200']= "萊蕪市" with open('city_code_map','r') as f: data = f.readlines() for line in data: #print line line_list = line.strip().split('\t') city =line_list[0] cityList.append(city) code_map[line_list[0]] = line_list[1] #print code_map # print cityList #cityList=['上海市'] fileName = './city_json' with open(fileName,'w') as f: f.truncate() for city in cityList: crawl_wiki_data(city) #print result #print crawl_wiki_data('龍巖市')