1. 程式人生 > >爬蟲案例之藥品通用名和商品名資料庫下載

爬蟲案例之藥品通用名和商品名資料庫下載

如圖:我想把圖中的表格給下載下來。

分析頁面請求,發現是ajax請求,不需要cookie,post請求需要帶一些引數,總之發現實現流程很簡單。但關鍵是從頁面提取表格,這裡主要用到了pandas的read_html,使用這個函式可以使我們很方便的提取也頁面的表格資訊。

程式碼

# -*- coding: utf-8 -*-

"""
@Datetime: 2018/11/11
@Author: Zhang Yafei
"""
from multiprocessing import Pool

import pandas
import requests
import os

BASE_DIR = os.path.dirname(os.path.abspath(__file__))
HTML_DIR = os.path.join(BASE_DIR,'藥品商品名通用名稱資料庫')

if not os.path.exists(HTML_DIR):
    os.mkdir(HTML_DIR)

name_list = [] 
if os.path.exists('drug_name.csv'):
    data = pandas.read_csv('drug_name.csv',encoding='utf-8')
    
header = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Content-Length': '248',
        'Content-Type': 'application/x-www-form-urlencoded',
        'Cookie': 'JSESSIONID=0000ixyj6Mwe6Be4heuHcvtSW4C:-1; Hm_lvt_3849dadba32c9735c8c87ef59de6783c=1541937281; Hm_lpvt_3849dadba32c9735c8c87ef59de6783c=1541940406',
        'Upgrade-Insecure-Requests': '1',
        'Origin': 'http://pharm.ncmi.cn',
        'Referer': 'http://pharm.ncmi.cn/dataContent/dataSearch.do?did=27',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
    }


def spider(page):
    adverse_url = 'http://pharm.ncmi.cn/dataContent/dataSearch.do?did=27'
    form_data = {
        'method': 'list',
        'did': 27,
        'ec_i': 'ec',
        'ec_crd': 15,
        'ec_p': page,
        'ec_rd': 15,
        'ec_pd': page,
    }
    response = requests.post(url=adverse_url,headers=header,data=form_data)
    filename = '{}.html'.format(page)
    with open(filename,'w',encoding='utf-8') as f:
        f.write(response.text)
    print(filename,'下載完成')


def get_response(page):
    file = os.path.join(HTML_DIR,'{}.html')
    with open(file.format(page),'r',encoding='utf-8') as f:
        response = f.read()
    return response


def parse(page):
    response = get_response(page)
    result = pandas.read_html(response,attrs={'id':'ec_table'})[0]
    data = result.iloc[:,:5]
    data.columns = ['序號','批准文號','藥品中文名稱','藥品商品名稱','生產單位']
    if page==1:
        data.to_csv('drug_name.csv',mode='w',encoding='utf_8_sig',index=False)
    else:
        data.to_csv('drug_name.csv',mode='a',encoding='utf_8_sig',header=False,index=False)
    print('第{}頁資料存取完畢'.format(page))

def get_unparse_data():
    if os.path.exists('drug_name.csv'):
        pages = data['序號']
        pages = list(set(range(1,492))-set(pages.values))
    else:
        pages = list(range(1,492))
    return pages
    
def download():
     pool = Pool()
     pool.map(spider,list(range(1,492)))
     pool.close()
     pool.join()
    
    
def write_to_csv():
    pages = get_unparse_data()
    print(pages)
    list(map(parse,pages))    
    
def new_data(chinese_name):
    trade_name = '/'.join(set(data[data.藥品中文名稱==chinese_name].藥品商品名稱))
    name_list.append(trade_name)
      
def read_from_csv():
    name = data['藥品中文名稱'].values
    print(len(name))
    chinese_name = list(set(data['藥品中文名稱'].values))
    list(map(new_data,chinese_name))
    df_data = {'藥品中文名稱':chinese_name,'藥品商品名稱':name_list}
    new_dataframe = pandas.DataFrame(df_data)
    new_dataframe.to_csv('unique_chinese_name.csv',mode='w',encoding='utf_8_sig',index=False)
    return new_dataframe    
    
def main():
    download()    
    write_to_csv()
    return read_from_csv()

if __name__ == '__main__':
    drugname_dataframe = main()

  知識點總結:1.ajax的post請求,不需要登入

        2.多程序下載

        3.解析資料用read_html快速提取表格