1. 程式人生 > >模塊法之內蒙古自治區環境保護廳

模塊法之內蒙古自治區環境保護廳

imp etl OS print link 5.0 enc gen web

import re,requests,xlwt
from lxml import etree
headers = {
    User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36,
}
end_list=[]
def getlinks(url):
    #獲取需求下所有跳轉頁面href
    wb_data = requests.get(url,headers=headers)
    wb_data.encoding
=wb_data.apparent_encoding links=re.findall(class="font_hei15_1".*?href="(.*?)",wb_data.text,re.S) for link in links: link = (http://www.nmgepb.gov.cn/ywgl/hjpj/xmslqk/+link).replace(/.,‘‘) getinfos(link) def getinfos(url): #進入詳細頁獲取指定信息 wb_data = requests.get(url,headers=headers) wb_data.encoding
=wb_data.apparent_encoding soup = etree.HTML(wb_data.text) names = soup.xpath(//div[@class="xl_nr_16"]) addresses = soup.xpath(//div[@class="xl_nr_16"]) danweis = soup.xpath(//div[@class="xl_nr_16"]) pingjiajigous = soup.xpath(//div[@class="xl_nr_16"]) dates = soup.xpath(//div[@class="xl_nr_16"]
) for name,address,danwei,pingjiajigou,date in zip(names,addresses,danweis,pingjiajigous,dates): try: name = name.xpath(p/text()[5])[0] # 像p1的3 /5/ 9 /10的構造 address = address.xpath(p/text()[6])[0]#3/5/9/10的地點 danwei = danwei.xpath(p/text()[7])[0] pingjiajigou = pingjiajigou.xpath(p/text()[8])[0] date = date.xpath(p/text()[9])[0] print(name,address,danwei,pingjiajigou,date) except: try: name = name.xpath(div/text()[10])[0]#像p1的1/2的構造 address = address.xpath(div/text()[12])[0]#1/2的地點 danwei = danwei.xpath(div/text()[14])[0] pingjiajigou = pingjiajigou.xpath(div/text()[16])[0] date = date.xpath(div/text()[18])[0] print(name,address,danwei,pingjiajigou,date) except: try: name = name.xpath(div/p[6]/text())[0]#像p1的6/13/14的構造 #已解決 address=address.xpath(div/p[7]/text())[0] danwei = danwei.xpath(div/p[8]/text())[0] pingjiajigou = pingjiajigou.xpath(div/p[9]/text())[0] date = date.xpath(div/p[10]/text())[0] print(name,address,danwei,pingjiajigou,date) except: try: name = name.xpath(div/p/text()[10])[0]#像p1的4/7/8/11/12/15的構造 address = address.xpath(div/p/text()[12])[0] danwei = danwei.xpath(div/p/text()[14])[0] pingjiajigou = pingjiajigou.xpath(div/p/text()[16])[0] date = date.xpath(div/p/text()[18])[0] print(name,address,danwei,pingjiajigou,date) except: try: name = name.xpath(div/table/tbody/tr[2]/td[2]/p/a/text())[0] #像第二頁7/8的構造 address = address.xpath(div/table/tbody/tr[2]/td[3]/p/span/text())[0] danwei = danwei.xpath(div/table/tbody/tr[2]/td[4]/p/span/text())[0] pingjiajigou = pingjiajigou.xpath(div/table/tbody/tr[2]/td[5]/p/span/text())[0] list=[] i = 0 while i < 6: date5 = date.xpath(div/table/tbody/tr[2]/td[6]/p/span/text())[i]#這裏得把所有元素取出來合並/這是像第二頁第七項的構造 i +=1 list.append(date5) date = list[0] + list[1] + list[2] + list[3] + list[4] + list[5] print(name,address,danwei,pingjiajigou,date) except: try: name = name.xpath(div/table/tbody/tr[2]/td[2]/text())[0]#像第二頁9/10的構造 address = address.xpath(div/table/tbody/tr[2]/td[3]/p/span/text())[0] danwei = danwei.xpath(div/table/tbody/tr[2]/td[4]/p/span/text())[0] pingjiajigou = pingjiajigou.xpath(div/table/tbody/tr[2]/td[5]/p/span/text())[0] i =0 list1=[] while i <6: date6 = date.xpath(div/table/tbody/tr[2]/td[6]/p/span/text())[i]#這裏得把所有元素取出來合並/這是像第二頁第九項的構造 i +=1 list1.append(date6) date = list1[0] + list1[1] + list1[2] + list1[3] + list1[4] + list1[5] print(name,address,danwei,pingjiajigou,date) except: try: name = name.xpath(div/span/table/tbody/tr[2]/td[2]/p/a/text())[0]#像第二頁11的構造 address = address.xpath(div/span/table/tbody/tr[2]/td[3]/p/span/text())[0] danwei = danwei.xpath(div/span/table/tbody/tr[2]/td[4]/p/span/text())[0] pingjiajigou = pingjiajigou.xpath(div/span/table/tbody/tr[2]/td[5]/p/span/text())[0] i = 0 list2=[] while i <6: date7 = date.xpath(div/span/table/tbody/tr[2]/td[6]/p/span/text())[i] i +=1 list2.append(date7) date=list2[0] + list2[1] + list2[2] + list2[3] + list2[4] + list2[5] print(name,address,danwei,pingjiajigou,date) except: try: name = name.xpath(div/span/table/tbody/tr[2]/td[2]/p/span/text())[0] address = address.xpath(div/span/table/tbody/tr[2]/td[3]/p/span/text())[0] danwei = danwei.xpath(div/span/table/tbody/tr[2]/td[4]/p/span/text())[0] pingjiajigou = pingjiajigou.xpath(div/span/table/tbody/tr[2]/td[5]/p/span/text())[0] i = 0 list4 = [] while i <6: date8 = date.xpath(div/span/table/tbody/tr[2]/td[6]/p/span/text())[i]#像p2 11項的構造 i+=1 list4.append(date8) date=list4[0] + list4[1] + list4[2] + list4[3] + list4[4] + list4[5] print(name,address,danwei,pingjiajigou,date) except: try: name = name.xpath(div/div/span/table/tbody/tr[2]/td[2]/p/span/a/text())[0] address = address.xpath(div/div/span/table/tbody/tr[2]/td[3]/p/span[1]/text())[0] danwei = danwei.xpath(div/div/span/table/tbody/tr[2]/td[4]/p/span/text())[0] pingjiajigou = pingjiajigou.xpath(div/div/span/table/tbody/tr[2]/td[5]/p/span/text())[0] list5=[] i=0 while i<6: date9 = date.xpath(div/div/span/table/tbody/tr[2]/td[6]/p/span/text())[i] i+=1 list5.append(date9) date=list5[0] + list5[1] + list5[2] + list5[3] + list5[4] + list5[5] #像p2 13/14項的構造 print(name,address,danwei,pingjiajigou,date) except: try: name = name.xpath(div/div/span/table/tbody/tr[2]/td[2]/p/a/text())[0] address = address.xpath(div/div/span/table/tbody/tr[2]/td[3]/p/span[1]/text())[0] danwei = danwei.xpath(div/div/span/table/tbody/tr[2]/td[4]/p/span[1]/text())[0] pingjiajigou = pingjiajigou.xpath(div/div/span/table/tbody/tr[2]/td[5]/p/span[1]/text())[0] i =0 list3=[] while i <6: date10 = date.xpath(div/div/span/table/tbody/tr[2]/td[6]/p/span/text())[i]#這是像第二頁15項的構造 i+=1 list3.append(date10) date=list3[0] + list3[1] + list3[2] + list3[3] + list3[4] + list3[5] print(name,address,danwei,pingjiajigou,date) except: try: name = name.xpath(div/div/table/tbody/tr[3]/td[2]/text())[0] address = address.xpath(div/div/table/tbody/tr[3]/td[3]/text())[0] danwei = danwei.xpath(div/div/table/tbody/tr[3]/td[4]/text())[0] pingjiajigou = pingjiajigou.xpath(div/div/table/tbody/tr[3]/td[5]/text())[0] date = date.xpath(div/div/table/tbody/tr[3]/td[6]/text())[0] print(name,address,danwei,pingjiajigou,date) except: name=url address=null danwei=null pingjiajigou=null date=null print(name,address,danwei,pingjiajigou,date) start_list=[name,address,danwei,pingjiajigou,date] end_list.append(start_list) if __name__ == __main__: try: url1 = http://www.nmgepb.gov.cn/ywgl/hjpj/xmslqk/index.html#這是首頁 getlinks(url1) url2s = [http://www.nmgepb.gov.cn/ywgl/hjpj/xmslqk/index_{}.html.format(str(i)) for i in range(1,6)] for url2 in url2s: getlinks(url2) book = xlwt.Workbook(encoding=utf-8) sheet = book.add_sheet(getmessage) header=[項目名稱,建設地點,建設單位,環境影響機構,受理日期] for h in range(len(header)): sheet.write(0,h,header[h]) i = 1 for start_list in end_list[:-3]: j = 0 for message in start_list: sheet.write(i,j,message) j+=1 i+=1 print(寫入完畢,最後三項已剔除) book.save(endresult.xls) except: print(wrong) book.save(error.xls)

模塊法之內蒙古自治區環境保護廳