python3爬蟲 連結+表格+圖片
阿新 • • 發佈:2018-12-12
# -*- coding: utf-8 -*- import urllib.request import http.cookiejar from bs4 import BeautifulSoup import requests import csv import time import re import urllib from urllib.parse import quote import string def get_url_2(): with open('F:/python/二級目錄網址.csv')as f: f_csv = csv.reader(f) link_list =[] for link1 in f_csv: link_list.append(link1) return link_list def get_url_weizhuang(head={ 'Connection': 'Keep-Alive', 'Accept': 'text/html, application/xhtml+xml, */*', 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko' }): cj = http.cookiejar.CookieJar() opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj)) header = [] for key, value in head.items(): elem = (key, value) header.append(elem) opener.addheaders = header return opener def get_html(link): Cookie = "PHPStat_First_Time_10000011=1480428327337; PHPStat_Cookie_Global_User_Id=_ck16112922052713449617789740328; PHPStat_Return_Time_10000011=1480428327337; PHPStat_Main_Website_10000011=_ck16112922052713449617789740328%7C10000011%7C%7C%7C; VISITED_COMPANY_CODE=%5B%22600064%22%5D; VISITED_STOCK_CODE=%5B%22600064%22%5D; seecookie=%5B600064%5D%3A%u5357%u4EAC%u9AD8%u79D1; _trs_uv=ke6m_532_iw3ksw7h; VISITED_MENU=%5B%228451%22%2C%229055%22%2C%229062%22%2C%229729%22%2C%228528%22%5D" headers = { 'User-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36', 'Cookie': Cookie, 'Connection': 'keep-alive', 'Accept': '*/*', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Host': 'query.sse.com.cn', 'Referer': 'http://www.sse.com.cn/assortment/stock/list/share/' } r = requests.get(link, headers=headers, timeout=10) # print("響應狀態碼:", r.status_code) if 200 != r.status_code: pass # 普通BeautifulSoup find html = r.text return html def get_data4(): uop = get_url_weizhuang().open(link1, timeout=1000) content = uop.read().decode("utf-8") # 基本資訊-表格 pattern1 = re.compile('style="padding-left:20px;">([\s\S]*?)</td>') # 受理標準-文字 pattern2 = re.compile('style="font-size: 15px; line-height: 45px;text-indent: 2em; padding: 0 10px;">([\s\S]*?)</p>') # 設定依據-文字 pattern3 = re.compile('<p style="font-size: 15px; line-height: 45px;">([\s\S]*?)</p>') # 收費標準及依據-文字 pattern4 = re.compile('<p style="font-size: 15px; line-height: 45px;(.*?);float:left;">([\s\S]*?)</p>') # 辦理流程-表格 pattern5 = re.compile('<div class="main_tab_item" id="con-one-5" style="display: none;">([\s\S]*?)</div>') # 辦理流程圖-圖片 pattern6 = re.compile('<div class="main_tab_item" id="con-one-6" style="display: none;">([\s\S]*?)</div>') # 辦理材料-表格 pattern7 = re.compile('<div class="main_tab_item" id="con-one-7" style="display: none;">([\s\S]*?)</div>') # 特殊環節-表格 pattern8=re.compile('<div class="main_tab_item" id="con-one-8" style="display: none;">([\s\S]*?)</div>') # 辦理結果樣本-圖片 # pattern9=re.compile('<img src="([\s\S]*?)" <alt>') items1 = re.findall(pattern1,content) items2 = re.findall(pattern2,content) items3 = re.findall(pattern3,content) items4 = re.findall(pattern4,content) items5 = re.findall(pattern5,content) items7 = re.findall(pattern7,content) items8 = re.findall(pattern8,content) item_sum1 = [[items1,items8],[items2,items3,items4],[items5, items7]] for p1 in item_sum1: jiben_xinxi = [] for p in p1: for item11 in p: item1111 = qingxi_data(item11) jiben_xinxi.append(item1111) ui_string2 = str(jiben_xinxi).replace('\n', '').replace('\r', '').replace('\\n', '').replace('\\r','').\ replace(' ','').replace('\'','').replace('>','').replace('[','').replace(']','').replace('\\u3000','').\ replace('(text-indent:2em;padding:010px','').replace('(padding:010px;margin-top:0px;','').replace('--','')\ .replace('"','').replace(')"','').split(',') ui_string2 = [x for x in ui_string2 if x != ''] for n in range(len(ui_string2)): pattern = 'vard=(.*);if' ui_string3 = re.findall(pattern, ui_string2[n]) if ui_string3 != []: ui_string2[n] = ui_string3 print(ui_string2) save_contents(ui_string2) def qingxi_data(item11): dr = re.compile('<[^>]+>', re.S) item111 = dr.sub(',', str(item11)) item1111 = item111.replace('\\r', '').replace('\\n', '').replace(' ', '').replace('\n', '').replace('\r', '') return item1111 def dict_data5(jiben_xinxi): dict1 = {} len_1 = len(jiben_xinxi) if len_1 % 2 == 0: for index, item in enumerate(jiben_xinxi): if index % 2 == 0: dict1[item] = jiben_xinxi[index + 1] print(dict1) def save_contents(shuju): urlist = shuju try: with open("詳細資料.csv",'a+',newline='') as f: writer = csv.writer(f) for i in range(len(urlist)): writer.writerow([urlist[i]]) except: pass def check_link(url): try: r = requests.get(url) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: print('無法連結伺服器!!!') def Schedule(a, b, c): ''' a:已經下載的資料塊 b:資料庫塊的大小 c:遠端檔案的大小 ''' per = 100.0 * a * b / c if per > 100: per = 100 print('完成!') print('%.2f%%' % per) def get_contents(rurl): soup = BeautifulSoup(rurl, 'lxml') trs = soup.find_all('img') title_name = soup.find(attrs={'class': 'content_banner_list_up'}).string title_name2 = title_name.replace(' ', '').replace("\n", "").replace("\r", "") if trs != []: for src in trs: ui = [] ui.append(src) ui_string = str(ui).replace('<img alt="" src="','').replace('"/>','').replace('[','').replace(']','') url = quote(ui_string,safe=string.printable) pattern2 = "[\u4e00-\u9fa5]+" regex2 = re.compile(pattern2) results2 = regex2.findall(ui_string) filename = str(results2[0]) + '.jpg' try: urllib.request.urlretrieve(url, 'e:/test/%s_%s' % (title_name2, filename), Schedule) except: pass time.sleep(1) print('下載完成!') else: pass if __name__ == '__main__': for i in range(len(get_url_2())): link1 = get_url_2()[i][0] print(link1) get_data4() rs = check_link(link1) get_contents(rs) time.sleep(3)
# -*- coding: utf-8 -*- import requests from bs4 import BeautifulSoup import re import csv import time link1 = 'http://www.hbzwfw.gov.cn/hbzw/sxcx/itemList/xz_index.do?webId=31&deptid=' def get_html(link): Cookie = "PHPStat_First_Time_10000011=1480428327337; PHPStat_Cookie_Global_User_Id=_ck16112922052713449617789740328; PHPStat_Return_Time_10000011=1480428327337; PHPStat_Main_Website_10000011=_ck16112922052713449617789740328%7C10000011%7C%7C%7C; VISITED_COMPANY_CODE=%5B%22600064%22%5D; VISITED_STOCK_CODE=%5B%22600064%22%5D; seecookie=%5B600064%5D%3A%u5357%u4EAC%u9AD8%u79D1; _trs_uv=ke6m_532_iw3ksw7h; VISITED_MENU=%5B%228451%22%2C%229055%22%2C%229062%22%2C%229729%22%2C%228528%22%5D" headers = { 'User-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36', 'Cookie': Cookie, 'Connection': 'keep-alive', 'Accept': '*/*', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Host': 'query.sse.com.cn', 'Referer': 'http://www.sse.com.cn/assortment/stock/list/share/' } r = requests.get(link, headers=headers, timeout=10) if 200 != r.status_code: pass html = r.text return html def get_id(link11): movie_list = [] soup = BeautifulSoup(get_html(link=link11), "lxml") div_list2 = re.findall(r'href="javascript:changebm(.*)" title=(.*)',soup.decode("utf8", "ignore")) for i in range(len(div_list2)): list1 = str(div_list2[i]) list2 = re.findall(r"[\u4e00-\u9fa5]+",list1) list3 = re.findall(r"\d+",list1) if len((list3[0]))==9: movie_list.append(list3[0]) else: pass return movie_list def get_shuju_1(): movie_list2 = get_id(link1) print(movie_list2) for n in range(len(movie_list2)): url_id = movie_list2[n] for p in range(1,9): url3 = "http://www.hbzwfw.gov.cn/hbzw/sxcx/itemList/xz_list.do?webId=31&deptid=%s&isone=&isonline=&type=&word=&page_num=%s" % (url_id,p) soup3 = BeautifulSoup(get_html(link=url3), "lxml") div_list2 = soup3.select('a') if len(div_list2) != 0: print("存在此頁" + '' + url3) div_list = soup3.select('div > div.r3_tit > a') for m in range(len(div_list)): div_list_2 = str(div_list[m]).replace('<a href="', '').replace('" target="_blank" title="', '').replace('\r\n\t\t\t\t\t\t\t\t\t</a>', '') div_list_3 = re.sub(r'">[\u4e00-\u9fa5]+', '', str(div_list_2)) pattern = re.compile(r'^http(.*)html') div_list_4 = re.findall(pattern, div_list_3) movie_list1 = [] time.sleep(1) movie_list3 = ('http'+str(div_list_4[0])+'html') movie_list1.append(movie_list3) save_contents(movie_list1) else: None def save_contents(shuju): urlist = shuju try: with open("二級目錄網址.csv",'a+',newline='') as f: writer = csv.writer(f) for i in range(len(urlist)): writer.writerow([urlist[i]]) except: pass if __name__ == '__main__': get_html(link=link1) get_id(link1) get_shuju_1()