python3爬蟲 連結+表格+圖片(本地+csv+mongodb儲存)
阿新 • • 發佈:2018-11-11
# -*- coding: utf-8 -*- import requests from bs4 import BeautifulSoup import re import csv import time from pymongo import MongoClient client = MongoClient('localhost',27017) db = client.admin db.authenticate("root", "root") my_db = client.mydb col = my_db.zwfw_html link1 = 'http://www.hbzwfw.gov.cn/hbzw/sxcx/itemList/xz_index.do?webId=31&deptid=' def get_html(link): Cookie = "PHPStat_First_Time_10000011=1480428327337; PHPStat_Cookie_Global_User_Id=_ck16112922052713449617789740328; PHPStat_Return_Time_10000011=1480428327337; PHPStat_Main_Website_10000011=_ck16112922052713449617789740328%7C10000011%7C%7C%7C; VISITED_COMPANY_CODE=%5B%22600064%22%5D; VISITED_STOCK_CODE=%5B%22600064%22%5D; seecookie=%5B600064%5D%3A%u5357%u4EAC%u9AD8%u79D1; _trs_uv=ke6m_532_iw3ksw7h; VISITED_MENU=%5B%228451%22%2C%229055%22%2C%229062%22%2C%229729%22%2C%228528%22%5D" headers = { 'User-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36', 'Cookie': Cookie, 'Connection': 'keep-alive', 'Accept': '*/*', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Host': 'query.sse.com.cn', 'Referer': 'http://www.sse.com.cn/assortment/stock/list/share/' } r = requests.get(link, headers=headers, timeout=10) if 200 != r.status_code: pass html = r.text return html def get_id(link11): movie_list = [] soup = BeautifulSoup(get_html(link=link11), "lxml") div_list2 = re.findall(r'href="javascript:changebm(.*)" title=(.*)',soup.decode("utf8", "ignore")) for i in range(len(div_list2)): list1 = str(div_list2[i]) list2 = re.findall(r"[\u4e00-\u9fa5]+",list1) list3 = re.findall(r"\d+",list1) if len((list3[0]))==9: movie_list.append(list3[0]) else: pass return movie_list def get_shuju_1(): movie_list2 = get_id(link1) print(movie_list2) for n in range(len(movie_list2)): try: url_id = movie_list2[n] except IndexError as e: pass for p in range(1,9): url3 = "http://www.hbzwfw.gov.cn/hbzw/sxcx/itemList/xz_list.do?webId=31&deptid=%s&isone=&isonline=&type=&word=&page_num=%s" % (url_id,p) soup3 = BeautifulSoup(get_html(link=url3), "lxml") div_list2 = soup3.select('a') if len(div_list2) != 0: print("存在此頁" + '' + url3) div_list = soup3.select('div > div.r3_tit > a') for m in range(len(div_list)): div_list_2 = str(div_list[m]).replace('<a href="', '').replace('" target="_blank" title="', '').replace('\r\n\t\t\t\t\t\t\t\t\t</a>', '') div_list_3 = re.sub(r'">[\u4e00-\u9fa5]+', '', str(div_list_2)) pattern = re.compile(r'^http(.*)html') div_list_4 = re.findall(pattern, div_list_3) div_list_5 = re.findall(r"[\u4e00-\u9fa5,(,),、,]+", div_list_3) time.sleep(0.1) movie_list3 = ('http'+str(div_list_4[0])+'html') print(movie_list3, div_list_5[0]) save_contents(movie_list3,div_list_5[0]) dict_url = dict(zip(div_list_5[0],movie_list3)) col.insert(dict_url) else: None def save_contents(shuju,title): urlist = shuju titleist = title try: with open("二級目錄網址.csv",'a+',newline='') as f: writer = csv.writer(f) writer.writerows(zip([urlist], [titleist])) except: pass if __name__ == '__main__': get_html(link=link1) get_shuju_1()