【python爬蟲】遊俠網部分新聞爬取
阿新 • • 發佈:2021-12-22
最後效果展示
【用網頁表格的形式展示爬取的資料】
用pyinstaller打包成exe了
【python3.6 可執行的環境多一些】
aaaaa出錯了!why什麼?放到win7虛擬機器執行 中文解碼錯誤了~~~
程式碼
#遊俠網 新聞news-link-ul https://www.ali213.net/ li a標籤 import os from re import I import time import requests from lxml import etree import pandas as pd from requests.packages.urllib3.exceptions import InsecureRequestWarning requests.packages.urllib3.disable_warnings(InsecureRequestWarning) import traceback url_list = [] text_list = [] def get_url(url): header = headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"} response = requests.get(url,verify=False) rightcode = response.encoding#得到正確編碼 s = response.content s.decode(rightcode)#對內容進行正確解碼 return s for page in range(2,3): s = get_url("https://www.ali213.net/") selector = etree.HTML(s)#與下面的方法是同一樣的效果 imgEle = selector.xpath('//ul[@class="news-link-ul"]/li[1]/a[1]') imgEleText = selector.xpath('//ul[@class="news-link-ul"]/li[1]/a[1]/text()') label = 'tu%s'%page for index,i in enumerate(imgEle): imgUrl = i.xpath('@href')[0]#連線已得到 url_list.append(imgUrl) text_list.append(imgEleText[index]) #print(imgUrl) #print(imgEleText[index]) def main1(): j = [] for k in url_list: j.append('http://'+k.split("/",2)[2])#給網頁連結中//去除掉 然後新增 http dic1 = { "text":[i for i in j], "url":[f for f in text_list] # j replace url_list #"text":["https://www.ali213.net/"], #"url":["遊俠網"] } #構建字典 利用字典建立表格資料 df1 = pd.DataFrame(dic1) df1['url'] = '<a href=' + df1['text'] + '><div>' + df1['url'] + '</div></a>' nowtime = time.strftime("%d-%m-%Y") df1 = df1.to_html(nowtime+'遊俠網新聞.html',escape=False) #escape = False 與上面一條語句 ,可以將df1[text]中變成url連結 #html_table = df1.to_html('遊俠網新聞.html') #print(df1.to_html()) #可以打印出html 字串 if __name__ == '__main__': try: main1() except Exception as e: t=traceback.format_exc() with open(os.getcwd()+"/error-pa6.txt",'w') as f: f.write(t)