1. 程式人生 > 其它 >【python爬蟲】遊俠網部分新聞爬取

【python爬蟲】遊俠網部分新聞爬取

最後效果展示

【用網頁表格的形式展示爬取的資料】

用pyinstaller打包成exe了

【python3.6 可執行的環境多一些】

aaaaa出錯了!why什麼?放到win7虛擬機器執行 中文解碼錯誤了~~~

程式碼

#遊俠網 新聞news-link-ul  https://www.ali213.net/ li a標籤
import os
from re import I
import time
import requests
from lxml import etree
import pandas as pd
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
import traceback

url_list = []
text_list = []

def get_url(url):
    header = headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"}
    response = requests.get(url,verify=False)
    rightcode = response.encoding#得到正確編碼
    s = response.content
    s.decode(rightcode)#對內容進行正確解碼
    return s

for page in range(2,3):
    s = get_url("https://www.ali213.net/")
    selector = etree.HTML(s)#與下面的方法是同一樣的效果
    imgEle = selector.xpath('//ul[@class="news-link-ul"]/li[1]/a[1]')
    imgEleText = selector.xpath('//ul[@class="news-link-ul"]/li[1]/a[1]/text()')
    label = 'tu%s'%page
    for index,i in enumerate(imgEle):
        imgUrl = i.xpath('@href')[0]#連線已得到
        url_list.append(imgUrl)
        text_list.append(imgEleText[index])
        #print(imgUrl)
        #print(imgEleText[index])
def main1():
    j = []
    for k in url_list:
        j.append('http://'+k.split("/",2)[2])#給網頁連結中//去除掉 然後新增 http
    dic1 = {
        "text":[i for i in j],
        "url":[f for f in text_list] # j replace url_list
        #"text":["https://www.ali213.net/"],
        #"url":["遊俠網"]
    }
    #構建字典 利用字典建立表格資料
    df1 = pd.DataFrame(dic1)

    df1['url'] = '<a href=' + df1['text'] + '><div>' + df1['url'] + '</div></a>'
    nowtime =  time.strftime("%d-%m-%Y")
    df1 = df1.to_html(nowtime+'遊俠網新聞.html',escape=False) #escape = False 與上面一條語句 ,可以將df1[text]中變成url連結
    #html_table = df1.to_html('遊俠網新聞.html')
    #print(df1.to_html())  #可以打印出html 字串


if __name__ == '__main__':
    try:
        main1()
    except Exception as e:
        t=traceback.format_exc()
        with open(os.getcwd()+"/error-pa6.txt",'w') as f:
            f.write(t)