1. 程式人生 > >如何解決Windows系統儲存檔案的亂碼問題

如何解決Windows系統儲存檔案的亂碼問題

如何解決爬取某時段微博資料但儲存在csv檔案中出現亂碼問題 1、(1獲取微博資料以及儲存在csv中) 原始碼如下

# start_chrome -> input_date -> scroll_down-> find_cards_info -> save -> find_next (goto)
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import csv
import os

# 執行前先下載 chrome driver,下載地址是:https://sites.google.com/a/chromium.org/chromedriver/downloads,點選【Latest Release: ChromeDriver x.xx】進入下載
def start_chrome(): driver = webdriver.Chrome(executable_path='./chromedriver') # Windows 需寫成'./chromedriver.exe' driver.start_client() return driver # weibo.com/sxbg? + start_t ... def q(st,et): return f'?is_ori=1&key_word=&start_time={st}&end_time={et}&is_search=1&is_searchadv=1#_0'
def scroll_down(): html_page = driver.find_element_by_tag_name('html') # ...# form > input for i in range(15): print(i) html_page.send_keys(Keys.END) time.sleep(0.6) def find_cards_info(): cards_sel = 'div.WB_feed_detail' cards = driver.find_elements_by_css_selector(cards_sel) info_list = [] for
card in cards: content_sel = 'div.WB_text.W_f14' time_sel = 'div.WB_from.S_txt2' link_sel = 'div.WB_from.S_txt2 > a:nth-child(1)' content = card.find_element_by_css_selector(content_sel).text time = card.find_element_by_css_selector(time_sel).text link = card.find_element_by_css_selector(link_sel).get_attribute('href') info_list.append([content,time,link]) # [[1,2,3],[4,5,6]...] return info_list def find_next(): next_sel = 'a.page.next' next_page = driver.find_elements_by_css_selector(next_sel) if next_page: return next_page[0].get_attribute('href') def save(info_list,name): full_path = './' + name + '.csv' # 2018-01-02~2018-03-05.csv if os.path.exists(full_path): with open(full_path,'a') as f: writer = csv.writer(f) writer.writerows(info_list) print('Done') else: with open(full_path,'w+') as f: writer = csv.writer(f) writer.writerows(info_list) print('Done') def run_crawler(base,duration): # 2018-01-02~2018-03-05 if not base.endswith('feedtop'): st, et = duration.split('~') driver.get(base+q(st,et)) else: driver.get(base) time.sleep(5) scroll_down() time.sleep(5) info_list = find_cards_info() save(info_list,duration) next_page = find_next() if next_page: run_crawler(next_page,duration) base = 'https://weibo.com/bgsxy' #這裡可以替換成你想爬取的微博使用者 driver = start_chrome() input() run_crawler(base, '2017-06-20~2018-02-03') #這裡可以替換成你想爬取的時間段

之後產生相關亂碼問題 (2、如何解決在Windows系統不相容的問題?爬取資料在csv裡面產生亂碼,可採用三明治方法解決) 程式碼如下:

import csv
import os

#網站是UTF-8編碼,之後用UTF-8解碼編碼寫入Windows電腦中,但是windowcsv預設是gbk解碼,所以會產生亂碼(mac系統沒事,預設是utf8編碼)
path = 'D:\\2017-06-20~2018-09-03.csv'#爬取資料的路徑
full_path = 'D:\hsf.csv'#要儲存資料的路徑
info_list = []
c = open(path,'r',encoding='utf-8')#用utf8解碼,因為大多數網站都是用utf8編碼
read = csv.reader(c)
for line in read:
    if line:
        info_list.append(line)
        print(line)
print('D')
if os.path.exists(full_path):
    with open(full_path,'a',encoding='gb18030') as f:#用gb18030編碼的話,適用於Windows系統
        writer = csv.writer(f)
        writer.writerows(info_list)
        print('Done')
else:
    with open(full_path,'w+',encoding='gb18030') as f:
        writer = csv.writer(f)
        writer.writerows(info_list)
        print('Done')

IDE中記得重新設定一下,不然依舊會報錯

這裡寫圖片描述

說明:原始碼選自麻瓜教程實用主義課程中的,同時也感謝幫我解惑的蔡坤林同學和夏老師指導!如有不當之處,敬請指正!