1. 程式人生 > 其它 >python 合併csv和excel檔案

python 合併csv和excel檔案

技術標籤:python

這篇文章是R語言 如何合併csv檔案(批量讀取csv檔案)的姊妹篇。提供更為強大的功能。

解決的問題是提取csv檔案和xlsx檔案混雜時,檔案合併問題。具體來說,下面的程式碼是提取指定列文字,簡單清洗後存入新csv檔案。

提取的列名為“博文內容”(在兩類檔案中列名相同)
process(data)為文字清洗模組
資料庫介面未使用
程式碼如下:

# This is a sample Python script.
import csv
import re
import os
import xlrd
import store_to_sql
# Press Shift+F10 to execute it or replace it with your code.
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings. def find_path_list(path): file_list = os.listdir(path) for i,file in enumerate(file_list): file_list[i] = path + os.sep + file return file_list def read_csv_colum(file_dir, encoding =
"utf-8-sig"): #提取檔案指定列 try: col_text = -1 with open(file_dir, 'r',encoding=encoding, newline='') as f: reader = csv.reader(f) for i, col_name in enumerate(next(reader)): if col_name == "博文內容": col_text =
i with open(file_dir, 'r', encoding=encoding, newline='') as f: reader = csv.reader(f) data = [row[col_text] for row in reader] except: try: # 開啟excel檔案,建立一個workbook物件,表含有sheet名 rbook = xlrd.open_workbook(file_dir) # sheets方法返回物件列表,[<xlrd.sheet.Sheet object at 0x103f147f0>] rbook.sheets() # xls預設有3個工作簿,Sheet1,Sheet2,Sheet3 col_text = -1 col_time = -1 rsheet = rbook.sheet_by_index(0) # 取第一個工作簿 for row in rsheet.get_rows(): for i, col_name in enumerate(row): if str(col_name) == "text:'博文內容'": col_text = i break data = [row[col_text] for row in rsheet.get_rows()] # 第二列 print("這是xlsx檔案") except: data = [] return data def write_csv(file_dir, result_data): # 寫入csv檔案,現在更改為寫入sql資料庫 with open(file_dir, 'a', encoding='utf-8-sig', newline='') as f: writer = csv.writer(f) for data in result_data: writer.writerows([[data]]) def write_sql(file_dir, result_data, connect): """ """ for data in result_data: store_to_sql.store_to_sql(data, connect) def process(data): deal_data = [] for text in data: text = str(text) a = re.sub(u"\\(.*?\\)|\\{.*?\\}|\\[.*?\\]|\\<.*?\\>", "", text) # 刪去 a2 = re.sub(u"\\【.*?\\】|\\{.*?\\}|\\[.*?\\]|\\<.*?\\>", "", a) a3 = re.sub(u"\\#.*?\\#|\\{.*?\\}|\\[.*?\\]|\\<.*?\\>", "", a2) # 刪#之間內容,上同 a4 = re.sub('[a-zA-Z]','',a3) # 刪英文字母 a5 = re.sub('博文內容', '', a4) if a5 == "" or len(a5) < 5: continue else: deal_data.append(a5) return deal_data # Press the green button in the gutter to run the script. if __name__ == '__main__': fold_path = "E:\課程\自然語言處理\資料彙總的副本" #輸入scsv檔案存放的絕對路徑 new_csv_dir = fold_path + os.sep + "合併後資料.csv" #處理後csv檔案的名字 # 指定提取第三列 file_dir_list = find_path_list(fold_path) # 返回檔案路徑列表 # 存入資料庫,可修改下面for迴圈內部的write_csv來修改為存入資料庫功能 # store_to_sql.create_tab(store_to_sql.conn_sql()) # conn = store_to_sql.conn_sql() for i, dir in enumerate(file_dir_list): try: result_data = read_csv_colum(dir) deal_data = process(result_data) write_csv(new_csv_dir, deal_data) print("已經完成{}%".format(round((i+1) * 100 / len(file_dir_list), 2))) except UnicodeDecodeError as e: print(e)

新年第一篇部落格,新年快樂!進一步有進一步的喜悅~