python 合併csv和excel檔案
阿新 • • 發佈:2021-01-01
技術標籤:python
這篇文章是R語言 如何合併csv檔案(批量讀取csv檔案)的姊妹篇。提供更為強大的功能。
解決的問題是提取csv檔案和xlsx檔案混雜時,檔案合併問題。具體來說,下面的程式碼是提取指定列文字,簡單清洗後存入新csv檔案。
提取的列名為“博文內容”(在兩類檔案中列名相同)
process(data)為文字清洗模組
資料庫介面未使用
程式碼如下:
# This is a sample Python script.
import csv
import re
import os
import xlrd
import store_to_sql
# Press Shift+F10 to execute it or replace it with your code.
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
def find_path_list(path):
file_list = os.listdir(path)
for i,file in enumerate(file_list):
file_list[i] = path + os.sep + file
return file_list
def read_csv_colum(file_dir, encoding = "utf-8-sig"): #提取檔案指定列
try:
col_text = -1
with open(file_dir, 'r',encoding=encoding, newline='') as f:
reader = csv.reader(f)
for i, col_name in enumerate(next(reader)):
if col_name == "博文內容":
col_text = i
with open(file_dir, 'r', encoding=encoding, newline='') as f:
reader = csv.reader(f)
data = [row[col_text] for row in reader]
except:
try:
# 開啟excel檔案,建立一個workbook物件,表含有sheet名
rbook = xlrd.open_workbook(file_dir)
# sheets方法返回物件列表,[<xlrd.sheet.Sheet object at 0x103f147f0>]
rbook.sheets()
# xls預設有3個工作簿,Sheet1,Sheet2,Sheet3
col_text = -1
col_time = -1
rsheet = rbook.sheet_by_index(0) # 取第一個工作簿
for row in rsheet.get_rows():
for i, col_name in enumerate(row):
if str(col_name) == "text:'博文內容'":
col_text = i
break
data = [row[col_text] for row in rsheet.get_rows()] # 第二列
print("這是xlsx檔案")
except:
data = []
return data
def write_csv(file_dir, result_data):
# 寫入csv檔案,現在更改為寫入sql資料庫
with open(file_dir,
'a',
encoding='utf-8-sig',
newline='') as f:
writer = csv.writer(f)
for data in result_data:
writer.writerows([[data]])
def write_sql(file_dir, result_data, connect):
"""
"""
for data in result_data:
store_to_sql.store_to_sql(data, connect)
def process(data):
deal_data = []
for text in data:
text = str(text)
a = re.sub(u"\\(.*?\\)|\\{.*?\\}|\\[.*?\\]|\\<.*?\\>", "", text) # 刪去
a2 = re.sub(u"\\【.*?\\】|\\{.*?\\}|\\[.*?\\]|\\<.*?\\>", "", a)
a3 = re.sub(u"\\#.*?\\#|\\{.*?\\}|\\[.*?\\]|\\<.*?\\>", "", a2) # 刪#之間內容,上同
a4 = re.sub('[a-zA-Z]','',a3) # 刪英文字母
a5 = re.sub('博文內容', '', a4)
if a5 == "" or len(a5) < 5:
continue
else:
deal_data.append(a5)
return deal_data
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
fold_path = "E:\課程\自然語言處理\資料彙總的副本" #輸入scsv檔案存放的絕對路徑
new_csv_dir = fold_path + os.sep + "合併後資料.csv" #處理後csv檔案的名字 # 指定提取第三列
file_dir_list = find_path_list(fold_path) # 返回檔案路徑列表
# 存入資料庫,可修改下面for迴圈內部的write_csv來修改為存入資料庫功能
# store_to_sql.create_tab(store_to_sql.conn_sql())
# conn = store_to_sql.conn_sql()
for i, dir in enumerate(file_dir_list):
try:
result_data = read_csv_colum(dir)
deal_data = process(result_data)
write_csv(new_csv_dir, deal_data)
print("已經完成{}%".format(round((i+1) * 100 / len(file_dir_list), 2)))
except UnicodeDecodeError as e:
print(e)
新年第一篇部落格,新年快樂!進一步有進一步的喜悅~