1. 程式人生 > 實用技巧 >Python-從郵件中提取內容並插入資料庫

Python-從郵件中提取內容並插入資料庫

引子

有100多封eml格式的本地郵件,需要從每封郵件中提取出特定的內容,並插入資料庫中。難處在於如何使用正則提取內容。想試驗正則表示式的結果,可以下載RegexBuddy,方便除錯。

程式碼

import os
from email import message_from_file
import re
import hashlib
import json
import pymssql

def parseEmail(file):
    with open(file) as f:
        name = f.name
        msg = message_from_file(f)
        target_text 
= msg.get_payload()[0].get_payload() # <span>Timestamp: 2020-07-30 00:00:00 (UTC) </span> # <strong>Koality_AutoDQ4_OnePipeline_OfficeForms</strong> timestamp = re.findall("Timestamp: (\d{4}-\d{1,2}-\d{1,2})",target_text)[0] dataset = re.findall("Dataset: .+_.+_.+_(.+_*\w+) <http
",target_text)[0] unique_dataset_name = re.findall("Dataset: (.+_.+_.+_.+) <http",target_text)[0] metrics = re.findall(" Metrics[:=]\n*(.+\n*.+)\n*<=*\n*h", target_text)[0] metrics = metrics.replace('\n', '').replace('=','').replace(': ', '') # Koality_AutoDQ4_OA-OXO_Teams_DoD, whose dataset is Teams_DoD instaed of DoD
if 'Teams_' in unique_dataset_name: dataset = 'Teams_' + dataset incident_source_text = target_text[target_text.find('Incident List'): target_text.find('RootCause')].replace('=', '').replace('\n', '') incidents_str = incident_source_text[:incident_source_text.find('<https')] incdient_groups = re.findall("(\w+:\w+),", incidents_str) incident_dict = {} for incident in incdient_groups: k, v = incident.split(':') incident_dict[k] = v incident_json = json.dumps(incident_dict) hashkey = toHash(incident_json) return timestamp, dataset, metrics, incident_json, hashkey, unique_dataset_name def toHash(text): hs = hashlib.md5() hs.update(text.encode()) return hs.hexdigest().upper() def insertIntoDB(conn,table, results): cur = conn.cursor() count = 0 for row in results: rowDict = {'table': table, 'timestamp':row[0], 'dataset':row[1], 'metric':row[2], 'json':row[3], 'hashkey':row[4], 'unique_dataset_name': row[5]} sql = """ insert into [{table}] ([WorkloadName], [MetricName], [DimensionCombination_JsonForDb_Hash], [Timestamp], [DimensionCombination_JsonForDb], [UniqueDatasetName], [MetricValue], [ExpectedMetricValue]) VALUES ('{dataset}', '{metric}','{hashkey}','{timestamp}', '{json}', '{unique_dataset_name}', 0, 0); """.format(**rowDict) try: cur.execute(sql) print('Inserted a row......') except: print(row[1], row[5]) count += 1 print("Successfully inserted {:d} rows....".format(count)) cur.close() conn.close() return True def main(): print("Starting script......") emailFolder = r"C:\Work\IncidentEmails" results = [] for filePath in os.listdir(emailFolder): fullPath = emailFolder + "\\" + filePath timestamp, dataset, metrics, incident_json, hashkey, unique_dataset_name = parseEmail(fullPath) results.append([timestamp, dataset, metrics, incident_json, hashkey, unique_dataset_name]) conn = pymssql.connect('server', 'usr@server', 'pwd', 'db', autocommit=True) insertIntoDB(conn, 'KenshoAutoDQAlert', results) if __name__ == "__main__": main()
View Code