1. 程式人生 > 其它 >解析doc文件中XML段落的個數

解析doc文件中XML段落的個數

# -*- coding:utf-8 -*-
'''
anslysis_doc.py
功能:解析doc文件中xml段落的個數 步驟: ''' import os import re from docx import Document def get_xml_count(path): ''' :param path: doc文件的絕對路徑 :return: 返回doc文件中XML段落的個數 ''' # print('doc檔案: %s' %path) doc = Document(path) count = 0 flag = True
for paragraph in doc.paragraphs: # print(paragraph.text) if flag: regex = re.match(r'^<[A-Z]+>$', paragraph.text) # 首次匹配到XML的標籤,eg:<ACL> if regex: value = regex.group(0) # 獲取標籤內容 flag = False count
+= 1 elif re.match(value, paragraph.text): # 以首次獲取的標籤內容對剩下段落做正則匹配 count += 1 return count if __name__ == '__main__': path_dir = 'D:\\workspace_py\\FILES' sum = 0 for file in os.listdir(path_dir): path = path_dir + '/' + file count = get_xml_count(path)
print('%s檔案中,符合條件XML個數: %s' % (path, count)) sum += count print('總的符合條件XML個數:%s' % sum) # path = 'D:\\workspace_py\\FILES\\Comware ACL NETCONF XML API Action Reference.docx' # count = get_xml_count(path) # print(count)