python讀取ppt內容
阿新 • • 發佈:2021-01-12
import json from pptx import Presentation from pptx.enum.shapes import MSO_SHAPE_TYPE def ppt_catch_format_text(filename): """ 抓取PPT的內容,按段落返回 其中 filename 是PPT檔案的路徑 """ prs = Presentation(filename) txt_oa = {} for x in range(len(prs.slides)): txt_oa[x] = [] # ---Only on text-boxes outside group elements--- for shape in prs.slides[x].shapes: if hasattr(shape, "text"): row_text = shape.text.encode('utf-8').strip().decode() txt_oa[x].append(row_text) # ---Only operate on group shapes--- group_shapes = [shp for shp in prs.slides[x].shapes if shp.shape_type ==MSO_SHAPE_TYPE.GROUP] for group_shape in group_shapes: for shape in group_shape.shapes: if shape.has_text_frame: row_text = shape.text.encode('utf-8').strip().decode() txt_oa[x].append(row_text) return txt_oa text_list = ppt_catch_format_text('report.pptx') text_list = json.dumps(text_list, ensure_ascii=False, indent=4).replace("\\n","") print(text_list) ''' Presentation priːzenˈteɪʃn 演示 slides slaɪdz 幻燈片 shape ʃeɪp 形狀 '''