Python:提取word中關鍵內容並導出到其它word和excel表格中

一、前言

新部門接到一個新需求,要求根據以前的會議紀要,提取相關信息(如下圖所示)
在這裏插入圖片描述
包括了會議名稱、時間、地點、主持人、出席人員、列席人員、缺席人員、會議內容、彙報人、列席人等等,然後要生成兩樣東西:

1、會議通知

在這裏插入圖片描述
右下角是會議通知時間,根據會議時間往前倒推兩天自動生成。

2、會議總表

在這裏插入圖片描述

二、主要難點

1、原來的文件都是doc格式的,python的docx庫不能讀取,所以必須要靠win32轉換成docx;
2、對docx的庫使用不多,所以提取和寫入表格的代碼都是百度了好久獲得的;
3、寫入excel不難,只是參會人要把“出席人員”、“列席人員”和“缺席人員”組合起來,會議時間也只要日期不要時間;
4、最後還需要把word轉成pdf,又使用了win32的庫。

三、具體代碼

# coding=gbk
import docx
from win32com.client import Dispatch
from docx.shared import Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
import os
import datetime
import traceback

def excel_pre():
    '''啓動excel和路徑設置'''
    global xl
    xl = Dispatch("Excel.Application")
    xl.Visible = False #True是顯示, False是隱藏
    xl.DisplayAlerts = 0

def doc2Docx(fileName):
    '''將doc轉換爲docx'''
    word = Dispatch("Word.Application")
    doc = word.Documents.Open(fileName)
    doc.SaveAs(fileName + "x", 12, False, "", True, "", False, False, False, False)
    os.remove(fileName)
    doc.Close()
    word.Quit()

def dict_judge(text):
    '''判斷text中是否存在dict中的key'''
    num_dict = {'一、':1, '二、':2, '三、':3, '四、':4, '五、':5, '六、':6, '七、':7, '八、':8, '九、':9}
    for key,value in num_dict.items():
        if key in text:
            return value
    return 0

def notice_time(meeting_time, timedelta=-2):
    '''根據會議時間倒推會議通知時間'''
    a = datetime.datetime.strptime(meeting_time, "%Y-%m-%d")
    b = a + datetime.timedelta(days=timedelta)
    c = b.strftime('%Y-%m-%d')
    return c

def report_time(meeting_time, num):
    '''模擬生成彙報時間report_time'''
    if '午' in meeting_time:
        meeting_time = meeting_time.replace('午', '0')
    a = datetime.datetime.strptime(meeting_time, '%H:%M')
    if num ==1:
        return meeting_time
    else:
        b = a + datetime.timedelta(minutes=10*(num-1))
        c = b.strftime('%H:%M')
        return c

def doc2pdf(input_file):
    '''把word轉成pdf'''
    word = Dispatch('Word.Application')
    doc = word.Documents.Open(input_file)
    doc.SaveAs(input_file.replace(".docx", ".pdf"), FileFormat=17)
    doc.Close()
    word.Quit()

def set_font(paragraph):
    '''設定word中的字體大小'''
    paragraph.paragraph_format.left_indent = Pt(0) #取消左縮進
    paragraph.paragraph_format.right_indent = Pt(0)  # 取消右縮進
    run = paragraph.runs
    font = run[0].font
    font.size= Pt(14) #14對應四號字體
    # font.bold = True #加粗

def get_meeting_info(docname):
    '''提取會議紀要裏的內容'''
    doc = docx.Document(meeting_file_path + docname)
    meeting_name = docname.replace('紀要.docx', '')
    meeting_name = '上海NOC ' + meeting_name
    meeting_dict = {'會議名稱':meeting_name, '會議時間': '', '主持人': '', '參會人': ''}
    notice_dict = {'會議名稱':meeting_name, '會議時間':'', '會議地點':'', '會議主持':'', '出席人員': ''}
    joiner = ''
    for i, paragraph in enumerate(doc.paragraphs):
        this_text = paragraph.text
        num = dict_judge(this_text)
        if '會議時間' in this_text:
            this_text = this_text.split(':')[1]
            notice_dict['會議時間'] = this_text
            meeting_date = this_text[:10] #會議日期
            meeting_time = this_text[-5:] #會議時間
            notice_date = notice_time(meeting_date)
            meeting_dict['會議時間'] = meeting_date
            notice_dict['通知時間'] = notice_date
        elif '會議地點' in this_text:
            this_text = this_text.split(':')[1]
            notice_dict['會議地點'] = this_text
        elif '會議主持' in this_text:
            this_text = this_text.split(':')[1]
            meeting_dict['主持人'] = notice_dict['會議主持'] = this_text
        elif '出席人員' in this_text:
            this_text = this_text.split(':')[1]
            # notice_dict['出席人員'] = this_text
            joiner = joiner + this_text + '、'
        elif '列席人員' in this_text:
            this_text = this_text.split(':')[1]
            joiner = joiner + this_text + '、'
        elif '缺席人員' in this_text:
            this_text = this_text.split(':')[1]
            if this_text == '無':
                joiner = joiner[:-1]
            else:
                joiner = joiner + this_text
        elif num:
            this_title = this_text.split('、', 1)[1].replace('。', '')
            try:
                this_reporter = doc.paragraphs[i+1].text.split(':', 1)[1]
                if '迴避' in this_reporter:
                    this_reporter = this_reporter.split('(', 1)[0]
            except:
                this_reporter = ''
            if '列席人' in this_reporter:
                this_reporter, liexiren = this_reporter.split('(', 1)
                liexiren = liexiren.split(':')[1].replace(')', '')
            else:
                liexiren = ''
            this_report_time = report_time(meeting_time, num)
            notice_dict[num] = [this_title, this_reporter, liexiren, this_report_time]
    meeting_dict['參會人'] = joiner
    if '(開會)' in joiner:
        joiner = joiner.replace('(開會)', '')
    notice_dict['出席人員'] = joiner
    return meeting_dict, notice_dict

def make_report(report_dict):
    '''製作會議通知'''
    doc = docx.Document('會議通知模板.docx')
    #插入主要通知內容
    doc_cell_dict = {'會議名稱': (0, 1), '會議時間': (1, 1), '會議地點': (2, 1), '會議主持': (3, 1), '出席人員': (4, 1)}
    for key, value in doc_cell_dict.items():
        doc.tables[0].rows[value[0]].cells[value[1]].text = report_dict[key]
        paragraph = doc.tables[0].rows[value[0]].cells[value[1]].paragraphs[0]
        set_font(paragraph)
    #插入幾個會議內容
    for i in range(1, len(report_dict)-5):
        doc.tables[0].rows[i + 5].cells[1].text = report_dict[i][0]  # 會議內容
        doc.tables[0].rows[i + 5].cells[2].text = report_dict[i][1]  # 彙報人
        doc.tables[0].rows[i + 5].cells[3].text = report_dict[i][2]  # 專題列席人
        doc.tables[0].rows[i + 5].cells[4].text = report_dict[i][3]  # 彙報時間
        for j in range(1, 5):
            paragraph = doc.tables[0].rows[i + 5].cells[j].paragraphs[0]
            set_font(paragraph)
    #創建新的格式
    try:
        style1 = doc.styles['style head1']
    except:
        style1 = doc.styles.add_style('style head1', 2)
    finally:
        style1.font.bold = True
        style1.font.name = u'宋體 (中文正文)'
        style1.font.size = Pt(14)
    #在最後插入通知時間
    e = doc.add_paragraph()
    e.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.RIGHT #右對齊
    e.add_run(report_dict['通知時間'], style=style1)
    #最後根據名字保存會議通知
    doc.save(report_file_path + '會議通知:'+ report_dict['會議名稱'] + '.docx')
    #把會議通知轉成PDF格式
    doc2pdf(report_file_path + '會議通知:'+ report_dict['會議名稱'] + '.docx')

def make_newxls(meeting_dict, newrow):
    '''把meetring的信息導入到excel裏'''
    ws.Cells(newrow, 1).Value = meeting_dict['會議名稱']
    ws.Cells(newrow, 3).Value = '黨委(黨組)會'
    ws.Cells(newrow, 4).Value = '現場會議'
    ws.Cells(newrow, 5).Value = meeting_dict['會議時間']
    ws.Cells(newrow, 6).Value = meeting_dict['主持人']
    ws.Cells(newrow, 7).Value = meeting_dict['參會人']

if __name__ == "__main__":
    #如果原始文檔是doc格式的話,就需要先批量轉換爲docx
    # for f in os.listdir('會議紀要庫'):
    #     if f.endswith('.doc'):
    #         doc2Docx(meeting_file_path + f)
    meeting_file_path = os.path.abspath('.') + '\\' + '會議紀要庫' + '\\'
    report_file_path = os.path.abspath('.') + '\\' + '會議通知庫' + '\\'
    this_path = os.path.abspath('.') + '\\'
    excel_pre()
    wb = xl.Workbooks.Open(this_path + '決策會議採集模板.xls')
    ws = wb.Sheets('決策會議')
    n = 3
    try:
        for f in os.listdir('會議紀要庫'):
            if f.endswith('.docx'):
                print(f)
                # doc2pdf(meeting_file_path + f)   # 把會議通知轉成PDF格式(在“會議紀要庫”裏)
                meeting_dict, notice_dict = get_meeting_info(f)
                make_report(notice_dict) # 生成會議通知(在“會議通知庫”裏)
                make_newxls(meeting_dict, n) #把記錄放在《決策會議採集模板.xls》裏
                n +=1
    except:
        traceback.print_exc()
    finally:
        wb.Save()
        wb.Close()

成果:

在這裏插入圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章