此功能需要安裝 python的win32com模塊
python -m pip install pypiwin32
# coding=utf-8
'''
word 文檔信息提取
'''
import os,fnmatch
from win32com import client as wc
from win32com.client import Dispatch
def word2txt(filepath, savepath=''):
pass
# 1. 切分文件路徑和文件名
dirs, filename = os.path.split(filepath)
# print(dirs, "\r\n", filename)
# 2. 修改文件後綴
new_name = ''
if fnmatch.fnmatch(filepath, '*.doc'):
new_name = filepath[:-4]+'.txt'
elif fnmatch.fnmatch(filepath, '*.docx'):
new_name = filepath[:-5]+'.txt'
else:
return print('僅支持 doc和docx格式')
# 3. 設置新文件保存路徑
if savepath == '':
savepath = dirs
else:
savepath = savepath
new_path = os.path.join(savepath, new_name)
# print(filepath)
# 4. 加載文本處理程序
wordapp = wc.Dispatch('Word.Application')
mytxt = wordapp.Documents.Open(filepath)
# 5. 保存文本信息
# print(new_path)
mytxt.SaveAs(new_path, 4) # 參數4代表抽取文本
mytxt.Close()
if __name__ == '__main__':
filepath1 = os.path.abspath(r'文檔1.doc')
filepath2 = os.path.abspath(r'文檔2.docx')
filepath3 = os.path.abspath(r'pdf文檔.pdf')
word2txt(filepath1)
親測有效