python3安裝 使用 pdfminer3k python在線、本地讀取PDF文件

pdfminer3k 在線、本地讀取PDF文件

上資源

官網pdfminer3k
下載pdfminer3k

上代碼

就着註釋看代碼,是一件美差。

#! python3
# -*- coding: utf-8 -*-

"""
@Time    : 2017/8/17 18:07
@Author  : typhoon
@Site    :
@File    : test_has_package_python3.py
@Software: PyCharm
@desc    : parse pdf
"""

import importlib
import sys
import random
from urllib.request import urlopen
from urllib.request import Request

from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
from pdfminer.pdfparser import PDFParser, PDFDocument

'''
 解析pdf 文本,保存到txt文件中
'''
importlib.reload(sys)

user_agent = ['Mozilla/5.0 (Windows NT 10.0; WOW64)', 'Mozilla/5.0 (Windows NT 6.3; WOW64)',
              'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0',
              'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
              'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
              'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
              'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
              'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
              'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
              'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
              'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
              'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
              'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
              'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
              'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
              'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
              'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
              'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
              'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
              'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
              'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
              'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
              'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']


def parse(_path):
    # fp = open(_path, 'rb')  # rb以二進制讀模式打開本地pdf文件
    request = Request(url=_path, headers={'User-Agent': random.choice(user_agent)})  # 隨機從user_agent列表中抽取一個元素
    fp = urlopen(request) #打開在線PDF文檔

    # 用文件對象來創建一個pdf文檔分析器
    praser_pdf = PDFParser(fp)

    # 創建一個PDF文檔
    doc = PDFDocument()

    # 連接分析器 與文檔對象
    praser_pdf.set_document(doc)
    doc.set_parser(praser_pdf)

    # 提供初始化密碼doc.initialize("123456")
    # 如果沒有密碼 就創建一個空的字符串
    doc.initialize()

    # 檢測文檔是否提供txt轉換,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 創建PDf資源管理器 來管理共享資源
        rsrcmgr = PDFResourceManager()

        # 創建一個PDF參數分析器
        laparams = LAParams()

        # 創建聚合器
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)

        # 創建一個PDF頁面解釋器對象
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # 循環遍歷列表,每次處理一頁的內容
        # doc.get_pages() 獲取page列表
        for page in doc.get_pages():
            # 使用頁面解釋器來讀取
            interpreter.process_page(page)

            # 使用聚合器獲取內容
            layout = device.get_result()

            # 這裏layout是一個LTPage對象 裏面存放着 這個page解析出的各種對象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要獲取文本就獲得對象的text屬性,
            for out in layout:
                # 判斷是否含有get_text()方法,圖片之類的就沒有
                # if hasattr(out,"get_text"):
                if isinstance(out, LTTextBoxHorizontal):

                    results = out.get_text()
                    print("results: " + results)
if __name__ == '__main__':
    url = "http://www.caac.gov.cn/XXGK/XXGK/TJSJ/201708/P020170821330916187824.pdf"
    parse(url)

安裝pdfminer3k

此爲python3的包,so pyhon2.7的同學請見諒
一般只安裝了python3 安裝使用語句

pip install pdfminer3k

若python同時安裝了幾個版本(python2.7 and python3.6)

pip3 install pdfminer3k
或者
py -3 -m pip install pdfminer3k

運行python3程序

py -3 test.py

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章