我的方法:
利用爬蟲下載 html
from urllib import request
from lxml import etree
from bll.rate_manager_bll import rate_manager_bll
from datetime import datetime
from bll.order_bll import order_bll
from common import ali_oss
import imgkit
from common_tools.constant.common_constant import *
from dal.attachment_dal import attachment_dal
def capture_BCHK_real_rate():
"""
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
}
url = 'https://www.baidu.com'
response = request.Request(url=url, headers=headers)
html_info = request.urlopen(response).read().decode('utf-8')
如果網址存在css 和js 該怎麼辦呢 盤他
# 替換地址
html_info_new = f"""{html_info}""".replace('src="', 'src="https://www.xxx.com').replace('href="', 'href="https://www.xxx.com')
output_path我爲False 具體自己搜索一下mgkit.from_string的用法
data_str = imgkit.from_string(html_info_new, output_path=False)
其中遇到的坑
imgkit 安裝 imgkit
ubuntu系統,更新sudo apt-get update
安裝:sudo apt-get install xvfb
sudo apt-get install wkhtmltopdf
sudo pip install pdfkit
sudo pip install imgkit
還遇到字體的問題