python獲取html如果url存在中文 ,會拋出UnicodeEncodeError異常
參考 https://www.cnblogs.com/jessicaxu/p/7977277.html
UnicodeEncodeError: 'ascii' codec can't encode characters in position 19-24: ordinal not in range(128)
#該函數用於獲取html內容
#使用到urlopen的函數
def getHtml(url):
print("開始訪問:" + url)
# 如果不加上下面的這行出現會出現urllib2.HTTPError: HTTP Error 403: Forbidden錯誤
# 主要是由於該網站禁止爬蟲導致的,可以在請求加上頭信息,僞裝成瀏覽器訪問User-Agent,具體的信息可以通過火狐的FireBug插件查詢
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}#Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)
#Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0
req = urllib.request.Request(url=url, headers=headers)
page = urllib.request.urlopen(req, timeout=1)
#3.0直接使用read()函數會出現報錯,提示是編碼有問題。在後面加上編碼就ok了。
html = page.read().decode("utf-8")
print("訪問成功:" + url)
return html
cUrl https://www.aa223.com/yousheng/list-誘惑短篇小說.html
開始訪問:https://www.aa223.com/yousheng/list-誘惑短篇小說.html
Traceback (most recent call last):
File "E:\GitMulu\PythonCrawler\src\kxf\test\mp3\__init__.py", line 107, in <module>
htmlChild = getHtml(str(cUrl))
File "E:\GitMulu\PythonCrawler\src\kxf\test\utils\__init__.py", line 46, in getHtml
page = urllib.request.urlopen(req, timeout=1)
File "D:\Programs\Python\Python37\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "D:\Programs\Python\Python37\lib\urllib\request.py", line 525, in open
response = self._open(req, data)
File "D:\Programs\Python\Python37\lib\urllib\request.py", line 543, in _open
'_open', req)
File "D:\Programs\Python\Python37\lib\urllib\request.py", line 503, in _call_chain
result = func(*args)
File "D:\Programs\Python\Python37\lib\urllib\request.py", line 1360, in https_open
context=self._context, check_hostname=self._check_hostname)
File "D:\Programs\Python\Python37\lib\urllib\request.py", line 1317, in do_open
encode_chunked=req.has_header('Transfer-encoding'))
File "D:\Programs\Python\Python37\lib\http\client.py", line 1244, in request
self._send_request(method, url, body, headers, encode_chunked)
File "D:\Programs\Python\Python37\lib\http\client.py", line 1255, in _send_request
self.putrequest(method, url, **skips)
File "D:\Programs\Python\Python37\lib\http\client.py", line 1122, in putrequest
self._output(request.encode('ascii'))
UnicodeEncodeError: 'ascii' codec can't encode characters in position 19-24: ordinal not in range(128)
轉下編碼就可以解決,如下
#該函數用於獲取html內容
#使用到urlopen的函數
def getHtml(url):
url = getUrlCode(url);
# 如果不加上下面的這行出現會出現urllib2.HTTPError: HTTP Error 403: Forbidden錯誤
# 主要是由於該網站禁止爬蟲導致的,可以在請求加上頭信息,僞裝成瀏覽器訪問User-Agent,具體的信息可以通過火狐的FireBug插件查詢
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}#Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)
#Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0
req = urllib.request.Request(url=url, headers=headers)
page = urllib.request.urlopen(req, timeout=10)
#3.0直接使用read()函數會出現報錯,提示是編碼有問題。在後面加上編碼就ok了。
html = page.read().decode("utf-8")
print("訪問成功:" + url)
return html
#將字符串進行編碼 將中文字符轉換成url編碼
def getUrlCode(url):
print("getUrlCode " + url)
name = re.findall(u"[\u4e00-\u9fa5]+",url)
if len(name)>0:
for name1 in name:
str2 = parse.quote(name1) #將字符串進行編碼
print(str2) #哈哈 %E5%93%88%E5%93%88
url = url.replace(name1, str2);
print("getUrlCode =====" + url)
return url