python獲取html如果url存在中文 ,會拋出UnicodeEncodeError異常

python獲取html如果url存在中文 ,會拋出UnicodeEncodeError異常

參考 https://www.cnblogs.com/jessicaxu/p/7977277.html

 UnicodeEncodeError: 'ascii' codec can't encode characters in position 19-24: ordinal not in range(128)

#該函數用於獲取html內容
#使用到urlopen的函數
def getHtml(url):
    print("開始訪問:" + url)
    # 如果不加上下面的這行出現會出現urllib2.HTTPError: HTTP Error 403: Forbidden錯誤
    # 主要是由於該網站禁止爬蟲導致的,可以在請求加上頭信息,僞裝成瀏覽器訪問User-Agent,具體的信息可以通過火狐的FireBug插件查詢
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}#Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)
    #Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0
    req = urllib.request.Request(url=url, headers=headers)
    page = urllib.request.urlopen(req, timeout=1)
    #3.0直接使用read()函數會出現報錯,提示是編碼有問題。在後面加上編碼就ok了。
    html = page.read().decode("utf-8")
    print("訪問成功:" + url)
    return html

 

cUrl   https://www.aa223.com/yousheng/list-誘惑短篇小說.html
開始訪問:https://www.aa223.com/yousheng/list-誘惑短篇小說.html
Traceback (most recent call last):
  File "E:\GitMulu\PythonCrawler\src\kxf\test\mp3\__init__.py", line 107, in <module>
    htmlChild = getHtml(str(cUrl))
  File "E:\GitMulu\PythonCrawler\src\kxf\test\utils\__init__.py", line 46, in getHtml
    page = urllib.request.urlopen(req, timeout=1)
  File "D:\Programs\Python\Python37\lib\urllib\request.py", line 222, in urlopen
    return opener.open(url, data, timeout)
  File "D:\Programs\Python\Python37\lib\urllib\request.py", line 525, in open
    response = self._open(req, data)
  File "D:\Programs\Python\Python37\lib\urllib\request.py", line 543, in _open
    '_open', req)
  File "D:\Programs\Python\Python37\lib\urllib\request.py", line 503, in _call_chain
    result = func(*args)
  File "D:\Programs\Python\Python37\lib\urllib\request.py", line 1360, in https_open
    context=self._context, check_hostname=self._check_hostname)
  File "D:\Programs\Python\Python37\lib\urllib\request.py", line 1317, in do_open
    encode_chunked=req.has_header('Transfer-encoding'))
  File "D:\Programs\Python\Python37\lib\http\client.py", line 1244, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "D:\Programs\Python\Python37\lib\http\client.py", line 1255, in _send_request
    self.putrequest(method, url, **skips)
  File "D:\Programs\Python\Python37\lib\http\client.py", line 1122, in putrequest
    self._output(request.encode('ascii'))
UnicodeEncodeError: 'ascii' codec can't encode characters in position 19-24: ordinal not in range(128)

 轉下編碼就可以解決,如下

#該函數用於獲取html內容
#使用到urlopen的函數
def getHtml(url):
    url = getUrlCode(url);
    # 如果不加上下面的這行出現會出現urllib2.HTTPError: HTTP Error 403: Forbidden錯誤
    # 主要是由於該網站禁止爬蟲導致的,可以在請求加上頭信息,僞裝成瀏覽器訪問User-Agent,具體的信息可以通過火狐的FireBug插件查詢
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}#Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)
    #Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0
    req = urllib.request.Request(url=url, headers=headers)
    page = urllib.request.urlopen(req, timeout=10)
    #3.0直接使用read()函數會出現報錯,提示是編碼有問題。在後面加上編碼就ok了。
    html = page.read().decode("utf-8")
    print("訪問成功:" + url)
    return html

#將字符串進行編碼 將中文字符轉換成url編碼
def getUrlCode(url):
    print("getUrlCode " + url)
    name = re.findall(u"[\u4e00-\u9fa5]+",url)
    if len(name)>0:
        for name1 in name:
            str2 = parse.quote(name1)    #將字符串進行編碼
            print(str2)                 #哈哈  %E5%93%88%E5%93%88
            url = url.replace(name1, str2);
    print("getUrlCode =====" + url)
    return url

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章