Python3 爬蟲-提取請求頁面所有的真實url-BeautifulSoup

原創

2020-06-28 21:57

在 HTML中 <a href='xx'> 表示超鏈接，所以要是提取頁面 url 的話就是提取 ‘xx’

方法一：find_all

import urllib
import requests
from urllib.parse import urlparse
from urllib import request, parse
from bs4 import BeautifulSoup

word = '周杰倫'
# word爲關鍵詞，pn是百度用來分頁的..
url = 'http://www.baidu.com.cn/s?wd=' + urllib.parse.quote(word) + '&pn=0'
print(url)
# 通過 url 獲取域名
res = urlparse(url)
domain = res.netloc
print(domain)
print('- - '*30)

response = request.urlopen(url)
page = response.read()
soup = BeautifulSoup(page, 'lxml')
# tagh3 = soup.find_all('h3')  # 返回 list
tagh3 = soup.find_all('a')  # 獲取所有 a 標籤下內容，返回 list
all = open(r'F:\security\web\output\report\test.txt', 'w+')
hrefs = []
for h3 in tagh3:
    # href = h3.find('a').get('href')
    try:
        href = h3.get('href')  # 獲取 a 標籤下 href 的屬性值（即：url）
    except:
        pass
    else:
        hrefs.append(href)
hrefs = list({}.fromkeys(hrefs).keys())  # 去重
for href in hrefs:
    if href == None:
        hrefs.remove(href)
for href in hrefs:
    if href.strip().startswith('http://') or href.strip().startswith('https://'):
        response_url = requests.get(url=href.strip(), headers=headers, allow_redirects=False)  # allow_redirects=False 禁止重定向
        try:
            real_url = response_url.headers['Location']  # 得到網頁原始地址
        except Exception as e:
            all.write(href.strip() + '\n')
        else:
            if real_url.startswith('http'):
                all.write(real_url + '\n')
    else:
        l = href.split(domain)
        if len(l) == 1:
            if href.startswith('/'):
                href = 'http://' + domain + href
            else:
                href = 'http://' + domain + '/' + href
        else:
            href = "http://" + domain + l[1]
        response_url = requests.get(url=href.strip(), headers=headers, allow_redirects=False)  # allow_redirects=False 禁止重定向
        try:
            real_url = response_url.headers['Location']  # 得到網頁原始地址
        except Exception as e:
            all.write(href.strip() + '\n')
        else:
            if real_url.startswith('http'):
                all.write(real_url + '\n')
all.close()

方法二：select

import urllib
import requests
from urllib.parse import urlparse
from urllib import request
from bs4 import BeautifulSoup

word = '周杰倫'
# word爲關鍵詞，pn是百度用來分頁的..
url = 'http://www.baidu.com/s?wd=' + urllib.parse.quote(word) + '&pn=0'
print(url)
res = urlparse(url)
domain = res.netloc
print(domain)
page = request.urlopen(url).read()
soup = BeautifulSoup(page, 'lxml')
# tagh3 = soup.select('h3 > a[href]')  # 同級標籤之間不需要空格隔開， 不同級標籤要用空格隔開
tags = soup.select('a[href]')  # 返回 list
hrefs = []
for tag in tags:
    hrefs.append(tag.get('href'))  # 提取 href 的內容（即：url）
herfs = list({}.fromkeys(hrefs).keys())  # 去重
fw2 = open(r'F:\..\..\demo2.txt', 'w+')
for href in hrefs:
    if href.strip().startswith('http://') or href.strip().startswith('https://'):
        response_url = requests.get(url=href.strip(), headers=param.headers, allow_redirects=False)  # allow_redirects=False 禁止重定向
        try:
            real_url = response_url.headers['Location']  # 得到網頁原始地址
        except Exception as e:
            fw2.write(href.strip() + '\n')
        else:
            if real_url.startswith('http'):
                fw2.write(real_url + '\n')
    else:
        l = href.strip().split(domain)
        if len(l) == 1:
            if href.strip().startswith('/'):
                href = 'http://' + domain + href.strip()
            else:
                href = 'http://' + domain + '/' + href.strip()
        else:
            href = "http://" + domain + l[1]
        # print(href.strip())
        response_url = requests.get(url=href.strip(), headers=param.headers, allow_redirects=False)  # allow_redirects=False 禁止重定向
        # 得到網頁原始地址
        try:
            real_url = response_url.headers['Location']
        except Exception as e:
            fw2.write(href.strip() + '\n')
        else:
            if real_url.startswith('http'):
                fw2.write(real_url + '\n')
fw2.close()

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

Python3 爬蟲-提取請求頁面所有的真實url-BeautifulSoup

美團一面：項目中有 10000 個 if else 如何優化？想了半天，被問懵了！

京東面試：如何進行JVM調優？

Python 將PowerPoint (PPT/PPTX) 轉爲HTML

SQL優化-20231016

Python3-爬取頁面元素的幾種方法

Python3 爬蟲-判斷 url 是否使用了CDN

Python3 爬蟲-提取請求頁面所有的真實url-BeautifulSoup

頁面元素定位方式：xpath----軸定位方式

Python爬蟲後獲取重定向url的兩種方法

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結