在 HTML中 <a href='xx'> 表示超鏈接,所以要是提取頁面 url 的話就是提取 ‘xx’
方法一:find_all
import urllib
import requests
from urllib.parse import urlparse
from urllib import request, parse
from bs4 import BeautifulSoup
word = '周杰倫'
# word爲關鍵詞,pn是百度用來分頁的..
url = 'http://www.baidu.com.cn/s?wd=' + urllib.parse.quote(word) + '&pn=0'
print(url)
# 通過 url 獲取域名
res = urlparse(url)
domain = res.netloc
print(domain)
print('- - '*30)
response = request.urlopen(url)
page = response.read()
soup = BeautifulSoup(page, 'lxml')
# tagh3 = soup.find_all('h3') # 返回 list
tagh3 = soup.find_all('a') # 獲取所有 a 標籤下內容,返回 list
all = open(r'F:\security\web\output\report\test.txt', 'w+')
hrefs = []
for h3 in tagh3:
# href = h3.find('a').get('href')
try:
href = h3.get('href') # 獲取 a 標籤下 href 的屬性值(即:url)
except:
pass
else:
hrefs.append(href)
hrefs = list({}.fromkeys(hrefs).keys()) # 去重
for href in hrefs:
if href == None:
hrefs.remove(href)
for href in hrefs:
if href.strip().startswith('http://') or href.strip().startswith('https://'):
response_url = requests.get(url=href.strip(), headers=headers, allow_redirects=False) # allow_redirects=False 禁止重定向
try:
real_url = response_url.headers['Location'] # 得到網頁原始地址
except Exception as e:
all.write(href.strip() + '\n')
else:
if real_url.startswith('http'):
all.write(real_url + '\n')
else:
l = href.split(domain)
if len(l) == 1:
if href.startswith('/'):
href = 'http://' + domain + href
else:
href = 'http://' + domain + '/' + href
else:
href = "http://" + domain + l[1]
response_url = requests.get(url=href.strip(), headers=headers, allow_redirects=False) # allow_redirects=False 禁止重定向
try:
real_url = response_url.headers['Location'] # 得到網頁原始地址
except Exception as e:
all.write(href.strip() + '\n')
else:
if real_url.startswith('http'):
all.write(real_url + '\n')
all.close()
方法二:select
import urllib
import requests
from urllib.parse import urlparse
from urllib import request
from bs4 import BeautifulSoup
word = '周杰倫'
# word爲關鍵詞,pn是百度用來分頁的..
url = 'http://www.baidu.com/s?wd=' + urllib.parse.quote(word) + '&pn=0'
print(url)
res = urlparse(url)
domain = res.netloc
print(domain)
page = request.urlopen(url).read()
soup = BeautifulSoup(page, 'lxml')
# tagh3 = soup.select('h3 > a[href]') # 同級標籤之間不需要空格隔開, 不同級標籤要用空格隔開
tags = soup.select('a[href]') # 返回 list
hrefs = []
for tag in tags:
hrefs.append(tag.get('href')) # 提取 href 的內容(即:url)
herfs = list({}.fromkeys(hrefs).keys()) # 去重
fw2 = open(r'F:\..\..\demo2.txt', 'w+')
for href in hrefs:
if href.strip().startswith('http://') or href.strip().startswith('https://'):
response_url = requests.get(url=href.strip(), headers=param.headers, allow_redirects=False) # allow_redirects=False 禁止重定向
try:
real_url = response_url.headers['Location'] # 得到網頁原始地址
except Exception as e:
fw2.write(href.strip() + '\n')
else:
if real_url.startswith('http'):
fw2.write(real_url + '\n')
else:
l = href.strip().split(domain)
if len(l) == 1:
if href.strip().startswith('/'):
href = 'http://' + domain + href.strip()
else:
href = 'http://' + domain + '/' + href.strip()
else:
href = "http://" + domain + l[1]
# print(href.strip())
response_url = requests.get(url=href.strip(), headers=param.headers, allow_redirects=False) # allow_redirects=False 禁止重定向
# 得到網頁原始地址
try:
real_url = response_url.headers['Location']
except Exception as e:
fw2.write(href.strip() + '\n')
else:
if real_url.startswith('http'):
fw2.write(real_url + '\n')
fw2.close()