# ! /usr/bin/python
# -*- coding: utf8 -*-
# @Time : 2019/10/23
# @Author : zhang
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup
import os
def parameters(pathsave,last,path): #獲取參數
html = urlopen(path + last) # 獲取網頁
bs = BeautifulSoup(html, 'html.parser') # 解析網頁
hyperlink = bs.find_all('a') # 獲取所有超鏈接
pathsavenew = pathsave + last
if not os.path.exists(pathsavenew):
os.makedirs(pathsavenew)
for h1 in hyperlink :
hh = h1.get('href')
if '.mat' in hh:
html = requests.get(path + last + hh)
with open(pathsavenew + hh, "wb") as f:
f.write(html.content)
if '.txt' in hh:
html = requests.get(path + last + hh)
with open(pathsavenew + hh, "wb") as f:
f.write(html.content)
def RawData(pathsave,last,path): # 獲取RawDate
dic3=[]
html = urlopen(path + last) # 獲取網頁
bs = BeautifulSoup(html, 'html.parser') # 解析網頁
hyperlink = bs.find_all('a') # 獲取所有超鏈接
for h3 in hyperlink:
hh = h3.get('href')
if 'NIF' in hh:
dic3.append(hh)
dic4=[]
for z in dic3:
html = urlopen(path + last+z) # 獲取網頁
bs = BeautifulSoup(html, 'html.parser') # 解析網頁
hyperlink = bs.find_all('a') # 獲取所有超鏈接
for h4 in hyperlink:
hh = h4.get('href')
if '.nii' in hh:
dic4.append(hh)
pathsavenew = pathsave + last + z
if not os.path.exists(pathsavenew):
os.makedirs(pathsavenew)
for dd in dic4:
url_visit=path + last+z+dd
html = requests.get(url_visit)
with open(pathsavenew+dd, "wb") as f:
f.write(html.content)
def Mask(pathsave,last,path): #獲取mask
html = urlopen(path + last) # 獲取網頁
bs = BeautifulSoup(html, 'html.parser') # 解析網頁
hyperlink = bs.find_all('a') # 獲取所有超鏈接
for h3 in hyperlink:
hh = h3.get('href')
if '.zip' in hh:
pathsavenew = pathsave + last
if not os.path.exists(pathsavenew):
os.makedirs(pathsavenew)
html = requests.get(path + last+hh)
with open(pathsavenew + hh, "wb") as f:
f.write(html.content)
def tensors(pathsave,last,path):
html = urlopen(path + last) # 獲取網頁
bs = BeautifulSoup(html, 'html.parser') # 解析網頁
hyperlink = bs.find_all('a') # 獲取所有超鏈接
for h3 in hyperlink:
hh = h3.get('href')
if '.zip' in hh:
pathsavenew = pathsave + last
if not os.path.exists(pathsavenew):
os.makedirs(pathsavenew)
html = requests.get(path + last+hh)
with open(pathsavenew + hh, "wb") as f:
f.write(html.content)
import urllib
headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)
dic=[]
html = urlopen('http://cvrgrid.org/datasets/exVivo/canine/normal/') # 獲取網頁
bs = BeautifulSoup(html, 'html.parser') # 解析網頁
hyperlink = bs.find_all('a') # 獲取所有超鏈接
for h in hyperlink:
hh = h.get('href')
if 'DTI' in hh:
dic.append(hh)
dic2=[]
dicadd=[]
for i in dic:
path1='http://cvrgrid.org/datasets/exVivo/canine/normal/'+i
pathsave='data1/'+i
if not os.path.exists(pathsave):
os.makedirs(pathsave)
html = urlopen(path1) # 獲取網頁
bs = BeautifulSoup(html, 'html.parser') # 解析網頁
hyperlink = bs.find_all('a') # 獲取所有超鏈接
for h in hyperlink:
hh = h.get('href')
if 'Raw' in hh:
RawData( pathsave,hh,path1)
if 'matlab' in hh:
Mask(pathsave,hh,path1)
if 'DTIacq' in hh:
parameters(pathsave, hh, path1)
#if 'Tensor' in hh:
#Mask(pathsave, hh, path1)
print("download successful")
網絡爬蟲 爬取心血管數據
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.