網絡爬蟲 爬取心血管數據


# ! /usr/bin/python
# -*- coding: utf8 -*-
# @Time    : 2019/10/23 
# @Author  : zhang

import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup
import os
def  parameters(pathsave,last,path):    #獲取參數
    html = urlopen(path + last)  # 獲取網頁
    bs = BeautifulSoup(html, 'html.parser')  # 解析網頁
    hyperlink = bs.find_all('a')  # 獲取所有超鏈接
    pathsavenew = pathsave + last
    if not os.path.exists(pathsavenew):
        os.makedirs(pathsavenew)
    for h1 in hyperlink :
        hh = h1.get('href')
        if '.mat' in hh:
            html = requests.get(path + last + hh)
            with open(pathsavenew + hh, "wb") as f:
                f.write(html.content)
        if '.txt' in hh:
            html = requests.get(path + last + hh)
            with open(pathsavenew + hh, "wb") as f:
                f.write(html.content)

def  RawData(pathsave,last,path):      # 獲取RawDate

    dic3=[]
    html = urlopen(path + last)  # 獲取網頁
    bs = BeautifulSoup(html, 'html.parser')  # 解析網頁
    hyperlink = bs.find_all('a')  # 獲取所有超鏈接
    for h3 in hyperlink:
        hh = h3.get('href')
        if 'NIF' in hh:
            dic3.append(hh)
    dic4=[]
    for z in dic3:
        html = urlopen(path + last+z)  # 獲取網頁
        bs = BeautifulSoup(html, 'html.parser')  # 解析網頁
        hyperlink = bs.find_all('a')  # 獲取所有超鏈接
        for h4 in hyperlink:
            hh = h4.get('href')
            if '.nii' in hh:
                dic4.append(hh)
    pathsavenew = pathsave + last + z
    if not os.path.exists(pathsavenew):
        os.makedirs(pathsavenew)
    for dd in dic4:
        url_visit=path + last+z+dd
        html = requests.get(url_visit)
        with open(pathsavenew+dd, "wb") as f:
            f.write(html.content)

def  Mask(pathsave,last,path):          #獲取mask
    html = urlopen(path + last)  # 獲取網頁
    bs = BeautifulSoup(html, 'html.parser')  # 解析網頁
    hyperlink = bs.find_all('a')  # 獲取所有超鏈接
    for h3 in hyperlink:
        hh = h3.get('href')
        if '.zip' in hh:
            pathsavenew = pathsave + last
            if not os.path.exists(pathsavenew):
                os.makedirs(pathsavenew)
            html = requests.get(path + last+hh)
            with open(pathsavenew + hh, "wb") as f:
                f.write(html.content)

def  tensors(pathsave,last,path):
    html = urlopen(path + last)  # 獲取網頁
    bs = BeautifulSoup(html, 'html.parser')  # 解析網頁
    hyperlink = bs.find_all('a')  # 獲取所有超鏈接
    for h3 in hyperlink:
        hh = h3.get('href')
        if '.zip' in hh:
            pathsavenew = pathsave + last
            if not os.path.exists(pathsavenew):
                os.makedirs(pathsavenew)
            html = requests.get(path + last+hh)
            with open(pathsavenew + hh, "wb") as f:
                f.write(html.content)

import urllib
headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)


dic=[]
html = urlopen('http://cvrgrid.org/datasets/exVivo/canine/normal/')  # 獲取網頁
bs = BeautifulSoup(html, 'html.parser')  # 解析網頁
hyperlink = bs.find_all('a')  # 獲取所有超鏈接
for h in hyperlink:
    hh = h.get('href')
    if 'DTI' in hh:
        dic.append(hh)

dic2=[]
dicadd=[]
for i in dic:
    path1='http://cvrgrid.org/datasets/exVivo/canine/normal/'+i
    pathsave='data1/'+i
    if not os.path.exists(pathsave):
        os.makedirs(pathsave)
    html = urlopen(path1)  # 獲取網頁
    bs = BeautifulSoup(html, 'html.parser')  # 解析網頁
    hyperlink = bs.find_all('a')  # 獲取所有超鏈接
    for h in hyperlink:
        hh = h.get('href')
        if 'Raw' in hh:
            RawData( pathsave,hh,path1)
        if 'matlab' in hh:
            Mask(pathsave,hh,path1)
        if 'DTIacq' in hh:
            parameters(pathsave, hh, path1)
        #if 'Tensor' in hh:
            #Mask(pathsave, hh, path1)
    print("download successful")





發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章