第一次爬蟲,紀念下

對下面租房網址進行爬蟲

http://bj.xiaozhu.com/%E5%9C%B0%E9%93%81%E4%B9%9D%E5%8F%B7%E7%BA%BF_lysys-duanzufang-p2-20/?putkey=%E5%9C%B0%E9%93%81%E4%B9%9D%E5%8F%B7%E7%BA%BF


首先將兩頁的所有房子鏈接取出來:

page = []


def get_page_link(page_num):
    for i in range(1, page_num):
        url = 'http://bj.xiaozhu.com/%E5%9C%B0%E9%93%81%E4%B9%9D%E5%8F%B7%E7%BA%BF_lysys-duanzufang-p{}-20/?putkey=%E5%9C%B0%E9%93%81%E4%B9%9D%E5%8F%B7%E7%BA%BF'.format(str(i))
        wb_data = requests.get(url)
        soup = BeautifulSoup(wb_data.text, 'lxml')
        all_link = soup.select('ul > li > div.result_btm_con.lodgeunitname')
        for link in all_link:
            # print(link.get('detailurl'))
            # print(type(link.get('detailurl')))
            page.append(str(link.get('detailurl')))
        print('------------------')


get_page_link(3)


然後對於每一個鏈接進行獲取裏面房子信息
def get_information(url):

    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    title = soup.select('div > div.con_l > div.pho_info > h4')[0].get_text()
    address = soup.select('div > div.con_l > div.pho_info > p > span')[0].get_text()
    day_price = soup.select('div.day_l > span')[0].get_text()
    imag = soup.select('#curBigImage')[0].get('src')
    host_name = soup.select('div > div.w_240 > h6 > a')[0].get_text()
    host_gender = soup.select('div.member_pic > div')[0].get('class')[0]

    print(title, ' ', address, ' ', day_price, ' ', host_name, ' ', print_gender(host_gender))


全部代碼


from bs4 import BeautifulSoup
import  requests
import time


page = []


def print_gender(str):
    if str == 'member_ico1':
        return '女'
    else:
        return '男'


def get_information(url):

    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    title = soup.select('div > div.con_l > div.pho_info > h4')[0].get_text()
    address = soup.select('div > div.con_l > div.pho_info > p > span')[0].get_text()
    day_price = soup.select('div.day_l > span')[0].get_text()
    imag = soup.select('#curBigImage')[0].get('src')
    host_name = soup.select('div > div.w_240 > h6 > a')[0].get_text()
    host_gender = soup.select('div.member_pic > div')[0].get('class')[0]

    print(title, ' ', address, ' ', day_price, ' ', host_name, ' ', print_gender(host_gender))


def get_page_link(page_num):
    for i in range(1, page_num):
        url = 'http://bj.xiaozhu.com/%E5%9C%B0%E9%93%81%E4%B9%9D%E5%8F%B7%E7%BA%BF_lysys-duanzufang-p{}-20/?putkey=%E5%9C%B0%E9%93%81%E4%B9%9D%E5%8F%B7%E7%BA%BF'.format(str(i))
        wb_data = requests.get(url)
        soup = BeautifulSoup(wb_data.text, 'lxml')
        all_link = soup.select('ul > li > div.result_btm_con.lodgeunitname')
        for link in all_link:
            # print(link.get('detailurl'))
            # print(type(link.get('detailurl')))
            page.append(str(link.get('detailurl')))
        print('------------------')


def main():
    get_page_link(3)
    for link in page:
        get_information(link)
        time.sleep(2)

if __name__ == '__main__':
    main()


運行可得到如下信息




發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章