python爬蟲之六 —— selenium和BOSS直聘

主要邏輯

  1. 打開首頁
  2. 搜索關鍵字,進入第一頁
    2.1 獲取詳情頁url
    2.2 進入詳情頁抓取數據
  3. 翻頁,重複第二步

詳細代碼

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as bs
import time


class ZhiPin:
    def __init__(self):
        self.start_url = 'https://www.zhipin.com/'
        self.driver = webdriver.Chrome()
    
    def search(self, kw):
        element = self.driver.find_element_by_class_name('ipt-search')
        element.send_keys(kw)
        element.send_keys(Keys.RETURN)
        self.driver.find_element_by_link_text('全國').click()
    
    def next_page(self):
        next_page = self.driver.find_elements_by_class_name('next')
        next_page = next_page[0] if len(next_page)>0 else None
        return next_page
    
    def parse_page(self):
        page = self.driver.page_source
        soup = bs(page, 'lxml')
        contents = soup.select('#main > div > div.job-list > ul > li')
        for content in contents:
            result = {
                    'title': content.select_one('.job-title').get_text(),
                    'salary': content.select_one('.red').get_text(),
                    'location': content.select('p')[0].get_text(),
                    'company': content.select_one('div.info-company > div > h3 > a').get_text(),
                    'company_condition': content.select('p')[1].get_text(),
                    'hr': content.select_one('div.info-publis > h3').get_text(),
                    'date': content.select('p')[2].get_text()
                    }
            yield result

# 調用這個方法可進入詳情頁 
#    def detail_page(self):
#        handle = self.driver.current_window_handle
#        details = self.driver.find_elements_by_css_selector('div.job-list > ul > li > div > div.info-primary > h3.name > a')
#        for detail in details:
#            detail.click()
#            handles = self.driver.window_handles
#            for newhandle in handles:
#                if newhandle != handle:
#                    self.driver.switch_to.window(newhandle)
#                    time.sleep(2)
#                    self.driver.close()
#                    self.driver.switch_to.window(handles[0])            
#            time.sleep(2)        
    
    def run(self, kw):
        # 1.打開首頁
        self.driver.get(self.start_url)
        # 2.搜索關鍵字,進入第一頁
        self.search(kw)
        time.sleep(2)
          # 2.1 獲取詳情頁url
          # 2.2 進入詳情頁抓取數據
        for result in self.parse_page():
            print(result)
#        self.detail_page()
        # 3.翻頁,重複第二步
        next_page = self.next_page()
        while next_page:
            next_page.click()
            time.sleep(2)
            for result in self.parse_page():
                print(result)
#            self.detail_page()
            next_page = self.next_page()


if __name__ == '__main__':
    boss = ZhiPin()
    boss.run('數據分析')
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章