主要邏輯
- 打開首頁
- 搜索關鍵字,進入第一頁
2.1 獲取詳情頁url
2.2 進入詳情頁抓取數據
- 翻頁,重複第二步
詳細代碼
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as bs
import time
class ZhiPin:
def __init__(self):
self.start_url = 'https://www.zhipin.com/'
self.driver = webdriver.Chrome()
def search(self, kw):
element = self.driver.find_element_by_class_name('ipt-search')
element.send_keys(kw)
element.send_keys(Keys.RETURN)
self.driver.find_element_by_link_text('全國').click()
def next_page(self):
next_page = self.driver.find_elements_by_class_name('next')
next_page = next_page[0] if len(next_page)>0 else None
return next_page
def parse_page(self):
page = self.driver.page_source
soup = bs(page, 'lxml')
contents = soup.select('#main > div > div.job-list > ul > li')
for content in contents:
result = {
'title': content.select_one('.job-title').get_text(),
'salary': content.select_one('.red').get_text(),
'location': content.select('p')[0].get_text(),
'company': content.select_one('div.info-company > div > h3 > a').get_text(),
'company_condition': content.select('p')[1].get_text(),
'hr': content.select_one('div.info-publis > h3').get_text(),
'date': content.select('p')[2].get_text()
}
yield result
def run(self, kw):
self.driver.get(self.start_url)
self.search(kw)
time.sleep(2)
for result in self.parse_page():
print(result)
next_page = self.next_page()
while next_page:
next_page.click()
time.sleep(2)
for result in self.parse_page():
print(result)
next_page = self.next_page()
if __name__ == '__main__':
boss = ZhiPin()
boss.run('數據分析')