一、我們將51job作爲爬去目標:
1、網站界面:
2.分析網站結構:
二、設計思路:
通過檢查網頁我們發現每一個職位信息都包裹在一個class屬性爲el的一個div下,每一個具體的信息又在div的span標籤下,所以這裏我們可以選擇通過re 、BeautifulSoup或者lxml來進行對網頁的解析,在這裏我選擇的是lxml。請求網頁信息用的ruquests庫,因爲這個職位信息有上萬條,爲了節省時間我們採用多線程來爬取寫入數據。最後處理數據放入excl中方便查看。
三、Demo:
import threading
from queue import Queue
import requests
from lxml import etree
HEADRES = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2864.400',
}
COUNT = 0
class Producters(threading.Thread):
def __init__(self, page_queue, content_queue, *args, **kwargs):
super(Producters, self).__init__(*args, **kwargs)
self.page_queue = page_queue
self.content_queue = content_queue
def run(self):
while True:
if self.page_queue.empty():
break
url = self.page_queue.get()
self.parse_html(url)
def getHtml(self, url):
r = requests.get(url, headers=HEADRES)
r.encoding = r.apparent_encoding
return r.text
def parse_html(self, url):
text = self.getHtml(url)
#print(text)
html = etree.HTML(text)
divs = html.xpath('//div[@class="el"]')
for div in divs[4:]:
spans = div.xpath('.//span')
try:
jobname = spans[0].xpath('./a/text()')[0]
# print(spans[0].xpath('./a/text()')[0])
companyname = spans[1].xpath('.//text()')[0]
address = spans[2].xpath('.//text()')[0]
salary = spans[3].xpath('.//text()')[0]
time = spans[4].xpath('.//text()')[0]
# print(jobname.strip(), companyname, address, salary, time+'\n')
print('companyname'+companyname)
self.content_queue.put([jobname.strip() + '$', companyname + '$', address + '$', salary + '$', time + '\n'])
except:
continue
class Customer(threading.Thread):
def __init__(self, page_queue, content_queue, *args, **kwargs):
super(Customer, self).__init__(*args, **kwargs)
self.page_queue = page_queue
self.content_queue = content_queue
def run(self):
global COUNT
while True:
if self.content_queue.empty() and self.page_queue.empty():
break
with open('java1job.txt', 'a+', encoding='utf-8') as f:
print(COUNT)
COUNT += 1
f.writelines(self.content_queue.get())
if __name__ == '__main__':
page_queue = Queue(2000)
content_queue = Queue(100000)
for i in range(1, 1000):
page_queue.put('https://search.51job.com/list/000000,000000,0000,00,9,99,java,2,'+str(i)+'.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=')
for x in range(10):
t = Producters(page_queue, content_queue)
t.start()
for x in range(10):
t = Customer(page_queue, content_queue)
t.start()