第一次嘗試Spider,是個突破
#encoding=utf-8
import requests
import re
class MaoYantop:
def __init__(self):
self.url = "https://maoyan.com/board/4?offset={}"
self.headers = {"user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36"}
self.rank = 0
def get_url_list(self):
return [self.url.format(i*10) for i in range(10)]
def parse_url(self, url):
response = requests.get(url, headers=self.headers)
return response.content.decode()
def run(self):
#1.生成url列表
url_list = self.get_url_list()
#2.發送請求
for url in url_list:
html_str = self.parse_url(url)
#3.正則表達式匹配需要的內容
ret = re.findall(r'title="(.*?)".*?(主演.*?)\n.*?(上映時間.*?)</p>.*?<i class="integer">(\d\.?).*?(\d)</i></p>',html_str,re.S)
for temp in ret:
self.rank += 1 # 排名
print(self.rank, temp)
if __name__ == "__main__":
maoyantop = MaoYantop()
maoyantop.run()
部分內容