Intro
一直想自己動手用框架搭起來一個搜索引擎,但是也一直不知道從哪裏開始下手比較好。
最近一直在網易雲音樂上聽歌,決定從網易雲上把評論全部爬下來,用評論做一個垂直搜索
Path
說幹就開始吧 首先第一步得先把網易雲上的評論爬下來吧,沒有評論資源怎麼開始幹活啊!
就拿我最喜歡的歌手 Eason 陳奕迅 做例子吧
首先打開網頁版的網易雲音樂 搜索 十年 這首歌~
十年-Eason
接下來打開我們瀏覽器的開發者工具
仔細找找以後果然發現了這個API
https://music.163.com/weapi/v1/resource/comments/R_SO_4_66842?csrf_token=
還是一個挺奇怪的API哈,我們應該能看出來 R_SO_4_66842 是這首歌的ID,翻看了一下其他歌曲,果然 都是通過這個API去獲取評論的
後面的csrf_token應該是防止跨域訪問攻擊的,不在我們討論的範圍
仔細看了一下這個API,需要哪些參數
需要兩個什麼鬼的參數
一個是params 另一個是encSecKey
顯然網易云爲了防止爬蟲已經做了很多加密工作了,這就很難受
正當我走投無路的時候,發現了這篇文章!
https://www.zhihu.com/question/36081767
真大神啊。。。人家已經把參數和什麼全部解析出來了
於是簡單的“改裝閱讀”一下代碼
EncryptUtil.py:
# -*- coding:utf-8 -*-
import os
import base64
import time
from Crypto.Cipher import AES
def createSecretKey(size):
return (''.join(map(lambda xx: (hex(ord(xx))[2:]), os.urandom(size))))[0:size]
def aesEncrypt(text, secKey):
pad = 16 - len(text) % 16
text = text + pad * chr(pad)
encryptor = AES.new(secKey, 2, '0102030405060708')
ciphertext = encryptor.encrypt(text)
ciphertext = base64.b64encode(ciphertext)
return ciphertext
def rsaEncrypt(text, pubKey, modulus):
text = text[::-1]
rs = int(text.encode('hex'), 16)**int(pubKey, 16)%int(modulus, 16)
return format(rs, 'x').zfill(256)
def timeStamp(timeNum):
timeStamp = float(timeNum/1000)
timeArray = time.localtime(timeStamp)
reTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
return reTime
CrawlerComments.py
# -*- coding:utf-8 -*-
import EncryptUtil
import json
import requests
import time
import DataBase
import Logger
logger = Logger.Log()
class Crawler(object):
def __init__(self,id):
modulus = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7'
self.nonce = '0CoJUm6Qyw8W8jud'
pubKey = '010001'
self.secKey = EncryptUtil.createSecretKey(16)
self.encSecKey = EncryptUtil.rsaEncrypt(self.secKey, pubKey, modulus)
self.mysql = DataBase.Mysql()
self.musicId = id
self.requestUrl = "http://music.163.com/weapi/v1/resource/comments/R_SO_4_%d/"%int(id)
self.headers = {
'Host': 'music.163.com',
'Connection': 'keep-alive',
'Content-Length': '484',
'Cache-Control': 'max-age=0',
'Origin': 'http://music.163.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.84 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded',
'Accept': '*/*',
'DNT': '1',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4',
'Cookie': 'JSESSIONID-WYYY=b66d89ed74ae9e94ead89b16e475556e763dd34f95e6ca357d06830a210abc7b685e82318b9d1d5b52ac4f4b9a55024c7a34024fddaee852404ed410933db994dcc0e398f61e670bfeea81105cbe098294e39ac566e1d5aa7232df741870ba1fe96e5cede8372ca587275d35c1a5d1b23a11e274a4c249afba03e20fa2dafb7a16eebdf6%3A1476373826753; _iuqxldmzr_=25; _ntes_nnid=7fa73e96706f26f3ada99abba6c4a6b2,1476372027128; _ntes_nuid=7fa73e96706f26f3ada99abba6c4a6b2; __utma=94650624.748605760.1476372027.1476372027.1476372027.1; __utmb=94650624.4.10.1476372027; __utmc=94650624; __utmz=94650624.1476372027.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
}
def getComment(self, offset):
text = {
'username': "",
'password': "",
'rememberLogin': 'true',
'offset': offset
}
text = json.dumps(text)
encText = EncryptUtil.aesEncrypt(EncryptUtil.aesEncrypt(text, self.nonce), self.secKey)
data = {
'params': encText,
'encSecKey': self.encSecKey
}
res = requests.post(self.requestUrl,headers=self.headers, data=data)
jsonData = res.json()
self.databaseSave(jsonData)
return int(jsonData["total"])
def databaseSave(self ,jsonData):
for comment in jsonData["comments"]:
commentData = {
'id': str(comment["commentId"]),
'user': str(comment["user"]["userId"]),
'content': comment["content"].encode('utf-8'),
'likeCount': str(comment["likedCount"]),
'commentTime': str(EncryptUtil.timeStamp(comment["time"])),
'musicId': str(self.musicId)
}
if not comment["beReplied"] == []:
commentData["reComment"] = str(comment["beReplied"][0]["user"]["userId"])
if self.mysql.insertData("comment",commentData) >= 0:
logger.info("Comment %s Saved."%commentData["id"])
userData = {
'id': str(comment["user"]["userId"]),
'username': comment["user"]["nickname"].encode('utf-8'),
'avatarUrl': comment["user"]["avatarUrl"].encode('utf-8')
}
if self.mysql.insertData("user",userData) >= 0:
logger.info("User %s Saved."%userData["id"])
def process(self, offset):
if offset == -1:
return
off = offset
total = self.getComment(off)
while off<total:
off += 10
self.getComment(off)
self.taskSchedule.trigger(self.musicId,"-1")
def main():
c = Crawler(66842)
c.process(1)
if __name__ == '__main__':
main()
DataBase.py
# -*- coding: utf-8 -*-
import MySQLdb
import setting
import time
import Logger
logger = Logger.Log()
class Mysql:
#獲取當前時間
def getCurrentTime(self):
return time.strftime('[%Y-%m-%d %H:%M:%S]',time.localtime(time.time()))
#數據庫初始化
def __init__(self):
try:
self.db = MySQLdb.connect(
host=setting.MYSQL_HOST,
port = int(setting.MYSQL_PORT),
user=setting.MYSQL_USER,
passwd=setting.MYSQL_PASSWD,
db =setting.MYSQL_DBNAME,
charset="utf8mb4"
)
self.cur = self.db.cursor()
except MySQLdb.Error,e:
logger.error("連接數據庫錯誤")
#插入數據
def insertData(self, table, my_dict):
try:
self.db.set_character_set('utf8mb4')
cols = ', '.join(my_dict.keys())
values = '"," '.join(my_dict.values())
sql = "INSERT INTO %s (%s) VALUES (%s)" % (table, cols, '"'+values+'"')
try:
result = self.cur.execute(sql)
insert_id = self.db.insert_id()
self.db.commit()
#判斷是否執行成功
if result:
return insert_id
else:
return 0
except MySQLdb.Error,e:
#發生錯誤時回滾
self.db.rollback()
#主鍵唯一,無法插入
if "key 'PRIMARY'" in e.args[1]:
logger.warning("數據已存在,未插入數據")
else:
logger.error("數據已存在,未插入數據"+str(my_dict))
return -1
except MySQLdb.Error,e:
logger.error("數據庫錯誤")
return -1
if __name__=='__main__':
d = Mysql()
commentData = {
'id': '1',
'user': '1',
'content': '\xF0\x9F\x8E\xA4',
'likeCount': '1'
}
d.insertData('comment',commentData)
好了,我的初步爬取應該是沒什麼問題了,下一步就是
- 利用代理池進行不同代理的爬蟲防止IP被封鎖問題
- 利用線程池進行多線程的爬蟲提高效率
- 利用TaskSchedule記錄爬蟲的爬取進度
那麼這些就留給下一次再寫了
最後附一下github地址: https://github.com/WJerry0227/MusicComment163
應該會持續更新..