思路:
1.從需要被爬的網站裏獲取數據
2.依據獲取到的數據結構(字段)創建數據庫表結構
3.創建數據表
4.插入數據前整理數據
5.將整理好的數據插入到數據庫中
6.插入操作結束後關閉數據庫連接或者收集錯誤信息
待優化部分:
1.特殊字符(如表情符號)存入數據庫中時會報錯,需要優化編碼方式
2.錯誤信息可以通過郵件發給管理員
3.優化同步異步操作,把好多層callback換成Promise
4.包含發郵件和promise的優化版本https://blog.csdn.net/tengxi_5290/article/details/103368115
公共資源:
import {connection} from './../../common/dbConnect.js' // 連接數據庫(代碼在文章最底端)
const https = require('https') // 網絡請求
// 需要被爬的網站網址
const url = 'https://douban.fm/j/v2/songlist/explore?type=hot&genre=0&limit=20&sample_cnt=5' // 這是獲取數據的接口url
const urlDir = 'https://douban.fm/explore/songlists' // 這是瀏覽器訪問頁面的url
let errorInfoGroup = [] // 存放報錯的數據條目
let successInfoGroup = [] // 存放沒報錯的數據條目
let testInterval // 計時器
實例代碼:
// 從需要被爬的網站裏獲取數據
let getWebData = function (callback) {
https.get(url, (res) => {
let chunks = []
let size = 0
res.on('data', (trunk) => {
chunks.push(trunk)
size += trunk.length
})
res.on('end', () => {
callback(chunks, size)
})
})
}
// 依據獲取到的數據結構(字段)創建數據庫表結構
let getTableSkelon = function (chunks, size, callback) {
let data = Buffer.concat(chunks, size)
let html = JSON.parse(data.toString()).reverse()
let createString = 'CREATE TABLE IF NOT EXISTS doubanAlbum ('
let createParams = ''
for(let j in html[0]) {
if(j !== 'id') {
createParams = createParams + j + ' VARCHAR (255) DEFAULT NULL, '
} else {
createParams = createParams + 'id INT (10) NOT NULL, '
}
}
createString = createString + createParams + 'PRIMARY KEY (id))'
callback(createString, html)
}
// 創建數據表
let createTable = function (createString, html, callback) {
connection.query(createString, (err, data) => {
if(err) {
throw err
} else {
callback(html)
}
})
}
// 插入數據前整理數據
let fillTable = function (newInfo) {
let tempSongs = []
for (let i in newInfo) {
if(newInfo[i] !== null) {
if(Array.prototype.isPrototypeOf(newInfo[i]) == true) {
if(i == 'sample_songs') {
newInfo['sample_songs'].forEach((item, index, array) => {
tempSongs.push(item.sid)
})
newInfo[i] = tempSongs.join(',')
} else {
newInfo[i] = JSON.stringify(newInfo[i])
}
} else if(typeof(newInfo[i]) == 'object') {
if(newInfo[i].hasOwnProperty('id')) {
newInfo[i] = newInfo[i].id
} else {
newInfo[i] = null
}
} else {
newInfo[i] = newInfo[i]
}
}
}
fillbd(newInfo)
}
// 將整理好的數據插入到數據庫中
let fillbd = function (newInfo) {
connection.query('INSERT INTO doubanAlbum SET ?', newInfo, (err, result) => {
if(err) {
errorInfoGroup.push(newInfo)
// throw err
} else {
successInfoGroup.push(newInfo)
}
})
}
// 插入操作結束後關閉數據庫連接或者收集錯誤信息
let endConnection = function (html) {
if(successInfoGroup.length + errorInfoGroup.length == html.length) {
connection.end()
clearInterval(testInterval)
// TODO: 錯誤數據發郵件告知管理員並寫入log留作記錄
} else {
console.log('錯誤數據')
console.log(errorInfoGroup)
}
}
// 調用方法
getWebData((chunks, size) => {
getTableSkelon(chunks, size, (createString, html) => {
createTable(createString, html, () => {
html.forEach((item, index, array) => {
fillTable(item)
})
testInterval = setInterval(endConnection,1000, html);
})
})
})
// codes in dbConnect.js
const mysql = require('mysql')
const dbName = YOUR DATABASE NAME
const connection = mysql.createConnection({
host: 'localhost',
user: 'root',
password: YOUR PASSWORD FOR YOUR DATABASE,
port: 3306,
autoReconnect: true
})
connection.query('USE ' + dbName)
module.exports = {
connection
}