由於沒有網絡日誌,我們這裏用之前寫的python腳本爬取新浪微博熱搜模擬產生日誌文件,通過kafka和flume整合 將日誌定時抽取到 spark上進行處理,微博熱搜是十分鐘更新一次,我們這裏也設置十分鐘的定時任務,具體步驟如下
第一步
編寫python腳本獲取微博熱搜 實時排名,主題和url,然後運行測試,代碼如下
#!python2
# -*- coding:utf-8 -*-
import urllib,requests,re,sys
#獲取熱搜源碼
weiboHotFile=requests.get('http://s.weibo.com/top/summary')
weiboHotHtml=weiboHotFile.text
#正則表達式匹配URL ,找到title
hotKey=re.compile(r'td class=\\"td_05\\"><a href=\\"\\/weibo\\/(.*?)&Refer=top\\"')
hotKeyListBe=hotKey.findall(weiboHotHtml)
rank=1
#遍歷獲取的title 列表
for title in hotKeyListBe:
#去除干擾數字
title=title.replace('25','')
url='http://s.weibo.com/weibo/'+title
print(str(rank)+'\t'+(str(urllib.unquote(title.encode('utf-8'))).decode('utf-8'))+'\t'+url+'\n')
rank+=1
運行python腳本
第二步
在mysql創建表,包括當天最高排名,主題,url,和日期
#spark screaming 將數據導入這張表,創建 主題索引,用來加快替換排名的速度
Create table weiboHotSearch(
highest_rank int(4),
title varchar(100) unique,
url varchar(100),
day_date date);
#創建臨時表,每天晚上數據導入表中的當天的數據加載到零時表
Create table weiboHotSearch_temp(
highest_rank int(4),
title varchar(100),
url varchar(100),
day_date date);
第三步
編寫代碼,實現從kafka實時獲取熱搜榜,並存入數據庫,然後打成jar包
package com.stanley.sparktest.weibo
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.kafka.KafkaUtils
import kafka.serializer.StringDecoder
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.Row
import java.util.Date
import com.stanley.sparktest.sqlUtil.ConnectionPool
object WeiBoHot {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf()
.setAppName("Weibo HotSerach Application")
.setMaster("local[2]")
// Create SparkContext
val sc = new SparkContext(sparkConf)
//5秒獲取一次
val ssc = new StreamingContext(sc, Seconds(5))
//設置checkPoint
ssc.checkpoint(".")
// Kafka Cluster
val kafkaParam = Map("metadata.broker.list" -> "master:9092")
// Kafka Topics
val topics = Set("weiboTopic")
// Step 1: Create DStream
val lineDStream = KafkaUtils.createDirectStream[
String, String, StringDecoder,StringDecoder](
ssc, // StreamingContext
kafkaParam, // kafkaParams: Map[String, String]
topics // topics: Set[String]
).map(tuple => tuple._2)
// Step 2: DStream Transformation
val tupleDStream = lineDStream
.map(line => line.split("\t"))
.map(arr=>{
val rank=arr(0)
val title=arr(1)
val url=arr(2)
val date=arr(3)
(rank,title,url,date)
})
tupleDStream.foreachRDD(rdd => rdd.foreachPartition(partitionOfRecords =>{
val connection = ConnectionPool.getConnection()
partitionOfRecords.foreach(record => {
println("input data is " + record._1 + "\t" +record._2+"\t"+record._3+"\t"+record._4)
val sql1="select * from weiboHotSearch where title='"+record._2+"'"
println("sql:" + sql1)
val stmt = connection.createStatement
val rs=stmt.executeQuery(sql1)
if(rs.next()){
//對比之前的排名,更新排名
var highest_rank=rs.getInt("highest_rank")
if(record._1.toInt<highest_rank){
highest_rank=record._1.toInt
}
val sql="update weiboHotSearch set highest_rank="+highest_rank+" where title='"+record._2+"'"
println("sql:" + sql)
stmt.executeUpdate(sql)
}else{
val sql="insert into weiboHotSearch values("+record._1+",'"+record._2+"','"+record._3+"','"+record._4+"')"
println("sql:" + sql)
stmt.executeUpdate(sql)
}
})
ConnectionPool.returnConnection(connection)
}
))
//封裝事件
tupleDStream.print()
ssc.start()
ssc.awaitTermination()
// Stop Context
ssc.stop()
sc.stop()
}
}
package com.stanley.sparktest.sqlUtil;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.util.LinkedList;
public class ConnectionPool {
private static LinkedList<Connection> connectionQueue;
static {
try {
Class.forName("com.mysql.jdbc.Driver");
} catch (ClassNotFoundException e) {
e.printStackTrace();
}
}
public synchronized static Connection getConnection() {
try {
if (connectionQueue == null) {
connectionQueue = new LinkedList<Connection>();
for (int i = 0; i < 5; i++) {
Connection conn = DriverManager.getConnection(
"jdbc:mysql://master:3306/test",
"root",
"123456");
connectionQueue.push(conn);
}
}
} catch (Exception e) {
e.printStackTrace();
}
return connectionQueue.poll();
}
public static void returnConnection(Connection conn){connectionQueue.push(conn);}
}
第四步
整合flume kafka 通過執行tail-F抽取微博熱搜榜
# The configuration file needs to define the sources,
# the channels and the sinks.
# Sources, channels and sinks are defined per agent,
# in this case called 'agent'
a2.sources = r2
a2.channels = c2
a2.sinks = k2
# define sources
a2.sources.r2.type = exec
## 注意一定要執行flume命令的用戶對該/var/log/httpd/access_log文件
## 具有可讀的權限
a2.sources.r2.command = tail -F /opt/project/weibo/data_`date +"%Y-%m-%d"`
a2.sources.r2.shell = /bin/bash -c
# define channels
a2.channels.c2.type = memory
a2.channels.c2.capacity = 1000
a2.channels.c2.transactionCapacity = 100
# define sinks
#啓用設置多級目錄,這裏按年/月/日/時 2級目錄,每個小時生成一個文件夾
a2.sinks.k2.type = org.apache.flume.sink.kafka.KafkaSink
a2.sinks.k2.brokerList = master:9092
a2.sinks.k2.topic = weiboTopic
# bind the sources and sinks to the channels
a2.sources.r2.channels = c2
a2.sinks.k2.channel = c2
第五步
啓動kafka 創建topic ,啓動flume
bin/kafka-server-start.sh config/server.properties
bin/kafka-topics.sh --create --zookeeper master:2181 --replication-factor 1 --partitions 1 --topic weiboTopic
第六步
創建hive分區表與mysql關聯
用來存放mysql數據
create external table weiboHotSearch(
highest_rank int,
title string,
url string,
day_date string
)partitioned by (year string,month string,day string)
row format delimited fields terminated by '\t';
臨時表,用來導入mysql數據
create table weiboHotSearch_temp(
highest_rank int,
title string,
url string,
day_date string
)row format delimited fields terminated by '\t';
第七步
編寫shell腳本
#!/bin/sh
PATH=/usr/local/bin
export PATH
python_dir=/opt/project/weibo/weiboHot.py
python ${python_dir}
#!/bin/sh
#操作腳本
#使用環境變量
. /etc/profile
#HDFS數據源目錄
DATA_LOG=/user/hive/warehouse/work0403.db
#當天日期
TODAY=`date +%Y-%m-%d`
#前一天日期
YESTERDAY=$(date -d "yesterday" +%Y-%m-%d)
#設置數據庫變量
HOSTNAME=master
PORT=3306
USERNAME=root
PASSWORD=123456
DATABASE=weibo
#設置sql語句
sql_truncate_temp="truncate table ${DATABASE}.weiboHotSearch_temp"
sql_insert="insert into ${DATABASE}.weiboHotSearch_temp (select * from ${DATABASE}.weiboHotSearch where day_date=\""${TODAY}"\")"
sql_truncate_main="truncate table ${DATABASE}.weiboHotSearch"
#將前一天的臨時表清空
mysql -h${HOSTNAME} -P${PORT} -u${USERNAME} -p${PASSWORD} -e "${sql_truncate_temp}" --default-character-set=UTF8
#將主表數據插入臨時表
mysql -h${HOSTNAME} -P${PORT} -u${USERNAME} -p${PASSWORD} -e "${sql_insert}" --default-character-set=UTF8
#主表數據清空
mysql -h${HOSTNAME} -P${PORT} -u${USERNAME} -p${PASSWORD} -e "${sql_truncate_main}" --default-character-set=UTF8
#清空hive前一天臨時表
hive -e "
use ${DATABASE};
truncate table weiboHotSearch_temp;"
#用sqoop 將臨時表數據導入hive 臨時表
sqoop import \
--connect jdbc:mysql://${HOSTNAME}:${PORT}/${DATABASE} \
--username ${USERNAME} \
--password ${PASSWORD} \
--table weiboHotSearch_temp \
--num-mappers 1 \
--fields-terminated-by "\t" \
--delete-target-dir \
--hive-database ${DATABASE} \
--hive-import \
--hive-table weiboHotSearch_temp
#將hive 臨時表數據導入hive主表
hive -e "
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nostrick;
insert into table ${DATABASE}.weiboHotSearch partition(year,month,day)
select highest_rank,title,url,day_date,substr(day_date,0,4) year,substr(day_date,6,2) month, substr(day_date,9,2) day
from ${DATABASE}.weiboHotSearch_temp;"
#刪除前一天數據文件
rm -rf /opt/project/weibo/data_${YESTERDAY}
第八步
測試運行
啓動hdfs
${HADOOP_HOME}/sbin/start-dfs.sh
${HADOOP_HOME}/sbin/start-yarn.sh
${HADOOP_HOME}/sbin/mr-jobhistory-daemon.sh start historyserver
啓動zookeeper
${ZOOKEEPER_HOME}/sbin/zkServer.sh start
啓動hive metastore
${HIVE_HOME}/bin/hive --service metastore &
啓動flume
${FLUME_HOME}/bin/flume-ng agent --conf conf/ --name a2 --conf-file /opt/project/weibo/flume-kafka_weibo.conf
啓動kafka
${KAFKA_HOME}/bin/kafka-server-start.sh config/server.properties
運行 spark java 包
${SPARK_HOME}/bin/spark-submit \
--class com.stanley.sparktest.weibo.WeiBoHot \
/opt/project/weibo/weiboHot.jar
執行python 腳本
bash /opt/project/weibo/python_shell.sh
python腳本執行完後執行操作腳本
bash /opt/project/weibo/operation.sh
爬取的內容抽取到kafka集羣中
查看數據庫,數據已存入數據庫
執行操作腳本後
數據庫中接收數據表被清空,數據轉入準備導入和hive 連接的表
數據已經存入到hive 中
數據在存入到hdfs 分區當中
數據文件已經自動產生
第九步
設置定時任務
可以用oozie來調度,
Crontab調度如下
#每十分鐘爬取一次微博熱搜數據
2,12,22,32,42,52 * * * * bash /opt/project/weibo/python_shell.sh
#每天23點55分執行操作腳本,把數據存入到hdfs
55 23 * * * bash /opt/project/weibo/operation.sh
使用hue 頁面操作oozie
創建兩個workflow,分別對應python腳本,和操作腳本
兩個腳本分別創建coordinator,頻率和上面的crontab一致
創建bundle,將兩個coordinator綁在一起
最後提交bundle任務