spark screaming 模擬實戰項目實例

由於沒有網絡日誌,我們這裏用之前寫的python腳本爬取新浪微博熱搜模擬產生日誌文件,通過kafka和flume整合 將日誌定時抽取到 spark上進行處理,微博熱搜是十分鐘更新一次,我們這裏也設置十分鐘的定時任務,具體步驟如下

第一步

編寫python腳本獲取微博熱搜 實時排名,主題和url,然後運行測試,代碼如下

#!python2
# -*- coding:utf-8 -*-
import urllib,requests,re,sys

#獲取熱搜源碼

weiboHotFile=requests.get('http://s.weibo.com/top/summary')
weiboHotHtml=weiboHotFile.text
#正則表達式匹配URL ,找到title
hotKey=re.compile(r'td class=\\"td_05\\"><a href=\\"\\/weibo\\/(.*?)&Refer=top\\"')
hotKeyListBe=hotKey.findall(weiboHotHtml)
rank=1
#遍歷獲取的title 列表
for title in hotKeyListBe:
    #去除干擾數字
    title=title.replace('25','')
    url='http://s.weibo.com/weibo/'+title   
    print(str(rank)+'\t'+(str(urllib.unquote(title.encode('utf-8'))).decode('utf-8'))+'\t'+url+'\n')
    rank+=1    

運行python腳本



第二步

在mysql創建表,包括當天最高排名,主題,url,和日期

#spark screaming 將數據導入這張表,創建 主題索引,用來加快替換排名的速度
Create  table weiboHotSearch(
highest_rank int(4),
title varchar(100) unique,
url varchar(100),
day_date date);
#創建臨時表,每天晚上數據導入表中的當天的數據加載到零時表
Create table weiboHotSearch_temp(
highest_rank int(4),
title varchar(100),
url varchar(100),
day_date date);


第三步

編寫代碼,實現從kafka實時獲取熱搜榜,並存入數據庫,然後打成jar包

package com.stanley.sparktest.weibo

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.kafka.KafkaUtils
import kafka.serializer.StringDecoder
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.Row
import java.util.Date
import com.stanley.sparktest.sqlUtil.ConnectionPool

object WeiBoHot {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf()
         .setAppName("Weibo HotSerach Application")
         .setMaster("local[2]")
     // Create SparkContext
     val sc = new SparkContext(sparkConf)
    //5秒獲取一次
     val ssc = new StreamingContext(sc, Seconds(5))
    //設置checkPoint
    ssc.checkpoint(".")
    // Kafka Cluster
     val kafkaParam = Map("metadata.broker.list" -> "master:9092")

     // Kafka Topics
     val topics = Set("weiboTopic")
 
     // Step 1: Create DStream
     val lineDStream = KafkaUtils.createDirectStream[
       String, String, StringDecoder,StringDecoder](
         ssc, // StreamingContext
         kafkaParam, // kafkaParams: Map[String, String]
         topics // topics: Set[String]
     ).map(tuple => tuple._2)
     // Step 2: DStream Transformation
     val tupleDStream = lineDStream
           .map(line => line.split("\t"))
           .map(arr=>{
             val rank=arr(0)
             val title=arr(1)
             val url=arr(2)
             val date=arr(3)
             (rank,title,url,date)
           })
       tupleDStream.foreachRDD(rdd => rdd.foreachPartition(partitionOfRecords =>{
       val connection = ConnectionPool.getConnection()
       partitionOfRecords.foreach(record => {
         println("input data is " + record._1 + "\t" +record._2+"\t"+record._3+"\t"+record._4)    
        
        val sql1="select * from weiboHotSearch where title='"+record._2+"'"
         println("sql:" + sql1)
         val stmt = connection.createStatement        
         val rs=stmt.executeQuery(sql1)
         if(rs.next()){
           //對比之前的排名,更新排名
           var highest_rank=rs.getInt("highest_rank")
           if(record._1.toInt<highest_rank){
             highest_rank=record._1.toInt
           }
           val sql="update  weiboHotSearch set highest_rank="+highest_rank+" where title='"+record._2+"'"
           println("sql:" + sql)
           stmt.executeUpdate(sql)
         }else{
           val sql="insert into weiboHotSearch values("+record._1+",'"+record._2+"','"+record._3+"','"+record._4+"')"
          println("sql:" + sql)
          stmt.executeUpdate(sql)          
         }      
          
       })
       ConnectionPool.returnConnection(connection)
     }
     ))
     //封裝事件
     tupleDStream.print()
     ssc.start()
     ssc.awaitTermination()
     // Stop Context
     ssc.stop()
     sc.stop()
  }
} 

package com.stanley.sparktest.sqlUtil;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.util.LinkedList;

public class ConnectionPool {
    private static LinkedList<Connection> connectionQueue;

    static {
        try {
            Class.forName("com.mysql.jdbc.Driver");
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        }
    }

    public synchronized static Connection getConnection() {
        try {
            if (connectionQueue == null) {
                connectionQueue = new LinkedList<Connection>();
                for (int i = 0; i < 5; i++) {
                    Connection conn = DriverManager.getConnection(
                            "jdbc:mysql://master:3306/test",
                            "root",
                            "123456");
                    connectionQueue.push(conn);
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return connectionQueue.poll();
    }
    public static void returnConnection(Connection conn){connectionQueue.push(conn);}
    
}


第四步

整合flume kafka 通過執行tail-F抽取微博熱搜榜

# The configuration file needs to define the sources, 
# the channels and the sinks.
# Sources, channels and sinks are defined per agent, 
# in this case called 'agent'

a2.sources = r2
a2.channels = c2
a2.sinks = k2

# define sources
a2.sources.r2.type = exec
## 注意一定要執行flume命令的用戶對該/var/log/httpd/access_log文件
## 具有可讀的權限
a2.sources.r2.command = tail -F /opt/project/weibo/data_`date +"%Y-%m-%d"`
a2.sources.r2.shell = /bin/bash -c

# define channels
a2.channels.c2.type = memory
a2.channels.c2.capacity = 1000
a2.channels.c2.transactionCapacity = 100

# define sinks
#啓用設置多級目錄,這裏按年/月/日/時 2級目錄,每個小時生成一個文件夾
a2.sinks.k2.type = org.apache.flume.sink.kafka.KafkaSink
a2.sinks.k2.brokerList = master:9092
a2.sinks.k2.topic = weiboTopic

# bind the sources and sinks to the channels
a2.sources.r2.channels = c2
a2.sinks.k2.channel = c2


第五步

啓動kafka 創建topic ,啓動flume

bin/kafka-server-start.sh config/server.properties
bin/kafka-topics.sh --create --zookeeper master:2181 --replication-factor 1 --partitions 1 --topic weiboTopic


第六步

創建hive分區表與mysql關聯


用來存放mysql數據
create external table weiboHotSearch(
highest_rank int,
title string,
url string,
day_date string
)partitioned by (year string,month string,day string)
row format delimited fields terminated by '\t';

臨時表,用來導入mysql數據
create table weiboHotSearch_temp(
highest_rank int,
title string,
url string,
day_date string
)row format delimited fields terminated by '\t';


第七步

編寫shell腳本

#!/bin/sh
PATH=/usr/local/bin
export PATH
python_dir=/opt/project/weibo/weiboHot.py
python ${python_dir} 

#!/bin/sh
#操作腳本
#使用環境變量
. /etc/profile

#HDFS數據源目錄
DATA_LOG=/user/hive/warehouse/work0403.db

#當天日期
TODAY=`date +%Y-%m-%d`

#前一天日期
YESTERDAY=$(date -d "yesterday" +%Y-%m-%d)

#設置數據庫變量
HOSTNAME=master
PORT=3306
USERNAME=root
PASSWORD=123456
DATABASE=weibo
#設置sql語句
sql_truncate_temp="truncate table ${DATABASE}.weiboHotSearch_temp"
sql_insert="insert into ${DATABASE}.weiboHotSearch_temp (select * from ${DATABASE}.weiboHotSearch where day_date=\""${TODAY}"\")"
sql_truncate_main="truncate table ${DATABASE}.weiboHotSearch"
#將前一天的臨時表清空
mysql -h${HOSTNAME}  -P${PORT}  -u${USERNAME} -p${PASSWORD} -e "${sql_truncate_temp}" --default-character-set=UTF8
#將主表數據插入臨時表
mysql -h${HOSTNAME}  -P${PORT}  -u${USERNAME} -p${PASSWORD} -e "${sql_insert}" --default-character-set=UTF8
#主表數據清空
mysql -h${HOSTNAME}  -P${PORT}  -u${USERNAME} -p${PASSWORD} -e "${sql_truncate_main}" --default-character-set=UTF8
#清空hive前一天臨時表
hive -e "
use ${DATABASE};
truncate table weiboHotSearch_temp;"
#用sqoop 將臨時表數據導入hive 臨時表
sqoop import \
--connect jdbc:mysql://${HOSTNAME}:${PORT}/${DATABASE} \
--username ${USERNAME} \
--password ${PASSWORD} \
--table weiboHotSearch_temp \
--num-mappers 1 \
--fields-terminated-by "\t" \
--delete-target-dir \
--hive-database ${DATABASE} \
--hive-import \
--hive-table weiboHotSearch_temp

#將hive 臨時表數據導入hive主表
hive -e "
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nostrick;
insert into table ${DATABASE}.weiboHotSearch partition(year,month,day)
select highest_rank,title,url,day_date,substr(day_date,0,4) year,substr(day_date,6,2) month, substr(day_date,9,2) day

from ${DATABASE}.weiboHotSearch_temp;"

#刪除前一天數據文件
rm -rf /opt/project/weibo/data_${YESTERDAY}

第八步

測試運行

啓動hdfs
${HADOOP_HOME}/sbin/start-dfs.sh
${HADOOP_HOME}/sbin/start-yarn.sh
${HADOOP_HOME}/sbin/mr-jobhistory-daemon.sh start historyserver 
啓動zookeeper
${ZOOKEEPER_HOME}/sbin/zkServer.sh start
啓動hive metastore
${HIVE_HOME}/bin/hive --service metastore &
啓動flume
${FLUME_HOME}/bin/flume-ng agent --conf conf/ --name a2 --conf-file /opt/project/weibo/flume-kafka_weibo.conf
啓動kafka
${KAFKA_HOME}/bin/kafka-server-start.sh config/server.properties
運行 spark java 包
${SPARK_HOME}/bin/spark-submit \
--class com.stanley.sparktest.weibo.WeiBoHot \
/opt/project/weibo/weiboHot.jar
執行python 腳本
bash /opt/project/weibo/python_shell.sh
python腳本執行完後執行操作腳本
bash /opt/project/weibo/operation.sh

爬取的內容抽取到kafka集羣中

查看數據庫,數據已存入數據庫

執行操作腳本後

數據庫中接收數據表被清空,數據轉入準備導入和hive 連接的表



數據已經存入到hive 中



數據在存入到hdfs 分區當中


數據文件已經自動產生


第九步

設置定時任務

可以用oozie來調度,

Crontab調度如下

#每十分鐘爬取一次微博熱搜數據
2,12,22,32,42,52 * * * * bash /opt/project/weibo/python_shell.sh
#每天23點55分執行操作腳本,把數據存入到hdfs
55 23 * * * bash /opt/project/weibo/operation.sh

使用hue 頁面操作oozie

創建兩個workflow,分別對應python腳本,和操作腳本

兩個腳本分別創建coordinator,頻率和上面的crontab一致



創建bundle,將兩個coordinator綁在一起

最後提交bundle任務 



發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章