spark:sparkstreaming 0.08版本 從 kafka 採集數據,並調用HTTP接口傳參

pom:

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.tzb.bigdata</groupId>
    <artifactId>spark-test</artifactId>
    <!--<packaging>pom</packaging>-->
    <version>1.0</version>
    <!--<modules>-->
    <!--<module>hbase</module>-->
    <!--</modules>-->

    <properties>
        <scala.version>2.10.6</scala.version>
        <hadoop.version>2.6.0</hadoop.version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>2.1.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.11</artifactId>
            <version>2.1.1</version>
        </dependency>
        <!--<dependency>-->
        <!--<groupId>org.apache.spark</groupId>-->
        <!--<artifactId>spark-sql_2.10</artifactId>-->
        <!--<version>1.6.0</version>-->
        <!--</dependency>-->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-hive_2.11</artifactId>
            <version>2.1.1</version>
        </dependency>
        <dependency>
            <groupId>com.typesafe.play</groupId>
            <artifactId>play-mailer_2.11</artifactId>
            <version>7.0.0</version>
        </dependency>
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.41</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming_2.11</artifactId>
            <version>2.1.1</version>
        </dependency>
        <!--=========================spark-streaming-kafka===========================-->
        <!--0.8版本版本-->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming-kafka-0-8_2.11</artifactId>
            <version>2.1.1</version>
        </dependency>

        <!--0.10版本 新版本-->
        <!--<dependency>-->
            <!--<groupId>org.apache.spark</groupId>-->
            <!--<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>-->
            <!--<version>2.3.0</version>-->
            <!--<exclusions>-->
                <!--<exclusion>-->
                    <!--<artifactId>scala-library</artifactId>-->
                    <!--<groupId>org.scala-lang</groupId>-->
                <!--</exclusion>-->
            <!--</exclusions>-->
        <!--</dependency>-->
        <!--======================================================================-->
        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka-clients</artifactId>
            <version>0.11.0.2</version>
        </dependency>


        <!--<dependency>-->
        <!--<groupId>org.scala-lang</groupId>-->
        <!--<artifactId>scala-library</artifactId>-->
        <!--<version>2.10.6</version>-->
        <!--</dependency>-->
        <!--<dependency>-->
        <!--<groupId>org.apache.hadoop</groupId>-->
        <!--<artifactId>hadoop-common</artifactId>-->
        <!--</dependency>-->

        <!--測試Hbase時再打開註釋,否則idea本地連接測試環境會報錯-->
        <!--<dependency>-->
            <!--<groupId>org.apache.hbase</groupId>-->
            <!--<artifactId>hbase-client</artifactId>-->
            <!--<version>2.0.1</version>-->
            <!--<exclusions>-->
                <!--<exclusion>-->
                    <!--<groupId>com.fasterxml.jackson.core</groupId>-->
                    <!--<artifactId>jackson-databind</artifactId>-->
                <!--</exclusion>-->
            <!--</exclusions>-->
        <!--</dependency>-->

        <dependency>
            <groupId>net.sf.json-lib</groupId>
            <artifactId>json-lib</artifactId>
            <version>2.4</version>
            <classifier>jdk15</classifier>
        </dependency>
        <dependency>
            <groupId>org.neo4j.driver</groupId>
            <artifactId>neo4j-java-driver</artifactId>
            <version>4.0.0</version>
        </dependency>
        <dependency>
            <groupId>com.google.code.gson</groupId>
            <artifactId>gson</artifactId>
            <version>2.8.5</version>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
            <!-- 去掉scope作用域,使用默認的compile,編譯、測試、運行都有效的作用域 -->
            <!--<scope>test</scope>-->
        </dependency>
        <dependency>
            <groupId>net.minidev</groupId>
            <artifactId>json-smart</artifactId>
            <version>2.3</version>
        </dependency>
        <!-- 郵件發送 -->
        <!--<dependency>-->
        <!--<groupId>com.typesafe.play</groupId>-->
        <!--<artifactId>play-mailer_2.11</artifactId>-->
        <!--<version>7.0.0</version>-->
        <!--</dependency>-->
        <!--<dependency>-->
        <!--<groupId>org.apache.poi</groupId>-->
        <!--<artifactId>poi</artifactId>-->
        <!--<version>3.12</version>-->
        <!--</dependency>-->
        <dependency>
            <groupId>joda-time</groupId>
            <artifactId>joda-time</artifactId>
            <version>2.10.1</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-catalyst -->
        <!--<dependency>-->
        <!--<groupId>org.apache.spark</groupId>-->
        <!--<artifactId>spark-catalyst_2.11</artifactId>-->
        <!--<version>2.3.0</version>-->
        <!--<scope>test</scope>-->
        <!--</dependency>-->
        <!--中文分詞器-->
        <dependency>
            <groupId>com.huaban</groupId>
            <artifactId>jieba-analysis</artifactId>
            <version>1.0.2</version>
        </dependency>
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.68</version>
        </dependency>
        <!--es-->
        <dependency>
            <groupId>org.elasticsearch</groupId>
            <artifactId>elasticsearch-spark-20_2.11</artifactId>
            <version>6.2.4</version>
        </dependency>
        <!--poi excel-->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>3.12</version>
        </dependency>
    </dependencies>

    <build>
        <finalName>spark-test</finalName>
        <plugins>
            <plugin>
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
                <version>3.2.2</version>
                <executions>
                    <execution>
                        <goals>
                            <goal>compile</goal>
                            <goal>testCompile</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-assembly-plugin</artifactId>
                <!--<version>3.0.0</version>-->
                <configuration>
                    <archive>
                        <manifest>
                            <mainClass>WordCount</mainClass>
                        </manifest>
                    </archive>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <configuration>
                    <source>8</source>
                    <target>8</target>
                </configuration>
            </plugin>
        </plugins>
    </build>



</project>
SparkStreaming04_KafkaSource:
package com.tzb.sparkstreaming
import java.net.URLEncoder
import net.minidev.json.JSONObject
import net.minidev.json.parser.JSONParser
import org.apache.commons.httpclient.HttpClient
import org.apache.commons.httpclient.methods.GetMethod
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}

/**
  * SparkStreaming 版本0.8
  * 注:本程序是將SparkStreaming與kafka結合,通過從kafka採集數據,並調用HTTP接口傳參
  * SparkStreaming 從kafka中採集數據 無狀態的數據統計(不同批次的word的次數不會合並)
  * 1)聲明採集器(這裏不需要是因爲 SparkStreaming03_MyReceiver中已定義)
  * 2)重寫方法 onStart ,onStop
  *
  * idea本地和210測試機都測試成功:
  * 打開kafkatool向某個主題中推送數據
  * 執行main方法,開始消費數據
  * 並取出值調用HTTP接口傳參
  * 打包測試(成功):
  * spark-submit --master yarn-client --conf spark.driver.memory=2g --class com.tzb.sparkstreaming.SparkStreaming04_KafkaSource --executor-memory 8G --num-executors 5 --executor-cores 2 /var/lib/hadoop-hdfs/spride_sqoop_beijing/bi_table/test/spark-test-jar-with-dependencies.jar >> /var/lib/hadoop-hdfs/spride_sqoop_beijing/bi_table/test/sparkstreaming_datachange.log
  * 線上跑的話要把代碼裏的kafka以及zk 等組件的ip或域名,改爲線上的,同時提交任務時把 spark-submit 改爲 spark-submit2,命令後邊加個&符號,則爲後臺啓動程序,當前窗口可關閉。
  *
  * 如何停止任務:
  * 如果想停止掉這個任務則:ps -ef | grep SparkStreaming04_KafkaSource,並將端口kill掉即可。
  *
  */
object SparkStreaming04_KafkaSource {

  def main(args: Array[String]): Unit = {

    //使用SparkStreaming完成WordCount

    //Spark配置對象
    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("SparkStreaming01_WordCount")

    //實時數據分析環境對象
    //採集週期:以指定的時間爲週期採集實時數據
    val streamingContext = new StreamingContext(sparkConf,Seconds(5))

    // 從kafka中採集數據(注意這裏要引 kafka 0.8 版本的包)
    //家裏機器
//    val kafkaDStream: ReceiverInputDStream[(String,String)] = KafkaUtils.createStream(
//      streamingContext,
//      "sparkproject1:2181",
//      "testgroup", //group
//      Map("testsparkstreaming" -> 3) //topics
//    )
    // 210 測試機
    val kafkaDStream: ReceiverInputDStream[(String,String)] = KafkaUtils.createStream(
      streamingContext,
      "**.**.**.10:2181",
      "testgroup", //group
      Map("test" -> 3) //topics
    )

    //家裏機器
    //bin/kafka-topics.sh --zookeeper sparkproject1:2181 --list    //查看kafka topics
    //創建topic bin/kafka-topics.sh --zookeeper sparkproject1:2181 --create --topic testsparkstreaming --partitions 3 --replication-factor 2   //總共3*2 6個副本,先定義分區再定義副本
    //生產測試數據 bin/kafka-console-producer.sh --broker-list sparkproject1:9092 --topic testsparkstreaming
    // 210 測試機

    //將採集的數據進行分解(扁平化),注意kafka的消息其實就是k v 對
    val wordDStream : DStream[String] =  kafkaDStream.flatMap(t => t._2.split(" "))

    //將數據進行結構的轉換方便統計分析
    val mapDStream : DStream[(String,Int)] = wordDStream.map((_,1))

    //將轉換結構後的數據進行聚合處理
    val wordToSumDStream : DStream[(String,Int)] = mapDStream.reduceByKey(_+_)
    //將結果打印出來
    wordToSumDStream.print()
    wordToSumDStream.repartition(1) //未生效?

    //將DStream保存成文件
//    wordToSumDStream.saveAsTextFiles("file:///D:\\workspace\\spark-test\\output\\sparkstreamingResult1") //注意:如果sparkstreamingResult1文件夾沒手動創建的話,會把結果存儲到output目錄
//    wordToSumDStream.saveAsTextFiles("file:///D:/workspace/spark-test/output/sparkstreamingResult/sparkstreaming.txt")
//    wordToSumDStream.saveAsTextFiles("file:///output/sparkstreamingResult/sparkstreaming.txt") // 指向D盤根目錄

    wordToSumDStream.foreachRDD(
      rdd => {
        val arr : Array[(String, Int)] = rdd.collect()
        if(arr!=null && arr.length>0){
          println("key:"+ arr(0)._1+" value:" +arr(0)._2)
                  //調用HTTP接口
                      val result = requestHTTP(arr(0)._1)
                      println("=======>HTTP接口調用結果:" + result)

        }

      }
    )


    //不能停止採集程序
    //streamingContext.stop

    //啓動採集器
    streamingContext.start()
    //Driver等待採集器的執行
    streamingContext.awaitTermination()
 //
  }

  /**
    * 請求HTTP接口
    * @param jobName
    * @return
    */
  def requestHTTP(jobName: String) = {
    var data =""
    var jobName1="bbb"
    // 相當於你拿到了一個瀏覽器 HTTP:(Get | Post | Put | Delete)
    val httpClient = new HttpClient()
    // 組裝參數
    val params = Map[String, String](
      "jobName" -> URLEncoder.encode(jobName, "UTF-8"),
      "jobName1" -> URLEncoder.encode(jobName1, "UTF-8")
    ).map(kv => kv._1 + "=" + kv._2).mkString("&")

    val getMethod = new GetMethod("http://10.21.4.197:7772/src/main/test/sparkHTTP?" + params) //此接口寫法示例:本類搜索 spark請求HTTP接口示例
    getMethod.addRequestHeader("Content-Type", "application/json;charset=UTF-8")
    // 發送get請求
    val status = httpClient.executeMethod(getMethod)
    if (status == 200) {
      val responseBodyAsString = getMethod.getResponseBodyAsString
      val jsonParser = new JSONParser()
      val jsonObj: JSONObject = jsonParser.parse(responseBodyAsString).asInstanceOf[JSONObject]
      data = jsonObj.get("data").toString
      // 釋放連接
      getMethod.releaseConnection()
    } else None
    data
  }

}

//這裏不需要是因爲 SparkStreaming03_MyReceiver中已定義
//聲明採集器
//1)繼承Receiver
//class MyReceiver(host:String,port:Int) extends Receiver[String](StorageLevel.MEMORY_ONLY){
//  //  val socket = _
//  var socket: java.net.Socket = null
//
//  def receive(): Unit = {
//    socket = new java.net.Socket(host,port)
//    val reader = new BufferedReader(new InputStreamReader(socket.getInputStream,"UTF-8"))
//    var line : String =null
//    while((line = reader.readLine()) != null){
//      //將採集的數據存儲到採集器的內部進行轉換
//      if("END".equals(line)){  //設定結束標識符
//        return
//      }else{ //數據是正常發的
//        this.store(line)
//      }
//    }
//  }
//
//  override  def onStart(): Unit ={
//    //啓動一個線程
//    new Thread(
//      new Runnable {
//        override def run(): Unit = {
//          receive()
//        }
//      }
//
//    ).start()
//  }
//
//  override def onStop(): Unit = {
//    if(socket != null){
//      socket.close()
//      socket = null
//    }
//  }
//}

HTTP接口:

package com.huayong.bi.web.controller;

import com.alibaba.fastjson.JSONObject;
import org.springframework.web.bind.annotation.*;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

/**
 *  測試
 */
@RestController
@RequestMapping("/src/main/test")
public class TestController {
    
    /**
     * spark 請求 HTTP
     * @param request
     * @param response
     */
    @CrossOrigin
    @RequestMapping(value = "/sparkHTTP", method={RequestMethod.GET})
    public String sparkHTTP(HttpServletRequest request, HttpServletResponse response) {
        JSONObject jo = null;
        try {
            String jobName = request.getParameter("jobName");
            String jobName1  = request.getParameter("jobName1");
            System.out.println(jobName + "===" + jobName1);

            jo = new JSONObject();
            jo.put("code", 200);
            jo.put("msg", "");
            jo.put("data", "成功");
        } catch (Exception e) {
            e.printStackTrace();
        }
        return jo.toString();
    }

}

 

 

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章