首先,使用 ScalaIDE 或 IDEA 創建 Scala 的 Maven 工程。需要用到 spark-core,spark-sql,spark-streaming 的 jar 包,pom 文件如下:
<properties>
<spark.version>2.1.0</spark.version>
<scala.version>2.11</scala.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
</dependencies>
一、創建 SparkContext 對象
package core
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.log4j.Logger
import org.apache.log4j.Level
object Demo {
def main(args: Array[String]): Unit = {
// Spark使用log4j打印日誌,爲了避免程序執行過程中產生過多的日誌,添加如下兩行代碼
Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
// 先創建SparkConf,再通過SparkConf創建SparkContext
val conf = new SparkConf().setAppName("demo").setMaster("local")
val sc = new SparkContext(conf)
// 進行詞頻統計
val rdd = sc.textFile("hdfs://qujianlei:9000/data/data.txt").
flatMap(_.split(" ")).
map(x => (x, 1)).
reduceByKey(_+_).
saveAsTextFile("hdfs://qujianlei:9000/output/spark/0214")
sc.stop()
}
}
二、創建 SQLContext 對象
package sql
import org.apache.log4j.Level
import org.apache.log4j.Logger
import org.apache.spark.SparkConf
import org.apache.spark.sql.SQLContext
import org.apache.spark.SparkContext
case class People(id:Int, name:String, age:Int)
object Demo {
def main(args: Array[String]): Unit = {
// Spark使用log4j打印日誌,爲了避免程序執行過程中產生過多的日誌,添加如下兩行代碼
Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
// new SQLContext的方式創建SQLContext
val conf = new SparkConf().setAppName("demo").setMaster("local")
val sc = new SparkContext(conf)
val sqlc = new SQLContext(sc)
// 導入SQLContext的隱式轉換函數toDF
import sqlc.implicits._
val peopleRDD = sc.textFile("d:/students.txt").
map(_.split(" ")).
map(x => People(x(0).toInt, x(1), x(2).toInt))
// 將RDD轉換成DataFrame
val peopleDF = peopleRDD.toDF
// 將DataFrame註冊成表
peopleDF.createOrReplaceTempView("people")
// 通過SQLContext執行查詢
sqlc.sql("select * from people").show()
sc.stop()
}
}
2. 通過 Spark2.0 引入的 SparkSession 間接訪問 SQLContext,SparkContext
package sql
import org.apache.log4j.Level
import org.apache.log4j.Logger
import org.apache.spark.SparkConf
import org.apache.spark.sql.SQLContext
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
case class People(id:Int, name:String, age:Int)
object Demo {
def main(args: Array[String]): Unit = {
// Spark使用log4j打印日誌,爲了避免程序執行過程中產生過多的日誌,添加如下兩行代碼
Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
// 通過sparkSession來間接訪問SQLContext
val spark = SparkSession.builder().appName("demo").master("local").getOrCreate()
// 導入SQLContext的隱式轉換函數toDF
import spark.sqlContext.implicits._
// 下面這種導入方式也可以
// import spark.implicits
val peopleRDD = spark.sparkContext.textFile("d:/students.txt").
map(_.split(" ")).
map(x => People(x(0).toInt, x(1), x(2).toInt))
// 將RDD轉換成DataFrame
val peopleDF = peopleRDD.toDF
// 將DataFrame註冊成表
peopleDF.createOrReplaceTempView("people")
// 通過SQLContext執行查詢
spark.sqlContext.sql("select * from people").show()
// 下面這種方式也可以
// spark.sql("select * from people").show()
spark.stop()
}
}
三、創建 StreamingContext 對象
package streaming
import org.apache.log4j.Level
import org.apache.log4j.Logger
import org.apache.spark.SparkConf
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.storage.StorageLevel
object SocketStream {
def main(args: Array[String]): Unit = {
System.setProperty("hadoop.home.dir", "F:\\第七期\\hadoop-2.7.3\\hadoop-2.7.3");
// 爲了避免執行過程中打印過多的日誌
Logger.getLogger("org.apache.spark").setLevel(Level.OFF)
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
// local[x]這裏,x的值至少爲2,表示有兩個線程執行流計算,一個接受數據,一個處理數據
// 如果將程序提交到Spark集羣上運行,同理,至少保證CPU有2個核心
val conf = new SparkConf().setAppName("demo").setMaster("local[2]")
val ssc = new StreamingContext(conf, Seconds(3))
val socketStream = ssc.socketTextStream("192.168.0.1", 1234, StorageLevel.MEMORY_AND_DISK_SER)
socketStream.print()
ssc.start()
ssc.awaitTermination()
}
}