【Spark】RDD

    小落用的是pyspark,利用jupyter來編寫提交spark job。下面直接用代碼介紹:
在此之前要已經搭建並啓動hdfs+spark+jupyter

啓動spark api

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("spark://192.168.48.100:7077")\
				.appName("rdd_demos").getOrCreate()
				

創建RDD

# 通過並行化內存中的集合來創建RDD
arr1 = [1,2,3,4,5,6,7,8,9,10]
rdd1 = spark.sparkContext.parallelize(arr1)
#加載外部文件創建RDD
file = "/spark_demo/wordcount/input/study.txt"
rdd3 = spark.sparkContext.textFile(file)

對RDD執行transformation操作

# 查看當前RDD有幾個分區
rdd1.getNumPartitions()
# 把rdd看作是流式的
#—— map(自定義規則)轉換,一對一
data_rdd1 = data.map(lambda x: x+1)
#—— flatMap轉換:先規則轉換,再扁平,一對多
data_rdd2 = data.flatMap(lambda x: range(x,4))
#—— filter轉換,多對一
data_rdd3 = data.filter(lambda x: x!=1)
#—— distinct轉換,多對一
data_rdd4 = data.distinct()
# ——sample:抽樣,多對一
data_rdd5 = data.sample(False, 0.5)

對RDD執行set操作

# 現在假設有兩個RDD,包含元素分別爲{1,2,3,3}和{3,4,5}
# 首先構造這兩個RDD
data1 = spark.sparkContext.parallelize([1,2,3,3])
data2 = spark.sparkContext.parallelize([3,4,5])
# union操作
data1.union(data2).collect()
# intersection操作:交集
data1.intersection(data2).collect()
# substact操作:差集
data1.subtract(data2).collect()
# cartesian操作:笛卡爾集
data1.cartesian(data2).collect()
# groupBy操作:按指定規則分組
a = spark.sparkContext.parallelize(["black", "blue", "white", "green", "grey"])
b = a.groupBy(lambda x: len(x)).collect()
print(b)
sorted([(x,sorted(y)) for (x,y) in b])

對RDD執行action操作

rdd.count()
rdd.collect()
rdd.first()
rdd.countByValue()
rdd.take(2)
rdd.takeOrdered(2)
rdd.takeOrdered(2, key=lambda x: -x)
rdd.takeSample(False, 2)
rdd.reduce(lambda x,y: x+y)
rdd.getNumPartitions()

複雜算子——aggregate

seqOp = (lambda x,y: x * y)    # 每個分區執行的函數
combOp = (lambda x,y: x + y)   # 各個分區結果最後聚集時使用的函數
2 #指定是初始值及輸出形式
result = rdd.aggregate(2, seqOp, combOp)
result

創建Pair RDD

# 創建Pair RDD的方式有多種。

# 第一種創建方式:從文件中加載,然後轉換爲Pair RDD。
file = "/spark_demo/wordcount/input/study.txt"
lines =spark.sparkContext.textFile(file)
pairRDD = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word,1))
pairRDD.collect()

# 第二種方式:通過並行集合創建Pair RDD
rdd = spark.sparkContext.parallelize(["Hadoop","Spark","Hive","Spark"])
pairRDD = rdd.map(lambda word: (word,1))
pairRDD.collect()
# keyBy():自定義key的分組規則
a = spark.sparkContext.parallelize(["black", "blue", "white", "green", "grey"])

# 通過應用指定的函數來創建該RDD中元素的元組(參數函數生成對應的key),返回一個pair RDD
b = a.keyBy(lambda x:len(x))  
b.collect()

# 或者,通過元組列表創建 pair rdd
pets = spark.sparkContext.parallelize([("cat",1),("dog",1),("cat",2)])
pets.collect()

Pair RDD transformation操作

# reduceByKey(func):按照key來合併值(相同key的值進行合併)
pairRDD.reduceByKey(lambda x,y: x + y).collect()
# groupByKey():按照key分組
pairRDD.groupByKey().collect()
# keys:返回所有的key
pairRDD.keys().collect()
# values: 返回所有的value
pairRDD.values().collect()

# sortByKey():按照key進行排序,默認是升序
pairRDD.sortByKey(ascending=False).collect()

# mapValues(func):將函數應用到pair rdd中的每個元素上,不改變key
pairRDD.mapValues(lambda x: x*x).collect()
# flatMapValues(func)
pairRDD.flatMapValues(lambda x: range(x,6)).collect()

複雜算子 combineByKey()


# combineByKey():
data = spark.sparkContext.parallelize([("company-1",92),("company-1",85),("company-1",82),
                   ("company-1",93),("company-1",86),("company-1",83),
                   ("company-2",78),("company-2",96),("company-2",85),
                   ("company-3",88),("company-3",94),("company-3",80)],3)
cbk = data.combineByKey(
            lambda income: (income,1),
            lambda t,income: (t[0]+income, t[1]+1),
            lambda t1,t2: (t1[0]+t2[0], t1[1]+t2[1])    # 將不同分區的同一個key的C合併
        )
cbk.collect()     # 每個公司的總收入

# 每個公司的平均收入
cbk.map(lambda t: (t[0],t[1][0],t[1][0]/float(t[1][1]))).collect()
# groupByKey
x = spark.sparkContext.parallelize([
    ("USA", 1), ("USA", 2), ("India", 1),
    ("UK", 1), ("India", 4), ("India", 9),
    ("USA", 8), ("USA", 3), ("India", 4),
    ("UK", 6), ("UK", 9), ("UK", 5)], 4)
 
# 使用groupByKey,默認分區
y = x.groupByKey()
 
# 查看分區
print('分區數: ',y.getNumPartitions()) 
 
# 使用預定義的分區
y = x.groupByKey(2)
print('分區數: ',y.getNumPartitions())
 
# 輸出結果
for t in y.collect():
    print(t[0], [v for v in t[1]])

複雜算子 aggregateByKey

# 使用key-value對創建PairRDD studentRDD
student_rdd = spark.sparkContext.parallelize([
          ("Joseph", "Maths", 83), ("Joseph", "Physics", 74), ("Joseph", "Chemistry", 91), 
          ("Joseph", "Biology", 82),   ("Jimmy", "Maths", 69), ("Jimmy", "Physics", 62), 
          ("Jimmy", "Chemistry", 97), ("Jimmy", "Biology", 80), ("Tina", "Maths", 78), 
          ("Tina", "Physics", 73), ("Tina", "Chemistry", 68), ("Tina", "Biology", 87),
          ("Thomas", "Maths", 87), ("Thomas", "Physics", 93), ("Thomas", "Chemistry", 91), 
          ("Thomas", "Biology", 74), ("Cory", "Maths", 56), ("Cory", "Physics", 65), 
          ("Cory", "Chemistry", 71), ("Cory", "Biology", 68), ("Jackeline", "Maths", 86), 
          ("Jackeline", "Physics", 62), ("Jackeline", "Chemistry", 75), ("Jackeline", "Biology", 83),
          ("Juan", "Maths", 63), ("Juan", "Physics", 69), ("Juan", "Chemistry", 64), 
          ("Juan", "Biology", 60)], 2)
    
# 定義Seqencial Operation and Combiner Operations
# Sequence operation : 從單個分區查找最大成績
def seq_op(accumulator, element):
    if(accumulator > element[1]):
        return accumulator 
    else: 
        return element[1]
 
 
# Combiner Operation : 從所有分區累加器中找出最大成績
def comb_op(accumulator1, accumulator2):
    if(accumulator1 > accumulator2):
        return accumulator1 
    else:
        return accumulator2
 

# 在我們的情況下,零值將是零,因爲我們正在尋找最大的成績
zero_val = 0
aggr_rdd = student_rdd.map(lambda t: (t[0], (t[1], t[2]))).aggregateByKey(zero_val, seq_op, comb_op) 
 
# 查看輸出
for tpl in aggr_rdd.collect():
    print(tpl)

關閉 spark api,停止saprk job

spark.stop()

RDD下的簡單統計方法

rdd1.sum()

rdd1.max()

rdd1.min()

# 均值
rdd1.mean()

rdd1.count()

# 方差
rdd1.variance()

# 樣本方差
rdd1.sampleVariance()

# 標準差
rdd1.stdev()

# 抽樣標準差
rdd1.sampleStdev()

# 直方圖
# 方法1
# rdd1.histogram([1.0, 10.0, 20.9])
rdd1.histogram([1.0, 8.0, 20.9])

# 方法2
rdd1.histogram(3)
# 通過調用stats()方法返回一個StatsCounter對象
status = rdd1.stats()

print(status.count())
print(status.mean())
print(status.stdev())
print(status.max())
print(status.min())
print(status.sum())
print(status.variance())
print(status)
spark.stop()

持久化(緩存)與共享變量

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章