Spark RDDs vs DataFrames vs SparkSQL

簡介

Spark的 RDD、DataFrame 和 SparkSQL的性能比較。

2方面的比較

單條記錄的隨機查找
aggregation聚合並且sorting後輸出

使用以下Spark的三種方式來解決上面的2個問題，對比性能。

Using RDD’s
Using DataFrames
Using SparkSQL

數據源

在HDFS中3個文件中存儲的9百萬不同記錄
每條記錄11個字段
總大小 1.4 GB

實驗環境

HDP 2.4
Hadoop version 2.7
Spark 1.6
HDP Sandbox

測試結果

原始的RDD 比 DataFrames 和 SparkSQL性能要好
DataFrames 和 SparkSQL 性能差不多
使用DataFrames 和 SparkSQL 比 RDD 操作更直觀
Jobs都是獨立運行，沒有其他job的干擾

2個操作

Random lookup against 1 order ID from 9 Million unique order ID's
GROUP all the different products with their total COUNTS and SORT DESCENDING by product name

代碼

RDD Random Lookup

#!/usr/bin/env python
 from time import timefrom pyspark import SparkConf, SparkContext
 
conf = (SparkConf()
  .setAppName("rdd_random_lookup")
  .set("spark.executor.instances", "10")
  .set("spark.executor.cores", 2)
  .set("spark.dynamicAllocation.enabled", "false")
  .set("spark.shuffle.service.enabled", "false")
  .set("spark.executor.memory", "500MB"))
sc = SparkContext(conf = conf)
 
t0 = time()
 
path = "/data/customer_orders*"lines = sc.textFile(path) 
## filter where the order_id, the second field, is equal to 96922894print lines.map(lambda line: line.split('|')).filter(lambda line: int(line[1]) == 96922894).collect()
 
tt = str(time() - t0)print "RDD lookup performed in " + tt + " seconds"

DataFrame Random Lookup

#!/usr/bin/env python
 from time import timefrom pyspark.sql import *from pyspark import SparkConf, SparkContext
 
conf = (SparkConf()
  .setAppName("data_frame_random_lookup")
  .set("spark.executor.instances", "10")
  .set("spark.executor.cores", 2)
  .set("spark.dynamicAllocation.enabled", "false")
  .set("spark.shuffle.service.enabled", "false")
  .set("spark.executor.memory", "500MB"))
sc = SparkContext(conf = conf)
 
sqlContext = SQLContext(sc)
 
t0 = time()
 
path = "/data/customer_orders*"lines = sc.textFile(path) 
## create data frameorders_df = sqlContext.createDataFrame( \
lines.map(lambda l: l.split("|")) \
.map(lambda p: Row(cust_id=int(p[0]), order_id=int(p[1]), email_hash=p[2], ssn_hash=p[3], product_id=int(p[4]), product_desc=p[5], \
country=p[6], state=p[7], shipping_carrier=p[8], shipping_type=p[9], shipping_class=p[10]  ) ) ) 
## filter where the order_id, the second field, is equal to 96922894orders_df.where(orders_df['order_id'] == 96922894).show()
 
tt = str(time() - t0)print "DataFrame performed in " + tt + " seconds"

SparkSQL Random Lookup

#!/usr/bin/env python
 from time import timefrom pyspark.sql import *from pyspark import SparkConf, SparkContext
 
conf = (SparkConf()
  .setAppName("spark_sql_random_lookup")
  .set("spark.executor.instances", "10")
  .set("spark.executor.cores", 2)
  .set("spark.dynamicAllocation.enabled", "false")
  .set("spark.shuffle.service.enabled", "false")
  .set("spark.executor.memory", "500MB"))
sc = SparkContext(conf = conf)
 
sqlContext = SQLContext(sc)
 
t0 = time()
 
path = "/data/customer_orders*"lines = sc.textFile(path) 
## create data frameorders_df = sqlContext.createDataFrame( \
lines.map(lambda l: l.split("|")) \
.map(lambda p: Row(cust_id=int(p[0]), order_id=int(p[1]), email_hash=p[2], ssn_hash=p[3], product_id=int(p[4]), product_desc=p[5], \
country=p[6], state=p[7], shipping_carrier=p[8], shipping_type=p[9], shipping_class=p[10]  ) ) ) 
## register data frame as a temporary tableorders_df.registerTempTable("orders") 
## filter where the customer_id, the first field, is equal to 96922894print sqlContext.sql("SELECT * FROM orders where order_id = 96922894").collect()
 
tt = str(time() - t0)print "SparkSQL performed in " + tt + " seconds"

RDD with GroupBy, Count, and Sort Descending

#!/usr/bin/env python
 from time import timefrom pyspark import SparkConf, SparkContext
 
conf = (SparkConf()
  .setAppName("rdd_aggregation_and_sort")
  .set("spark.executor.instances", "10")
  .set("spark.executor.cores", 2)
  .set("spark.dynamicAllocation.enabled", "false")
  .set("spark.shuffle.service.enabled", "false")
  .set("spark.executor.memory", "500MB"))
sc = SparkContext(conf = conf)
 
t0 = time()
 
path = "/data/customer_orders*"lines = sc.textFile(path)
 
counts = lines.map(lambda line: line.split('|')) \
.map(lambda x: (x[5], 1)) \
.reduceByKey(lambda a, b: a + b) \
.map(lambda x:(x[1],x[0])) \
.sortByKey(ascending=False) 
for x in counts.collect():  print x[1] + '\t' + str(x[0])
 
tt = str(time() - t0)print "RDD GroupBy performed in " + tt + " seconds"

DataFrame with GroupBy, Count, and Sort Descending

#!/usr/bin/env python
 from time import timefrom pyspark.sql import *from pyspark import SparkConf, SparkContext
 
conf = (SparkConf()
  .setAppName("data_frame_aggregation_and_sort")
  .set("spark.executor.instances", "10")
  .set("spark.executor.cores", 2)
  .set("spark.dynamicAllocation.enabled", "false")
  .set("spark.shuffle.service.enabled", "false")
  .set("spark.executor.memory", "500MB"))
sc = SparkContext(conf = conf)
 
sqlContext = SQLContext(sc)
 
t0 = time()
 
path = "/data/customer_orders*"lines = sc.textFile(path) 
## create data frameorders_df = sqlContext.createDataFrame( \
lines.map(lambda l: l.split("|")) \
.map(lambda p: Row(cust_id=int(p[0]), order_id=int(p[1]), email_hash=p[2], ssn_hash=p[3], product_id=int(p[4]), product_desc=p[5], \
country=p[6], state=p[7], shipping_carrier=p[8], shipping_type=p[9], shipping_class=p[10]  ) ) )
 
results = orders_df.groupBy(orders_df['product_desc']).count().sort("count",ascending=False) 
for x in results.collect():  print x
 
tt = str(time() - t0)print "DataFrame performed in " + tt + " seconds"

SparkSQL with GroupBy, Count, and Sort Descending

#!/usr/bin/env python
 from time import timefrom pyspark.sql import *from pyspark import SparkConf, SparkContext
 
conf = (SparkConf()
  .setAppName("spark_sql_aggregation_and_sort")
  .set("spark.executor.instances", "10")
  .set("spark.executor.cores", 2)
  .set("spark.dynamicAllocation.enabled", "false")
  .set("spark.shuffle.service.enabled", "false")
  .set("spark.executor.memory", "500MB"))
sc = SparkContext(conf = conf)
 
sqlContext = SQLContext(sc)
 
t0 = time()
 
path = "/data/customer_orders*"lines = sc.textFile(path) 
## create data frameorders_df = sqlContext.createDataFrame(lines.map(lambda l: l.split("|")) \
.map(lambda r: Row(product=r[5]))) 
## register data frame as a temporary tableorders_df.registerTempTable("orders")
 
results = sqlContext.sql("SELECT product, count(*) AS total_count FROM orders GROUP BY product ORDER BY total_count DESC") 
for x in results.collect():  print x
 
tt = str(time() - t0)print "SparkSQL performed in " + tt + " seconds"

Spark RDDs vs DataFrames vs SparkSQL

如何在低代碼平臺中引用 JavaScript ？

探究職業發展的關鍵：能力模型解讀

高效率使用windows

如何使用 JavaScript 獲取當前頁面幀率 FPS

工程款拖欠，農民工怎麼了？就得一直忍着委屈求全嗎？

HarmonyOS 實現下拉刷新，上拉加載更多

語音信號處理中的“窗函數”

智能決策新時代：可視化大屏是否能夠超越傳統白板？

解密Prompt系列28. LLM Agent之金融領域摸索：FinMem & FinAgent

分享幾個.NET開源的AI和LLM相關項目框架

Spark RDDs vs DataFrames vs SparkSQL

javascript深拷貝和淺拷貝

MySQL緩存之Qcache與buffer pool對比

PostgreSQL 使用 PreparedStatement 導致查詢慢的分析

Wireshark網絡端點和會話

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結