一.使用Parquet存儲數據
數據使用列存儲之前是普通的行存儲,下面是行存儲的的文件大小,這個HDFS上的數據
使用parquet列存儲,可以將文件的大小減小化。下面具體講parquet存儲數據的代碼以及加載數據的格式。
數據庫代碼:
create table if not exists db_yhd.track_log_parquet(
id STRING,
url STRING,
referer STRING,
keyword STRING,
type STRING,
guid STRING,
pageId STRING,
moduleId STRING,
linkId STRING,
attachedInfo STRING,
sessionId STRING,
trackerU STRING,
trackerType STRING,
ip STRING,
trackerSrc STRING,
cookie STRING,
orderCode STRING,
trackTime STRING,
endUserId STRING,
firstLink STRING,
sessionViewNo STRING,
productId STRING,
curMerchantId STRING,
provinceId STRING,
cityId STRING,
fee STRING,
edmActivity STRING,
edmEmail STRING,
edmJobId STRING,
ieVersion STRING,
platform STRING,
internalKeyword STRING,
resultSum STRING,
currentPage STRING,
linkPosition STRING,
buttonPosition STRING
)
PARTITIONED BY(date_str String,hour_str STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t"
STORED AS Parquet
加載數據
INSERT OVERWRITE TABLE db_yhd.track_log_parquet PARTITION (date_str="20150828",hour_str="18") IF NOT EXISTS
SELECT id,url,referer,keyword,type,guid,pageId,moduleId,linkId,attachedInfo,sessionId,trackerU,trackerType,ip,trackerSrc,cookie,orderCode,trackTime,endUserId,firstLink,sessionViewNo ,productId,curMerchantId ,provinceId,cityId,fee,edmActivity,edmEmail,edmJobId,ieVersion,platform,internalKeyword,resultSum,currentPage,linkPosition,buttonPosition
FROM
db_yhd.track_log
WHERE
date_str="20150828" and hour_str="18"
測試結果截圖:
二.使用Parquet格式+snappy壓縮
也可以使用parquet格式+snappy壓縮格式實現文件的壓縮,減少數據的大小,增加數據的加載速度。
壓縮代碼:
set parquet.compression=snappy;
create table if not exists db_yhd.track_log_parquet_snappy(
id STRING,
url STRING,
referer STRING,
keyword STRING,
type STRING,
guid STRING,
pageId STRING,
moduleId STRING,
linkId STRING,
attachedInfo STRING,
sessionId STRING,
trackerU STRING,
trackerType STRING,
ip STRING,
trackerSrc STRING,
cookie STRING,
orderCode STRING,
trackTime STRING,
endUserId STRING,
firstLink STRING,
sessionViewNo STRING,
productId STRING,
curMerchantId STRING,
provinceId STRING,
cityId STRING,
fee STRING,
edmActivity STRING,
edmEmail STRING,
edmJobId STRING,
ieVersion STRING,
platform STRING,
internalKeyword STRING,
resultSum STRING,
currentPage STRING,
linkPosition STRING,
buttonPosition STRING
)
PARTITIONED BY(date_str String,hour_str STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t"
STORED AS Parquet
加載數據
INSERT OVERWRITE TABLE db_yhd.track_log_parquet_snappy PARTITION (date_str="20150828", hour_str="18" ) IF NOT EXISTS
SELECT id,url,referer,keyword,type,guid,pageId,moduleId,linkId,attachedInfo,sessionId,trackerU,trackerType,ip,trackerSrc,cookie,orderCode,trackTime,endUserId,firstLink,sessionViewNo ,productId,curMerchantId ,provinceId,cityId,fee,edmActivity,edmEmail,edmJobId,ieVersion,platform,internalKeyword,resultSum,currentPage,linkPosition,buttonPosition
FROM
db_yhd.track_log
WHERE
date_str="20150828" and hour_str="18"
測試結果截圖:
三.使用ORC格式存儲 + snappy壓縮
可以使用orc文本格式+snappy壓縮格式下面是它的具體實現方法:
SQL語句:
create table if not exists db_yhd.track_log_orc_snappy(
id STRING ,
url STRING,
referer STRING,
keyword STRING,
type STRING,
guid STRING,
pageId STRING,
moduleId STRING,
linkId STRING,
attachedInfo STRING,
sessionId STRING,
trackerU STRING,
trackerType STRING,
ip STRING,
trackerSrc STRING,
cookie STRING,
orderCode STRING,
trackTime STRING,
endUserId STRING,
firstLink STRING,
sessionViewNo STRING,
productId STRING,
curMerchantId STRING,
provinceId STRING,
cityId STRING,
fee STRING,
edmActivity STRING,
edmEmail STRING,
edmJobId STRING,
ieVersion STRING,
platform STRING,
internalKeyword STRING,
resultSum STRING,
currentPage STRING,
linkPosition STRING,
buttonPosition STRING
)
PARTITIONED BY(date_str String,hour_str STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t"
STORED AS ORC
TBLPROPERTIES ("orc.compress"="SNAPPY")
加載數據:
INSERT OVERWRITE TABLE db_yhd.track_log_orc_snappy PARTITION (date_str="20150828", hour_str="18" ) IF NOT EXISTS
SELECT id,url,referer,keyword,type,guid,pageId,moduleId,linkId,attachedInfo,sessionId,trackerU,trackerType,ip,trackerSrc,cookie,orderCode,trackTime,endUserId,firstLink,sessionViewNo ,productId,curMerchantId ,provinceId,cityId,fee,edmActivity,edmEmail,edmJobId,ieVersion,platform,internalKeyword,resultSum,currentPage,linkPosition,buttonPosition
FROM
db_yhd.track_log
WHERE
date_str="20150828" and hour_str="18"
測試結果截圖:
4.總結
以上這三種方式雖然對數據進行了大幅度的壓縮,但是還是沒有改變元數據的分割性,原來的值沒有改變。雖然文件的大小壓縮越小越好,但是,在壓縮中不能改變數據的分割性以及原文件的數據內容需保持不變。以上是比較常見的組合壓縮方式。