Hive系列（四）聽說Hive分區能提高查找效率？？快來試試-----各種姿勢導表

原創

NICEDAYSS

2020-06-22 23:33

文章目錄

一：Hive分區簡介：

二：分區的具體過程：

一：Hive分區簡介：

分區的方式：

動態分區
靜態分區

分區的作用：

分區主要用於提高性能

沒有分區的存在，那麼每次查詢Hive將會進行全表掃描

分區列的值將表劃分爲segments（文件夾）
查詢時使用分區列和常規列類似
查詢Hive自動過濾不用於提高性能的分區

主要是以縮小數據查詢範圍，提高查詢速度和性能的

分區的配置：

-- Hive默認配置值
-- 開啓或關閉動態分區 true 爲開啓
hive.exec.dynamic.partition=true;

-- 設置爲nonstrict模式，讓所有分區都動態配置，否則至少需要指定一個分區值
hive.exec.dynamic.partition.mode=nonstrict;

-- 能被mapper或reducer創建的最大動態分區數，超出而報錯
hive.exec.max.dynamic.partitions.pernode=100;

-- 一條帶有動態分區SQL語句所能創建的最大動態分區總數，超過則報錯
hive.exec.max.dynamic.partitions=1000;

-- 全局能被創建文件數目的最大值，通過Hadoop計數器跟蹤，若超過則報錯
hive.exec.max.created.files=100000;

二：分區的具體過程：

創建分區：

一般來說外部表是ods層的數據原始層，內部表是dwd層的細節數據層

分區表一般是內部表。

創建分區表–動態和靜態的創建方式一樣

# 數據原始層的外部表
CREATE external TABLE employee_partitioned(
    name string,
    work_place ARRAY<string>,
    sex_age STRUCT<sex:string,age:int>,
    skills_score MAP<string,int>,
    depart_title MAP<STRING,ARRAY<STRING>> 
)
# 導入表時以|爲分隔符
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
# 集合以,爲分割符
COLLECTION ITEMS TERMINATED BY ','
# map以：爲分割符
MAP KEYS TERMINATED BY ':';


# 近源層的細節數據層
CREATE TABLE employee_partitioned_copy(
    name string,
    work_place ARRAY<string>,
    sex_age STRUCT<sex:string,age:int>,
    skills_score MAP<string,int>,
    depart_title MAP<STRING,ARRAY<STRING>> 
)
# 創表時先設分區
PARTITIONED BY (year INT, month INT)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
COLLECTION ITEMS TERMINATED BY ','
MAP KEYS TERMINATED BY ':';

設置分區：

靜態分區–相當於指定手動創建

ALTER TABLE employee_partitioned_copy ADD 
PARTITION (year=2019,month=3) PARTITION (year=2019,month=4); 
ALTER TABLE employee_partitioned_copy DROP PARTITION (year=2019, month=4)

添加靜態分區的數據

# 塞值
insert into mypart partition(gender='male') values(1,'zs');
# 塞表 靜態塞值的時候不需要塞分區字段名
insert overwrite table mypart partition(gender='female')
select userid,username from userinfos;

# 如果塞的表和分區的分區字段不一致，會強行把表的分區字段變爲一致
# 就是到這個分區，這個分區的字段都變爲一致。

動態分區–

使用動態分區需設定屬性–開啓動態分區

set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;

動態分區設置方法

 # 給一張表的對應分區裏插入另一張表的數據，靜態塞值的時候不需要塞分區字段名
insert into table employee_partitioned partition(year, month)
select name,array('Toronto') as work_place,
named_struct("sex","male","age",30) as sex_age,
map("python",90) as skills_score,
map("r&d", array('developer')) as depart_title,
year(start_date) as year,month(start_date) as month
from employee_hr eh;

設置動態分區的個數上限

set hive.exec.max.created.files=600000;

加載本地數據文件到hive數據庫表

 load data local inpath '/opt/wyw.xlsx' overwrite into table mydemo.customs2;

將一張表導入另一張表

 # 給一張表的對應分區裏插入另一張表的數據，靜態塞值的時候不需要塞分區字段名
 insert into table mypart partition(gender) 
 select userid,username,gender from userinfos;

insert overwrite table userinfos partition(year,month) select userid,username,age,regexp_replace(birthday,'/','-'),gender,split(birthday,'/')[0] as year, split(birthday,'/')[1] as month from customs3;

向分區插入數據：

動態分區插入時默認最後幾個字段對應分區，所以要對應分區的順序一一對應填寫字段名，需要分區的字段放在最後幾個。

建表時：插入數據

創建表時本地插入

create table customs(
    cust_id string,
    cust_name string,
    age int
)
row format delimited fields terminated by ','

創建表時HDFS插入

-- 內部分區表的HDFS創建導入
create table customs(
    cust_id string,
    cust_name string,
    age int
)
row format delimited fields terminated by ','
location '/data';# hdfs

建表後：HDFS插入數據

-- 此路徑填寫hdfs路徑
 load data inpath '/mydemo/wyw.xlsx' overwrite into table mydemo.customs2;

建表後：本地插入數據

插入文件

 # 本地全量插入
 load data local inpath '/opt/wyw.xlsx' overwrite into table mydemo.customs2;

建表後：表數據導入另一張表

 # 給一張表的對應分區裏插入另一張表的數據，動態塞值的時候需要塞分區字段名
 #  給一張表的對應分區裏插入另一張表的數據，靜態塞值的時候不需要塞分區字段名
insert into table mypart 
select userid,username,gender 
from userinfos;

建表後：直接插入語句插入數據

 insert into table mypart 
 values(1,'zs');

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

Hive系列（四）聽說Hive分區能提高查找效率？？快來試試-----各種姿勢導表

文章目錄

一：Hive分區簡介：

分區的方式：

分區的作用：

分區的配置：

二：分區的具體過程：

創建分區：

設置分區：

向分區插入數據：

工作中用到的腳本合集

微服務實踐Aspire項目發佈到遠程k8s集羣

通過f-string編寫簡潔高效的Python格式化輸出代碼

[轉帖]20個常用的Linux工具命令

[轉帖]PostgreSQL從小白到高手教程 - 第46講：poc-tpch測試

24-5-18 X

ZooKeeper系列（一）ZooKeeper基本簡介與命令和集羣環境搭建

Hadoop系列 (一) 補--Hadoop完全分佈式環境搭建

Hadoop系列 ( 三 ) MapReduce存在的意義----MapReduce究竟做了些什麼？？

Scala系列（二）Scala數組----超詳細常用方法及其用法

Sqoop系列（一）通過sqoop將關係型數據遷移到HBase和Hive上

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結