1.hive分組去重函數使用。
select *,row_num() over(partition by id order by modifytime desc) rn from lyjtest where rn=1;
row_num() over函數對id做分區根據修改時間做降序,然後篩選出時間最新的一條(rn=1)的數據,達到去重的效果。
2.hive 寫入數據
insert into table table2 select * from table1; --查詢table1中的數據寫入table2;
insert overwrite table table2 select * from table1;--覆蓋寫入
3.where和having的區別
//where是先限定性條件再分組(對原始數據過濾,where不能過濾聚合函數)
hive> select count(*),age from table1 where id>18 group by age;
//having是先分組在限定條件(對每個組進行過濾,having後只能跟select中已有的列)
hive> select age,count(*) c from table1 group by age having c>2;
//where和having一起使用
select id,count(*) from table1 where id>18 group by id having count(*)>2;
4.hive只支持union all,不支持union
union all 不去重
select name,age from table1 where id<80
union all
select name,age from table2 where age>18;
5.查詢前五條數據
select * from table1 order by age desc limit 5; --查詢年齡最大的五條數據
select * from student limit 5;--隨機查詢五條數據
6.五種子句的嚴格順序
where → group by → having → order by → limit
7.distinct
//distinct關鍵字返回唯一不同的值(返回age和id均不相同的記錄)
hive> select distinct age,id from test;
8.複製表
create table test1_temp like test1; --只複製表不包含數據
create table test1 as select * from test2; --複製表複製數據到新表9.創建表
9.創建表
CREATE TABLE `lyjtest1`( `id` double, `name` string, `sex` string)
COMMENT 'create table from sql'
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES ( 'field.delim'='\t', 'line.delim'='\n', 'serialization.format'='\t')
STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'hdfs://ambari1:8020/warehouse/tablespace/managed/hive/ods_lyjtest.db/lyjtest1' ;