常用hive開窗函數示例

個人博客原文鏈接

簡介

普通的聚合函數每組(group by)只返回一個值,而開窗函數則可以爲每行都返回一個值。簡而言之,相當於對查詢的結果添加新的一列值,這一列可以是聚合值,也可以是排序號。開窗函數的關鍵字是over()。

準備測試數據

  1. 建表
    create table if not exists student_scores(
    id string,
    studentId string,
    language string,
    math string,
    english string,
    classId string,
    departmentId string
    );

  2. 插入數據
    insert into table student_scores values
    (‘1’,‘111’,‘68’,‘69’,‘90’,‘class1’,‘department1’),
    (‘2’,‘112’,‘73’,‘80’,‘96’,‘class1’,‘department1’),
    (‘3’,‘113’,‘90’,‘74’,‘75’,‘class1’,‘department1’),
    (‘4’,‘114’,‘89’,‘94’,‘93’,‘class1’,‘department2’),
    (‘5’,‘115’,‘99’,‘93’,‘89’,‘class1’,‘department1’),
    (‘6’,‘121’,‘96’,‘74’,‘79’,‘class2’,‘department1’),
    (‘7’,‘122’,‘89’,‘86’,‘85’,‘class2’,‘department1’),
    (‘8’,‘123’,‘70’,‘78’,‘61’,‘class2’,‘department1’),
    (‘9’,‘124’,‘76’,‘70’,‘76’,‘class2’,‘department1’),
    (‘10’,‘211’,‘89’,‘93’,‘60’,‘class1’,‘department2’),
    (‘11’,‘212’,‘76’,‘83’,‘75’,‘class1’,‘department2’),
    (‘12’,‘213’,‘71’,‘94’,‘90’,‘class2’,‘department2’),
    (‘13’,‘214’,‘94’,‘94’,‘66’,‘class1’,‘department2’),
    (‘14’,‘215’,‘84’,‘82’,‘73’,‘class1’,‘department2’),
    (‘15’,‘216’,‘85’,‘74’,‘93’,‘class1’,‘department2’),
    (‘16’,‘221’,‘77’,‘99’,‘61’,‘class2’,‘department2’),
    (‘17’,‘222’,‘80’,‘78’,‘96’,‘class2’,‘department2’),
    (‘18’,‘223’,‘79’,‘74’,‘96’,‘class2’,‘department2’),
    (‘19’,‘224’,‘75’,‘80’,‘78’,‘class2’,‘department2’),
    (‘20’,‘225’,‘82’,‘85’,‘63’,‘class2’,‘department2’);

count()

  1. 以符合條件的所有行爲作爲窗口
    select studentId,language,math,english,classId,departmentId,count(math) over() as count1
    from student_scores;
    結果:
    111 68 69 90 class1 department1 20
    112 73 80 96 class1 department1 20
    113 90 74 75 class1 department1 20
    114 89 94 93 class1 department2 20
    115 99 93 89 class1 department1 20
    121 96 74 79 class2 department1 20
    122 89 86 85 class2 department1 20
    123 70 78 61 class2 department1 20
    124 76 70 76 class2 department1 20
    211 89 93 60 class1 department2 20
    212 76 83 75 class1 department2 20
    213 71 94 90 class2 department2 20
    214 94 94 66 class1 department2 20
    215 84 82 73 class1 department2 20
    216 85 74 93 class1 department2 20
    221 77 99 61 class2 department2 20
    222 80 78 96 class2 department2 20
    223 79 74 96 class2 department2 20
    224 75 80 78 class2 department2 20
    225 82 85 63 class2 department2 20

  2. 以根據classId分組的所有行爲作爲窗口
    select studentId,language,math,english,classId,departmentId,count(math) over(partition by classId) as count2
    from student_scores;
    結果:
    121 96 74 79 class2 department1 10
    122 89 86 85 class2 department1 10
    123 70 78 61 class2 department1 10
    124 76 70 76 class2 department1 10
    213 71 94 90 class2 department2 10
    221 77 99 61 class2 department2 10
    222 80 78 96 class2 department2 10
    223 79 74 96 class2 department2 10
    224 75 80 78 class2 department2 10
    225 82 85 63 class2 department2 10
    111 68 69 90 class1 department1 10
    112 73 80 96 class1 department1 10
    113 90 74 75 class1 department1 10
    114 89 94 93 class1 department2 10
    115 99 93 89 class1 department1 10
    211 89 93 60 class1 department2 10
    212 76 83 75 class1 department2 10
    214 94 94 66 class1 department2 10
    215 84 82 73 class1 department2 10
    216 85 74 93 class1 department2 10

  3. 以根據classId分組、根據math倒序排序(即count大於等於當前行math的值的個數)的所有行爲作爲窗口
    select studentId,language,math,english,classId,departmentId,count(math) over(partition by classId order by math desc) as count3
    from student_scores;
    結果:
    221 77 99 61 class2 department2 1
    213 71 94 90 class2 department2 2
    122 89 86 85 class2 department1 3
    225 82 85 63 class2 department2 4
    224 75 80 78 class2 department2 5
    123 70 78 61 class2 department1 7
    222 80 78 96 class2 department2 7
    121 96 74 79 class2 department1 9
    223 79 74 96 class2 department2 9
    124 76 70 76 class2 department1 10
    114 89 94 93 class1 department2 2
    214 94 94 66 class1 department2 2
    115 99 93 89 class1 department1 4
    211 89 93 60 class1 department2 4
    212 76 83 75 class1 department2 5
    215 84 82 73 class1 department2 6
    112 73 80 96 class1 department1 7
    113 90 74 75 class1 department1 9
    216 85 74 93 class1 department2 9
    111 68 69 90 class1 department1 10

  4. 以根據classId分組、根據math排序、從當前行前1行到當前行後2行的所有行爲作爲窗口
    select studentId,language,math,english,classId,departmentId,count(math) over(partition by classId order by math desc rows between 1 preceding and 2 following) as count4
    from student_scores;
    結果:
    221 77 99 61 class2 department2 3
    213 71 94 90 class2 department2 4
    122 89 86 85 class2 department1 4
    225 82 85 63 class2 department2 4
    224 75 80 78 class2 department2 4
    123 70 78 61 class2 department1 4
    222 80 78 96 class2 department2 4
    121 96 74 79 class2 department1 4
    223 79 74 96 class2 department2 3
    124 76 70 76 class2 department1 2
    114 89 94 93 class1 department2 3
    214 94 94 66 class1 department2 4
    115 99 93 89 class1 department1 4
    211 89 93 60 class1 department2 4
    212 76 83 75 class1 department2 4
    215 84 82 73 class1 department2 4
    112 73 80 96 class1 department1 4
    113 90 74 75 class1 department1 4
    216 85 74 93 class1 department2 3
    111 68 69 90 class1 department1 2

  5. 以根據classId分組、根據math排序、從第一行到最後一行的所有行爲作爲窗口,默認就是所有行,相當於count3
    select studentId,language,math,english,classId,departmentId,count(math) over(partition by classId order by math desc rows between unbounded preceding and unbounded following) as count5
    from student_scores;

  6. 以根據classId分組、根據math排序、從當前行前1行到當前行的所有行爲作爲窗口
    select studentId,language,math,english,classId,departmentId,count(math) over(partition by classId order by math desc rows between 1 preceding and current row) as count6
    from student_scores;
    結果:
    221 77 99 61 class2 department2 1
    213 71 94 90 class2 department2 2
    122 89 86 85 class2 department1 2
    225 82 85 63 class2 department2 2
    224 75 80 78 class2 department2 2
    123 70 78 61 class2 department1 2
    222 80 78 96 class2 department2 2
    121 96 74 79 class2 department1 2
    223 79 74 96 class2 department2 2
    124 76 70 76 class2 department1 2
    114 89 94 93 class1 department2 1
    214 94 94 66 class1 department2 2
    115 99 93 89 class1 department1 2
    211 89 93 60 class1 department2 2
    212 76 83 75 class1 department2 2
    215 84 82 73 class1 department2 2
    112 73 80 96 class1 department1 2
    113 90 74 75 class1 department1 2
    216 85 74 93 class1 department2 2
    111 68 69 90 class1 department1 2

row_number()

以根據classId分組、根據math排序的所有行爲作爲窗口,從1開始對分區內的數據排序,相同的值無重複排名,即排名是連續的。
select studentId,language,math,english,classId,departmentId,row_number() over(partition by classId order by math desc) as count1
from student_scores;
結果:
221 77 99 61 class2 department2 1
213 71 94 90 class2 department2 2
122 89 86 85 class2 department1 3
225 82 85 63 class2 department2 4
224 75 80 78 class2 department2 5
123 70 78 61 class2 department1 6
222 80 78 96 class2 department2 7
121 96 74 79 class2 department1 8
223 79 74 96 class2 department2 9
124 76 70 76 class2 department1 10
114 89 94 93 class1 department2 1
214 94 94 66 class1 department2 2
115 99 93 89 class1 department1 3
211 89 93 60 class1 department2 4
212 76 83 75 class1 department2 5
215 84 82 73 class1 department2 6
112 73 80 96 class1 department1 7
113 90 74 75 class1 department1 8
216 85 74 93 class1 department2 9
111 68 69 90 class1 department1 10

rank()

  1. 以根據math排序的所有行爲作爲窗口,基於over子句中的orderby來確定一組值中一個值的排名,相同的值有重複排名,即排名可能不是連續的。
    select studentId,language,math,english,classId,departmentId,rank() over(order by math desc) as count1
    from student_scores;
    結果:
    221 77 99 61 class2 department2 1
    114 89 94 93 class1 department2 2
    213 71 94 90 class2 department2 2
    214 94 94 66 class1 department2 2
    115 99 93 89 class1 department1 5
    211 89 93 60 class1 department2 5
    122 89 86 85 class2 department1 7
    225 82 85 63 class2 department2 8
    212 76 83 75 class1 department2 9
    215 84 82 73 class1 department2 10
    112 73 80 96 class1 department1 11
    224 75 80 78 class2 department2 11
    123 70 78 61 class2 department1 13
    222 80 78 96 class2 department2 13
    113 90 74 75 class1 department1 15
    121 96 74 79 class2 department1 15
    216 85 74 93 class1 department2 15
    223 79 74 96 class2 department2 15
    124 76 70 76 class2 department1 19
    111 68 69 90 class1 department1 20

  2. 以根據classId分組、math排序的所有行爲作爲窗口,基於over子句中的orderby來確定一組值中一個值的排名,相同的值有重複排名,即排名可能不是連續的。
    select studentId,language,math,english,classId,departmentId,rank() over(partition by classId order by math desc) as count2
    from student_scores;
    結果:
    221 77 99 61 class2 department2 1
    213 71 94 90 class2 department2 2
    122 89 86 85 class2 department1 3
    225 82 85 63 class2 department2 4
    224 75 80 78 class2 department2 5
    123 70 78 61 class2 department1 6
    222 80 78 96 class2 department2 6
    121 96 74 79 class2 department1 8
    223 79 74 96 class2 department2 8
    124 76 70 76 class2 department1 10
    114 89 94 93 class1 department2 1
    214 94 94 66 class1 department2 1
    115 99 93 89 class1 department1 3
    211 89 93 60 class1 department2 3
    212 76 83 75 class1 department2 5
    215 84 82 73 class1 department2 6
    112 73 80 96 class1 department1 7
    113 90 74 75 class1 department1 8
    216 85 74 93 class1 department2 8
    111 68 69 90 class1 department1 10

dense_rank()

dense_rank與rank的區別在於,相同的值會有重複排名,且排名也是連續的。如兩個行的排名爲1,則下一個排名爲2。
select studentId,language,math,english,classId,departmentId,dense_rank() over(order by math desc) as count1
from student_scores;
結果:
221 77 99 61 class2 department2 1
114 89 94 93 class1 department2 2
213 71 94 90 class2 department2 2
214 94 94 66 class1 department2 2
115 99 93 89 class1 department1 3
211 89 93 60 class1 department2 3
122 89 86 85 class2 department1 4
225 82 85 63 class2 department2 5
212 76 83 75 class1 department2 6
215 84 82 73 class1 department2 7
112 73 80 96 class1 department1 8
224 75 80 78 class2 department2 8
123 70 78 61 class2 department1 9
222 80 78 96 class2 department2 9
113 90 74 75 class1 department1 10
121 96 74 79 class2 department1 10
216 85 74 93 class1 department2 10
223 79 74 96 class2 department2 10
124 76 70 76 class2 department1 11
111 68 69 90 class1 department1 12

其他

簡要介紹一下其它不太常用的

  1. max(取窗口內最大值)
    select studentId,language,math,english,classId,departmentId,max(math) over(partition by classId) as count1
    from student_scores;

  2. min(取窗口內最小值)
    select studentId,language,math,english,classId,departmentId,min(math) over(partition by classId) as count1
    from student_scores;

  3. sum(取窗口內的和)
    select studentId,language,math,english,classId,departmentId,sum(math) over(partition by classId) as count1
    from student_scores;

  4. avg(取窗口內的平均值)
    select studentId,language,math,english,classId,departmentId,avg(math) over(partition by classId) as count1
    from student_scores;

  5. first_value(取窗口內第一個值)
    select studentId,language,math,english,classId,departmentId,first_value(math) over(partition by classId) as count1
    from student_scores;

  6. last_value(取窗口內最後一個值)
    select studentId,language,math,english,classId,departmentId,last_value(math) over(partition by classId) as count1
    from student_scores;

  7. ntile(將分區中已排序的值根據大小分組,返回組的排名)
    select studentId,language,math,english,classId,departmentId,ntile(2) over(partition by classId order by math desc) as count1
    from student_scores;

  8. percent_rank(返回當前行的百分比排行)
    select studentId,language,math,english,classId,departmentId,percent_rank() over(partition by classId order by math desc) as count1
    from student_scores;

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章