【Hive】hive影評案例

數據描述

  1. users.dat 用戶表
    數據格式爲: 2::M::56::16::70072
    對應字段爲:UserID BigInt, Gender String, Age Int, Occupation String, Zipcode String
    對應字段中文解釋:用戶id,性別,年齡,職業,郵政編碼

  2. movies.dat 電影表
    數據格式爲: 2::Jumanji (1995)::Adventure|Children’s|Fantasy
    對應字段爲:MovieID BigInt, Title String, Genres String
    對應字段中文解釋:電影ID,電影名字,電影類型

  3. ratings.dat 評分表
    數據格式爲: 1::1193::5::978300760
    對應字段爲:UserID BigInt, MovieID BigInt, Rating Double, Timestamped String
    對應字段中文解釋:用戶ID,電影ID,評分,評分時間戳

數據要求

(1)寫shell腳本清洗數據。(hive不支持解析多字節的分隔符,也就是說hive只能解析’:’, 不支持解析’::’,所以用普通方式建表來使用是行不通的,要求對數據做一次簡單清洗)

(2)使用Hive能解析的方式進行
要求:外部表
hdfs: /user/data/yingping/movies ratings users

題目

-- 設置本地模式 及 顯示錶頭
hive> set hive.exec.mode.local.auto=true;
hive> set hive.cli.print.header=true;
-- hdfs 數據目錄創建
hive> dfs -mkdir -p /user/data/yingping/movies;
hive> dfs -mkdir -p /user/data/yingping/ratings;
hive> dfs -mkdir -p /user/data/yingping/users;
-- hdfs 數據目錄 上傳數據
hive> dfs -put users.dat /user/data/yingping/users;
hive> dfs -put movies.dat /user/data/yingping/movies;
hive> dfs -put ratings.dat /user/data/yingping/ratings;
-- 創建yingping數據庫
hive> create database yingping;
hive> use yingping;

1. 正確建表,導入數據(三張表,三份數據),並驗證是否正確

-- 創建表 movies
create external table if not exists movies(
MovieID BigInt, Title String, Genres String) 
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe' 
with serdeproperties('input.regex'='(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s') stored as textfile 
location '/user/data/yingping/movies';

hive> select * from movies limit 2;
OK
movies.movieid	movies.title	movies.genres
1	Toy Story (1995)	Animation|Children's|Comedy
2	Jumanji (1995)	Adventure|Children's|Fantasy
Time taken: 1.142 seconds, Fetched: 2 row(s)

-- 創建表 ratings
create external table if not exists ratings(UserID BigInt, MovieID BigInt, Rating Double, Timestamped String) 
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe' 
with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s') stored as textfile 
location '/user/data/yingping/ratings';

hive> select * from ratings limit 2;
OK
ratings.userid	ratings.movieid	ratings.rating	ratings.timestamped
1	1193	5.0	978300760
1	661	3.0	978302109
Time taken: 0.145 seconds, Fetched: 2 row(s)

-- 創建表 users
create external table if not exists users(userid bigint,gender string,age int,occupation string,zipcode string) row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe' 
with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s %5$s') stored as textfile location '/user/data/yingping/users';

hive> select * from users limit 2;
OK
users.userid	users.gender	users.age	users.occupation	users.zipcode
1	F	1	10	48067
2	M	56	16	70072
Time taken: 0.137 seconds, Fetched: 2 row(s)

2. 求被評分次數最多的10部電影,並給出評分次數(電影名,評分次數)

----分析
--也就是要先求出,每一部電影總評分次數 排序
分組:電影id title ->movies表
求:count() ->ratings表
排序:count() desc limit 10

select 
a.Title,count(b.Rating) totalRate
from movies a join ratings b
on a.MovieID=b.MovieID
group by a.MovieID,a.Title
order by totalRate desc limit 10;

--結果
a.title	totalrate
American Beauty (1999)	3428
Star Wars: Episode IV - A New Hope (1977)	2991
Star Wars: Episode V - The Empire Strikes Back (1980)	2990
Star Wars: Episode VI - Return of the Jedi (1983)	2883
Jurassic Park (1993)	2672
Saving Private Ryan (1998)	2653
Terminator 2: Judgment Day (1991)	2649
Matrix, The (1999)	2590
Back to the Future (1985)	2583
Silence of the Lambs, The (1991)	2578

3. 分別求男性,女性當中評分最高的10部電影(性別,電影名,影評分)

方法一:分開求

----分析
--1.求男性
過濾條件:男性 ->users表
每一部電影的平均評分
分組:電影Id 電影名 ->movies表
求:avg(Rating) -> ratings表
排序:avg(Rating) desc limit 10

select
c.Gender Gender,
a.Title Title,
avg(b.Rating) avgrate 
from movies a join ratings b on a.MovieID=b.MovieId
join users c on b.UserID=c.UserID
where c.Gender='M'
group by a.MovieID,a.Title,c.Gender
order by avgrate desc limit 10;

--結果
gender	title	avgrate
M	Small Wonders (1996)	5.0
M	Smashing Time (1967)	5.0
M	Baby, The (1973)	5.0
M	Gate of Heavenly Peace, The (1995)	5.0
M	Schlafes Bruder (Brother of Sleep) (1995)	5.0
M	Dangerous Game (1993)	5.0
M	Follow the Bitch (1998)	5.0
M	Bells, The (1926)	5.0
M	Lured (1947)	5.0
M	Angela (1995)	5.0

--2.求女性
select 
c.Gender Gender,a.Title Title,avg(b.Rating) avgrate 
from 
movies a join ratings b 
on a.MovieID=b.MovieID join users c on b.UserID=c.UserID 
where c.Gender='F' 
group by a.MovieID,a.Title,c.Gender 
order by avgrate desc limit 10;

--結果
gender	title	avgrate
F	For the Moment (1994)	5.0
F	Gate of Heavenly Peace, The (1995)	5.0
F	Skipped Parts (2000)	5.0
F	Raw Deal (1948)	5.0
F	Clean Slate (Coup de Torchon) (1981)	5.0
F	Coldblooded (1995)	5.0
F	Country Life (1994)	5.0
F	Message to Love: The Isle of Wight Festival (1996)	5.0
F	Bittersweet Motel (2000)	5.0
F	Lamerica (1994)	5.0

方法二:分析函數

----分析
--每個性別中評分最高的10部電影

--每個性別中每部電影的評分
分組:Gender MovieID Title
求:avg
select 
c.Gender Gender,a.Title Title,avg(b.Rating) avgrate 
from movies a join ratings b on a.MovieID=b.MovieID 
join users c on b.UserID=c.UserID 
group by c.Gender,a.MovieID,a.Title;

--分組求topN
局部排序
分組:Gender
排序: avgrate
select * from 
(select 
Gender,Title,avgrate,row_number() over(distribute by Gender sort by avgrate desc) index 
from 
(select 
c.Gender Gender,a.Title Title,avg(b.Rating) avgrate 
from movies a join ratings b on a.MovieID=b.MovieID 
join users c on b.UserID=c.UserID 
group by c.Gender,a.MovieID,a.Title
) d) e where index<=10;

--結果
e.gender	e.title	e.avgrate	e.index
F	Brother, Can You Spare a Dime? (1975)	5.0	1
F	Woman of Paris, A (1923)	5.0	2
F	Ayn Rand: A Sense of Life (1997)	5.0	3
F	One Little Indian (1973)	5.0	4
F	Song of Freedom (1936)	5.0	5
F	Belly (1998)	5.0	6
F	Gambler, The (A J�t�kos) (1997)	5.0	7
F	Clean Slate (Coup de Torchon) (1981)	5.0	8
F	Country Life (1994)	5.0	9
F	Other Side of Sunday, The (S�ndagsengler) (1996)	5.0	10
M	Bells, The (1926)	5.0	1
M	Dangerous Game (1993)	5.0	2
M	Follow the Bitch (1998)	5.0	3
M	Small Wonders (1996)	5.0	4
M	Schlafes Bruder (Brother of Sleep) (1995)	5.0	5
M	Angela (1995)	5.0	6
M	Baby, The (1973)	5.0	7
M	Ulysses (Ulisse) (1954)	5.0	8
M	Gate of Heavenly Peace, The (1995)	5.0	9
M	Smashing Time (1967)	5.0	10

4. 求movieid = 2116這部電影各年齡段(因爲年齡就只有7個,就按這個7個分就好了)的平均影評(年齡段,影評分)

--分析
過濾:movieID=2116 -> ratings表
分組:年齡 -> users表
求:avg ->ratings表

select
a.Age,avg(b.Rating) avgrate
from users a join ratings b
on a.UserID=b.UserID
where b.MovieID=2116
group by a.Age;

--結果
a.age	avgrate
1	3.2941176470588234
18	3.3580246913580245
25	3.436548223350254
35	3.2278481012658227
45	2.8275862068965516
50	3.32
56	3.5

5. 求最喜歡看電影(影評次數最多)的那位女性評最高分的10部電影的平均影評分(觀影者,電影名,影評分)

首先這個題目可能有點歧義:
1.求,這個女性評分最高的10部電影的總平均分
2. 求,這個女生評分最高的10部電影的,每一電影的平均分

--如果是2.這個女生評分最高的10部電影的,每一電影的平均分
----分析
--1) 先找到這位女性
過慮:Gender="F" ->users表
分組:UserId ->ratings表
求:count() ->ratings表
排序:count() desc limit 1

create table answer05_step01 as 
select
a.UserID UserID,count(a.Rating) totalRate
from ratings a join users b 
on a.UserID=b.UserID
where b.Gender="F"
group by a.UserID
order by totalRate desc limit 1;

--結果
userid	totalrate
1150	1302

--2) 然後這個女性最喜歡的10部電影
--這個女生評分最高的10部電影
過濾:女性 1150 ->answer05_step01表
排序:評分 desc limit10 ->ratings表
create table answer05_step02 as 
select 
a.UserID UserID,a.MovieID MovieID,a.Rating Rating 
from ratings a join answer05_step01 b on 
a.UserID=b.UserID 
order by Rating desc limit 10;
----****注意  where子查詢運用一個查詢結果問題***

--結果(這位女性評分最高的10部電影)
hive> select * from answer05_step02;
OK
answer05_step02.userid	answer05_step02.movieid	answer05_step02.rating
1150	745		5.0
1150	1279	5.0
1150	1236	5.0
1150	904		5.0
1150	750		5.0
1150	2997	5.0
1150	2064	5.0
1150	905		5.0
1150	1094	5.0
1150	1256	5.0
Time taken: 0.128 seconds, Fetched: 10 row(s)

--3) 她喜歡的10部電影的平均影評 (每一部電影的平均影評)
分組:MovieID ->answer05_step02表
求 avg -> ratings表
select 
a.MovieID MovieID,avg(b.Rating) avgrate 
from answer05_step02 a left join ratings b 
on a.MovieID=b.MovieID 
group by a.MovieID;
--結果
movieid	avgrate
745		4.52054794520548
750		4.4498902706656915
904		4.476190476190476
905		4.280748663101604
1094	3.7314890154597236
1236	4.188888888888889
1256	4.21043771043771
1279	3.747422680412371
2064	4.0739348370927315
2997	4.125390450691656

----上面3步聯合查詢就是
select 
a.MovieID MovieID,avg(b.Rating) avgrate 
from 
(
select 
a.UserID UserID,a.MovieID MovieID,a.Rating Rating 
from ratings a join 
(
select
a.UserID UserID,count(a.Rating) totalRate
from ratings a join users b 
on a.UserID=b.UserID
where b.Gender="F"
group by a.UserID
order by totalRate desc limit 1
) b on 
a.UserID=b.UserID 
order by Rating desc limit 10
) 
a left join ratings b 
on a.MovieID=b.MovieID 
group by a.MovieID;

--結果
movieid	avgrate
162		4.063136456211812
904		4.476190476190476
951		4.249370277078086
1230	4.14167916041979
1966	3.6464646464646466
2330	4.163043478260869
3163	3.7039473684210527
3307	4.387453874538745
3671	4.047363717605005
3675	3.8265682656826567


---聯合查詢時候,會替換上邊兒分步表,那我們實驗一下,表別名重複影響不影響?
--實驗發現他們局部表名,不影響
select 
a.MovieID MovieID,avg(b.Rating) avgrate 
from 
(
select 
c.UserID UserID,c.MovieID MovieID,c.Rating Rating 
from ratings c join 
(
select 
e.UserID UserID,count(e.Rating) totalRate 
from ratings e join users f on e.UserID=f.UserID 
where f.Gender="F" 
group by e.UserID 
order by totalRate desc limit 1
) d 
on 
c.UserID=d.UserID 
order by Rating desc limit 10
) a left join ratings b 
on a.MovieID=b.MovieID 
group by a.MovieID;

--結果
movieid	avgrate
162	4.063136456211812
904	4.476190476190476
951	4.249370277078086
1230	4.14167916041979
1966	3.6464646464646466
2330	4.163043478260869
3163	3.7039473684210527
3307	4.387453874538745
3671	4.047363717605005
3675	3.8265682656826567


---但是,我發現聯合查詢與分步查詢的結果不一致???

6. 求好片(評分>=4.0)最多的那個年份(上映)的最好看的10部電影

----分析
--好片最多的那個年份?
--每一部電影的平均評分
--每一年評分>=4.0的電影部數   按這個部數排序 desc limit1

--1.每一年每一部電影的平均評分
分組:年份	電影 -> movies表
求: avg(Ratings) ->ratings表
--電影表數據是這樣的,電影名稱截取年份
movies.movieid	movies.title	movies.genres
1	Toy Story (1995)	Animation|Children's|Comedy

create table year_movie_avgrate as 
select
substr(a.Title,-5,4) year,a.MovieID MovieID,a.Title Title,avg(b.Rating) avgrate 
from movies a join ratings b
on a.MovieID=b.MovieID
group by a.MovieID,a.Title,substr(a.Title,-5,4);
--year_movie_avgrate 存入數據樣例
year	movieid	title	avgrate
1995	1	Toy Story (1995)	4.146846413095811
1995	2	Jumanji (1995)	3.20114122681883
1995	3	Grumpier Old Men (1995)	3.01673640167364


--2. 好片最多的那個年份
分組:年份
過濾 >=4.0
求:count()
排序:count() desc limit 1

select 
year,count(*) totalMovie 
from year_movie_avgrate 
where avgrate>=4.0 
group by year 
order by totalMovie desc limit 1;
--結果
year	totalmovie
1998	27

--3. 求這一年最好看的10部電影
過濾:1998 ->year_movie_avgrate表
排序:評分 倒序 limit 10
select 
b.MovieID MovieID,b.Title Title,b.avgrate avgrate 
from (select 
year,count(*) totalMovie 
from year_movie_avgrate 
where avgrate>=4.0 
group by year 
order by totalMovie desc limit 1) a 
left join year_movie_avgrate b 
on a.year=b.year 
order by avgrate desc limit 10;
--結果
movieid	title	avgrate
1830	Follow the Bitch (1998)	5.0
2503	Apple, The (Sib) (1998)	4.666666666666667
2309	Inheritors, The (Die Siebtelbauern) (1998)	4.5
2930	Return with Honor (1998)	4.4
2028	Saving Private Ryan (1998)	4.337353938937053
2360	Celebration, The (Festen) (1998)	4.3076923076923075
2839	West Beirut (West Beyrouth) (1998)	4.3
2357	Central Station (Central do Brasil) (1998)	4.283720930232558
3077	42 Up (1998)	4.2272727272727275
2329	American History X (1998)	4.2265625


7. 求1997年上映的電影中,評分最高的10部Comedy類電影

----分析
year_movie_avgrate表:  year movieid  title avgrate
過濾:1997& Comedy  -> movies表
排序:評分	desc limit 10

select 
a.year year,a.MovieID MovieID,a.Title Title,a.avgrate avgrate,b.Genres Genres 
from year_movie_avgrate a join movies b 
on a.MovieID=b.MovieID 
where a.year="1997" and instr(lower(b.Genres),"comedy")>0 
order by avgrate desc limit 10;

--結果
year	movieid	title	avgrate	genres
1997	2324	Life Is Beautiful (La Vita � bella) (1997)	4.329861111111111	Comedy|Drama
1997	2444	24 7: Twenty Four Seven (1997)	4.0	Comedy|Drama
1997	1827	Big One, The (1997)	4.0	Comedy|Documentary
1997	1871	Friend of the Deceased, A (1997)	4.0	Comedy|Drama
1997	1784	As Good As It Gets (1997)	3.9501404494382024	Comedy|Drama
1997	2618	Castle, The (1997)	3.891304347826087	Comedy
1997	1641	Full Monty, The (1997)	3.872393661384487	Comedy
1997	1564	Roseanna's Grave (For Roseanna) (1997)	3.8333333333333335	Comedy|Romance
1997	1734	My Life in Pink (Ma vie en rose) (1997)	3.825870646766169	Comedy|Drama
1997	1500	Grosse Pointe Blank (1997)	3.813380281690141	Comedy|Crime

8. 該影評庫中各種類型電影中評價最高的5部電影(類型,電影名,平均影評分)

----分析
分組:類型 MovieID ->movies表
求:avg ->ratings表
--1. 炸裂,電影類型
select 
MovieID,Title,tps.tp type 
from movies 
lateral view explode(split(Genres,"\\|")) tps as tp; 
-- 數據
movieid	title	type
1	Toy Story (1995)	Animation
1	Toy Story (1995)	Children's
1	Toy Story (1995)	Comedy
2	Jumanji (1995)	Adventure
2	Jumanji (1995)	Children's

--2. 關聯求 每種類型 每一部電影的平均分
select 
lower(a.type) type,a.MovieID MovieID,a.Title Title,avg(b.Rating) avgrate 
from (
select 
MovieID,Title,tps.tp type 
from movies 
lateral view explode(split(Genres,"\\|")) tps as tp
) a join ratings b
on a.MovieID=b.MovieID
group by lower(a.type),a.MovieID,a.Title;

--結果
type	movieid	title	avgrate
action	6	Heat (1995)	3.8787234042553194
action	9	Sudden Death (1995)	2.656862745098039
action	10	GoldenEye (1995)	3.5405405405405403
action	15	Cutthroat Island (1995)	2.458904109589041
action	20	Money Train (1995)	2.5375

--3. 分組求topN
分組:type
排序:avgrate desc  

select * from(
select 
c.type type,c.MovieID MovieID,c.Title Title,c.avgrate avgrate,row_number() over(partition by c.type order by c.avgrate desc) index 
from(
select 
lower(a.type) type,a.MovieID MovieID,a.Title Title,avg(b.Rating) avgrate 
from (
select 
MovieID,Title,tps.tp type 
from movies 
lateral view explode(split(Genres,"\\|")) tps as tp
) a join ratings b
on a.MovieID=b.MovieID
group by lower(a.type),a.MovieID,a.Title
) c
)d where index<=5;

--結果:
d.type	d.movieid	d.title	d.avgrate	d.index
action	2905	Sanjuro (1962)	4.608695652173913	1
action	2019	Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)	4.560509554140127	2
action	858	Godfather, The (1972)	4.524966261808367	3
action	1198	Raiders of the Lost Ark (1981)	4.477724741447892	4
action	260	Star Wars: Episode IV - A New Hope (1977)	4.453694416583082	5
adventure	3172	Ulysses (Ulisse) (1954)	5.0	1
adventure	2905	Sanjuro (1962)	4.608695652173913	2
adventure	1198	Raiders of the Lost Ark (1981)	4.477724741447892	3
adventure	260	Star Wars: Episode IV - A New Hope (1977)	4.453694416583082	4
adventure	1204	Lawrence of Arabia (1962)	4.401925391095066	5

9. 各年評分最高的電影類型(年份,類型,影評分)

----分析
--每一年 每一類電影的平均分
分組:年 電影類型 ->movies表
求:avg ->ratings表
--1. 炸裂 類型
select 
MovieID,substr(Title,-5,4) year,tps.tp type 
from movies lateral view explode(split(Genres,"\\|")) tps as tp;
--
movieid	year	type
1	1995	Animation
1	1995	Children's
1	1995	Comedy
2	1995	Adventure
2	1995	Children's

--2. 關聯 求每一年 每一類電影的平均分
select 
a.year year,lower(a.type) type,avg(b.Rating) avgrate 
from 
(select 
MovieID,substr(Title,-5,4) year,tps.tp type 
from movies lateral view explode(split(Genres,"\\|")) tps as tp) a 
join ratings b on a.MovieID=b.MovieID 
group by a.year,lower(a.type);

--
year	type	avgrate
1919	action	2.5
1919	adventure	2.6666666666666665
1919	comedy	3.6315789473684212
1919	drama	2.5714285714285716
1920	comedy	3.6666666666666665
1921	action	3.7903225806451615
1922	horror	3.991596638655462

--3. 分組topN
局部:分組   年
排序:  avgrate  desc 
select * from 
(select 
year,type,avgrate,row_number() over(distribute by year sort by avgrate desc) index 
from (
select 
a.year year,lower(a.type) type,avg(b.Rating) avgrate 
from 
(select 
MovieID,substr(Title,-5,4) year,tps.tp type 
from movies lateral view explode(split(Genres,"\\|")) tps as tp) a 
join ratings b on a.MovieID=b.MovieID 
group by a.year,lower(a.type)
) c) d where index=1;
-----
d.year	d.type	d.avgrate	d.index
1919	comedy	3.6315789473684212	1
1920	comedy	3.6666666666666665	1
1921	action	3.7903225806451615	1
1922	horror	3.991596638655462	1
1923	comedy	3.4444444444444446	1
1925	comedy	4.124223602484472	1
1926	crime	4.5	1
1927	comedy	4.368932038834951	1
1928	comedy	3.888888888888889	1

10. 每個地區最高評分的電影名,把結果存入HDFS(地區,電影名,影評分)

---分析
--每個地區 每一部電影的平均評分
分組:地區(郵編)->users表 電影id->ratings表 電影名->movies表
求:平均分 ->ratings表

select 
a.Zipcode zipcode,b.MovieID MovieID,avg(b.Rating) avgrate 
from users a join ratings b  on a.UserID=b.UserID join 
movies c on b.MovieID=c.MovieID 
group by a.zipcode,b.MovieID,c.Title;

--
zipcode	movieid	avgrate
00231	3	4.0
00231	7	5.0
00231	11	4.0
00231	17	3.0
00231	39	4.0
00231	46	3.0

--每個地區評分最高的 分組取TopN
分組:地區
排序: 評分 desc
select * from 
(select 
zipcode,movieid,avgrate,row_number() over(partition by zipcode order by avgrate desc) index 
from (
select 
a.Zipcode zipcode,b.MovieID MovieID,avg(b.Rating) avgrate 
from users a join ratings b  on a.UserID=b.UserID join 
movies c on b.MovieID=c.MovieID 
group by a.zipcode,b.MovieID,c.Title
) d ) e where index=1;

----
e.zipcode	e.movieid	e.avgrate	e.index
00231	2006	5.0	1
00606	3937	5.0	1
00681	3793	5.0	1
00693	2916	5.0	1
00918	2706	5.0	1

發佈了84 篇原創文章 · 獲贊 12 · 訪問量 10萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章