Sqoop實戰案例總結

安裝注意事項

#Set path to where bin/hadoop is available
export HADOOP_COMMON_HOME=/app/bigdata/hadoop

#Set path to where hadoop-*-core.jar is available
export HADOOP_MAPRED_HOME=/app/bigdata/hadoop                                                                    

#set the path to where bin/hbase is available                                                                    
export HBASE_HOME=/app/bigdata/hbase

#Set the path to where bin/hive is available                                                                     
export HIVE_HOME=/app/bigdata/hive

#Set the path for where zookeper config dir is                                                                   
export ZOOCFGDIR=/app/bigdata/zookeeper

鏈接mysql,前提是mysql允許遠程登錄,不允許的話需要授權:

--GRANT ALL PRIVILEGES ON *.* TO 'root'@'%' IDENTIFIED BY 'root' WITH GRANT OPTION;

#查詢指定jdbc的所有庫

./sqoop-list-databases --connect jdbc:mysql://192.168.10.107:3306 --username root --password root

#查詢指定庫xlhdw的所有表

./sqoop-list-tables --connect jdbc:mysql://192.168.10.107:3306/xlhdw --username root --password root

注意事項:

sqoop必須運行在yarn上

案例詳解

==================IMPORT 導入到hive=================
/**  將mysql的表導入 hdfs文件系統 **/
sqoop import \
--connect jdbc:mysql://192.168.10.107:3306/userdb \
--username root \
--password root \
--target-dir \
/sqooptest/emp \
--fields-terminated-by ',' \
--table emp \
--split-by id \
--m 2

結果數據:
[root@COLBY-NN-101 bigdata]# hdfs dfs -cat /sqooptest/emp/*
1201,gopal,manager,50000,TP
1202,manisha,Proof reader,50000,TP
1203,khalil,php dev,30000,AC
1204,prasanth,php dev,30000,AC
1205,kranthi,admin,20000,TP


/**  將mysql的表導入 hive表,sqoop會自動創建表結構 **/
sqoop import \
--connect jdbc:mysql://192.168.10.107:3306/userdb \
--username root \
--password root \
--hive-import \
--fields-terminated-by ',' \
--table emp \
--split-by id \
--m 2
結果數據:
hive> select * from emp;
OK
1201    gopal   manager 50000   TP
1202    manisha Proof reader    50000   TP
1203    khalil  php dev 30000   AC
1204    prasanth        php dev 30000   AC
1205    kranthi admin   20000   TP
Time taken: 1.586 seconds, Fetched: 5 row(s)

sqoop import \
--connect jdbc:mysql://192.168.10.107:3306/xlhdw \
--username root \
--password root \
--hive-import \
--fields-terminated-by ',' \
--table cdm_ent_dto_business_change_d \
--split-by reg_credit_no \
--m 2


/**  將mysql的表的增量數據導入 hdfs **/
sqoop import \
--connect jdbc:mysql://192.168.10.107:3306/userdb \
--target-dir /sqooptest  \
--username root \
--password root \
--table emp \
--m 1 \
--incremental append \
--check-column id \
--last-value 1205

========================條件導入=================
/**  將mysql的表的條件數據導入到hive **/
sqoop import \
--connect jdbc:mysql://192.168.10.107:3306/userdb \
--username root \
--password root \
--where "id='1201'" \
--hive-import \
--fields-terminated-by ',' \
--table emp \
--split-by id \
--m 2

/**  按需求條件導入從mysql到hdfs **/
sqoop import \
--connect jdbc:mysql://192.168.10.107:3306/userdb \
--username root \
--password root \
--target-dir /wherequery2 \
--query 'select id,name,deg from emp WHERE id<1207  and $CONDITIONS' \
--split-by id \
--fields-terminated-by '\t' \
--m 2

===============增量導入======================
增量導入是僅導入新添加的表中的行的技術。
sqoop支持兩種增量MySql導入到hive的模式,
 	一種是append,即通過指定一個遞增的列,比如:
--incremental append  --check-column num_id --last-value 0 
另種是可以根據時間戳,比如:
--incremental lastmodified --check-column created --last-value '2012-02-01 11:0:00' 
就是隻導入created 比'2012-02-01 11:0:00'更大的數據

1、append模式
它需要添加‘incremental’, ‘check-column’, 和 ‘last-value’選項來執行增量導入。
下面的語法用於Sqoop導入命令增量選項。
--incremental <mode>
--check-column <column name>
--last value <last check column value>上次導到哪個值了


/* 向emp表增量導入數據,指定emp的目錄即可*/
sqoop import \
--connect jdbc:mysql://192.168.10.107:3306/userdb \
--username root \
--password root \
--target-dir /user/hive/warehouse/emp \
--table emp --m 1 \
--incremental append \
--check-column id \
--last-value 1205


=====================EXPORT 導出到mysql==============
/**  將hdfs的文件數據導出到mysql **/
bin/sqoop export \
--connect jdbc:mysql://192.168.10.107:3306/userdb \
--username root \
--password root \
--input-fields-terminated-by ',' \
--table emp \
--export-dir /sqooptest/


/**  將hive的表數據(hdfs的文件)導出到mysql **/
注意指定輸入文件的分隔符,而且不能直接指定hive表名,而是指定表名所在的路徑
sqoop export \
--connect jdbc:mysql://192.168.10.107:3306/xlhdw \
--username root \
--password root \
--input-fields-terminated-by ',' \
--table cdm_ent_dto_business_change_d \
--export-dir /user/hive/warehouse/cdm_ent_dto_business_change_d/


--TODO  -----
將app數據倉庫中的 日新用戶維度統計報表:dim_user_new_day  導出到mysql的表中去
----------------------------------------
create table dim_user_new_day(os_name string,city string,release_channel string,app_ver_name string,cnts int)
partitioned by (day string, dim string);

-- 2 日新維度統計報表sql開發(利用多重插入語法)
from etl_user_new_day

insert into table dim_user_new_day partition(day='2017-09-21',dim='0000')
select 'all','all','all','all',count(1)
where day='2017-09-21'

insert into table dim_user_new_day partition(day='2017-09-21',dim='0001')
select 'all','all','all',app_ver_name,count(1)
where day='2017-09-21'
group by app_ver_name

insert into table dim_user_new_day partition(day='2017-09-21',dim='0010')
select 'all','all',release_channel,'all',count(1)
where day='2017-09-21'
group by release_channel

insert into table dim_user_new_day partition(day='2017-09-21',dim='0011')
select 'all','all',release_channel,app_ver_name,count(1)
where day='2017-09-21'
group by release_channel,app_ver_name

insert into table dim_user_new_day partition(day='2017-09-21',dim='0100')
select 'all',city,'all','all',count(1)
where day='2017-09-21'
group by city
----------------------------------------
-- 1 在mysql中建庫建表
create database app;
create table dim_user_new_day(
os_name varchar(20),city varchar(20),release_channel varchar(20),app_ver_name varchar(20),cnts int,dt varchar(20)
);

--注意:將庫和表的編碼集改成utf8,命令如下:
修改庫的編碼:
mysql> alter database db_name character set utf8;
修改表的編碼:
mysql> ALTER TABLE table_name CONVERT TO CHARACTER SET utf8 COLLATE utf8_general_ci; 


-- 2 用sqoop將hive中的 dim_user_new_day 中的指定日分區的數據導出到mysql 的dim_user_new_day
sqoop導出hive多級分區表會出現異常,如下:
“Error: java.io.IOException: Can't export data, please check failed map task logs”
解決方式是:將數據存儲爲單分區,再導出到mysql或者其他RDMS
命令模式:注意分隔符的使用
sqoop export \
--connect "jdbc:mysql://192.168.10.107:3306/app?useUnicode=true&characterEncoding=utf-8" \
--username root \
--password root \
--input-fields-terminated-by '\001' \
--table dim_user_new_day \
--export-dir /user/hive/warehouse/dim_user_new_day/day=2017-09-21/dim*/*



-----腳本模式
#!/bin/bash
day=`date -d '-1 day' +'%Y-%m-%d'`

/root/apps/sqoop/bin/sqoop export \
--connect "jdbc:mysql://192.168.10.107:3306/app?useUnicode=true&characterEncoding=utf-8" \
--username root \
--password root \
--input-fields-terminated-by '\001' \
--table dim_user_new_day \
--export-dir /user/hive/warehouse/app.db/dim_user_new_day_1p/day=${day} /

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章