Kettle作爲一款優秀的數據抽取程序,因爲高效穩定的性能,一直被廣大使用者所喜愛,並且還在國內廣受好評。因爲其本身使用純JAVA編寫,所以其JAVA API使用起來自然也是非常簡便。雖然其本身自帶的組件已經非常好用,並且能夠滿足豐富的場景。但可能有些場景下,我們可能需要通過其他的方式來實現,本篇我們將介紹Kettle的JAVA API的使用。
一、環境搭建
Pentaho官方倉庫:https://nexus.pentaho.org/content/groups/omni
核心jar包的pom.xml配置如下:
- <dependency>
- <groupId>pentaho-kettle</groupId>
- <artifactId>kettle-engine</artifactId>
- <version>4.4.0-stable</version>
- </dependency>
- <dependency>
- <groupId>pentaho-kettle</groupId>
- <artifactId>kettle-core</artifactId>
- <version>4.4.0-stable</version>
- </dependency>
- <dependency>
- <groupId>pentaho-kettle</groupId>
- <artifactId>kettle-db</artifactId>
- <version>4.4.0-stable</version>
- </dependency>
二、代碼部分
1、初始化環境
- public void initKettleEnvironment(HttpServletRequest request) throws KettleException {
- if (KettleEnvironment.isInitialized()) {
- return;
- }
- /**
- * 爲避免在部分網絡環境中無法完成初始化,需要自行處理
- */
- if (request == null) {
- // 運行環境初始化
- KettleEnvironment.init();
- } else {
- String userDir = System.getProperty("user.dir");
- String kettleHome = request.getSession().getServletContext().getRealPath(File.separator "WEB-INF");
- // 設置用戶路徑和系統環境,包括用戶路徑和主目錄
- System.setProperty("user.dir", kettleHome);
- System.setProperty("KETTLE_HOME", kettleHome);
- // 運行環境初始化
- KettleEnvironment.init();
- // 避免造成影響其他程序的運行,還原用戶路徑
- System.setProperty("user.dir", userDir);
- }
- }
2、創建轉化元
添加配置數組,配置轉化元
- public TransMeta buildTransMeta(String metaName, String... transXML) throws KettleXMLException {
- TransMeta transMeta = new TransMeta();
- // 設置轉化元的名稱
- transMeta.setName(metaName);
- // 添加轉換的數據庫連接
- for (int i = 0; i < transXML.length; i ) {
- transMeta.addDatabase(new DatabaseMeta(transXML[i]));
- }
- return transMeta;
- }
3、添加日誌(可選操作)
- public void setStepLogTable(TransMeta transMeta, String connDbName, String tableName) {
- VariableSpace space = new Variables();
- // 將step日誌數據庫配置名加入到變量集中
- space.setVariable(Const.KETTLE_TRANS_LOG_DB, connDbName);
- space.initializeVariablesFrom(null);
- StepLogTable stepLogTable = StepLogTable.getDefault(space, transMeta);
- // 配置StepLogTable使用的數據庫配置名稱
- stepLogTable.setConnectionName(connDbName);
- // 設置Step日誌的表名
- stepLogTable.setTableName(tableName);
- // 設置TransMeta的StepLogTable
- transMeta.setStepLogTable(stepLogTable);
- }
4、創建插件註冊器
- public PluginRegistry getRegistry() {
- // 插件註冊,用於註冊轉換中需要用到的插件
- return PluginRegistry.getInstance();
- }
5、設置表輸入步驟元
該步驟用於獲取源數據
- /**
- * 設置表輸入步驟
- * @param transMeta
- * @param registry
- * @param sourceDbName
- * @param sql
- * @param stepName
- * @return
- */
- public StepMeta setTableInputStep(TransMeta transMeta, PluginRegistry registry, String sourceDbName, String sql,
- String stepName) {
- // 創建表輸入
- TableInputMeta tableInputMeta = new TableInputMeta();
- String pluginId = registry.getPluginId(StepPluginType.class, tableInputMeta);
- // 指定數據源數據庫配置名
- DatabaseMeta source = transMeta.findDatabase(sourceDbName);
- tableInputMeta.setDatabaseMeta(source);
- tableInputMeta.setSQL(sql);
- // 將表輸入添加到轉換中
- StepMeta stepMeta = new StepMeta(pluginId, stepName, tableInputMeta);
- // 給步驟添加在spoon工具中的顯示位置
- stepMeta.setDraw(true);
- stepMeta.setLocation(100, 100);
- // 將表輸入添加到步驟中
- transMeta.addStep(stepMeta);
- return stepMeta;
- }
6、更新步驟元
該步驟用於將獲取到的數據更新到目標數據庫中
- /**
- * 設置表輸出步驟,用於整表抽取
- * @param transMeta
- * @param registry
- * @param targetDbName
- * @param targetTableName
- * @param stepName
- * @return
- */
- public StepMeta setTableOutput(TransMeta transMeta, PluginRegistry registry, String targetDbName,
- String targetTableName, String stepName) {
- // 創建表輸出
- TableOutputMeta tableOutputMeta = new TableOutputMeta();
- String pluginId = registry.getPluginId(StepPluginType.class, tableOutputMeta);
- // 配置表輸出的目標數據庫配置名
- DatabaseMeta targetDb = transMeta.findDatabase(targetDbName);
- tableOutputMeta.setDatabaseMeta(targetDb);
- tableOutputMeta.setTableName(targetTableName);
- // 將表輸出添加到轉換中
- StepMeta stepMeta = new StepMeta(pluginId, stepName, tableOutputMeta);
- transMeta.addStep(stepMeta);
- return stepMeta;
- }
-
- /**
- * 設置表插入與更新步驟,用於表中部分字段更新
- * @param transMeta
- * @param registry
- * @param targetDbName
- * @param targetTableName
- * @param updatelookup lookup檢索字段
- * @param updateStream lookup更新字段
- * @param updateStream2 lookup更新字段2
- * @param conditions lookup條件
- * @param updateOrNot lookup更新標記
- * @param stepName
- * @return
- */
- public StepMeta setInsertUpdateMeta(TransMeta transMeta, PluginRegistry registry, String targetDbName,
- String targetTableName, String[] updatelookup, String[] updateStream, String[] updateStream2,
- String[] conditions, Boolean[] updateOrNot, String stepName) {
- // 創建插入與更新
- InsertUpdateMeta insertUpdateMeta = new InsertUpdateMeta();
- String pluginId = registry.getPluginId(StepPluginType.class, insertUpdateMeta);
- // 配置目標數據庫配置名
- DatabaseMeta database_target = transMeta.findDatabase(targetDbName);
- insertUpdateMeta.setDatabaseMeta(database_target);
- // 設置目標表名
- insertUpdateMeta.setTableName(targetTableName);
- // 設置用來查詢的關鍵字
- insertUpdateMeta.setKeyLookup(updatelookup);
- insertUpdateMeta.setKeyStream(updateStream);
- insertUpdateMeta.setKeyStream2(updateStream2);// 這一步不能省略
- insertUpdateMeta.setKeyCondition(conditions);
- // 設置要更新的字段
- insertUpdateMeta.setUpdateLookup(updatelookup);
- insertUpdateMeta.setUpdateStream(updateStream);
- insertUpdateMeta.setUpdate(updateOrNot);
- // 添加步驟到轉換中
- StepMeta stepMeta = new StepMeta(pluginId, stepName, insertUpdateMeta);
- stepMeta.setDraw(true);
- stepMeta.setLocation(250, 100);
- transMeta.addStep(stepMeta);
- return stepMeta;
- }
7、綁定關聯步驟
該步驟用於將數據獲取和導入更新的步驟關聯綁定
- /**
- * 用於將表輸入步驟與第二步驟綁定
- * @param transMeta
- * @param from
- * @param to
- */
- public void addTransHop(TransMeta transMeta, StepMeta from, StepMeta to) {
- transMeta.addTransHop(new TransHopMeta(from, to));
- }
8、執行抽取
執行數據抽取
- /**
- * 執行抽取
- * @param transMeta
- * @param targetDbName
- */
- public void executeTrans(TransMeta transMeta, String targetDbName) {
- try {
- Database database = new Database(null, transMeta.findDatabase(targetDbName));
- database.connect();
- Trans trans = new Trans(transMeta);
- trans.execute(new String[] { "start..." });
- trans.waitUntilFinished();
- // 關閉數據庫連接
- database.disconnect();
- if (trans.getErrors() > 0) {
- throw new RuntimeException("There were errors during transformation execution.");
- }
- } catch (KettleDatabaseException e) {
- e.printStackTrace();
- } catch (KettleException e) {
- e.printStackTrace();
- }
- }
9、抽取示例
數據庫配置xml:
- <?xml version="1.0" encoding="UTF-8"?>
- <connection>
- <name>smy</name>
- <server>127.0.0.1</server>
- <type>Mysql</type>
- <access>Native</access>
- <database>test_db</database>
- <port>3306</port>
- <username>root</username>
- <password>123456</password>
- <attributes>
- <attribute>
- <code>USE_POOLING</code>
- <attribute>Y</attribute>
- </attribute>
- <attribute>
- <code>EXTRA_OPTION_MYSQL.characterEncoding</code>
- <attribute>utf8</attribute>
- </attribute>
- <attribute>
- <code>EXTRA_OPTION_MYSQL.defaultFetchSize</code>
- <attribute>500</attribute>
- </attribute>
- </attributes>
- </connection>
運行函數:
- public static void main(String[] args) {
- try {
- KettleClient client = new KettleClient();
- client.initEnvironment(null);
- String transXML = ""; // 此處爲上例的數據庫配置
- TransMeta meta = client.buildTransMeta("kettle", transXML);
- PluginRegistry registry = client.getRegistry();
- StepMeta step1 = client.setTableInputStep(meta, registry, "kettle", "select * from test1", "table input");
- StepMeta step2 = client.setTableOutput(meta, registry, "kettle", "test2", "table insert");
- client.addTransHop(meta, step1, step2);
- client.executeTrans(meta, "kettle");
- } catch (KettleException e) {
- e.printStackTrace();
- }
- }