oozie是Hadoop平臺中的任務調度系統,可以將不同類型的作業串聯起來,oozie中的核心概念稱爲workflow,即工作流,每種類型的作業都是一個工作流,oozie中已經集成的workflow包括hive、spark、hdfs、distcp等,有時我們可能需要擴展workflow,添加業務需要的邏輯,在這裏介紹下擴展workflow的一般步驟。
這裏以擴展livy的workflow爲例,主要功能是通過sparksql執行用戶的sql語句。
首先需要繼承ActionExecutor類,並重寫其方法,在對應的方法中完成與livy server交互的邏輯:
class LivyActionExecutor extends ActionExecutor("livy") {
private val logger = LoggerFactory.getLogger(classOf[LivyActionExecutor])
private val LOG = XLog.getLog(getClass)
private val SCRIPT = "script"
private val PROXY_USER = "proxyUser"
private val PARAM = "param"
private val QUEUE = "queue"
private val CONFIGURATION = "configuration"
private val DRIVER_MEMORY = "driverMemory"
private val EXECUTOR_MEMORY = "executorMemory"
private val EXECUTOR_NUMBERS = "numExecutors"
//根據用戶配置的參數,提交請求給livy server
override def start(context: ActionExecutor.Context, action: WorkflowAction): Unit = {
LogUtils.setLogInfo(action)
LOG.warn(XLog.STD, "oozie workflow start")
val actionXml = XmlUtils.parseXml(action.getConf)
val ns = actionXml.getNamespace
val scriptFile = actionXml.getChildText(SCRIPT, ns)
val proxyUser = actionXml.getChildText(PROXY_USER, ns)
val params = actionXml.getChildren(PARAM, ns).asInstanceOf[java.util.List[Element]]
var paramsList = List[String]()
for (param <- params) {
paramsList = paramsList :+ param.getTextTrim
}
var map:Map[String, String] = Map()
val driverMemory = actionXml.getChildText(DRIVER_MEMORY, ns)
if (StringUtils.isNotEmpty(driverMemory)) {
map += (DRIVER_MEMORY -> driverMemory)
}
val executorMemory = actionXml.getChildText(EXECUTOR_MEMORY, ns)
if (StringUtils.isNotEmpty(executorMemory)) {
map += (EXECUTOR_MEMORY -> executorMemory)
}
val numExecutors = actionXml.getChildText(EXECUTOR_NUMBERS, ns)
if (StringUtils.isNotEmpty(numExecutors)) {
map += (EXECUTOR_NUMBERS -> numExecutors)
}
val queue = actionXml.getChildText(QUEUE, ns)
if (StringUtils.isNotEmpty(queue)) {
map += (QUEUE -> queue)
}
val configurations = actionXml.getChild(CONFIGURATION, ns)
val properties = configurations.getChildren("property", ns).asInstanceOf[java.util.List[Element]]
var configMap:Map[String, String] = Map()
for (property <- properties) {
val name = property.getChildText("name", ns)
val value = property.getChildText("value", ns)
configMap += (name -> value)
}
if (configMap.nonEmpty) {
map += ("conf" -> configMap)
}
val livyService = Services.get.get(classOf[LivyService])
val sessionTuple = livyService.submitTask(scriptFile, proxyUser, map, paramsList)
if (sessionTuple == null) {
throw new RuntimeException("submit livy task failed")
}
LOG.warn(XLog.STD, s"batch session created : ${sessionTuple._2}, proxyUser: $proxyUser, scriptFile: $scriptFile, wfId: ${action.getId}")
LivyActionExecutor.oozieIdToSessionTuple += (action.getId -> sessionTuple)
LivyActionExecutor.sessionTupleToLogIndex += (sessionTuple -> 0)
LivyActionExecutor.sessionTupleIdUpdate += (sessionTuple -> false)
val trackerUri = s"http://${sessionTuple._1}/batches/${sessionTuple._2}"
context.setStartData(s"livy-batch-${sessionTuple._2}", trackerUri, "-")
Thread.sleep(5000)
check(context, action)
}
override def end(context: ActionExecutor.Context, action: WorkflowAction): Unit = {
val externalStatus = action.getExternalStatus
var status = WorkflowAction.Status.OK
if (!externalStatus.equals("OK")) {
status = WorkflowAction.Status.ERROR
}
context.setEndData(status, getActionSignal(status))
}
override def check(context: ActionExecutor.Context, wfAction: WorkflowAction): Unit = {
try {
LogUtils.setLogInfo(wfAction)
val wfId = wfAction.getId
val action = WorkflowActionQueryExecutor.getInstance.get(WorkflowActionQuery.GET_ACTION, wfId)
if (!LivyActionExecutor.oozieIdToSessionTuple.contains(wfId)) {
val trackUri = action.getTrackerUri
if (StringUtils.isNotEmpty(trackUri)) {
val livyNode = StringUtils.substringBetween(trackUri, "http://", "/batches/")
val sessionId = StringUtils.substringAfter(trackUri, "/batches/")
val sessionTuple = (livyNode, sessionId.toInt)
LivyActionExecutor.oozieIdToSessionTuple += (wfId -> sessionTuple)
LivyActionExecutor.sessionTupleToLogIndex += (sessionTuple -> 200)
LivyActionExecutor.sessionTupleIdUpdate += (sessionTuple -> false)
logger.info(s"recovery wfId: $wfId, livyNode: $livyNode, sessionId: $sessionId")
}
}
LivyActionExecutor.oozieIdToSessionTuple.get(wfId).foreach(sessionTuple => {
val livyService = Services.get.get(classOf[LivyService])
val state = livyService.getTaskState(sessionTuple._1, sessionTuple._2)
logger.info(s"check livy batch session ${sessionTuple._2} state: $state")
val update = LivyActionExecutor.sessionTupleIdUpdate(sessionTuple)
if (!update) {
val tuple3 = livyService.getSessionState(sessionTuple._1, sessionTuple._2)
if (tuple3 != null) {
val appId = tuple3._2
val sparkUiUrl = tuple3._3
if (StringUtils.isNotEmpty(appId) && StringUtils.isNotEmpty(sparkUiUrl)) {
//設置workflow的子作業id和url
action.setExternalId(appId)
action.setConsoleUrl(sparkUiUrl)
WorkflowActionQueryExecutor.getInstance.executeUpdate(WorkflowActionQuery.UPDATE_ACTION, action)
logger.info(s"WorkflowAction externalId: $appId, consoleUrl: $sparkUiUrl")
LivyActionExecutor.sessionTupleIdUpdate += (sessionTuple -> true)
}
}
}
if (state == null) {
context.setExecutionData("FAILED", null)
} else {
state match {
case "success" => context.setExecutionData("OK", null)
case "killed" => context.setExecutionData("KILLED", null)
case "running" => logger.info(s"batch session ${sessionTuple._2} is in state $state")
case "starting" | "error" | "dead" =>
val logIndexStart = LivyActionExecutor.sessionTupleToLogIndex(sessionTuple)
val logs = livyService.getSessionLog(sessionTuple._1, sessionTuple._2, logIndexStart, 200)
if (logs != null) {
for (log <- logs) {
LOG.warn(XLog.STD, log)
}
LivyActionExecutor.sessionTupleToLogIndex += (sessionTuple -> logs.size)
}
if (StringUtils.equals(state, "error") || StringUtils.equals(state, "dead")) {
context.setExecutionData("FAILED", null)
}
case _ =>
}
}
})
} catch {
case e: Exception => logger.error(s"check action error: ", e)
}
}
override def kill(context: ActionExecutor.Context, action: WorkflowAction): Unit = {
val wfId = action.getId
LivyActionExecutor.oozieIdToSessionTuple.get(wfId).foreach(sessionTuple => {
val livyService = Services.get.get(classOf[LivyService])
livyService.killTask(sessionTuple._1, sessionTuple._2)
logger.info(s"kill livy batch session ${sessionTuple._2}")
context.setExternalStatus("KILLED")
})
}
override def isCompleted(s: String): Boolean = {
true
}
}
在resource目錄中新建livy-action-0.1.xsd文件:
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:livy="uri:oozie:livy-action:0.1" elementFormDefault="qualified"
targetNamespace="uri:oozie:livy-action:0.1">
<xs:element name="livy" type="livy:ACTION"/>
<xs:complexType name="ACTION">
<xs:sequence>
<xs:element name="queue" type="xs:string" minOccurs="0" maxOccurs="1" />
<xs:element name="driverMemory" type="xs:string" minOccurs="0" maxOccurs="1" />
<xs:element name="executorMemory" type="xs:string" minOccurs="0" maxOccurs="1"/>
<xs:element name="numExecutors" type="xs:string" minOccurs="0" maxOccurs="1"/>
<xs:element name="proxyUser" type="xs:string" minOccurs="1" maxOccurs="1"/>
<xs:element name="script" type="xs:string" minOccurs="1" maxOccurs="1"/>
<xs:element name="param" type="xs:string" minOccurs="0" maxOccurs="10"/>
<xs:element name="configuration" type="livy:CONFIGURATION" minOccurs="0" maxOccurs="1"/>
</xs:sequence>
</xs:complexType>
<xs:complexType name="CONFIGURATION">
<xs:sequence>
<xs:element name="property" minOccurs="1" maxOccurs="unbounded">
<xs:complexType>
<xs:sequence>
<xs:element name="name" minOccurs="1" maxOccurs="1" type="xs:string"/>
<xs:element name="value" minOccurs="1" maxOccurs="1" type="xs:string"/>
<xs:element name="description" minOccurs="0" maxOccurs="1" type="xs:string"/>
</xs:sequence>
</xs:complexType>
</xs:element>
</xs:sequence>
</xs:complexType>
</xs:schema>
打包生成jar文件,將jar拷貝至oozie安裝路徑的lib目錄下面
修改conf目錄下的oozie-site.xml文件:
在oozie.service.SchemaService.wf.ext.schemas中添加livy-action-0.1.xsd
在oozie.service.ActionService.executor.ext.classes中添加自定義的action類
在oozie.services.ext中添加擴展的service類,這裏類是LivyService,可以把一些訪問接口的邏輯放到類裏。
重啓oozie,自定義的action即加載進去,可以執行相關的邏輯。