上一課我們講解了Receiver啓動的流程。Receiver是通過ReceiverSupervisor的start方法啓動的:
/** Start the supervisor */
def
start() {
onStart()
startReceiver()
}
override
protected
def
onStart() {
registeredBlockGenerators.foreach {
_
.start() }
}
而registeredBlockGenerators是在ReceiverSupervisor實例化時被賦值的:
private
val
defaultBlockGenerator
=
createBlockGenerator(defaultBlockGeneratorListener)
override
def
createBlockGenerator(
blockGeneratorListener
:
BlockGeneratorListener)
:
BlockGenerator
=
{
// Cleanup BlockGenerators that have already been stopped
registeredBlockGenerators --
=
registeredBlockGenerators.filter{
_
.isStopped() }
val
newBlockGenerator
=
new
BlockGenerator(blockGeneratorListener, streamId, env.conf)
registeredBlockGenerators +
=
newBlockGenerator
newBlockGenerator
}
調用BlockGenerator的start方法
/** Start block generating and pushing threads. */
def
start()
:
Unit
=
synchronized {
if
(state
==
Initialized) {
state
=
Active
blockIntervalTimer.start()
blockPushingThread.start()
logInfo(
"Started BlockGenerator"
)
}
else
{
throw
new
SparkException(
s
"Cannot start BlockGenerator as its not in the Initialized state [state = $state]"
)
}
}
blockIntervalTimer是一個定時器,到時間了就調用updateCurrentBuffer函數
private
val
blockIntervalTimer
=
new
RecurringTimer(clock, blockIntervalMs, updateCurrentBuffer,
"BlockGenerator"
)
private
val
blockIntervalMs
=
conf.getTimeAsMs(
"spark.streaming.blockInterval"
,
"200ms"
)
blockPushingThread是一個線程,它不斷地將數據寫入到BlockManager中
private
val
blockPushingThread
=
new
Thread() {
override
def
run() { keepPushingBlocks() } }
/** Keep pushing blocks to the BlockManager. */
private
def
keepPushingBlocks() {
logInfo(
"Started block pushing thread"
)
def
areBlocksBeingGenerated
:
Boolean
=
synchronized {
state !
=
StoppedGeneratingBlocks
}
try
{
// While blocks are being generated, keep polling for to-be-pushed blocks and push them.
while
(areBlocksBeingGenerated) {
Option(blocksForPushing.poll(
10
, TimeUnit.MILLISECONDS))
match
{
case
Some(block)
=
> pushBlock(block)
case
None
=
>
}
}
// At this point, state is StoppedGeneratingBlock. So drain the queue of to-be-pushed blocks.
logInfo(
"Pushing out the last "
+ blocksForPushing.size() +
" blocks"
)
while
(!blocksForPushing.isEmpty) {
val
block
=
blocksForPushing.take()
logDebug(s
"Pushing block $block"
)
pushBlock(block)
logInfo(
"Blocks left to push "
+ blocksForPushing.size())
}
logInfo(
"Stopped block pushing thread"
)
}
catch
{
case
ie
:
InterruptedException
=
>
logInfo(
"Block pushing thread was interrupted"
)
case
e
:
Exception
=
>
reportError(
"Error in block pushing thread"
, e)
}
}
從代碼中可看出,每個10ms從blocksForPushing隊列中取出所有的Block,調用pushBlock方法
private
def
pushBlock(block
:
Block) {
listener.onPushBlock(block.id, block.buffer)
logInfo(
"Pushed block "
+ block.id)
}
private
val
defaultBlockGeneratorListener
=
new
BlockGeneratorListener {
def
onAddData(data
:
Any, metadata
:
Any)
:
Unit
=
{ }
def
onGenerateBlock(blockId
:
StreamBlockId)
:
Unit
=
{ }
def
onError(message
:
String, throwable
:
Throwable) {
reportError(message, throwable)
}
def
onPushBlock(blockId
:
StreamBlockId, arrayBuffer
:
ArrayBuffer[
_
]) {
pushArrayBuffer(arrayBuffer, None, Some(blockId))
}
}
所以會去調用pushArrayBuffer方法,最終會調用如下方法:
/** Store block and report it to driver */
def
pushAndReportBlock(
receivedBlock
:
ReceivedBlock,
metadataOption
:
Option[Any],
blockIdOption
:
Option[StreamBlockId]
) {
val
blockId
=
blockIdOption.getOrElse(nextBlockId)
val
time
=
System.currentTimeMillis
val
blockStoreResult
=
receivedBlockHandler.storeBlock(blockId, receivedBlock)
logDebug(s
"Pushed block $blockId in ${(System.currentTimeMillis - time)} ms"
)
val
numRecords
=
blockStoreResult.numRecords
val
blockInfo
=
ReceivedBlockInfo(streamId, numRecords, metadataOption, blockStoreResult)
trackerEndpoint.askWithRetry[Boolean](AddBlock(blockInfo))
logDebug(s
"Reported block $blockId"
)
}
private
val
receivedBlockHandler
:
ReceivedBlockHandler
=
{
if
(WriteAheadLogUtils.enableReceiverLog(env.conf)) {
if
(checkpointDirOption.isEmpty) {
throw
new
SparkException(
"Cannot enable receiver write-ahead log without checkpoint directory set. "
+
"Please use streamingContext.checkpoint() to set the checkpoint directory. "
+
"See documentation for more details."
)
}
new
WriteAheadLogBasedBlockHandler(env.blockManager, receiver.streamId,
receiver.storageLevel, env.conf, hadoopConf, checkpointDirOption.get)
}
else
{
new
BlockManagerBasedBlockHandler(env.blockManager, receiver.storageLevel)
}
}
數據最終都會交給BlockManager。
blocksForPushing的定義如下:
private
val
blockQueueSize
=
conf.getInt(
"spark.streaming.blockQueueSize"
,
10
)
private
val
blocksForPushing
=
new
ArrayBlockingQueue[Block](blockQueueSize)
/** Change the buffer to which single records are added to. */
private
def
updateCurrentBuffer(time
:
Long)
:
Unit
=
{
try
{
var
newBlock
:
Block
=
null
synchronized {
if
(currentBuffer.nonEmpty) {
val
newBlockBuffer
=
currentBuffer
currentBuffer
=
new
ArrayBuffer[Any]
val
blockId
=
StreamBlockId(receiverId, time - blockIntervalMs)
listener.onGenerateBlock(blockId)
newBlock
=
new
Block(blockId, newBlockBuffer)
}
}
if
(newBlock !
=
null
) {
blocksForPushing.put(newBlock)
// put is blocking when queue is full
}
}
catch
{
case
ie
:
InterruptedException
=
>
logInfo(
"Block updating timer thread was interrupted"
)
case
e
:
Exception
=
>
reportError(
"Error in block updating thread"
, e)
}
}
我們再回過頭來看看supervisor的startReceiver()方法:
/** Start receiver */
def
startReceiver()
:
Unit
=
synchronized {
try
{
if
(onReceiverStart()) {
logInfo(
"Starting receiver"
)
receiverState
=
Started
receiver.onStart()
logInfo(
"Called receiver onStart"
)
}
else
{
// The driver refused us
stop(
"Registered unsuccessfully because Driver refused to start receiver "
+ streamId, None)
}
}
catch
{
case
NonFatal(t)
=
>
stop(
"Error starting receiver "
+ streamId, Some(t))
}
}
會調用receiver的onStart方法,我們以SocketReceiver爲例:
def
onStart() {
// Start the thread that receives data over a connection
new
Thread(
"Socket Receiver"
) {
setDaemon(
true
)
override
def
run() { receive() }
}.start()
}
在該函數中,生成一個新的線程“Socket
Receiver”,線程啓動調用SocketReceiver的receive()方法
def
receive() {
var
socket
:
Socket
=
null
try
{
logInfo(
"Connecting to "
+ host +
":"
+ port)
socket
=
new
Socket(host, port)
logInfo(
"Connected to "
+ host +
":"
+ port)
val
iterator
=
bytesToObjects(socket.getInputStream())
while
(!isStopped && iterator.hasNext) {
store(iterator.next)
}
if
(!isStopped()) {
restart(
"Socket data stream had no more data"
)
}
else
{
logInfo(
"Stopped receiving"
)
}
}
catch
{
case
e
:
java.net.ConnectException
=
>
restart(
"Error connecting to "
+ host +
":"
+ port, e)
case
NonFatal(e)
=
>
logWarning(
"Error receiving data"
, e)
restart(
"Error receiving data"
, e)
}
finally
{
if
(socket !
=
null
) {
socket.close()
logInfo(
"Closed socket to "
+ host +
":"
+ port)
}
}
}
}
構建了一個Socket對象,並且不斷地從InputStream中接收數據,每接收一條調用一次store方法。
def
store(dataItem
:
T) {
supervisor.pushSingle(dataItem)
}
數據是由ReceiverSupervisor管理的,調用supervisor.pushSingle將數據寫入。
def
pushSingle(data
:
Any) {
defaultBlockGenerator.addData(data)
}
defaultBlockGenerator的定義如下
private
val
defaultBlockGenerator
=
createBlockGenerator(defaultBlockGeneratorListener)
override
def
createBlockGenerator(
blockGeneratorListener
:
BlockGeneratorListener)
:
BlockGenerator
=
{
// Cleanup BlockGenerators that have already been stopped
registeredBlockGenerators --
=
registeredBlockGenerators.filter{
_
.isStopped() }
val
newBlockGenerator
=
new
BlockGenerator(blockGeneratorListener, streamId, env.conf)
registeredBlockGenerators +
=
newBlockGenerator
newBlockGenerator
}
它就是一個BlockGenerator,而addData函數將數據保存在BlockGenerator中的currentBuffer對象中
/**
* Push a single data item into the buffer.
*/
def
addData(data
:
Any)
:
Unit
=
{
if
(state
==
Active) {
waitToPush()
synchronized {
if
(state
==
Active) {
currentBuffer +
=
data
}
else
{
throw
new
SparkException(
"Cannot add data as BlockGenerator has not been started or has been stopped"
)
}
}
}
else
{
throw
new
SparkException(
"Cannot add data as BlockGenerator has not been started or has been stopped"
)
}
}
數據源源不斷的流進來,每個200ms就會將currentBuffer中的數據寫到blocksForPushing隊列中,然後在重新實例化一個currentBuffer。而blocksForPushing隊列會每個10ms就寫入到BlockManager中。