18
18
package org .apache .spark .streaming .kafka
19
19
20
20
import java .util .Properties
21
- import java .util .concurrent .{ ConcurrentHashMap , Executors }
21
+ import java .util .concurrent .ConcurrentHashMap
22
22
23
23
import scala .collection .Map
24
24
import scala .collection .mutable
25
25
import scala .reflect .{classTag , ClassTag }
26
26
27
27
import kafka .common .TopicAndPartition
28
- import kafka .consumer .{Consumer , ConsumerConfig , ConsumerConnector }
28
+ import kafka .consumer .{Consumer , ConsumerConfig , ConsumerConnector , KafkaStream }
29
29
import kafka .serializer .Decoder
30
30
import kafka .utils .{ZkUtils , ZKGroupTopicDirs , ZKStringSerializer , VerifiableProperties }
31
31
import org .I0Itec .zkclient .ZkClient
32
32
33
33
import org .apache .spark .{SparkEnv , Logging }
34
34
import org .apache .spark .storage .{StreamBlockId , StorageLevel }
35
35
import org .apache .spark .streaming .receiver .{BlockGeneratorListener , BlockGenerator , Receiver }
36
+ import org .apache .spark .util .Utils
36
37
37
38
private [streaming]
38
39
class ReliableKafkaReceiver [
@@ -45,27 +46,33 @@ class ReliableKafkaReceiver[
45
46
storageLevel : StorageLevel )
46
47
extends Receiver [Any ](storageLevel) with Logging {
47
48
48
- /** High level consumer to connect to Kafka */
49
+ /** High level consumer to connect to Kafka. */
49
50
private var consumerConnector : ConsumerConnector = null
50
51
51
- /** zkClient to connect to Zookeeper to commit the offsets */
52
+ /** zkClient to connect to Zookeeper to commit the offsets. */
52
53
private var zkClient : ZkClient = null
53
54
54
55
private val groupId = kafkaParams(" group.id" )
55
56
56
- private lazy val env = SparkEnv .get
57
+ private def conf () = SparkEnv .get.conf
57
58
58
59
private val AUTO_OFFSET_COMMIT = " auto.commit.enable"
59
60
60
61
/** A HashMap to manage the offset for each topic/partition, this HashMap is called in
61
- * synchronized block, so mutable HashMap will not meet concurrency issue */
62
- private lazy val topicPartitionOffsetMap = new mutable.HashMap [TopicAndPartition , Long ]
62
+ * synchronized block, so mutable HashMap will not meet concurrency issue.
63
+ */
64
+ private var topicPartitionOffsetMap : mutable.HashMap [TopicAndPartition , Long ] = null
63
65
64
- /** A concurrent HashMap to store the stream block id and related offset snapshot */
65
- private lazy val blockOffsetMap =
66
- new ConcurrentHashMap [StreamBlockId , Map [TopicAndPartition , Long ]]
66
+ /** A concurrent HashMap to store the stream block id and related offset snapshot. */
67
+ private var blockOffsetMap : ConcurrentHashMap [StreamBlockId , Map [TopicAndPartition , Long ]] = null
67
68
68
- private lazy val blockGeneratorListener = new BlockGeneratorListener {
69
+ /** Manage the BlockGenerator in receiver itself for better managing block store and offset
70
+ * commit.
71
+ */
72
+ private var blockGenerator : BlockGenerator = null
73
+
74
+ /** Kafka offsets checkpoint listener to register into BlockGenerator for offsets checkpoint. */
75
+ private final class OffsetCheckpointListener extends BlockGeneratorListener {
69
76
override def onStoreData (data : Any , metadata : Any ): Unit = {
70
77
if (metadata != null ) {
71
78
val kafkaMetadata = metadata.asInstanceOf [(TopicAndPartition , Long )]
@@ -96,10 +103,6 @@ class ReliableKafkaReceiver[
96
103
}
97
104
}
98
105
99
- /** Manage the BlockGenerator in receiver itself for better managing block store and offset
100
- * commit */
101
- private var blockGenerator : BlockGenerator = null
102
-
103
106
override def onStop (): Unit = {
104
107
if (consumerConnector != null ) {
105
108
consumerConnector.shutdown()
@@ -111,13 +114,33 @@ class ReliableKafkaReceiver[
111
114
zkClient = null
112
115
}
113
116
114
- blockGenerator.stop()
117
+ if (blockGenerator != null ) {
118
+ blockGenerator.stop()
119
+ blockGenerator = null
120
+ }
121
+
122
+ if (topicPartitionOffsetMap != null ) {
123
+ topicPartitionOffsetMap.clear()
124
+ topicPartitionOffsetMap = null
125
+ }
126
+
127
+ if (blockOffsetMap != null ) {
128
+ blockOffsetMap.clear()
129
+ blockOffsetMap = null
130
+ }
115
131
}
116
132
117
133
override def onStart (): Unit = {
118
134
logInfo(s " Starting Kafka Consumer Stream with group: $groupId" )
119
135
120
- blockGenerator = new BlockGenerator (blockGeneratorListener, streamId, env.conf)
136
+ // Initialize the topic-partition / offset hash map.
137
+ topicPartitionOffsetMap = new mutable.HashMap [TopicAndPartition , Long ]
138
+
139
+ // Initialize the stream block id / offset snapshot hash map.
140
+ blockOffsetMap = new ConcurrentHashMap [StreamBlockId , Map [TopicAndPartition , Long ]]()
141
+
142
+ // Initialize the block generator for storing Kafka message.
143
+ blockGenerator = new BlockGenerator (new OffsetCheckpointListener , streamId, conf())
121
144
122
145
if (kafkaParams.contains(AUTO_OFFSET_COMMIT ) && kafkaParams(AUTO_OFFSET_COMMIT ) == " true" ) {
123
146
logWarning(s " $AUTO_OFFSET_COMMIT should be set to false in ReliableKafkaReceiver, " +
@@ -133,7 +156,7 @@ class ReliableKafkaReceiver[
133
156
134
157
val consumerConfig = new ConsumerConfig (props)
135
158
136
- assert(consumerConfig.autoCommitEnable == false )
159
+ assert(! consumerConfig.autoCommitEnable)
137
160
138
161
logInfo(s " Connecting to Zookeeper: ${consumerConfig.zkConnect}" )
139
162
consumerConnector = Consumer .create(consumerConfig)
@@ -156,41 +179,45 @@ class ReliableKafkaReceiver[
156
179
val topicMessageStreams = consumerConnector.createMessageStreams(
157
180
topics, keyDecoder, valueDecoder)
158
181
159
- val executorPool = Executors .newFixedThreadPool (topics.values.sum)
182
+ val executorPool = Utils .newDaemonFixedThreadPool (topics.values.sum, " KafkaMessageHandler " )
160
183
161
184
try {
162
185
topicMessageStreams.values.foreach { streams =>
163
186
streams.foreach { stream =>
164
- executorPool.submit(new Runnable {
165
- override def run (): Unit = {
166
- logInfo(s " Starting message process thread ${Thread .currentThread().getId}. " )
167
- try {
168
- for (msgAndMetadata <- stream) {
169
- val topicAndPartition = TopicAndPartition (
170
- msgAndMetadata.topic, msgAndMetadata.partition)
171
- val metadata = (topicAndPartition, msgAndMetadata.offset)
172
-
173
- blockGenerator += ((msgAndMetadata.key, msgAndMetadata.message), metadata)
174
- }
175
- } catch {
176
- case e : Throwable => logError(" Error handling message; existing" , e)
177
- }
178
- }
179
- })
187
+ executorPool.submit(new MessageHandler (stream))
180
188
}
181
189
}
182
190
} finally {
183
191
executorPool.shutdown()
184
192
}
185
193
}
186
194
195
+ /** A inner class to handle received Kafka message. */
196
+ private final class MessageHandler (stream : KafkaStream [K , V ]) extends Runnable {
197
+ override def run (): Unit = {
198
+ logInfo(s " Starting message process thread ${Thread .currentThread().getId}. " )
199
+ try {
200
+ for (msgAndMetadata <- stream) {
201
+ val topicAndPartition = TopicAndPartition (
202
+ msgAndMetadata.topic, msgAndMetadata.partition)
203
+ val metadata = (topicAndPartition, msgAndMetadata.offset)
204
+
205
+ blockGenerator += ((msgAndMetadata.key, msgAndMetadata.message), metadata)
206
+ }
207
+ } catch {
208
+ case e : Throwable => logError(" Error handling message; existing" , e)
209
+ }
210
+ }
211
+ }
212
+
187
213
/**
188
214
* Commit the offset of Kafka's topic/partition, the commit mechanism follow Kafka 0.8.x's
189
215
* metadata schema in Zookeeper.
190
216
*/
191
217
private def commitOffset (offsetMap : Map [TopicAndPartition , Long ]): Unit = {
192
218
if (zkClient == null ) {
193
- logError(s " zkClient $zkClient should be initialized at started " )
219
+ val thrown = new IllegalStateException (" Zookeeper client is unexpectedly null" )
220
+ stop(" Zookeeper client is not initialized before commit offsets to ZK" , thrown)
194
221
return
195
222
}
196
223
@@ -205,7 +232,7 @@ class ReliableKafkaReceiver[
205
232
s " ${topicAndPart.topic}, partition ${topicAndPart.partition}" , t)
206
233
}
207
234
208
- logInfo(s " Committed offset ${ offset} for topic ${topicAndPart.topic}, " +
235
+ logInfo(s " Committed offset $offset for topic ${topicAndPart.topic}, " +
209
236
s " partition ${topicAndPart.partition}" )
210
237
}
211
238
}
0 commit comments