2
2
3
3
import logging
4
4
import time
5
- from multiprocessing import Process , Queue as MPQueue , Event , Value
5
+
6
+ from collections import namedtuple
7
+ from multiprocessing import Process , Manager as MPManager
6
8
7
9
try :
8
- from Queue import Empty
10
+ from Queue import Empty , Full
9
11
except ImportError : # python 2
10
- from queue import Empty
12
+ from queue import Empty , Full
11
13
12
14
from .base import (
13
15
AUTO_COMMIT_MSG_COUNT , AUTO_COMMIT_INTERVAL ,
14
- NO_MESSAGES_WAIT_TIME_SECONDS
16
+ NO_MESSAGES_WAIT_TIME_SECONDS ,
17
+ FULL_QUEUE_WAIT_TIME_SECONDS
15
18
)
16
19
from .simple import Consumer , SimpleConsumer
17
20
18
- log = logging . getLogger ( "kafka" )
21
+ Events = namedtuple ( "Events" , [ "start" , "pause" , "exit" ] )
19
22
23
+ log = logging .getLogger ("kafka" )
20
24
21
- def _mp_consume (client , group , topic , chunk , queue , start , exit , pause , size ):
25
+ def _mp_consume (client , group , topic , queue , size , events , ** consumer_options ):
22
26
"""
23
27
A child process worker which consumes messages based on the
24
28
notifications given by the controller process
@@ -34,20 +38,20 @@ def _mp_consume(client, group, topic, chunk, queue, start, exit, pause, size):
34
38
# We will start consumers without auto-commit. Auto-commit will be
35
39
# done by the master controller process.
36
40
consumer = SimpleConsumer (client , group , topic ,
37
- partitions = chunk ,
38
41
auto_commit = False ,
39
42
auto_commit_every_n = None ,
40
- auto_commit_every_t = None )
43
+ auto_commit_every_t = None ,
44
+ ** consumer_options )
41
45
42
46
# Ensure that the consumer provides the partition information
43
47
consumer .provide_partition_info ()
44
48
45
49
while True :
46
50
# Wait till the controller indicates us to start consumption
47
- start .wait ()
51
+ events . start .wait ()
48
52
49
53
# If we are asked to quit, do so
50
- if exit .is_set ():
54
+ if events . exit .is_set ():
51
55
break
52
56
53
57
# Consume messages and add them to the queue. If the controller
@@ -56,7 +60,13 @@ def _mp_consume(client, group, topic, chunk, queue, start, exit, pause, size):
56
60
57
61
message = consumer .get_message ()
58
62
if message :
59
- queue .put (message )
63
+ while True :
64
+ try :
65
+ queue .put (message , timeout = FULL_QUEUE_WAIT_TIME_SECONDS )
66
+ break
67
+ except Full :
68
+ if events .exit .is_set (): break
69
+
60
70
count += 1
61
71
62
72
# We have reached the required size. The controller might have
@@ -65,7 +75,7 @@ def _mp_consume(client, group, topic, chunk, queue, start, exit, pause, size):
65
75
# loop consuming all available messages before the controller
66
76
# can reset the 'start' event
67
77
if count == size .value :
68
- pause .wait ()
78
+ events . pause .wait ()
69
79
70
80
else :
71
81
# In case we did not receive any message, give up the CPU for
@@ -105,7 +115,8 @@ class MultiProcessConsumer(Consumer):
105
115
def __init__ (self , client , group , topic , auto_commit = True ,
106
116
auto_commit_every_n = AUTO_COMMIT_MSG_COUNT ,
107
117
auto_commit_every_t = AUTO_COMMIT_INTERVAL ,
108
- num_procs = 1 , partitions_per_proc = 0 ):
118
+ num_procs = 1 , partitions_per_proc = 0 ,
119
+ ** simple_consumer_options ):
109
120
110
121
# Initiate the base consumer class
111
122
super (MultiProcessConsumer , self ).__init__ (
@@ -117,11 +128,13 @@ def __init__(self, client, group, topic, auto_commit=True,
117
128
118
129
# Variables for managing and controlling the data flow from
119
130
# consumer child process to master
120
- self .queue = MPQueue (1024 ) # Child consumers dump messages into this
121
- self .start = Event () # Indicates the consumers to start fetch
122
- self .exit = Event () # Requests the consumers to shutdown
123
- self .pause = Event () # Requests the consumers to pause fetch
124
- self .size = Value ('i' , 0 ) # Indicator of number of messages to fetch
131
+ manager = MPManager ()
132
+ self .queue = manager .Queue (1024 ) # Child consumers dump messages into this
133
+ self .events = Events (
134
+ start = manager .Event (), # Indicates the consumers to start fetch
135
+ exit = manager .Event (), # Requests the consumers to shutdown
136
+ pause = manager .Event ()) # Requests the consumers to pause fetch
137
+ self .size = manager .Value ('i' , 0 ) # Indicator of number of messages to fetch
125
138
126
139
# dict.keys() returns a view in py3 + it's not a thread-safe operation
127
140
# http://blog.labix.org/2008/06/27/watch-out-for-listdictkeys-in-python-3
@@ -143,12 +156,14 @@ def __init__(self, client, group, topic, auto_commit=True,
143
156
144
157
self .procs = []
145
158
for chunk in chunks :
146
- args = (client .copy (),
147
- group , topic , chunk ,
148
- self .queue , self .start , self .exit ,
149
- self .pause , self .size )
150
-
151
- proc = Process (target = _mp_consume , args = args )
159
+ options = {'partitions' : list (chunk )}
160
+ if simple_consumer_options :
161
+ simple_consumer_options .pop ('partitions' , None )
162
+ options .update (simple_consumer_options )
163
+
164
+ args = (client .copy (), group , topic , self .queue ,
165
+ self .size , self .events )
166
+ proc = Process (target = _mp_consume , args = args , kwargs = options )
152
167
proc .daemon = True
153
168
proc .start ()
154
169
self .procs .append (proc )
@@ -159,9 +174,9 @@ def __repr__(self):
159
174
160
175
def stop (self ):
161
176
# Set exit and start off all waiting consumers
162
- self .exit .set ()
163
- self .pause .set ()
164
- self .start .set ()
177
+ self .events . exit .set ()
178
+ self .events . pause .set ()
179
+ self .events . start .set ()
165
180
166
181
for proc in self .procs :
167
182
proc .join ()
@@ -176,10 +191,10 @@ def __iter__(self):
176
191
# Trigger the consumer procs to start off.
177
192
# We will iterate till there are no more messages available
178
193
self .size .value = 0
179
- self .pause .set ()
194
+ self .events . pause .set ()
180
195
181
196
while True :
182
- self .start .set ()
197
+ self .events . start .set ()
183
198
try :
184
199
# We will block for a small while so that the consumers get
185
200
# a chance to run and put some messages in the queue
@@ -191,12 +206,12 @@ def __iter__(self):
191
206
192
207
# Count, check and commit messages if necessary
193
208
self .offsets [partition ] = message .offset + 1
194
- self .start .clear ()
209
+ self .events . start .clear ()
195
210
self .count_since_commit += 1
196
211
self ._auto_commit ()
197
212
yield message
198
213
199
- self .start .clear ()
214
+ self .events . start .clear ()
200
215
201
216
def get_messages (self , count = 1 , block = True , timeout = 10 ):
202
217
"""
@@ -216,7 +231,7 @@ def get_messages(self, count=1, block=True, timeout=10):
216
231
# necessary, but these will not be committed to kafka. Also, the extra
217
232
# messages can be provided in subsequent runs
218
233
self .size .value = count
219
- self .pause .clear ()
234
+ self .events . pause .clear ()
220
235
221
236
if timeout is not None :
222
237
max_time = time .time () + timeout
@@ -228,7 +243,7 @@ def get_messages(self, count=1, block=True, timeout=10):
228
243
# go into overdrive and keep consuming thousands of
229
244
# messages when the user might need only a few
230
245
if self .queue .empty ():
231
- self .start .set ()
246
+ self .events . start .set ()
232
247
233
248
try :
234
249
partition , message = self .queue .get (block , timeout )
@@ -242,8 +257,8 @@ def get_messages(self, count=1, block=True, timeout=10):
242
257
timeout = max_time - time .time ()
243
258
244
259
self .size .value = 0
245
- self .start .clear ()
246
- self .pause .set ()
260
+ self .events . start .clear ()
261
+ self .events . pause .set ()
247
262
248
263
# Update and commit offsets if necessary
249
264
self .offsets .update (new_offsets )
0 commit comments