.

hampsterx · hampsterx · commit 630ad78c4884 · 2026-03-17T10:22:08.000+13:00
diff --git a/docs/DESIGN.md b/docs/DESIGN.md
@@ -44,6 +44,20 @@ Unlike earlier versions, the consumer now fully supports dynamic resharding:
 - **Rate limiting**: Configurable per-shard bandwidth and record rate limits
 - **Async buffering**: Non-blocking `put()` operations with configurable queue sizes
 
+### Checkpoint Safety
+
+Checkpoints use a **deferred execution** model to prevent data loss:
+
+1. **Deferred commit**: When a `__CHECKPOINT__` sentinel is dequeued from the internal queue, it is stored as pending but not committed. The checkpoint only fires at the start of the *next* `__anext__()` call, proving the user's code survived processing the preceding records. If the consumer crashes between receiving a record and calling `__anext__()` again, the checkpoint is never committed and records replay on restart (at-least-once).
+
+2. **Queue put timeout**: If enqueueing a parsed record times out (bounded queue full for 30s), `LastSequenceNumber` only advances to the last fully-enqueued record. The remaining rows in the Kinesis batch are abandoned to prevent a non-contiguous sequence gap that would skip records on restart.
+
+3. **Shard deallocation ordering**: When a shard iterator is exhausted (`NextShardIterator=None`), all pending checkpoints for that shard are flushed *before* `deallocate()` releases ownership. No checkpoint sentinel is enqueued for the terminal batch (it would race with deallocation); instead, those records replay on restart. Checkpoint sentinels that were already queued before deallocation are silently skipped via a `_deallocated_shards` set.
+
+4. **`checkpoint_interval` debouncing**: When set, checkpoint writes are buffered in `_pending_checkpoints` and flushed by a background task every N seconds, reducing backend write pressure. The flusher uses compare-and-delete to avoid dropping a newer sequence that arrives during the `await` on the checkpoint backend. On `close()`, deferred checkpoints are committed, the flusher is cancelled (triggering a final flush), and any remaining buffered checkpoints are flushed before the checkpointer is closed.
+
+These guarantees hold under single-process asyncio concurrency. For multi-process coordination, the `CheckPointer` implementation (e.g. Redis with locking) must handle ownership contention.
+
 ## Integration Points
 
 - **Checkpointing**: Pluggable checkpointer interface (Memory, Redis) for multi-consumer coordination
diff --git a/kinesis/consumer.py b/kinesis/consumer.py
@@ -128,6 +128,7 @@ def __init__(
         self._parent_shards = set()  # Track shards that are parents
         self._child_shards = set()  # Track shards that are children
         self._exhausted_parents = set()  # Track parent shards that are fully consumed
+        self._deallocated_shards: set = set()  # Shards already deallocated (skip stale sentinels)
 
     def __aiter__(self) -> AsyncIterator[Any]:
         return self
@@ -295,7 +296,11 @@ async def fetch(self):
                                     log.warning("Queue put timed out, skipping record")
                                     dropped = True
                                     break
-                            if item_count > 0 and not dropped:
+                            if dropped:
+                                # Stop processing this batch — remaining rows would
+                                # create a non-contiguous sequence gap on restart.
+                                break
+                            if item_count > 0:
                                 last_enqueued_sequence = row["SequenceNumber"]
                             total_items += item_count
 
@@ -384,6 +389,7 @@ async def fetch(self):
                             # restart (at-least-once safe, no data loss).
 
                             await self.checkpointer.deallocate(shard_id)
+                            self._deallocated_shards.add(shard_id)
 
                         # Remove shard iterator to stop fetching from this shard
                         shard.pop("ShardIterator", None)
@@ -867,11 +873,15 @@ async def _flush_pending_checkpoints(self):
 
         Entries are removed individually on success so that a failure
         mid-loop preserves the remaining (unflushed) checkpoints for
-        the next attempt.
+        the next attempt.  Uses compare-and-delete to avoid dropping a
+        newer sequence written by _maybe_checkpoint during the await.
         """
         for shard_id in list(self._pending_checkpoints):
-            await self.checkpointer.checkpoint(shard_id, self._pending_checkpoints[shard_id])
-            del self._pending_checkpoints[shard_id]
+            seq = self._pending_checkpoints[shard_id]
+            await self.checkpointer.checkpoint(shard_id, seq)
+            # Only delete if the value hasn't been superseded during the await
+            if self._pending_checkpoints.get(shard_id) == seq:
+                del self._pending_checkpoints[shard_id]
 
     async def __anext__(self):
 
@@ -906,9 +916,13 @@ async def __anext__(self):
                 raise StopAsyncIteration from None
 
             if item and isinstance(item, dict) and "__CHECKPOINT__" in item:
-                if self.checkpointer:
+                cp_shard = item["__CHECKPOINT__"]["ShardId"]
+                if cp_shard in self._deallocated_shards:
+                    # Stale sentinel queued before deallocation; skip silently
+                    log.debug("Skipping checkpoint for deallocated shard %s", cp_shard)
+                elif self.checkpointer:
                     # Don't execute now — defer to next __anext__ call
-                    self._deferred_checkpoints[item["__CHECKPOINT__"]["ShardId"]] = item[
+                    self._deferred_checkpoints[cp_shard] = item[
                         "__CHECKPOINT__"
                     ]["SequenceNumber"]
                 checkpoint_count += 1
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -191,7 +191,15 @@ def _factory(**kwargs):
 
     for c in consumers:
         for task in [getattr(c, "_checkpoint_flusher_task", None), c.fetch_task]:
-            if task and not task.done():
+            if task is None:
+                continue
+            if task.done():
+                # Surface exceptions from tasks that finished during the test
+                if not task.cancelled() and task.exception():
+                    log.warning(
+                        "Task finished with error during test: %s", task.exception(), exc_info=task.exception()
+                    )
+            else:
                 task.cancel()
                 try:
                     await task
diff --git a/tests/test_checkpoint_ordering.py b/tests/test_checkpoint_ordering.py
@@ -141,6 +141,58 @@ async def test_no_checkpointer_skips_checkpoint_processing(self, mock_consumer):
 class TestQueuePutTimeoutFix:
     """Verify LastSequenceNumber only advances to last successfully enqueued record."""
 
+    @pytest.mark.asyncio
+    async def test_queue_put_timeout_breaks_outer_loop(self, mock_consumer):
+        """When queue.put() times out on row 2 of a 3-row batch, row 3 is NOT
+        processed — prevents a non-contiguous sequence gap on restart."""
+        consumer = mock_consumer(sleep_time_no_records=0)
+        consumer.checkpointer = MemoryCheckPointer(name="test")
+        await consumer.checkpointer.allocate("shard-0")
+        consumer.refresh_shards = AsyncMock()
+        consumer.get_records = AsyncMock(return_value=None)
+
+        # Fail on the second row's put (row 2), succeed on row 1
+        put_count = 0
+        original_put = consumer.queue.put
+
+        async def failing_put(item):
+            nonlocal put_count
+            put_count += 1
+            if put_count == 2:  # Row 2's first output
+                raise asyncio.TimeoutError("simulated queue full")
+            await original_put(item)
+
+        consumer.queue.put = failing_put
+
+        shard = consumer.shards[0]
+        shard["ShardIterator"] = "iter-0"
+        shard["stats"] = ShardStats()
+        shard["throttler"] = Throttler(rate_limit=1, period=1)
+
+        fetch_result = {
+            "Records": [
+                {"SequenceNumber": "100", "Data": b'{"msg": "r1"}'},
+                {"SequenceNumber": "200", "Data": b'{"msg": "r2"}'},
+                {"SequenceNumber": "300", "Data": b'{"msg": "r3"}'},
+            ],
+            "NextShardIterator": "iter-next",
+        }
+        fut = asyncio.get_running_loop().create_future()
+        fut.set_result(fetch_result)
+        shard["fetch"] = fut
+
+        await consumer.fetch()
+
+        # Row 1 succeeded, row 2 dropped → outer loop breaks, row 3 never tried
+        assert shard.get("LastSequenceNumber") == "100"
+        # Only 1 data item should be in the queue (row 1)
+        items = []
+        while not consumer.queue.empty():
+            items.append(consumer.queue.get_nowait())
+        data_items = [i for i in items if isinstance(i, dict) and "msg" in i]
+        assert len(data_items) == 1
+        assert data_items[0]["msg"] == "r1"
+
     @pytest.mark.asyncio
     async def test_queue_put_timeout_no_sequence_advance(self, mock_consumer):
         """When queue.put() times out mid-batch, LastSequenceNumber tracks only
@@ -272,6 +324,28 @@ async def test_shard_exhaustion_with_records_no_sentinel_enqueued(self, mock_con
         assert len(data_items) == 2
         assert checkpoint_items == [], "No sentinel for terminal batch (would race with deallocate)"
 
+    @pytest.mark.asyncio
+    async def test_stale_sentinel_skipped_after_deallocation(self, mock_consumer):
+        """A checkpoint sentinel queued before deallocation is silently skipped
+        in __anext__ rather than checkpointing a deallocated shard."""
+        consumer = mock_consumer()
+        checkpointer = _make_mock_checkpointer()
+        consumer.checkpointer = checkpointer
+
+        # Simulate shard-0 already deallocated (e.g. shard exhaustion ran)
+        consumer._deallocated_shards.add("shard-0")
+
+        # Stale sentinel arrives for deallocated shard, then real data
+        await consumer.queue.put({"__CHECKPOINT__": {"ShardId": "shard-0", "SequenceNumber": "100"}})
+        await consumer.queue.put({"msg": "from-other-shard"})
+
+        item = await consumer.__anext__()
+        assert item == {"msg": "from-other-shard"}
+
+        # Sentinel must NOT have been deferred or committed
+        assert "shard-0" not in consumer._deferred_checkpoints
+        checkpointer.checkpoint.assert_not_awaited()
+
     @pytest.mark.asyncio
     async def test_close_flushes_then_deallocates(self, mock_consumer):
         """close() flushes all pending checkpoints before deallocating shards."""
@@ -395,6 +469,32 @@ async def test_checkpoint_interval_with_auto_checkpoint_false_raises(self):
                 checkpointer=AsyncMock(auto_checkpoint=False),
             )
 
+    @pytest.mark.asyncio
+    async def test_flush_preserves_newer_sequence_written_during_await(self, mock_consumer):
+        """If _maybe_checkpoint writes a newer sequence for a shard while
+        _flush_pending_checkpoints is awaiting the backend, the newer value
+        must survive (compare-and-delete, not unconditional delete)."""
+        consumer = mock_consumer(checkpoint_interval=60.0)
+        consumer.checkpointer = _make_mock_checkpointer()
+
+        # Simulate: flusher reads seq "100" for shard-0, then during the
+        # checkpoint() await, _maybe_checkpoint overwrites with "200".
+        original_checkpoint = consumer.checkpointer.checkpoint
+
+        async def checkpoint_with_concurrent_write(shard_id, seq):
+            # Simulate _maybe_checkpoint writing a newer value during this await
+            if seq == "100":
+                consumer._pending_checkpoints["shard-0"] = "200"
+            await original_checkpoint(shard_id, seq)
+
+        consumer.checkpointer.checkpoint = AsyncMock(side_effect=checkpoint_with_concurrent_write)
+
+        consumer._pending_checkpoints["shard-0"] = "100"
+        await consumer._flush_pending_checkpoints()
+
+        # "100" was flushed but "200" must still be pending (not deleted)
+        assert consumer._pending_checkpoints.get("shard-0") == "200"
+
     @pytest.mark.asyncio
     async def test_checkpoint_backend_raises_during_flush(self, mock_consumer):
         """Exception from checkpoint backend during flush propagates from close()."""