sync wait before L1 and L2 flush (pytorch#791)

duduyi2013 · facebook-github-bot · commit b22ead666a01 · 2025-02-18T22:35:50.000-08:00
Summary: X-link: pytorch#3709 Pull Request resolved: facebookresearch/FBGEMM#791 during flush, make sure we blocking wait on all the pending kernels before we do sync flush on L1 and L2 Reviewed By: q10, sryap Differential Revision: D69557437 fbshipit-source-id: 04d4a7850709f94055f8b2d5beab0fe622903378
diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py b/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py
@@ -1831,12 +1831,12 @@ def flush(self) -> None:
 
         torch.cuda.current_stream().wait_stream(self.ssd_eviction_stream)
 
+        torch.cuda.synchronize()
         self.ssd_db.set(
             active_ids_cpu,
             active_weights_cpu,
             torch.tensor([active_ids_cpu.numel()]),
         )
-
         self.ssd_db.flush()
 
     def prepare_inputs(