fix: respect null values in inner join in get_rows_to_update

mdwint · mdwint · commit 6a7aa3f127ac · 2025-09-05T11:38:21.000+02:00
diff --git a/pyiceberg/table/upsert_util.py b/pyiceberg/table/upsert_util.py
@@ -121,13 +121,31 @@ def get_rows_to_update(source_table: pa.Table, target_table: pa.Table, join_cols
     target_index = target_table.select(join_cols_set).append_column(TARGET_INDEX_COLUMN_NAME, pa.array(range(len(target_table))))
 
     # Step 3: Perform an inner join to find which rows from source exist in target
-    matching_indices = source_index.join(target_index, keys=list(join_cols_set), join_type="inner")
+    # PyArrow joins ignore null values, and we want null==null to hold, so we compute the join in Python.
+    # This is equivalent to:
+    # matching_indices = source_index.join(target_index, keys=list(join_cols_set), join_type="inner")
+    source_indices = {}
+    target_indices = {}
+
+    for row in source_index.to_pylist():
+        idx = row.pop(SOURCE_INDEX_COLUMN_NAME)
+        key = tuple(row.values())
+        source_indices[key] = idx
+
+    for row in target_index.to_pylist():
+        idx = row.pop(TARGET_INDEX_COLUMN_NAME)
+        key = tuple(row.values())
+        target_indices[key] = idx
+
+    matching_indices = [
+        (source_idx, target_idx)
+        for key, source_idx in source_indices.items()
+        if (target_idx := target_indices.get(key)) is not None
+    ]
 
     # Step 4: Compare all rows using Python
     to_update_indices = []
-    for source_idx, target_idx in zip(
-        matching_indices[SOURCE_INDEX_COLUMN_NAME].to_pylist(), matching_indices[TARGET_INDEX_COLUMN_NAME].to_pylist()
-    ):
+    for source_idx, target_idx in matching_indices:
         source_row = source_table.slice(source_idx, 1)
         target_row = target_table.slice(target_idx, 1)