Create indexes as needed

msg555 · msg555 · commit 2fd78db733ac · 2025-04-09T20:29:47.000-07:00
diff --git a/subsetter/common.py b/subsetter/common.py
@@ -1,6 +1,6 @@
 import logging
 import os
-from typing import Any, Dict, List, Literal, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Literal, Optional, Tuple, Union
 
 import sqlalchemy as sa
 from pydantic import BaseModel
@@ -217,3 +217,35 @@ def _set_session_sqls(dbapi_connection, _):
                 cursor.close()
 
         return engine
+
+
+def pydantic_search(root: Any) -> Iterable[BaseModel]:
+    """
+    A generator that yields all sub-models found underneath the passed root object (including the
+    root object itself). Searches model fields as well as through lists and dicts found in those
+    fields.
+    """
+    vis = set()
+    stack = []
+
+    def _push(key: Any, value: Any):
+        if isinstance(value, (BaseModel, list, dict)):
+            if id(value) not in vis:
+                vis.add(id(value))
+                stack.append(value)
+
+    _push(None, root)
+    while stack:
+        data = stack.pop()
+        if isinstance(data, BaseModel):
+            yield data
+            for field, _ in data.model_fields.items():
+                _push(field, getattr(data, field))
+
+        if isinstance(data, list):
+            for idx, elem in enumerate(data):
+                _push(idx, elem)
+
+        if isinstance(data, dict):
+            for key, elem in data.items():
+                _push(key, elem)
diff --git a/subsetter/sampler.py b/subsetter/sampler.py
@@ -1,4 +1,5 @@
 import abc
+import collections
 import functools
 import json
 import logging
@@ -12,7 +13,7 @@
 from sqlalchemy.sql.compiler import SQLCompiler
 from sqlalchemy.sql.expression import ClauseElement, Executable
 
-from subsetter.common import DatabaseConfig, parse_table_name
+from subsetter.common import DatabaseConfig, parse_table_name, pydantic_search
 from subsetter.config_model import (
     ConflictStrategy,
     DatabaseOutputConfig,
@@ -21,7 +22,7 @@
 )
 from subsetter.filters import FilterOmit, FilterView, FilterViewChain
 from subsetter.metadata import DatabaseMetadata
-from subsetter.plan_model import SQLTableIdentifier
+from subsetter.plan_model import SQLLeftJoin, SQLTableIdentifier
 from subsetter.planner import SubsetPlan
 from subsetter.solver import toposort
 
@@ -69,7 +70,7 @@ def create(
         select: sa.Select,
         *,
         name: str = "",
-        primary_key: Tuple[str, ...] = (),
+        indexes: Iterable[Tuple[str, ...]] = (),
     ) -> Tuple[sa.Table, int]:
         """
         Create a temporary table on the passed connection generated by the passed
@@ -82,9 +83,8 @@ def create(
             schema: The schema to create the temporary table within. For some dialects
                     temporary tables always exist in their own schema and this parameter
                     will be ignored.
-            primary_key: If set will mark the set of columns passed as primary keys in
-                         the temporary table. This tuple should match a subset of the
-                         column names in the select query.
+            indexes: creates an index on each tuple of columns listed. This is useful
+                     if future queries are likely to reference these columns.
 
         Returns a tuple containing the generated table object and the number of rows that
         were inserted in the table.
@@ -106,10 +106,7 @@ def create(
             metadata,
             schema=temp_schema,
             prefixes=["TEMPORARY"],
-            *(
-                sa.Column(col.name, col.type, primary_key=col.name in primary_key)
-                for col in select.selected_columns
-            ),
+            *(sa.Column(col.name, col.type) for col in select.selected_columns),
         )
         try:
             metadata.create_all(conn)
@@ -122,6 +119,22 @@ def create(
             if "--read-only" not in str(exc):
                 raise
 
+        for idx, index_cols in enumerate(indexes):
+            # For some dialects/data types we may not be able to construct an index. We just do our
+            # best here instead of hard failing.
+            try:
+                sa.Index(
+                    f"{temp_name}_idx_{idx}",
+                    *(table_obj.columns[col_name] for col_name in index_cols),
+                ).create(bind=conn)
+            except sa.exc.OperationalError:
+                LOGGER.warning(
+                    "Failed to create index %s on temporary table %s",
+                    index_cols,
+                    temp_name,
+                    exc_info=True,
+                )
+
         # Copy data into the temporary table
         stmt = table_obj.insert().from_select(list(table_obj.columns), select)
         LOGGER.debug(
@@ -834,6 +847,18 @@ def _materialize_tables(
         conn: sa.Connection,
         plan: SubsetPlan,
     ) -> None:
+        # Figure out what sets of columns are going to be queried for our materialized tables.
+        joined_columns = collections.defaultdict(set)
+        for data in pydantic_search(plan):
+            if not isinstance(data, SQLLeftJoin):
+                continue
+            table_id = data.right
+            if not table_id.sampled:
+                continue
+            joined_columns[(table_id.table_schema, table_id.table_name)].add(
+                tuple(data.right_columns)
+            )
+
         materialization_order = self._materialization_order(meta, plan)
         for schema, table_name, ref_count in materialization_order:
             table = meta.tables[(schema, table_name)]
@@ -866,7 +891,7 @@ def _materialize_tables(
                     schema,
                     table_q,
                     name=table_name,
-                    primary_key=table.primary_key,
+                    indexes=joined_columns[(schema, table_name)],
                 )
             )
             self.cached_table_sizes[(schema, table_name)] = rowcount
@@ -889,7 +914,7 @@ def _materialize_tables(
                         schema,
                         meta.temp_tables[(schema, table_name, 0)].select(),
                         name=table_name,
-                        primary_key=table.primary_key,
+                        indexes=joined_columns[(schema, table_name)],
                     )
                 )
                 LOGGER.info(