UniverseTBD
diff --git a/‎src/pu/__main__.py‎
Lines changed: 50 additions & 0 deletions b/‎src/pu/__main__.py‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎src/pu/arch_map.py‎
Lines changed: 166 additions & 0 deletions b/‎src/pu/arch_map.py‎
Lines changed: 166 additions & 0 deletions
@@ -47,6 +47,29 @@ def main():
     parser_percentiles.add_argument("--resize-mode", type=str, default="match", choices=["match", "fill"], help="Resize strategy (default: match).")
     parser_percentiles.add_argument("--output", type=str, default="data/percentiles.json", help="Output JSON path (default: data/percentiles.json).")
 
+    # Subparser for layerwise extraction
+    parser_extract = subparsers.add_parser("extract-layers", help="Extract embeddings from all layers of a model.")
+    parser_extract.add_argument("--model", required=True, help="Model to extract (e.g., 'vit', 'dino').")
+    parser_extract.add_argument("--mode", required=True, help="Dataset mode (e.g., 'jwst', 'desi').")
+    parser_extract.add_argument("--batch-size", type=int, default=64, help="Batch size (default: 64, lower than run due to layerwise memory).")
+    parser_extract.add_argument("--num-workers", type=int, default=0, help="Number of data loader workers.")
+    parser_extract.add_argument("--no-resize", dest="resize", action="store_false", help="Disable galaxy resizing.")
+    parser_extract.add_argument("--resize-mode", type=str, default="match", choices=["match", "fill"], help="Resize strategy (default: match).")
+    parser_extract.add_argument("--test", action="store_true", help="Quick test run using only 1000 samples.")
+    parser_extract.add_argument("--test-10k", action="store_true", help="Test run using only 10000 samples.")
+    parser_extract.add_argument("--hf-repo", type=str, default=os.environ.get("PU_HF_REPO"), help="HuggingFace dataset repo ID for upload. Default: $PU_HF_REPO.")
+    parser_extract.add_argument("--hf-token", type=str, default=None, help="HuggingFace token. Default: $HF_TOKEN env var.")
+    parser_extract.add_argument("--no-upload", action="store_true", help="Disable HuggingFace upload (upload is on by default when --hf-repo is set).")
+    parser_extract.add_argument("--delete-after-upload", action="store_true", help="Delete local parquet file after successful upload to HuggingFace. Saves disk space.")
+    parser_extract.add_argument("--output-dir", type=str, default="data", help="Directory to write parquet files (default: data/).")
+
+    # Subparser for pushing parquet files to HuggingFace Hub
+    parser_push = subparsers.add_parser("push", help="Upload parquet files to a HuggingFace dataset repo.")
+    parser_push.add_argument("file", nargs="?", help="Path to a .parquet file to upload.")
+    parser_push.add_argument("--all", action="store_true", help="Upload all .parquet files in data/.")
+    parser_push.add_argument("--repo", required=True, help="HuggingFace dataset repo ID (e.g., 'org/dataset-name').")
+    parser_push.add_argument("--token", type=str, default=None, help="HuggingFace token. Default: $HF_TOKEN env var.")
+
     # Subparser for benchmarking performance optimizations
     parser_benchmark = subparsers.add_parser("benchmark", help="Run performance benchmarks with optimization flags.")
     parser_benchmark.add_argument("--model", required=True, help="Model to benchmark (e.g., 'vit', 'dino').")
@@ -162,6 +185,33 @@ def main():
             resize_mode=args.resize_mode,
             output_path=args.output,
         )
+    elif args.command == "extract-layers":
+        from pu.experiments_layerwise import extract_all_layers
+        if args.mode in PAIRED_MODES and args.num_workers > 0:
+            print(f"Warning: Setting num_workers=0 for paired mode '{args.mode}' because multiple workers can change draw order and break pairing.")
+            args.num_workers = 0
+        extract_all_layers(
+            args.model,
+            args.mode,
+            batch_size=args.batch_size,
+            num_workers=args.num_workers,
+            max_samples=1000 if args.test else 10000 if args.test_10k else None,
+            resize=args.resize,
+            resize_mode=args.resize_mode,
+            output_dir=args.output_dir,
+            hf_repo=args.hf_repo,
+            hf_token=args.hf_token,
+            upload=not args.no_upload,
+            delete_after_upload=args.delete_after_upload,
+        )
+    elif args.command == "push":
+        from pu.hub import push_parquet, push_all
+        if args.all:
+            push_all("data", args.repo, token=args.token)
+        elif args.file:
+            push_parquet(args.file, args.repo, token=args.token)
+        else:
+            parser.error("Specify a file or --all")
     elif args.command == "benchmark":
         from pu.benchmark import run_benchmark, BenchmarkConfig
 
 
@@ -0,0 +1,166 @@
+"""Map the full module tree of any PyTorch model to a machine-readable JSON.
+
+Every nn.Module in the tree is a valid hook point. This module:
+1. Walks the full named_modules() graph
+2. Probes each module with a dummy forward to get output shapes
+3. Dumps a JSON file describing every extractable point
+
+Usage:
+    from pu.arch_map import map_architecture
+    arch = map_architecture(model, dummy_input)
+    # arch is a list of dicts, each with:
+    #   name, class, output_shape, num_params, depth, is_leaf
+"""
+
+import json
+from pathlib import Path
+
+import torch
+import torch.nn as nn
+
+
+def map_architecture(model, dummy_input, device="cuda"):
+    """Walk the full module tree and probe output shapes.
+
+    Args:
+        model: Any nn.Module (already on device, in eval mode).
+        dummy_input: A tensor that can be passed to model(dummy_input).
+                     For CLIP, pass pixel_values; for VLMs, pass a dict.
+        device: Device string.
+
+    Returns:
+        List of dicts, one per named module (excluding root).
+    """
+    shapes = {}
+    hooks = []
+
+    def _make_hook(name):
+        def hook(module, input, output):
+            if isinstance(output, tuple):
+                t = output[0]
+            elif isinstance(output, dict):
+                # Some modules return dicts (e.g., BaseModelOutput)
+                t = next(iter(output.values())) if output else None
+            else:
+                t = output
+            if isinstance(t, torch.Tensor):
+                shapes[name] = list(t.shape)
+            else:
+                shapes[name] = None
+        return hook
+
+    # Register hooks on every module
+    for name, mod in model.named_modules():
+        if name:
+            h = mod.register_forward_hook(_make_hook(name))
+            hooks.append(h)
+
+    # Forward pass to capture all shapes
+    with torch.no_grad():
+        try:
+            if isinstance(dummy_input, dict):
+                model(**dummy_input)
+            else:
+                model(dummy_input)
+        except Exception as e:
+            print(f"Warning: forward pass raised {e.__class__.__name__}: {e}")
+
+    # Remove all hooks
+    for h in hooks:
+        h.remove()
+
+    # Build architecture map
+    arch = []
+    for name, mod in model.named_modules():
+        if not name:
+            continue
+        # Count depth by dots
+        depth = name.count(".") + 1
+        # Is leaf = has no children
+        is_leaf = len(list(mod.children())) == 0
+        # Parameter count (non-recursive to avoid double counting)
+        num_params = sum(p.numel() for p in mod.parameters(recurse=False))
+
+        entry = {
+            "name": name,
+            "class": mod.__class__.__name__,
+            "output_shape": shapes.get(name),
+            "num_params": num_params,
+            "depth": depth,
+            "is_leaf": is_leaf,
+        }
+        arch.append(entry)
+
+    return arch
+
+
+def map_all_models(output_dir="data/architectures", batch_size=2, image_size=224):
+    """Map architectures for all registered models and save as JSON files.
+
+    Loads each model, runs a dummy forward, and saves the full module tree.
+    """
+    from pu.models import get_adapter
+    from pu.experiments_layerwise import MODEL_MAP
+
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    dummy_img = torch.randn(batch_size, 3, image_size, image_size)
+
+    for alias, (sizes, model_names) in MODEL_MAP.items():
+        # Just map the first (smallest) size
+        size, model_name = sizes[0], model_names[0]
+        out_path = output_dir / f"{alias}_{size}.json"
+
+        if out_path.exists():
+            print(f"[skip] {out_path} already exists")
+            continue
+
+        print(f"\n[{alias} {size}] Loading {model_name}...")
+        try:
+            adapter_cls = get_adapter(alias)
+            adapter = adapter_cls(model_name, size, alias=alias)
+            adapter.load()
+        except Exception as e:
+            print(f"  [error] Could not load: {e}")
+            continue
+
+        model = adapter.model
+        device = next(model.parameters()).device
+
+        # Determine the right input for this model type
+        if alias in ("clip",):
+            dummy = dummy_img.to(device)
+            # CLIP needs pixel_values kwarg for full model, but we map vision_model
+            model_to_map = model.vision_model
+            dummy_for_map = dummy
+        else:
+            model_to_map = model
+            dummy_for_map = dummy_img.to(device)
+
+        print(f"  Mapping {sum(1 for _ in model_to_map.named_modules()) - 1} modules...")
+        arch = map_architecture(model_to_map, dummy_for_map, device=str(device))
+
+        with open(out_path, "w") as f:
+            json.dump({
+                "model_alias": alias,
+                "model_size": size,
+                "model_name": model_name,
+                "num_modules": len(arch),
+                "num_leaf_modules": sum(1 for a in arch if a["is_leaf"]),
+                "total_params": sum(a["num_params"] for a in arch),
+                "modules": arch,
+            }, f, indent=2)
+
+        print(f"  Saved to {out_path} ({len(arch)} modules)")
+
+        # Cleanup
+        del adapter, model, model_to_map
+        import gc
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+
+
+if __name__ == "__main__":
+    map_all_models()