feat: update workspace paths and enhance gitignore

- Updated stablediffusion crate path from "../stable-diffusion-burn" to "./crates/stable-diffusion-burn" for proper workspace resolution - Enhanced .gitignore to include generated model files (.mpk, .pt, .bin, .safetensors, .ckpt) and user_data directory - Added Cargo.lock to gitignore with appropriate comment - Reorganized IDE files section in gitignore for better clarity - Added newline at end of file for proper formatting
2026-03-05 19:39:14 +01:00
parent 4bb7ca9074
commit 3a67c0979c
1605 changed files with 537032 additions and 2 deletions
--- a/crates/stable-diffusion-burn/burn-crates/burn-store/benches/download_resnet18.py
+++ b/crates/stable-diffusion-burn/burn-crates/burn-store/benches/download_resnet18.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+# /// script
+# requires-python = ">=3.8"
+# dependencies = [
+#     "torch",
+#     "torchvision",
+# ]
+# ///
+"""
+Download ResNet18 PyTorch model for benchmarking.
+This script downloads a pre-trained ResNet18 model from PyTorch Hub
+and saves it in a format suitable for benchmarking.
+"""
+
+import os
+import sys
+import tempfile
+from pathlib import Path
+
+import torch
+import torchvision.models as models
+
+def download_resnet18():
+    """Download ResNet18 model and save to temp directory."""
+
+    # Create a temporary directory for the model
+    temp_dir = Path(tempfile.gettempdir()) / "burn_resnet18_benchmark"
+    temp_dir.mkdir(parents=True, exist_ok=True)
+
+    output_path = temp_dir / "resnet18.pth"
+
+    # Check if already downloaded
+    if output_path.exists():
+        file_size_mb = output_path.stat().st_size / (1024 * 1024)
+        print(f"✅ ResNet18 already exists at: {output_path}")
+        print(f"   Size: {file_size_mb:.1f} MB")
+        return str(output_path)
+
+    print("📥 Downloading ResNet18 model...")
+
+    try:
+        # Download pre-trained ResNet18 model
+        model = models.resnet18(pretrained=True)
+
+        # Save the model state dict (this is what burn-store reads)
+        # Using the legacy format for compatibility
+        torch.save(model.state_dict(), output_path, _use_new_zipfile_serialization=False)
+
+        file_size_mb = output_path.stat().st_size / (1024 * 1024)
+        print(f"✅ Successfully downloaded ResNet18 to: {output_path}")
+        print(f"   Size: {file_size_mb:.1f} MB")
+        print(f"   Format: PyTorch legacy format")
+
+        # Verify it's readable
+        state_dict = torch.load(output_path, map_location='cpu')
+        print(f"   Tensors: {len(state_dict)} tensors")
+
+        # Print a few tensor names and shapes for verification
+        print("\n   Sample tensors:")
+        for i, (name, tensor) in enumerate(state_dict.items()):
+            if i < 3:
+                print(f"     - {name}: {list(tensor.shape)}")
+
+        return str(output_path)
+
+    except Exception as e:
+        print(f"❌ Failed to download ResNet18: {e}")
+        sys.exit(1)
+
+def main():
+    """Main entry point."""
+    path = download_resnet18()
+
+    # Write the path to a file that the benchmark can read
+    bench_config = Path(tempfile.gettempdir()) / "burn_resnet18_benchmark" / "path.txt"
+    bench_config.write_text(path)
+
+    print(f"\n💡 Model ready for benchmarking")
+    print(f"   Run: cargo bench --bench resnet18_loading")
+
+if __name__ == "__main__":
+    main()
--- a/crates/stable-diffusion-burn/burn-crates/burn-store/benches/generate_unified_models.py
+++ b/crates/stable-diffusion-burn/burn-crates/burn-store/benches/generate_unified_models.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python3
+# /// script
+# requires-python = ">=3.8"
+# dependencies = [
+#     "torch",
+#     "safetensors",
+#     "packaging",
+#     "numpy",
+# ]
+# ///
+"""
+Generate a large model (~312MB) in both PyTorch and SafeTensors formats for unified benchmarking.
+
+Usage:
+    uv run benches/generate_unified_models.py
+
+The script will create model files in /tmp/simple_bench_models/ directory.
+"""
+
+import torch
+import torch.nn as nn
+import os
+from pathlib import Path
+import tempfile
+from safetensors.torch import save_file
+
+def get_temp_dir():
+    """Get the appropriate temp directory."""
+    temp_dir = Path(tempfile.gettempdir()) / "simple_bench_models"
+    temp_dir.mkdir(parents=True, exist_ok=True)
+    return temp_dir
+
+class LargeModel(nn.Module):
+    """Large model with 20 layers to match Rust benchmark."""
+    def __init__(self):
+        super().__init__()
+        self.layers = nn.ModuleList()
+
+        # Create a model with 20 layers matching the Rust LargeModel
+        for i in range(20):
+            in_size = 1024 if i == 0 else 2048
+            out_size = 2048
+            self.layers.append(nn.Linear(in_size, out_size))
+
+        print(f"Created model with {len(self.layers)} layers")
+
+    def forward(self, x):
+        for layer in self.layers:
+            x = layer(x)
+        return x
+
+def calculate_model_size(model):
+    """Calculate the size of the model in MB."""
+    total_params = sum(p.numel() for p in model.parameters())
+    size_mb = (total_params * 4) / (1024 * 1024)  # 4 bytes per float32
+    return total_params, size_mb
+
+def initialize_weights(model):
+    """Initialize model weights with random values."""
+    for param in model.parameters():
+        if param.dim() > 1:
+            nn.init.xavier_uniform_(param)
+        else:
+            nn.init.zeros_(param)
+
+def save_pytorch_format(model, output_dir):
+    """Save model in PyTorch format."""
+    pt_path = output_dir / "large_model.pt"
+
+    # Save as checkpoint with model_state_dict (common format)
+    checkpoint = {
+        'model_state_dict': model.state_dict(),
+        'metadata': {
+            'model_type': 'large_benchmark_model',
+            'num_layers': len(model.layers),
+        }
+    }
+    torch.save(checkpoint, pt_path)
+
+    return pt_path
+
+def save_safetensors_format(model, output_dir):
+    """Save model in SafeTensors format."""
+    st_path = output_dir / "large_model.safetensors"
+
+    # Convert state dict to safetensors format
+    state_dict = model.state_dict()
+    # Ensure all tensors are contiguous and on CPU
+    state_dict = {k: v.contiguous().cpu() for k, v in state_dict.items()}
+
+    # Save with metadata
+    metadata = {
+        'model_type': 'large_benchmark_model',
+        'num_layers': str(len(model.layers)),
+    }
+    save_file(state_dict, st_path, metadata=metadata)
+
+    return st_path
+
+def verify_files(pt_path, st_path):
+    """Verify the saved files can be loaded."""
+    # Verify PyTorch file
+    checkpoint = torch.load(pt_path, map_location='cpu')
+    pt_keys = set(checkpoint['model_state_dict'].keys())
+    print(f"  PyTorch file: {len(pt_keys)} tensors")
+
+    # Verify SafeTensors file
+    from safetensors import safe_open
+    with safe_open(st_path, framework="pt", device="cpu") as f:
+        st_keys = set(f.keys())
+        print(f"  SafeTensors file: {len(st_keys)} tensors")
+
+    # Check keys match
+    if pt_keys != st_keys:
+        print("  ⚠️ Warning: Keys don't match between formats!")
+    else:
+        print("  ✓ Keys match between formats")
+
+def main():
+    print("🔧 Generating unified benchmark model files...")
+    print("")
+
+    output_dir = get_temp_dir()
+    print(f"📁 Output directory: {output_dir}")
+    print("")
+
+    # Set random seed for reproducibility
+    torch.manual_seed(42)
+
+    # Create the large model
+    print("📝 Creating large model...")
+    model = LargeModel()
+
+    # Calculate and display model size
+    total_params, size_mb = calculate_model_size(model)
+    print(f"  Total parameters: {total_params:,}")
+    print(f"  Model size: {size_mb:.2f} MB")
+    print("")
+
+    # Initialize weights
+    print("🎲 Initializing weights...")
+    initialize_weights(model)
+
+    # Save in PyTorch format
+    print("💾 Saving PyTorch format...")
+    pt_path = save_pytorch_format(model, output_dir)
+    pt_size_mb = pt_path.stat().st_size / (1024 * 1024)
+    print(f"  Saved: {pt_path}")
+    print(f"  File size: {pt_size_mb:.2f} MB")
+    print("")
+
+    # Save in SafeTensors format
+    print("💾 Saving SafeTensors format...")
+    st_path = save_safetensors_format(model, output_dir)
+    st_size_mb = st_path.stat().st_size / (1024 * 1024)
+    print(f"  Saved: {st_path}")
+    print(f"  File size: {st_size_mb:.2f} MB")
+    print("")
+
+    # Verify files
+    print("🔍 Verifying saved files...")
+    verify_files(pt_path, st_path)
+    print("")
+
+    print(f"✅ Model files generated successfully!")
+    print("")
+    print("📊 Summary:")
+    print(f"  PyTorch file: {pt_path.name} ({pt_size_mb:.2f} MB)")
+    print(f"  SafeTensors file: {st_path.name} ({st_size_mb:.2f} MB)")
+    print("")
+    print("💡 To run the unified benchmark:")
+    print("   cargo bench --bench unified_loading")
+
+if __name__ == "__main__":
+    main()
--- a/crates/stable-diffusion-burn/burn-crates/burn-store/benches/resnet18_loading.rs
+++ b/crates/stable-diffusion-burn/burn-crates/burn-store/benches/resnet18_loading.rs
@@ -0,0 +1,213 @@
+//! Benchmark for ResNet18 loading to verify lazy loading memory usage.
+//!
+//! resnet18.pth is pytorch's legacy file format.
+//!
+//! This benchmark loads a ResNet18 model and materializes all tensors
+//! to ensure memory usage stays reasonable with lazy loading.
+//!
+//! Run the benchmark:
+//! ```bash
+//! cargo bench --bench resnet18_loading
+//! ```
+
+use burn_store::pytorch::PytorchReader;
+use divan::{AllocProfiler, Bencher};
+use std::path::PathBuf;
+
+#[global_allocator]
+static ALLOC: AllocProfiler = AllocProfiler::system();
+
+#[allow(clippy::manual_range_contains)]
+fn main() {
+    // Check if ResNet18 file exists
+    let path = resnet18_path();
+    if !path.exists() {
+        eprintln!("❌ ResNet18 model not found!");
+        eprintln!();
+        eprintln!("Please download it first by running:");
+        eprintln!("  python benches/download_resnet18.py");
+        eprintln!();
+        eprintln!("Or if you don't have Python/PyTorch installed:");
+        eprintln!("  uv run benches/download_resnet18.py");
+        eprintln!();
+        eprintln!("Expected location: {}", path.display());
+        std::process::exit(1);
+    }
+
+    // Verify file size is reasonable
+    let metadata = std::fs::metadata(&path).expect("Failed to read file metadata");
+    let size_mb = metadata.len() as f64 / 1_048_576.0;
+
+    if size_mb < 40.0 || size_mb > 50.0 {
+        eprintln!(
+            "⚠️ Warning: ResNet18 file size ({:.1} MB) seems unusual",
+            size_mb
+        );
+        eprintln!("Expected size is around 45 MB");
+    }
+
+    println!("✅ Found ResNet18 model at: {}", path.display());
+    println!("📦 File size: {:.1} MB", size_mb);
+    println!("📊 Running ResNet18 loading benchmarks...\n");
+
+    // Run divan benchmarks
+    divan::main();
+}
+
+/// Get the path to ResNet18 model file
+fn resnet18_path() -> PathBuf {
+    // First try to read from the path file created by download script
+    let temp_dir = std::env::temp_dir();
+    let config_file = temp_dir.join("burn_resnet18_benchmark").join("path.txt");
+
+    if config_file.exists()
+        && let Ok(path_str) = std::fs::read_to_string(&config_file)
+    {
+        let path = PathBuf::from(path_str.trim());
+        if path.exists() {
+            return path;
+        }
+    }
+
+    // Fallback to default location
+    temp_dir
+        .join("burn_resnet18_benchmark")
+        .join("resnet18.pth")
+}
+
+#[divan::bench(sample_count = 10)]
+fn load_resnet18_metadata(bencher: Bencher) {
+    let path = resnet18_path();
+
+    bencher.bench_local(|| {
+        let reader = PytorchReader::new(&path).expect("Failed to load ResNet18");
+        let metadata = reader.metadata();
+
+        // Just access metadata without materializing tensors
+        assert_eq!(metadata.tensor_count, 122);
+    });
+}
+
+#[divan::bench(sample_count = 5)]
+fn load_resnet18_materialize_all(bencher: Bencher) {
+    let path = resnet18_path();
+
+    bencher.bench_local(|| {
+        let reader = PytorchReader::new(&path).expect("Failed to load ResNet18");
+        let keys = reader.keys();
+
+        let mut total_bytes = 0usize;
+
+        // Materialize all tensors one by one
+        for key in &keys {
+            let tensor = reader.get(key).expect("Failed to get tensor");
+            // Materialize the tensor data
+            let _data = tensor.to_data().expect("Failed to materialize tensor data");
+            total_bytes += tensor.data_len();
+        }
+
+        // Verify we processed all the data
+        assert!(total_bytes > 40_000_000); // Should be ~45MB
+    });
+}
+
+#[divan::bench(sample_count = 5)]
+fn load_resnet18_materialize_sequential(bencher: Bencher) {
+    let path = resnet18_path();
+
+    bencher.bench_local(|| {
+        let reader = PytorchReader::new(&path).expect("Failed to load ResNet18");
+        let keys = reader.keys();
+
+        // Materialize tensors one at a time, letting previous ones be dropped
+        // This simulates processing tensors sequentially without keeping all in memory
+        for key in &keys {
+            let tensor = reader.get(key).expect("Failed to get tensor");
+            let data = tensor.to_data().expect("Failed to materialize tensor data");
+
+            // Do minimal work with the data to prevent optimization
+            let sum = match data.dtype {
+                burn_tensor::DType::F32 => data
+                    .as_slice::<f32>()
+                    .map(|s| s.iter().sum::<f32>())
+                    .unwrap_or(0.0) as f64,
+                burn_tensor::DType::F64 => data
+                    .as_slice::<f64>()
+                    .map(|s| s.iter().sum::<f64>())
+                    .unwrap_or(0.0),
+                _ => 0.0,
+            };
+
+            // Use the sum to prevent dead code elimination
+            std::hint::black_box(sum);
+        }
+    });
+}
+
+#[divan::bench(sample_count = 10)]
+fn load_resnet18_largest_tensor(bencher: Bencher) {
+    let path = resnet18_path();
+
+    bencher.bench_local(|| {
+        let reader = PytorchReader::new(&path).expect("Failed to load ResNet18");
+
+        // Find and materialize only the largest tensor
+        // This tests peak memory for a single tensor operation
+        let keys = reader.keys();
+        let mut largest_key = String::new();
+        let mut largest_size = 0usize;
+
+        for key in &keys {
+            let tensor = reader.get(key).expect("Failed to get tensor");
+            let size = tensor.data_len();
+            if size > largest_size {
+                largest_size = size;
+                largest_key = key.clone();
+            }
+        }
+
+        // Materialize the largest tensor
+        let tensor = reader
+            .get(&largest_key)
+            .expect("Failed to get largest tensor");
+        let _data = tensor.to_data().expect("Failed to materialize tensor data");
+
+        assert!(largest_size > 9_000_000); // Should be ~9MB for layer4.0.conv2.weight
+    });
+}
+
+#[divan::bench(sample_count = 10)]
+fn load_resnet18_memory_profile(bencher: Bencher) {
+    let path = resnet18_path();
+
+    bencher
+        .with_inputs(|| path.clone())
+        .bench_local_values(|path| {
+            let reader = PytorchReader::new(&path).expect("Failed to load ResNet18");
+            let keys = reader.keys();
+
+            let mut peak_single_tensor = 0usize;
+            let mut total_data = 0usize;
+
+            // Process each tensor and track memory
+            for key in &keys {
+                let tensor = reader.get(key).expect("Failed to get tensor");
+                let tensor_size = tensor.data_len();
+
+                // Track largest single tensor
+                if tensor_size > peak_single_tensor {
+                    peak_single_tensor = tensor_size;
+                }
+
+                // Materialize the tensor
+                let data = tensor.to_data().expect("Failed to materialize tensor data");
+                total_data += tensor_size;
+
+                // Drop data immediately to test lazy loading memory efficiency
+                drop(data);
+            }
+
+            // Return stats for verification
+            (peak_single_tensor, total_data)
+        });
+}
--- a/crates/stable-diffusion-burn/burn-crates/burn-store/benches/unified_loading.rs
+++ b/crates/stable-diffusion-burn/burn-crates/burn-store/benches/unified_loading.rs
@@ -0,0 +1,332 @@
+#![recursion_limit = "256"]
+
+//! Unified benchmark comparing all loading methods:
+//! - BurnpackStore (new native format)
+//! - NamedMpkFileRecorder (old native format)
+//! - SafetensorsStore (new)
+//! - SafetensorsFileRecorder (old)
+//! - PytorchStore (new)
+//! - PyTorchFileRecorder (old)
+//!
+//! Before running this benchmark, generate the model files:
+//! ```bash
+//! cd crates/burn-store
+//! uv run benches/generate_unified_models.py
+//! ```
+//!
+//! Then run the benchmark:
+//! ```bash
+//! cargo bench --bench unified_loading
+//! ```
+
+use burn_core as burn;
+
+use burn_core::module::Module;
+use burn_core::prelude::*;
+use burn_core::record::{FullPrecisionSettings, NamedMpkFileRecorder, Recorder};
+// use burn_import::pytorch::{LoadArgs, PyTorchFileRecorder};
+// use burn_import::safetensors::SafetensorsFileRecorder;
+use burn_nn as nn;
+use burn_store::{
+    BurnpackStore, ModuleSnapshot, PyTorchToBurnAdapter, PytorchStore, SafetensorsStore,
+};
+use divan::{AllocProfiler, Bencher};
+use std::fs;
+use std::path::{Path, PathBuf};
+
+#[global_allocator]
+static ALLOC: AllocProfiler = AllocProfiler::system();
+
+// Backend type aliases
+type NdArrayBackend = burn_ndarray::NdArray<f32>;
+
+#[cfg(feature = "wgpu")]
+type WgpuBackend = burn_wgpu::Wgpu;
+
+#[cfg(feature = "cuda")]
+type CudaBackend = burn_cuda::Cuda<f32, i32>;
+
+#[cfg(feature = "tch")]
+type TchBackend = burn_tch::LibTorch<f32>;
+
+#[cfg(feature = "metal")]
+type MetalBackend = burn_wgpu::Metal;
+
+// Use the same LargeModel as other benchmarks for fair comparison
+#[derive(Module, Debug)]
+struct LargeModel<B: Backend> {
+    layers: Vec<nn::Linear<B>>,
+}
+
+impl<B: Backend> LargeModel<B> {
+    fn new(device: &B::Device) -> Self {
+        let mut layers = Vec::new();
+        // Create a model with 20 layers - same as safetensor_loading benchmark
+        for i in 0..20 {
+            let in_size = if i == 0 { 1024 } else { 2048 };
+            layers.push(nn::LinearConfig::new(in_size, 2048).init(device));
+        }
+        Self { layers }
+    }
+}
+
+/// Get the path to the model files
+fn get_model_dir() -> PathBuf {
+    std::env::temp_dir().join("simple_bench_models")
+}
+
+/// Generate Burnpack and NamedMpk files from existing SafeTensors file
+fn generate_burn_formats(st_path: &Path, bp_path: &Path, mpk_path: &Path) {
+    type TestBackend = NdArrayBackend;
+    let device = Default::default();
+
+    // Load the model from SafeTensors
+    let mut model = LargeModel::<TestBackend>::new(&device);
+    let mut store = SafetensorsStore::from_file(st_path).with_from_adapter(PyTorchToBurnAdapter);
+    model
+        .load_from(&mut store)
+        .expect("Failed to load from SafeTensors");
+
+    // Save as Burnpack
+    if !bp_path.exists() {
+        println!("  Creating Burnpack file...");
+        let mut burnpack_store = BurnpackStore::from_file(bp_path);
+        model
+            .save_into(&mut burnpack_store)
+            .expect("Failed to save as Burnpack");
+    }
+
+    // Save as NamedMpk
+    if !mpk_path.exists() {
+        println!("  Creating NamedMpk file...");
+        let recorder = NamedMpkFileRecorder::<FullPrecisionSettings>::default();
+        model
+            .save_file(mpk_path, &recorder)
+            .expect("Failed to save as NamedMpk");
+    }
+}
+
+/// Get paths to the model files
+fn get_model_paths() -> (PathBuf, PathBuf, PathBuf, PathBuf) {
+    let dir = get_model_dir();
+    (
+        dir.join("large_model.bpk"),
+        dir.join("large_model.mpk"),
+        dir.join("large_model.safetensors"),
+        dir.join("large_model.pt"),
+    )
+}
+
+/// Check if model files exist
+fn check_model_files() -> Result<(), String> {
+    let (_, _, st_path, pt_path) = get_model_paths();
+
+    // For now, only check safetensors and pytorch files (will generate burnpack/mpk later)
+    if !st_path.exists() || !pt_path.exists() {
+        return Err(format!(
+            "\n❌ Model files not found!\n\
+            \n\
+            Please generate the model files first by running:\n\
+            \n\
+            cd crates/burn-store\n\
+            uv run benches/generate_unified_models.py\n\
+            \n\
+            Expected files:\n\
+            - {}\n\
+            - {}\n",
+            st_path.display(),
+            pt_path.display()
+        ));
+    }
+
+    Ok(())
+}
+
+fn main() {
+    // Check if model files exist before running benchmarks
+    match check_model_files() {
+        Ok(()) => {
+            let (bp_path, mpk_path, st_path, pt_path) = get_model_paths();
+
+            // First, generate Burnpack and MPK files if they don't exist
+            if !bp_path.exists() || !mpk_path.exists() {
+                println!("⏳ Generating Burnpack and NamedMpk files from SafeTensors...");
+                generate_burn_formats(&st_path, &bp_path, &mpk_path);
+            }
+
+            let bp_size = fs::metadata(&bp_path)
+                .ok()
+                .map(|m| m.len() as f64 / 1_048_576.0);
+            let mpk_size = fs::metadata(&mpk_path)
+                .ok()
+                .map(|m| m.len() as f64 / 1_048_576.0);
+            let st_size = fs::metadata(&st_path).unwrap().len() as f64 / 1_048_576.0;
+            let pt_size = fs::metadata(&pt_path).unwrap().len() as f64 / 1_048_576.0;
+
+            println!("✅ Found model files:");
+            if let Some(size) = bp_size {
+                println!("  Burnpack: {} ({:.1} MB)", bp_path.display(), size);
+            }
+            if let Some(size) = mpk_size {
+                println!("  NamedMpk: {} ({:.1} MB)", mpk_path.display(), size);
+            }
+            println!("  SafeTensors: {} ({:.1} MB)", st_path.display(), st_size);
+            println!("  PyTorch: {} ({:.1} MB)", pt_path.display(), pt_size);
+            println!();
+            println!("🚀 Running unified loading benchmarks...");
+            println!();
+            println!("Comparing 6 loading methods:");
+            println!("  1. BurnpackStore (new native format - lazy loading)");
+            println!("  2. NamedMpkFileRecorder (old native format - loads all to memory)");
+            println!("  3. SafetensorsStore (new)");
+            println!("  4. SafetensorsFileRecorder (old)");
+            println!("  5. PytorchStore (new)");
+            println!("  6. PyTorchFileRecorder (old)");
+            println!();
+            println!("Available backends:");
+            println!("  - NdArray (CPU)");
+            #[cfg(feature = "wgpu")]
+            println!("  - WGPU (GPU)");
+            #[cfg(feature = "cuda")]
+            println!("  - CUDA (NVIDIA GPU)");
+            #[cfg(feature = "tch")]
+            println!("  - LibTorch");
+            #[cfg(feature = "metal")]
+            println!("  - Metal (Apple GPU)");
+            println!();
+
+            divan::main();
+        }
+        Err(msg) => {
+            eprintln!("{}", msg);
+            std::process::exit(1);
+        }
+    }
+}
+
+// Macro to generate benchmarks for each backend
+macro_rules! bench_backend {
+    ($backend:ty, $mod_name:ident, $backend_name:literal) => {
+        #[divan::bench_group(name = $backend_name, sample_count = 10)]
+        mod $mod_name {
+            use super::*;
+
+            type TestBackend = $backend;
+            type TestDevice = <TestBackend as Backend>::Device;
+
+            #[divan::bench]
+            fn burnpack_store(bencher: Bencher) {
+                let (bp_path, _, _, _) = get_model_paths();
+                let file_size = fs::metadata(&bp_path).unwrap().len();
+
+                bencher
+                    .counter(divan::counter::BytesCount::new(file_size))
+                    .bench(|| {
+                        let device: TestDevice = Default::default();
+                        let mut model = LargeModel::<TestBackend>::new(&device);
+                        let mut store = BurnpackStore::from_file(bp_path.clone());
+                        model.load_from(&mut store).expect("Failed to load");
+                    });
+            }
+
+            #[divan::bench]
+            fn namedmpk_recorder(bencher: Bencher) {
+                let (_, mpk_path, _, _) = get_model_paths();
+                let file_size = fs::metadata(&mpk_path).unwrap().len();
+
+                bencher
+                    .counter(divan::counter::BytesCount::new(file_size))
+                    .bench(|| {
+                        let device: TestDevice = Default::default();
+                        let recorder = NamedMpkFileRecorder::<FullPrecisionSettings>::default();
+                        let record = recorder
+                            .load(mpk_path.clone().into(), &device)
+                            .expect("Failed to load");
+                        let _model = LargeModel::<TestBackend>::new(&device).load_record(record);
+                    });
+            }
+
+            #[divan::bench]
+            fn safetensors_store(bencher: Bencher) {
+                let (_, _, st_path, _) = get_model_paths();
+                let file_size = fs::metadata(&st_path).unwrap().len();
+
+                bencher
+                    .counter(divan::counter::BytesCount::new(file_size))
+                    .bench(|| {
+                        let device: TestDevice = Default::default();
+                        let mut model = LargeModel::<TestBackend>::new(&device);
+                        let mut store = SafetensorsStore::from_file(st_path.clone())
+                            .with_from_adapter(PyTorchToBurnAdapter);
+                        model.load_from(&mut store).expect("Failed to load");
+                    });
+            }
+
+            // #[divan::bench]
+            // fn safetensors_recorder(bencher: Bencher) {
+            //     let (_, _, st_path, _) = get_model_paths();
+            //     let file_size = fs::metadata(&st_path).unwrap().len();
+
+            //     bencher
+            //         .counter(divan::counter::BytesCount::new(file_size))
+            //         .bench(|| {
+            //             let device: TestDevice = Default::default();
+            //             let recorder = SafetensorsFileRecorder::<FullPrecisionSettings>::default();
+            //             let record = recorder
+            //                 .load(st_path.clone().into(), &device)
+            //                 .expect("Failed to load");
+            //             let _model = LargeModel::<TestBackend>::new(&device).load_record(record);
+            //         });
+            // }
+
+            #[divan::bench]
+            fn pytorch_store(bencher: Bencher) {
+                let (_, _, _, pt_path) = get_model_paths();
+                let file_size = fs::metadata(&pt_path).unwrap().len();
+
+                bencher
+                    .counter(divan::counter::BytesCount::new(file_size))
+                    .bench(|| {
+                        let device: TestDevice = Default::default();
+                        let mut model = LargeModel::<TestBackend>::new(&device);
+                        let mut store = PytorchStore::from_file(pt_path.clone())
+                            .with_top_level_key("model_state_dict")
+                            .allow_partial(true);
+                        model.load_from(&mut store).expect("Failed to load");
+                    });
+            }
+
+            // #[divan::bench]
+            // fn pytorch_recorder(bencher: Bencher) {
+            //     let (_, _, _, pt_path) = get_model_paths();
+            //     let file_size = fs::metadata(&pt_path).unwrap().len();
+
+            //     bencher
+            //         .counter(divan::counter::BytesCount::new(file_size))
+            //         .bench(|| {
+            //             let device: TestDevice = Default::default();
+            //             let recorder = PyTorchFileRecorder::<FullPrecisionSettings>::default();
+            //             let load_args =
+            //                 LoadArgs::new(pt_path.clone()).with_top_level_key("model_state_dict");
+            //             let record = recorder.load(load_args, &device).expect("Failed to load");
+            //             let _model = LargeModel::<TestBackend>::new(&device).load_record(record);
+            //         });
+            // }
+        }
+    };
+}
+
+// Generate benchmarks for each backend
+bench_backend!(NdArrayBackend, ndarray_backend, "NdArray Backend (CPU)");
+
+#[cfg(feature = "wgpu")]
+bench_backend!(WgpuBackend, wgpu_backend, "WGPU Backend (GPU)");
+
+#[cfg(feature = "cuda")]
+bench_backend!(CudaBackend, cuda_backend, "CUDA Backend (NVIDIA GPU)");
+
+#[cfg(feature = "tch")]
+bench_backend!(TchBackend, tch_backend, "LibTorch Backend");
+
+#[cfg(feature = "metal")]
+bench_backend!(MetalBackend, metal_backend, "Metal Backend (Apple GPU)");
--- a/crates/stable-diffusion-burn/burn-crates/burn-store/benches/unified_saving.rs
+++ b/crates/stable-diffusion-burn/burn-crates/burn-store/benches/unified_saving.rs
@@ -0,0 +1,183 @@
+#![recursion_limit = "256"]
+
+//! Unified benchmark comparing all saving methods:
+//! - BurnpackStore (new native format)
+//! - NamedMpkFileRecorder (old native format)
+//! - SafetensorsStore (new)
+//!
+//! Before running this benchmark, ensure the directory exists:
+//! ```bash
+//! mkdir -p /tmp/simple_bench_models
+//! ```
+//!
+//! Then run the benchmark:
+//! ```bash
+//! cargo bench --bench unified_saving
+//! ```
+use burn_core as burn;
+
+use burn_core::module::Module;
+use burn_core::prelude::*;
+use burn_core::record::{FullPrecisionSettings, NamedMpkFileRecorder};
+use burn_nn as nn;
+use burn_store::{BurnpackStore, ModuleSnapshot, SafetensorsStore};
+use divan::{AllocProfiler, Bencher};
+use std::fs;
+use std::path::PathBuf;
+
+#[global_allocator]
+static ALLOC: AllocProfiler = AllocProfiler::system();
+
+// Backend type aliases
+type NdArrayBackend = burn_ndarray::NdArray<f32>;
+
+#[cfg(feature = "wgpu")]
+type WgpuBackend = burn_wgpu::Wgpu;
+
+#[cfg(feature = "cuda")]
+type CudaBackend = burn_cuda::Cuda<f32, i32>;
+
+#[cfg(feature = "tch")]
+type TchBackend = burn_tch::LibTorch<f32>;
+
+#[cfg(feature = "metal")]
+type MetalBackend = burn_wgpu::Metal;
+
+// Use the same LargeModel as other benchmarks for fair comparison
+#[derive(Module, Debug)]
+struct LargeModel<B: Backend> {
+    layers: Vec<nn::Linear<B>>,
+}
+
+impl<B: Backend> LargeModel<B> {
+    fn new(device: &B::Device) -> Self {
+        let mut layers = Vec::new();
+        // Create a model with 20 layers - same as loading benchmarks
+        for i in 0..20 {
+            let in_size = if i == 0 { 1024 } else { 2048 };
+            layers.push(nn::LinearConfig::new(in_size, 2048).init(device));
+        }
+        Self { layers }
+    }
+}
+
+/// Get the path to the output directory
+fn get_output_dir() -> PathBuf {
+    std::env::temp_dir().join("simple_bench_models_saving")
+}
+
+/// Ensure output directory exists
+fn ensure_output_dir() -> Result<(), String> {
+    let dir = get_output_dir();
+    if !dir.exists() {
+        fs::create_dir_all(&dir)
+            .map_err(|e| format!("Failed to create output directory: {}", e))?;
+    }
+    Ok(())
+}
+
+fn main() {
+    match ensure_output_dir() {
+        Ok(()) => {
+            println!("✅ Output directory ready: {}", get_output_dir().display());
+            println!();
+            println!("🚀 Running unified saving benchmarks...");
+            println!();
+            println!("Comparing 3 saving methods:");
+            println!("  1. BurnpackStore (new native format)");
+            println!("  2. NamedMpkFileRecorder (old native format)");
+            println!("  3. SafetensorsStore (new)");
+            println!();
+            println!("Available backends:");
+            println!("  - NdArray (CPU)");
+            #[cfg(feature = "wgpu")]
+            println!("  - WGPU (GPU)");
+            #[cfg(feature = "cuda")]
+            println!("  - CUDA (NVIDIA GPU)");
+            #[cfg(feature = "tch")]
+            println!("  - LibTorch");
+            #[cfg(feature = "metal")]
+            println!("  - Metal (Apple GPU)");
+            println!();
+
+            divan::main();
+        }
+        Err(msg) => {
+            eprintln!("❌ {}", msg);
+            std::process::exit(1);
+        }
+    }
+}
+
+// Macro to generate benchmarks for each backend
+macro_rules! bench_backend {
+    ($backend:ty, $mod_name:ident, $backend_name:literal) => {
+        #[divan::bench_group(name = $backend_name, sample_count = 10)]
+        mod $mod_name {
+            use super::*;
+
+            type TestBackend = $backend;
+            type TestDevice = <TestBackend as Backend>::Device;
+
+            #[divan::bench]
+            fn burnpack_store(bencher: Bencher) {
+                bencher.bench(|| {
+                    let device: TestDevice = Default::default();
+                    let model = LargeModel::<TestBackend>::new(&device);
+                    let output_path = get_output_dir().join("test_burnpack.bpk");
+                    let mut store = BurnpackStore::from_file(output_path.clone()).overwrite(true);
+                    model
+                        .save_into(&mut store)
+                        .expect("Failed to save with BurnpackStore");
+                    // Clean up
+                    let _ = fs::remove_file(output_path);
+                });
+            }
+
+            #[divan::bench]
+            fn namedmpk_recorder(bencher: Bencher) {
+                bencher.bench(|| {
+                    let device: TestDevice = Default::default();
+                    let model = LargeModel::<TestBackend>::new(&device);
+                    let output_path = get_output_dir().join("test_namedmpk.mpk");
+                    let recorder = NamedMpkFileRecorder::<FullPrecisionSettings>::default();
+                    model
+                        .save_file(output_path.clone(), &recorder)
+                        .expect("Failed to save with NamedMpkFileRecorder");
+                    // Clean up
+                    let _ = fs::remove_file(output_path);
+                });
+            }
+
+            #[divan::bench]
+            fn safetensors_store(bencher: Bencher) {
+                bencher.bench(|| {
+                    let device: TestDevice = Default::default();
+                    let model = LargeModel::<TestBackend>::new(&device);
+                    let output_path = get_output_dir().join("test_safetensors_store.safetensors");
+                    let mut store = SafetensorsStore::from_file(output_path.clone());
+                    model
+                        .save_into(&mut store)
+                        .expect("Failed to save with SafetensorsStore");
+                    // Clean up
+                    let _ = fs::remove_file(output_path);
+                });
+            }
+        }
+    };
+}
+
+// Generate benchmarks for each backend
+bench_backend!(NdArrayBackend, ndarray_backend, "NdArray Backend (CPU)");
+
+#[cfg(feature = "wgpu")]
+bench_backend!(WgpuBackend, wgpu_backend, "WGPU Backend (GPU)");
+
+#[cfg(feature = "cuda")]
+bench_backend!(CudaBackend, cuda_backend, "CUDA Backend (NVIDIA GPU)");
+
+#[cfg(feature = "tch")]
+bench_backend!(TchBackend, tch_backend, "LibTorch Backend");
+
+#[cfg(feature = "metal")]
+bench_backend!(MetalBackend, metal_backend, "Metal Backend (Apple GPU)");
--- a/crates/stable-diffusion-burn/burn-crates/burn-store/benches/zero_copy_loading.rs
+++ b/crates/stable-diffusion-burn/burn-crates/burn-store/benches/zero_copy_loading.rs
@@ -0,0 +1,596 @@
+#![recursion_limit = "256"]
+
+//! Benchmark comparing zero-copy vs copy loading modes for BurnpackStore.
+//!
+//! This benchmark measures the performance difference between:
+//! - `zero_copy(false)` - Default mode, copies tensor data into new allocations
+//! - `zero_copy(true)` - Zero-copy mode, slices tensor data without copying
+//!
+//! ## Understanding the Results
+//!
+//! **IMPORTANT**: For NdArray backend, you'll see similar allocation numbers because:
+//! - NdArray uses `ndarray::ArrayD` which MUST own data as `Vec<T>`
+//! - Even with zero-copy, the backend eventually copies data into its own format
+//!
+//! The zero-copy benefit is:
+//! - **Without zero-copy**: File → Copy to heap (Bytes) → Copy to Vec (backend)
+//! - **With zero-copy**: File → Zero-copy slice → Copy to Vec (backend)
+//!
+//! So zero-copy saves ONE memory copy at the store level. The `store_only_*` benchmarks
+//! show the raw store performance without backend allocation overhead.
+//!
+//! GPU backends that can consume `Bytes` directly will show larger benefits.
+//!
+//! ## Running the benchmark
+//!
+//! Before running this benchmark, generate the model files:
+//! ```bash
+//! cd crates/burn-store
+//! uv run benches/generate_unified_models.py
+//! ```
+//!
+//! Then run the benchmark:
+//! ```bash
+//! cargo bench --bench zero_copy_loading
+//! ```
+
+use burn_core as burn;
+
+use burn_core::module::Module;
+use burn_core::prelude::*;
+use burn_nn as nn;
+use burn_store::{
+    BurnpackStore, ModuleSnapshot, ModuleStore, PyTorchToBurnAdapter, SafetensorsStore,
+};
+use burn_tensor::{AllocationProperty, Bytes};
+use divan::{AllocProfiler, Bencher};
+use std::fs;
+use std::path::PathBuf;
+use std::sync::OnceLock;
+
+#[global_allocator]
+static ALLOC: AllocProfiler = AllocProfiler::system();
+
+// Static storage for embedded model bytes (simulating include_bytes!)
+static STATIC_MODEL_BYTES: OnceLock<&'static [u8]> = OnceLock::new();
+
+// Backend type aliases
+type NdArrayBackend = burn_ndarray::NdArray<f32>;
+
+#[cfg(feature = "wgpu")]
+type WgpuBackend = burn_wgpu::Wgpu;
+
+#[cfg(feature = "cuda")]
+type CudaBackend = burn_cuda::Cuda<f32, i32>;
+
+#[cfg(feature = "tch")]
+type TchBackend = burn_tch::LibTorch<f32>;
+
+#[cfg(feature = "metal")]
+type MetalBackend = burn_wgpu::Metal;
+
+// Use the same LargeModel as other benchmarks for fair comparison
+#[derive(Module, Debug)]
+struct LargeModel<B: Backend> {
+    layers: Vec<nn::Linear<B>>,
+}
+
+impl<B: Backend> LargeModel<B> {
+    fn new(device: &B::Device) -> Self {
+        let mut layers = Vec::new();
+        // Create a model with 20 layers - same as unified_loading benchmark
+        for i in 0..20 {
+            let in_size = if i == 0 { 1024 } else { 2048 };
+            layers.push(nn::LinearConfig::new(in_size, 2048).init(device));
+        }
+        Self { layers }
+    }
+}
+
+/// Get the path to the model files
+fn get_model_dir() -> PathBuf {
+    std::env::temp_dir().join("simple_bench_models")
+}
+
+/// Get path to Burnpack model file
+fn get_burnpack_path() -> PathBuf {
+    get_model_dir().join("large_model.bpk")
+}
+
+/// Generate Burnpack file from existing SafeTensors file if needed
+fn ensure_burnpack_file() {
+    let bp_path = get_burnpack_path();
+    let st_path = get_model_dir().join("large_model.safetensors");
+
+    if bp_path.exists() {
+        return;
+    }
+
+    if !st_path.exists() {
+        panic!(
+            "\n❌ SafeTensors model file not found!\n\
+            \n\
+            Please generate the model files first by running:\n\
+            \n\
+            cd crates/burn-store\n\
+            uv run benches/generate_unified_models.py\n\
+            \n\
+            Expected file: {}\n",
+            st_path.display()
+        );
+    }
+
+    println!("⏳ Generating Burnpack file from SafeTensors...");
+
+    type TestBackend = NdArrayBackend;
+    let device = Default::default();
+
+    // Load from SafeTensors
+    let mut model = LargeModel::<TestBackend>::new(&device);
+    let mut store = SafetensorsStore::from_file(&st_path).with_from_adapter(PyTorchToBurnAdapter);
+    model
+        .load_from(&mut store)
+        .expect("Failed to load from SafeTensors");
+
+    // Save as Burnpack
+    let mut burnpack_store = BurnpackStore::from_file(&bp_path);
+    model
+        .save_into(&mut burnpack_store)
+        .expect("Failed to save as Burnpack");
+
+    println!("✅ Created Burnpack file: {}", bp_path.display());
+}
+
+/// Initialize static model bytes (simulating include_bytes! at runtime for benchmarks)
+fn get_static_model_bytes() -> &'static [u8] {
+    STATIC_MODEL_BYTES.get_or_init(|| {
+        let bp_path = get_burnpack_path();
+        let bytes = fs::read(&bp_path).expect("Failed to read Burnpack file");
+        // Leak the bytes to get a 'static lifetime (acceptable for benchmarks)
+        Box::leak(bytes.into_boxed_slice())
+    })
+}
+
+fn main() {
+    // Ensure Burnpack file exists
+    ensure_burnpack_file();
+
+    let bp_path = get_burnpack_path();
+    let file_size = fs::metadata(&bp_path).unwrap().len() as f64 / 1_048_576.0;
+
+    println!("✅ Found Burnpack model file:");
+    println!("  Path: {}", bp_path.display());
+    println!("  Size: {:.1} MB", file_size);
+    println!();
+    println!("🚀 Running zero-copy loading benchmarks...");
+    println!();
+    println!("Comparing loading modes:");
+    println!("  1. file_copy        - from_file().zero_copy(false) - copies tensor data");
+    println!("  2. file_zero_copy   - from_file().zero_copy(true)  - zero-copy via mmap");
+    println!("  3. static_copy      - from_bytes() with Vec copy   - copies from static");
+    println!("  4. static_zero_copy - from_static()                - zero-copy from static");
+    println!();
+    println!("Available backends:");
+    println!("  - NdArray (CPU)");
+    #[cfg(feature = "wgpu")]
+    println!("  - WGPU (GPU)");
+    #[cfg(feature = "cuda")]
+    println!("  - CUDA (NVIDIA GPU)");
+    #[cfg(feature = "tch")]
+    println!("  - LibTorch");
+    #[cfg(feature = "metal")]
+    println!("  - Metal (Apple GPU)");
+    println!();
+
+    // Pre-initialize static bytes before benchmarks
+    let _ = get_static_model_bytes();
+
+    divan::main();
+}
+
+// Macro to generate benchmarks for each backend
+macro_rules! bench_backend {
+    ($backend:ty, $mod_name:ident, $backend_name:literal) => {
+        #[divan::bench_group(name = $backend_name, sample_count = 10)]
+        mod $mod_name {
+            use super::*;
+
+            type TestBackend = $backend;
+            type TestDevice = <TestBackend as Backend>::Device;
+
+            /// File-based loading with copy mode (default)
+            #[divan::bench]
+            fn file_copy(bencher: Bencher) {
+                let bp_path = get_burnpack_path();
+                let file_size = fs::metadata(&bp_path).unwrap().len();
+
+                bencher
+                    .counter(divan::counter::BytesCount::new(file_size))
+                    .bench(|| {
+                        let device: TestDevice = Default::default();
+                        let mut model = LargeModel::<TestBackend>::new(&device);
+                        let mut store = BurnpackStore::from_file(&bp_path).zero_copy(false);
+                        model.load_from(&mut store).expect("Failed to load");
+                    });
+            }
+
+            /// File-based loading with zero-copy mode (mmap + bytes::Bytes)
+            #[divan::bench]
+            fn file_zero_copy(bencher: Bencher) {
+                let bp_path = get_burnpack_path();
+                let file_size = fs::metadata(&bp_path).unwrap().len();
+
+                bencher
+                    .counter(divan::counter::BytesCount::new(file_size))
+                    .bench(|| {
+                        let device: TestDevice = Default::default();
+                        let mut model = LargeModel::<TestBackend>::new(&device);
+                        let mut store = BurnpackStore::from_file(&bp_path).zero_copy(true);
+                        model.load_from(&mut store).expect("Failed to load");
+                    });
+            }
+
+            /// Static bytes with copy mode (simulating old behavior)
+            #[divan::bench]
+            fn static_copy(bencher: Bencher) {
+                let static_bytes = get_static_model_bytes();
+                let file_size = static_bytes.len() as u64;
+
+                bencher
+                    .counter(divan::counter::BytesCount::new(file_size))
+                    .bench(|| {
+                        let device: TestDevice = Default::default();
+                        let mut model = LargeModel::<TestBackend>::new(&device);
+
+                        // Simulate old behavior: copy static bytes to Vec, then load
+                        let bytes = Bytes::from_bytes_vec(static_bytes.to_vec());
+                        let mut store = BurnpackStore::from_bytes(Some(bytes)).zero_copy(false);
+                        model.load_from(&mut store).expect("Failed to load");
+                    });
+            }
+
+            /// Static bytes with zero-copy mode (new from_static)
+            #[divan::bench]
+            fn static_zero_copy(bencher: Bencher) {
+                let static_bytes = get_static_model_bytes();
+                let file_size = static_bytes.len() as u64;
+
+                bencher
+                    .counter(divan::counter::BytesCount::new(file_size))
+                    .bench(|| {
+                        let device: TestDevice = Default::default();
+                        let mut model = LargeModel::<TestBackend>::new(&device);
+
+                        // Zero-copy: use from_static which keeps data in .rodata
+                        let mut store = BurnpackStore::from_static(static_bytes);
+                        model.load_from(&mut store).expect("Failed to load");
+                    });
+            }
+
+            /// In-memory shared bytes with zero-copy
+            #[divan::bench]
+            fn memory_shared_zero_copy(bencher: Bencher) {
+                let static_bytes = get_static_model_bytes();
+                let file_size = static_bytes.len() as u64;
+
+                // Pre-create shared bytes outside the benchmark loop
+                let shared = bytes::Bytes::from_static(static_bytes);
+
+                bencher
+                    .counter(divan::counter::BytesCount::new(file_size))
+                    .bench(|| {
+                        let device: TestDevice = Default::default();
+                        let mut model = LargeModel::<TestBackend>::new(&device);
+
+                        // Create Bytes from shared (cheap clone of Arc)
+                        let bytes = Bytes::from_shared(shared.clone(), AllocationProperty::Other);
+                        let mut store = BurnpackStore::from_bytes(Some(bytes)).zero_copy(true);
+                        model.load_from(&mut store).expect("Failed to load");
+                    });
+            }
+        }
+    };
+}
+
+// =============================================================================
+// Zero-copy verification (proves operations use static region data)
+// =============================================================================
+
+/// Verify that zero-copy loading actually uses data from the static region.
+/// This runs once at startup to prove correctness before benchmarking.
+#[divan::bench_group(name = "Zero-Copy Verification", sample_count = 1)]
+mod verification {
+    use super::*;
+    use burn_ndarray::NdArray;
+
+    type B = NdArray<f32>;
+
+    /// Verify zero-copy: tensor storage is borrowed (not owned)
+    #[divan::bench]
+    fn verify_storage_is_borrowed() {
+        let static_bytes = get_static_model_bytes();
+
+        // Load model with zero-copy from static bytes
+        let device = Default::default();
+        let mut model = LargeModel::<B>::new(&device);
+        let mut store = BurnpackStore::from_static(static_bytes);
+        model.load_from(&mut store).expect("Failed to load");
+
+        // Get the first layer's weight tensor and verify it uses borrowed storage
+        let weight = model.layers[0].weight.val();
+        // .into_primitive() returns TensorPrimitive<B>, .tensor() extracts B::FloatTensorPrimitive
+        let ndarray_tensor = weight.into_primitive().tensor();
+
+        // Verify the storage is borrowed (zero-copy from static region)
+        assert!(
+            ndarray_tensor.is_borrowed(),
+            "ZERO-COPY FAILURE: Tensor storage is NOT borrowed. \
+             Data was copied instead of being zero-copy!"
+        );
+
+        println!("✅ Verified: Tensor storage is borrowed (zero-copy from static region)");
+    }
+
+    /// Verify ALL layers use borrowed (zero-copy) storage.
+    /// This is the key proof that loaded weights point to static memory.
+    #[divan::bench]
+    fn verify_all_layers_borrowed() {
+        let static_bytes = get_static_model_bytes();
+
+        // Load model with zero-copy
+        let device = Default::default();
+        let mut model = LargeModel::<B>::new(&device);
+        let mut store = BurnpackStore::from_static(static_bytes);
+        model.load_from(&mut store).expect("Failed to load");
+
+        // Check ALL layers have borrowed storage
+        let mut total_elements = 0usize;
+        for (i, layer) in model.layers.iter().enumerate() {
+            let weight = layer.weight.val();
+            total_elements += weight.shape().num_elements();
+
+            assert!(
+                weight.into_primitive().tensor().is_borrowed(),
+                "Layer {} weight should be borrowed (zero-copy)",
+                i
+            );
+        }
+
+        let total_mb = (total_elements * 4) as f64 / 1_048_576.0;
+        println!(
+            "✅ Verified: All {} layers use borrowed storage",
+            model.layers.len()
+        );
+        println!(
+            "   - Model size: {:.2} MB - all pointing to static region",
+            total_mb
+        );
+    }
+
+    /// Verify data is readable and correct using sum().into_scalar().
+    /// Note: sum() triggers COW copy, so this shows ops work correctly on zero-copy data.
+    #[divan::bench]
+    fn verify_ops_produce_correct_results() {
+        let static_bytes = get_static_model_bytes();
+
+        let device = Default::default();
+        let mut model = LargeModel::<B>::new(&device);
+        let mut store = BurnpackStore::from_static(static_bytes);
+        model.load_from(&mut store).expect("Failed to load");
+
+        // Compute sum of first layer weight - proves data is valid
+        let weight = model.layers[0].weight.val();
+        let sum: f32 = weight.sum().into_scalar();
+
+        assert!(sum.is_finite(), "Sum should be finite");
+        println!("✅ Verified: Operations on zero-copy data produce valid results");
+        println!("   - First layer sum: {:.4}", sum);
+    }
+
+    /// Verify operations produce correct results on zero-copy data
+    #[divan::bench]
+    fn verify_operations_on_static_data() {
+        let static_bytes = get_static_model_bytes();
+
+        // Load model with zero-copy
+        let device = Default::default();
+        let mut model = LargeModel::<B>::new(&device);
+        let mut store = BurnpackStore::from_static(static_bytes);
+        model.load_from(&mut store).expect("Failed to load");
+
+        // Perform operations on the loaded weights
+        let weight = model.layers[0].weight.val();
+        let shape = weight.shape();
+
+        // Test 1: Sum should be finite (not NaN or Inf)
+        let sum: f32 = weight.clone().sum().to_data().to_vec().unwrap()[0];
+        assert!(
+            sum.is_finite(),
+            "Operation failed: sum is not finite ({})",
+            sum
+        );
+
+        // Test 2: Matrix multiply with itself transposed (W @ W.T)
+        let transposed = weight.clone().transpose();
+        let matmul_result = weight.clone().matmul(transposed);
+        let matmul_sum: f32 = matmul_result.sum().to_data().to_vec().unwrap()[0];
+        assert!(
+            matmul_sum.is_finite(),
+            "Matmul failed: result sum is not finite ({})",
+            matmul_sum
+        );
+
+        // Test 3: Element-wise operations
+        let doubled = weight.clone() * 2.0;
+        let doubled_sum: f32 = doubled.sum().to_data().to_vec().unwrap()[0];
+        assert!(
+            (doubled_sum - sum * 2.0).abs() < 1e-3,
+            "Element-wise op failed: doubled_sum ({}) != sum*2 ({})",
+            doubled_sum,
+            sum * 2.0
+        );
+
+        println!("✅ Verified: Operations on zero-copy data produce correct results");
+        println!("   - Weight shape: {:?}", shape.as_slice());
+        println!("   - Sum: {:.4}", sum);
+        println!("   - Matmul result sum: {:.4}", matmul_sum);
+    }
+
+    /// Compare zero-copy vs copy: verify both produce identical results
+    #[divan::bench]
+    fn verify_copy_vs_zero_copy_equality() {
+        let static_bytes = get_static_model_bytes();
+        let device: <B as Backend>::Device = Default::default();
+
+        // Load with zero-copy
+        let mut model_zc = LargeModel::<B>::new(&device);
+        let mut store_zc = BurnpackStore::from_static(static_bytes);
+        model_zc
+            .load_from(&mut store_zc)
+            .expect("Failed to load zero-copy");
+
+        // Load with copy (simulate old behavior)
+        let mut model_copy = LargeModel::<B>::new(&device);
+        let bytes = Bytes::from_bytes_vec(static_bytes.to_vec());
+        let mut store_copy = BurnpackStore::from_bytes(Some(bytes)).zero_copy(false);
+        model_copy
+            .load_from(&mut store_copy)
+            .expect("Failed to load copy");
+
+        // Compare weights from both models
+        for (i, (layer_zc, layer_copy)) in model_zc
+            .layers
+            .iter()
+            .zip(model_copy.layers.iter())
+            .enumerate()
+        {
+            let weight_zc = layer_zc.weight.val();
+            let weight_copy = layer_copy.weight.val();
+
+            // Check shapes match
+            assert_eq!(
+                weight_zc.shape(),
+                weight_copy.shape(),
+                "Layer {} weight shapes don't match",
+                i
+            );
+
+            // Check values match (using sum as a proxy)
+            let sum_zc: f32 = weight_zc.clone().sum().to_data().to_vec().unwrap()[0];
+            let sum_copy: f32 = weight_copy.clone().sum().to_data().to_vec().unwrap()[0];
+            assert!(
+                (sum_zc - sum_copy).abs() < 1e-6,
+                "Layer {} weight sums don't match: zero-copy={}, copy={}",
+                i,
+                sum_zc,
+                sum_copy
+            );
+        }
+
+        println!(
+            "✅ Verified: Zero-copy and copy loading produce identical results for all {} layers",
+            model_zc.layers.len()
+        );
+    }
+}
+
+// =============================================================================
+// Store-only benchmarks (no backend allocation overhead)
+// These show the TRUE zero-copy benefit at the store level
+// =============================================================================
+
+#[divan::bench_group(name = "Store Only (no backend)", sample_count = 10)]
+mod store_only {
+    use super::*;
+
+    /// File-based store with copy mode - measures store overhead only
+    #[divan::bench]
+    fn file_copy(bencher: Bencher) {
+        let bp_path = get_burnpack_path();
+        let file_size = fs::metadata(&bp_path).unwrap().len();
+
+        bencher
+            .counter(divan::counter::BytesCount::new(file_size))
+            .bench(|| {
+                let mut store = BurnpackStore::from_file(&bp_path).zero_copy(false);
+                // Just iterate through all tensor snapshots, calling to_data() on each
+                // This forces the store to read and materialize all tensor data
+                let snapshots = store.get_all_snapshots().expect("Failed to get snapshots");
+                for snapshot in snapshots.values() {
+                    let _data = snapshot.to_data().expect("Failed to get tensor data");
+                }
+            });
+    }
+
+    /// File-based store with zero-copy mode - measures store overhead only
+    #[divan::bench]
+    fn file_zero_copy(bencher: Bencher) {
+        let bp_path = get_burnpack_path();
+        let file_size = fs::metadata(&bp_path).unwrap().len();
+
+        bencher
+            .counter(divan::counter::BytesCount::new(file_size))
+            .bench(|| {
+                let mut store = BurnpackStore::from_file(&bp_path).zero_copy(true);
+                let snapshots = store.get_all_snapshots().expect("Failed to get snapshots");
+                for snapshot in snapshots.values() {
+                    let _data = snapshot.to_data().expect("Failed to get tensor data");
+                }
+            });
+    }
+
+    /// Static bytes with copy mode - measures store overhead only
+    #[divan::bench]
+    fn static_copy(bencher: Bencher) {
+        let static_bytes = get_static_model_bytes();
+        let file_size = static_bytes.len() as u64;
+
+        bencher
+            .counter(divan::counter::BytesCount::new(file_size))
+            .bench(|| {
+                // Simulate old behavior: copy static bytes to Vec
+                let bytes = Bytes::from_bytes_vec(static_bytes.to_vec());
+                let mut store = BurnpackStore::from_bytes(Some(bytes)).zero_copy(false);
+                let snapshots = store.get_all_snapshots().expect("Failed to get snapshots");
+                for snapshot in snapshots.values() {
+                    let _data = snapshot.to_data().expect("Failed to get tensor data");
+                }
+            });
+    }
+
+    /// Static bytes with zero-copy mode - measures store overhead only
+    #[divan::bench]
+    fn static_zero_copy(bencher: Bencher) {
+        let static_bytes = get_static_model_bytes();
+        let file_size = static_bytes.len() as u64;
+
+        bencher
+            .counter(divan::counter::BytesCount::new(file_size))
+            .bench(|| {
+                let mut store = BurnpackStore::from_static(static_bytes);
+                let snapshots = store.get_all_snapshots().expect("Failed to get snapshots");
+                for snapshot in snapshots.values() {
+                    let _data = snapshot.to_data().expect("Failed to get tensor data");
+                }
+            });
+    }
+}
+
+// =============================================================================
+// Full model loading benchmarks (includes backend allocation)
+// =============================================================================
+
+// Generate benchmarks for each backend
+bench_backend!(NdArrayBackend, ndarray_backend, "NdArray Backend (CPU)");
+
+#[cfg(feature = "wgpu")]
+bench_backend!(WgpuBackend, wgpu_backend, "WGPU Backend (GPU)");
+
+#[cfg(feature = "cuda")]
+bench_backend!(CudaBackend, cuda_backend, "CUDA Backend (NVIDIA GPU)");
+
+#[cfg(feature = "tch")]
+bench_backend!(TchBackend, tch_backend, "LibTorch Backend");
+
+#[cfg(feature = "metal")]
+bench_backend!(MetalBackend, metal_backend, "Metal Backend (Apple GPU)");