feat: update workspace paths and enhance gitignore

- Updated stablediffusion crate path from "../stable-diffusion-burn" to "./crates/stable-diffusion-burn" for proper workspace resolution
- Enhanced .gitignore to include generated model files (.mpk, .pt, .bin, .safetensors, .ckpt) and user_data directory
- Added Cargo.lock to gitignore with appropriate comment
- Reorganized IDE files section in gitignore for better clarity
- Added newline at end of file for proper formatting
This commit is contained in:
2026-03-05 19:39:14 +01:00
parent 4bb7ca9074
commit 3a67c0979c
1605 changed files with 537032 additions and 2 deletions

View File

@@ -0,0 +1,82 @@
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.8"
# dependencies = [
# "torch",
# "torchvision",
# ]
# ///
"""
Download ResNet18 PyTorch model for benchmarking.
This script downloads a pre-trained ResNet18 model from PyTorch Hub
and saves it in a format suitable for benchmarking.
"""
import os
import sys
import tempfile
from pathlib import Path
import torch
import torchvision.models as models
def download_resnet18():
"""Download ResNet18 model and save to temp directory."""
# Create a temporary directory for the model
temp_dir = Path(tempfile.gettempdir()) / "burn_resnet18_benchmark"
temp_dir.mkdir(parents=True, exist_ok=True)
output_path = temp_dir / "resnet18.pth"
# Check if already downloaded
if output_path.exists():
file_size_mb = output_path.stat().st_size / (1024 * 1024)
print(f"✅ ResNet18 already exists at: {output_path}")
print(f" Size: {file_size_mb:.1f} MB")
return str(output_path)
print("📥 Downloading ResNet18 model...")
try:
# Download pre-trained ResNet18 model
model = models.resnet18(pretrained=True)
# Save the model state dict (this is what burn-store reads)
# Using the legacy format for compatibility
torch.save(model.state_dict(), output_path, _use_new_zipfile_serialization=False)
file_size_mb = output_path.stat().st_size / (1024 * 1024)
print(f"✅ Successfully downloaded ResNet18 to: {output_path}")
print(f" Size: {file_size_mb:.1f} MB")
print(f" Format: PyTorch legacy format")
# Verify it's readable
state_dict = torch.load(output_path, map_location='cpu')
print(f" Tensors: {len(state_dict)} tensors")
# Print a few tensor names and shapes for verification
print("\n Sample tensors:")
for i, (name, tensor) in enumerate(state_dict.items()):
if i < 3:
print(f" - {name}: {list(tensor.shape)}")
return str(output_path)
except Exception as e:
print(f"❌ Failed to download ResNet18: {e}")
sys.exit(1)
def main():
"""Main entry point."""
path = download_resnet18()
# Write the path to a file that the benchmark can read
bench_config = Path(tempfile.gettempdir()) / "burn_resnet18_benchmark" / "path.txt"
bench_config.write_text(path)
print(f"\n💡 Model ready for benchmarking")
print(f" Run: cargo bench --bench resnet18_loading")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,175 @@
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.8"
# dependencies = [
# "torch",
# "safetensors",
# "packaging",
# "numpy",
# ]
# ///
"""
Generate a large model (~312MB) in both PyTorch and SafeTensors formats for unified benchmarking.
Usage:
uv run benches/generate_unified_models.py
The script will create model files in /tmp/simple_bench_models/ directory.
"""
import torch
import torch.nn as nn
import os
from pathlib import Path
import tempfile
from safetensors.torch import save_file
def get_temp_dir():
"""Get the appropriate temp directory."""
temp_dir = Path(tempfile.gettempdir()) / "simple_bench_models"
temp_dir.mkdir(parents=True, exist_ok=True)
return temp_dir
class LargeModel(nn.Module):
"""Large model with 20 layers to match Rust benchmark."""
def __init__(self):
super().__init__()
self.layers = nn.ModuleList()
# Create a model with 20 layers matching the Rust LargeModel
for i in range(20):
in_size = 1024 if i == 0 else 2048
out_size = 2048
self.layers.append(nn.Linear(in_size, out_size))
print(f"Created model with {len(self.layers)} layers")
def forward(self, x):
for layer in self.layers:
x = layer(x)
return x
def calculate_model_size(model):
"""Calculate the size of the model in MB."""
total_params = sum(p.numel() for p in model.parameters())
size_mb = (total_params * 4) / (1024 * 1024) # 4 bytes per float32
return total_params, size_mb
def initialize_weights(model):
"""Initialize model weights with random values."""
for param in model.parameters():
if param.dim() > 1:
nn.init.xavier_uniform_(param)
else:
nn.init.zeros_(param)
def save_pytorch_format(model, output_dir):
"""Save model in PyTorch format."""
pt_path = output_dir / "large_model.pt"
# Save as checkpoint with model_state_dict (common format)
checkpoint = {
'model_state_dict': model.state_dict(),
'metadata': {
'model_type': 'large_benchmark_model',
'num_layers': len(model.layers),
}
}
torch.save(checkpoint, pt_path)
return pt_path
def save_safetensors_format(model, output_dir):
"""Save model in SafeTensors format."""
st_path = output_dir / "large_model.safetensors"
# Convert state dict to safetensors format
state_dict = model.state_dict()
# Ensure all tensors are contiguous and on CPU
state_dict = {k: v.contiguous().cpu() for k, v in state_dict.items()}
# Save with metadata
metadata = {
'model_type': 'large_benchmark_model',
'num_layers': str(len(model.layers)),
}
save_file(state_dict, st_path, metadata=metadata)
return st_path
def verify_files(pt_path, st_path):
"""Verify the saved files can be loaded."""
# Verify PyTorch file
checkpoint = torch.load(pt_path, map_location='cpu')
pt_keys = set(checkpoint['model_state_dict'].keys())
print(f" PyTorch file: {len(pt_keys)} tensors")
# Verify SafeTensors file
from safetensors import safe_open
with safe_open(st_path, framework="pt", device="cpu") as f:
st_keys = set(f.keys())
print(f" SafeTensors file: {len(st_keys)} tensors")
# Check keys match
if pt_keys != st_keys:
print(" ⚠️ Warning: Keys don't match between formats!")
else:
print(" ✓ Keys match between formats")
def main():
print("🔧 Generating unified benchmark model files...")
print("")
output_dir = get_temp_dir()
print(f"📁 Output directory: {output_dir}")
print("")
# Set random seed for reproducibility
torch.manual_seed(42)
# Create the large model
print("📝 Creating large model...")
model = LargeModel()
# Calculate and display model size
total_params, size_mb = calculate_model_size(model)
print(f" Total parameters: {total_params:,}")
print(f" Model size: {size_mb:.2f} MB")
print("")
# Initialize weights
print("🎲 Initializing weights...")
initialize_weights(model)
# Save in PyTorch format
print("💾 Saving PyTorch format...")
pt_path = save_pytorch_format(model, output_dir)
pt_size_mb = pt_path.stat().st_size / (1024 * 1024)
print(f" Saved: {pt_path}")
print(f" File size: {pt_size_mb:.2f} MB")
print("")
# Save in SafeTensors format
print("💾 Saving SafeTensors format...")
st_path = save_safetensors_format(model, output_dir)
st_size_mb = st_path.stat().st_size / (1024 * 1024)
print(f" Saved: {st_path}")
print(f" File size: {st_size_mb:.2f} MB")
print("")
# Verify files
print("🔍 Verifying saved files...")
verify_files(pt_path, st_path)
print("")
print(f"✅ Model files generated successfully!")
print("")
print("📊 Summary:")
print(f" PyTorch file: {pt_path.name} ({pt_size_mb:.2f} MB)")
print(f" SafeTensors file: {st_path.name} ({st_size_mb:.2f} MB)")
print("")
print("💡 To run the unified benchmark:")
print(" cargo bench --bench unified_loading")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,213 @@
//! Benchmark for ResNet18 loading to verify lazy loading memory usage.
//!
//! resnet18.pth is pytorch's legacy file format.
//!
//! This benchmark loads a ResNet18 model and materializes all tensors
//! to ensure memory usage stays reasonable with lazy loading.
//!
//! Run the benchmark:
//! ```bash
//! cargo bench --bench resnet18_loading
//! ```
use burn_store::pytorch::PytorchReader;
use divan::{AllocProfiler, Bencher};
use std::path::PathBuf;
#[global_allocator]
static ALLOC: AllocProfiler = AllocProfiler::system();
#[allow(clippy::manual_range_contains)]
fn main() {
// Check if ResNet18 file exists
let path = resnet18_path();
if !path.exists() {
eprintln!("❌ ResNet18 model not found!");
eprintln!();
eprintln!("Please download it first by running:");
eprintln!(" python benches/download_resnet18.py");
eprintln!();
eprintln!("Or if you don't have Python/PyTorch installed:");
eprintln!(" uv run benches/download_resnet18.py");
eprintln!();
eprintln!("Expected location: {}", path.display());
std::process::exit(1);
}
// Verify file size is reasonable
let metadata = std::fs::metadata(&path).expect("Failed to read file metadata");
let size_mb = metadata.len() as f64 / 1_048_576.0;
if size_mb < 40.0 || size_mb > 50.0 {
eprintln!(
"⚠️ Warning: ResNet18 file size ({:.1} MB) seems unusual",
size_mb
);
eprintln!("Expected size is around 45 MB");
}
println!("✅ Found ResNet18 model at: {}", path.display());
println!("📦 File size: {:.1} MB", size_mb);
println!("📊 Running ResNet18 loading benchmarks...\n");
// Run divan benchmarks
divan::main();
}
/// Get the path to ResNet18 model file
fn resnet18_path() -> PathBuf {
// First try to read from the path file created by download script
let temp_dir = std::env::temp_dir();
let config_file = temp_dir.join("burn_resnet18_benchmark").join("path.txt");
if config_file.exists()
&& let Ok(path_str) = std::fs::read_to_string(&config_file)
{
let path = PathBuf::from(path_str.trim());
if path.exists() {
return path;
}
}
// Fallback to default location
temp_dir
.join("burn_resnet18_benchmark")
.join("resnet18.pth")
}
#[divan::bench(sample_count = 10)]
fn load_resnet18_metadata(bencher: Bencher) {
let path = resnet18_path();
bencher.bench_local(|| {
let reader = PytorchReader::new(&path).expect("Failed to load ResNet18");
let metadata = reader.metadata();
// Just access metadata without materializing tensors
assert_eq!(metadata.tensor_count, 122);
});
}
#[divan::bench(sample_count = 5)]
fn load_resnet18_materialize_all(bencher: Bencher) {
let path = resnet18_path();
bencher.bench_local(|| {
let reader = PytorchReader::new(&path).expect("Failed to load ResNet18");
let keys = reader.keys();
let mut total_bytes = 0usize;
// Materialize all tensors one by one
for key in &keys {
let tensor = reader.get(key).expect("Failed to get tensor");
// Materialize the tensor data
let _data = tensor.to_data().expect("Failed to materialize tensor data");
total_bytes += tensor.data_len();
}
// Verify we processed all the data
assert!(total_bytes > 40_000_000); // Should be ~45MB
});
}
#[divan::bench(sample_count = 5)]
fn load_resnet18_materialize_sequential(bencher: Bencher) {
let path = resnet18_path();
bencher.bench_local(|| {
let reader = PytorchReader::new(&path).expect("Failed to load ResNet18");
let keys = reader.keys();
// Materialize tensors one at a time, letting previous ones be dropped
// This simulates processing tensors sequentially without keeping all in memory
for key in &keys {
let tensor = reader.get(key).expect("Failed to get tensor");
let data = tensor.to_data().expect("Failed to materialize tensor data");
// Do minimal work with the data to prevent optimization
let sum = match data.dtype {
burn_tensor::DType::F32 => data
.as_slice::<f32>()
.map(|s| s.iter().sum::<f32>())
.unwrap_or(0.0) as f64,
burn_tensor::DType::F64 => data
.as_slice::<f64>()
.map(|s| s.iter().sum::<f64>())
.unwrap_or(0.0),
_ => 0.0,
};
// Use the sum to prevent dead code elimination
std::hint::black_box(sum);
}
});
}
#[divan::bench(sample_count = 10)]
fn load_resnet18_largest_tensor(bencher: Bencher) {
let path = resnet18_path();
bencher.bench_local(|| {
let reader = PytorchReader::new(&path).expect("Failed to load ResNet18");
// Find and materialize only the largest tensor
// This tests peak memory for a single tensor operation
let keys = reader.keys();
let mut largest_key = String::new();
let mut largest_size = 0usize;
for key in &keys {
let tensor = reader.get(key).expect("Failed to get tensor");
let size = tensor.data_len();
if size > largest_size {
largest_size = size;
largest_key = key.clone();
}
}
// Materialize the largest tensor
let tensor = reader
.get(&largest_key)
.expect("Failed to get largest tensor");
let _data = tensor.to_data().expect("Failed to materialize tensor data");
assert!(largest_size > 9_000_000); // Should be ~9MB for layer4.0.conv2.weight
});
}
#[divan::bench(sample_count = 10)]
fn load_resnet18_memory_profile(bencher: Bencher) {
let path = resnet18_path();
bencher
.with_inputs(|| path.clone())
.bench_local_values(|path| {
let reader = PytorchReader::new(&path).expect("Failed to load ResNet18");
let keys = reader.keys();
let mut peak_single_tensor = 0usize;
let mut total_data = 0usize;
// Process each tensor and track memory
for key in &keys {
let tensor = reader.get(key).expect("Failed to get tensor");
let tensor_size = tensor.data_len();
// Track largest single tensor
if tensor_size > peak_single_tensor {
peak_single_tensor = tensor_size;
}
// Materialize the tensor
let data = tensor.to_data().expect("Failed to materialize tensor data");
total_data += tensor_size;
// Drop data immediately to test lazy loading memory efficiency
drop(data);
}
// Return stats for verification
(peak_single_tensor, total_data)
});
}

View File

@@ -0,0 +1,332 @@
#![recursion_limit = "256"]
//! Unified benchmark comparing all loading methods:
//! - BurnpackStore (new native format)
//! - NamedMpkFileRecorder (old native format)
//! - SafetensorsStore (new)
//! - SafetensorsFileRecorder (old)
//! - PytorchStore (new)
//! - PyTorchFileRecorder (old)
//!
//! Before running this benchmark, generate the model files:
//! ```bash
//! cd crates/burn-store
//! uv run benches/generate_unified_models.py
//! ```
//!
//! Then run the benchmark:
//! ```bash
//! cargo bench --bench unified_loading
//! ```
use burn_core as burn;
use burn_core::module::Module;
use burn_core::prelude::*;
use burn_core::record::{FullPrecisionSettings, NamedMpkFileRecorder, Recorder};
// use burn_import::pytorch::{LoadArgs, PyTorchFileRecorder};
// use burn_import::safetensors::SafetensorsFileRecorder;
use burn_nn as nn;
use burn_store::{
BurnpackStore, ModuleSnapshot, PyTorchToBurnAdapter, PytorchStore, SafetensorsStore,
};
use divan::{AllocProfiler, Bencher};
use std::fs;
use std::path::{Path, PathBuf};
#[global_allocator]
static ALLOC: AllocProfiler = AllocProfiler::system();
// Backend type aliases
type NdArrayBackend = burn_ndarray::NdArray<f32>;
#[cfg(feature = "wgpu")]
type WgpuBackend = burn_wgpu::Wgpu;
#[cfg(feature = "cuda")]
type CudaBackend = burn_cuda::Cuda<f32, i32>;
#[cfg(feature = "tch")]
type TchBackend = burn_tch::LibTorch<f32>;
#[cfg(feature = "metal")]
type MetalBackend = burn_wgpu::Metal;
// Use the same LargeModel as other benchmarks for fair comparison
#[derive(Module, Debug)]
struct LargeModel<B: Backend> {
layers: Vec<nn::Linear<B>>,
}
impl<B: Backend> LargeModel<B> {
fn new(device: &B::Device) -> Self {
let mut layers = Vec::new();
// Create a model with 20 layers - same as safetensor_loading benchmark
for i in 0..20 {
let in_size = if i == 0 { 1024 } else { 2048 };
layers.push(nn::LinearConfig::new(in_size, 2048).init(device));
}
Self { layers }
}
}
/// Get the path to the model files
fn get_model_dir() -> PathBuf {
std::env::temp_dir().join("simple_bench_models")
}
/// Generate Burnpack and NamedMpk files from existing SafeTensors file
fn generate_burn_formats(st_path: &Path, bp_path: &Path, mpk_path: &Path) {
type TestBackend = NdArrayBackend;
let device = Default::default();
// Load the model from SafeTensors
let mut model = LargeModel::<TestBackend>::new(&device);
let mut store = SafetensorsStore::from_file(st_path).with_from_adapter(PyTorchToBurnAdapter);
model
.load_from(&mut store)
.expect("Failed to load from SafeTensors");
// Save as Burnpack
if !bp_path.exists() {
println!(" Creating Burnpack file...");
let mut burnpack_store = BurnpackStore::from_file(bp_path);
model
.save_into(&mut burnpack_store)
.expect("Failed to save as Burnpack");
}
// Save as NamedMpk
if !mpk_path.exists() {
println!(" Creating NamedMpk file...");
let recorder = NamedMpkFileRecorder::<FullPrecisionSettings>::default();
model
.save_file(mpk_path, &recorder)
.expect("Failed to save as NamedMpk");
}
}
/// Get paths to the model files
fn get_model_paths() -> (PathBuf, PathBuf, PathBuf, PathBuf) {
let dir = get_model_dir();
(
dir.join("large_model.bpk"),
dir.join("large_model.mpk"),
dir.join("large_model.safetensors"),
dir.join("large_model.pt"),
)
}
/// Check if model files exist
fn check_model_files() -> Result<(), String> {
let (_, _, st_path, pt_path) = get_model_paths();
// For now, only check safetensors and pytorch files (will generate burnpack/mpk later)
if !st_path.exists() || !pt_path.exists() {
return Err(format!(
"\n❌ Model files not found!\n\
\n\
Please generate the model files first by running:\n\
\n\
cd crates/burn-store\n\
uv run benches/generate_unified_models.py\n\
\n\
Expected files:\n\
- {}\n\
- {}\n",
st_path.display(),
pt_path.display()
));
}
Ok(())
}
fn main() {
// Check if model files exist before running benchmarks
match check_model_files() {
Ok(()) => {
let (bp_path, mpk_path, st_path, pt_path) = get_model_paths();
// First, generate Burnpack and MPK files if they don't exist
if !bp_path.exists() || !mpk_path.exists() {
println!("⏳ Generating Burnpack and NamedMpk files from SafeTensors...");
generate_burn_formats(&st_path, &bp_path, &mpk_path);
}
let bp_size = fs::metadata(&bp_path)
.ok()
.map(|m| m.len() as f64 / 1_048_576.0);
let mpk_size = fs::metadata(&mpk_path)
.ok()
.map(|m| m.len() as f64 / 1_048_576.0);
let st_size = fs::metadata(&st_path).unwrap().len() as f64 / 1_048_576.0;
let pt_size = fs::metadata(&pt_path).unwrap().len() as f64 / 1_048_576.0;
println!("✅ Found model files:");
if let Some(size) = bp_size {
println!(" Burnpack: {} ({:.1} MB)", bp_path.display(), size);
}
if let Some(size) = mpk_size {
println!(" NamedMpk: {} ({:.1} MB)", mpk_path.display(), size);
}
println!(" SafeTensors: {} ({:.1} MB)", st_path.display(), st_size);
println!(" PyTorch: {} ({:.1} MB)", pt_path.display(), pt_size);
println!();
println!("🚀 Running unified loading benchmarks...");
println!();
println!("Comparing 6 loading methods:");
println!(" 1. BurnpackStore (new native format - lazy loading)");
println!(" 2. NamedMpkFileRecorder (old native format - loads all to memory)");
println!(" 3. SafetensorsStore (new)");
println!(" 4. SafetensorsFileRecorder (old)");
println!(" 5. PytorchStore (new)");
println!(" 6. PyTorchFileRecorder (old)");
println!();
println!("Available backends:");
println!(" - NdArray (CPU)");
#[cfg(feature = "wgpu")]
println!(" - WGPU (GPU)");
#[cfg(feature = "cuda")]
println!(" - CUDA (NVIDIA GPU)");
#[cfg(feature = "tch")]
println!(" - LibTorch");
#[cfg(feature = "metal")]
println!(" - Metal (Apple GPU)");
println!();
divan::main();
}
Err(msg) => {
eprintln!("{}", msg);
std::process::exit(1);
}
}
}
// Macro to generate benchmarks for each backend
macro_rules! bench_backend {
($backend:ty, $mod_name:ident, $backend_name:literal) => {
#[divan::bench_group(name = $backend_name, sample_count = 10)]
mod $mod_name {
use super::*;
type TestBackend = $backend;
type TestDevice = <TestBackend as Backend>::Device;
#[divan::bench]
fn burnpack_store(bencher: Bencher) {
let (bp_path, _, _, _) = get_model_paths();
let file_size = fs::metadata(&bp_path).unwrap().len();
bencher
.counter(divan::counter::BytesCount::new(file_size))
.bench(|| {
let device: TestDevice = Default::default();
let mut model = LargeModel::<TestBackend>::new(&device);
let mut store = BurnpackStore::from_file(bp_path.clone());
model.load_from(&mut store).expect("Failed to load");
});
}
#[divan::bench]
fn namedmpk_recorder(bencher: Bencher) {
let (_, mpk_path, _, _) = get_model_paths();
let file_size = fs::metadata(&mpk_path).unwrap().len();
bencher
.counter(divan::counter::BytesCount::new(file_size))
.bench(|| {
let device: TestDevice = Default::default();
let recorder = NamedMpkFileRecorder::<FullPrecisionSettings>::default();
let record = recorder
.load(mpk_path.clone().into(), &device)
.expect("Failed to load");
let _model = LargeModel::<TestBackend>::new(&device).load_record(record);
});
}
#[divan::bench]
fn safetensors_store(bencher: Bencher) {
let (_, _, st_path, _) = get_model_paths();
let file_size = fs::metadata(&st_path).unwrap().len();
bencher
.counter(divan::counter::BytesCount::new(file_size))
.bench(|| {
let device: TestDevice = Default::default();
let mut model = LargeModel::<TestBackend>::new(&device);
let mut store = SafetensorsStore::from_file(st_path.clone())
.with_from_adapter(PyTorchToBurnAdapter);
model.load_from(&mut store).expect("Failed to load");
});
}
// #[divan::bench]
// fn safetensors_recorder(bencher: Bencher) {
// let (_, _, st_path, _) = get_model_paths();
// let file_size = fs::metadata(&st_path).unwrap().len();
// bencher
// .counter(divan::counter::BytesCount::new(file_size))
// .bench(|| {
// let device: TestDevice = Default::default();
// let recorder = SafetensorsFileRecorder::<FullPrecisionSettings>::default();
// let record = recorder
// .load(st_path.clone().into(), &device)
// .expect("Failed to load");
// let _model = LargeModel::<TestBackend>::new(&device).load_record(record);
// });
// }
#[divan::bench]
fn pytorch_store(bencher: Bencher) {
let (_, _, _, pt_path) = get_model_paths();
let file_size = fs::metadata(&pt_path).unwrap().len();
bencher
.counter(divan::counter::BytesCount::new(file_size))
.bench(|| {
let device: TestDevice = Default::default();
let mut model = LargeModel::<TestBackend>::new(&device);
let mut store = PytorchStore::from_file(pt_path.clone())
.with_top_level_key("model_state_dict")
.allow_partial(true);
model.load_from(&mut store).expect("Failed to load");
});
}
// #[divan::bench]
// fn pytorch_recorder(bencher: Bencher) {
// let (_, _, _, pt_path) = get_model_paths();
// let file_size = fs::metadata(&pt_path).unwrap().len();
// bencher
// .counter(divan::counter::BytesCount::new(file_size))
// .bench(|| {
// let device: TestDevice = Default::default();
// let recorder = PyTorchFileRecorder::<FullPrecisionSettings>::default();
// let load_args =
// LoadArgs::new(pt_path.clone()).with_top_level_key("model_state_dict");
// let record = recorder.load(load_args, &device).expect("Failed to load");
// let _model = LargeModel::<TestBackend>::new(&device).load_record(record);
// });
// }
}
};
}
// Generate benchmarks for each backend
bench_backend!(NdArrayBackend, ndarray_backend, "NdArray Backend (CPU)");
#[cfg(feature = "wgpu")]
bench_backend!(WgpuBackend, wgpu_backend, "WGPU Backend (GPU)");
#[cfg(feature = "cuda")]
bench_backend!(CudaBackend, cuda_backend, "CUDA Backend (NVIDIA GPU)");
#[cfg(feature = "tch")]
bench_backend!(TchBackend, tch_backend, "LibTorch Backend");
#[cfg(feature = "metal")]
bench_backend!(MetalBackend, metal_backend, "Metal Backend (Apple GPU)");

View File

@@ -0,0 +1,183 @@
#![recursion_limit = "256"]
//! Unified benchmark comparing all saving methods:
//! - BurnpackStore (new native format)
//! - NamedMpkFileRecorder (old native format)
//! - SafetensorsStore (new)
//!
//! Before running this benchmark, ensure the directory exists:
//! ```bash
//! mkdir -p /tmp/simple_bench_models
//! ```
//!
//! Then run the benchmark:
//! ```bash
//! cargo bench --bench unified_saving
//! ```
use burn_core as burn;
use burn_core::module::Module;
use burn_core::prelude::*;
use burn_core::record::{FullPrecisionSettings, NamedMpkFileRecorder};
use burn_nn as nn;
use burn_store::{BurnpackStore, ModuleSnapshot, SafetensorsStore};
use divan::{AllocProfiler, Bencher};
use std::fs;
use std::path::PathBuf;
#[global_allocator]
static ALLOC: AllocProfiler = AllocProfiler::system();
// Backend type aliases
type NdArrayBackend = burn_ndarray::NdArray<f32>;
#[cfg(feature = "wgpu")]
type WgpuBackend = burn_wgpu::Wgpu;
#[cfg(feature = "cuda")]
type CudaBackend = burn_cuda::Cuda<f32, i32>;
#[cfg(feature = "tch")]
type TchBackend = burn_tch::LibTorch<f32>;
#[cfg(feature = "metal")]
type MetalBackend = burn_wgpu::Metal;
// Use the same LargeModel as other benchmarks for fair comparison
#[derive(Module, Debug)]
struct LargeModel<B: Backend> {
layers: Vec<nn::Linear<B>>,
}
impl<B: Backend> LargeModel<B> {
fn new(device: &B::Device) -> Self {
let mut layers = Vec::new();
// Create a model with 20 layers - same as loading benchmarks
for i in 0..20 {
let in_size = if i == 0 { 1024 } else { 2048 };
layers.push(nn::LinearConfig::new(in_size, 2048).init(device));
}
Self { layers }
}
}
/// Get the path to the output directory
fn get_output_dir() -> PathBuf {
std::env::temp_dir().join("simple_bench_models_saving")
}
/// Ensure output directory exists
fn ensure_output_dir() -> Result<(), String> {
let dir = get_output_dir();
if !dir.exists() {
fs::create_dir_all(&dir)
.map_err(|e| format!("Failed to create output directory: {}", e))?;
}
Ok(())
}
fn main() {
match ensure_output_dir() {
Ok(()) => {
println!("✅ Output directory ready: {}", get_output_dir().display());
println!();
println!("🚀 Running unified saving benchmarks...");
println!();
println!("Comparing 3 saving methods:");
println!(" 1. BurnpackStore (new native format)");
println!(" 2. NamedMpkFileRecorder (old native format)");
println!(" 3. SafetensorsStore (new)");
println!();
println!("Available backends:");
println!(" - NdArray (CPU)");
#[cfg(feature = "wgpu")]
println!(" - WGPU (GPU)");
#[cfg(feature = "cuda")]
println!(" - CUDA (NVIDIA GPU)");
#[cfg(feature = "tch")]
println!(" - LibTorch");
#[cfg(feature = "metal")]
println!(" - Metal (Apple GPU)");
println!();
divan::main();
}
Err(msg) => {
eprintln!("{}", msg);
std::process::exit(1);
}
}
}
// Macro to generate benchmarks for each backend
macro_rules! bench_backend {
($backend:ty, $mod_name:ident, $backend_name:literal) => {
#[divan::bench_group(name = $backend_name, sample_count = 10)]
mod $mod_name {
use super::*;
type TestBackend = $backend;
type TestDevice = <TestBackend as Backend>::Device;
#[divan::bench]
fn burnpack_store(bencher: Bencher) {
bencher.bench(|| {
let device: TestDevice = Default::default();
let model = LargeModel::<TestBackend>::new(&device);
let output_path = get_output_dir().join("test_burnpack.bpk");
let mut store = BurnpackStore::from_file(output_path.clone()).overwrite(true);
model
.save_into(&mut store)
.expect("Failed to save with BurnpackStore");
// Clean up
let _ = fs::remove_file(output_path);
});
}
#[divan::bench]
fn namedmpk_recorder(bencher: Bencher) {
bencher.bench(|| {
let device: TestDevice = Default::default();
let model = LargeModel::<TestBackend>::new(&device);
let output_path = get_output_dir().join("test_namedmpk.mpk");
let recorder = NamedMpkFileRecorder::<FullPrecisionSettings>::default();
model
.save_file(output_path.clone(), &recorder)
.expect("Failed to save with NamedMpkFileRecorder");
// Clean up
let _ = fs::remove_file(output_path);
});
}
#[divan::bench]
fn safetensors_store(bencher: Bencher) {
bencher.bench(|| {
let device: TestDevice = Default::default();
let model = LargeModel::<TestBackend>::new(&device);
let output_path = get_output_dir().join("test_safetensors_store.safetensors");
let mut store = SafetensorsStore::from_file(output_path.clone());
model
.save_into(&mut store)
.expect("Failed to save with SafetensorsStore");
// Clean up
let _ = fs::remove_file(output_path);
});
}
}
};
}
// Generate benchmarks for each backend
bench_backend!(NdArrayBackend, ndarray_backend, "NdArray Backend (CPU)");
#[cfg(feature = "wgpu")]
bench_backend!(WgpuBackend, wgpu_backend, "WGPU Backend (GPU)");
#[cfg(feature = "cuda")]
bench_backend!(CudaBackend, cuda_backend, "CUDA Backend (NVIDIA GPU)");
#[cfg(feature = "tch")]
bench_backend!(TchBackend, tch_backend, "LibTorch Backend");
#[cfg(feature = "metal")]
bench_backend!(MetalBackend, metal_backend, "Metal Backend (Apple GPU)");

View File

@@ -0,0 +1,596 @@
#![recursion_limit = "256"]
//! Benchmark comparing zero-copy vs copy loading modes for BurnpackStore.
//!
//! This benchmark measures the performance difference between:
//! - `zero_copy(false)` - Default mode, copies tensor data into new allocations
//! - `zero_copy(true)` - Zero-copy mode, slices tensor data without copying
//!
//! ## Understanding the Results
//!
//! **IMPORTANT**: For NdArray backend, you'll see similar allocation numbers because:
//! - NdArray uses `ndarray::ArrayD` which MUST own data as `Vec<T>`
//! - Even with zero-copy, the backend eventually copies data into its own format
//!
//! The zero-copy benefit is:
//! - **Without zero-copy**: File → Copy to heap (Bytes) → Copy to Vec (backend)
//! - **With zero-copy**: File → Zero-copy slice → Copy to Vec (backend)
//!
//! So zero-copy saves ONE memory copy at the store level. The `store_only_*` benchmarks
//! show the raw store performance without backend allocation overhead.
//!
//! GPU backends that can consume `Bytes` directly will show larger benefits.
//!
//! ## Running the benchmark
//!
//! Before running this benchmark, generate the model files:
//! ```bash
//! cd crates/burn-store
//! uv run benches/generate_unified_models.py
//! ```
//!
//! Then run the benchmark:
//! ```bash
//! cargo bench --bench zero_copy_loading
//! ```
use burn_core as burn;
use burn_core::module::Module;
use burn_core::prelude::*;
use burn_nn as nn;
use burn_store::{
BurnpackStore, ModuleSnapshot, ModuleStore, PyTorchToBurnAdapter, SafetensorsStore,
};
use burn_tensor::{AllocationProperty, Bytes};
use divan::{AllocProfiler, Bencher};
use std::fs;
use std::path::PathBuf;
use std::sync::OnceLock;
#[global_allocator]
static ALLOC: AllocProfiler = AllocProfiler::system();
// Static storage for embedded model bytes (simulating include_bytes!)
static STATIC_MODEL_BYTES: OnceLock<&'static [u8]> = OnceLock::new();
// Backend type aliases
type NdArrayBackend = burn_ndarray::NdArray<f32>;
#[cfg(feature = "wgpu")]
type WgpuBackend = burn_wgpu::Wgpu;
#[cfg(feature = "cuda")]
type CudaBackend = burn_cuda::Cuda<f32, i32>;
#[cfg(feature = "tch")]
type TchBackend = burn_tch::LibTorch<f32>;
#[cfg(feature = "metal")]
type MetalBackend = burn_wgpu::Metal;
// Use the same LargeModel as other benchmarks for fair comparison
#[derive(Module, Debug)]
struct LargeModel<B: Backend> {
layers: Vec<nn::Linear<B>>,
}
impl<B: Backend> LargeModel<B> {
fn new(device: &B::Device) -> Self {
let mut layers = Vec::new();
// Create a model with 20 layers - same as unified_loading benchmark
for i in 0..20 {
let in_size = if i == 0 { 1024 } else { 2048 };
layers.push(nn::LinearConfig::new(in_size, 2048).init(device));
}
Self { layers }
}
}
/// Get the path to the model files
fn get_model_dir() -> PathBuf {
std::env::temp_dir().join("simple_bench_models")
}
/// Get path to Burnpack model file
fn get_burnpack_path() -> PathBuf {
get_model_dir().join("large_model.bpk")
}
/// Generate Burnpack file from existing SafeTensors file if needed
fn ensure_burnpack_file() {
let bp_path = get_burnpack_path();
let st_path = get_model_dir().join("large_model.safetensors");
if bp_path.exists() {
return;
}
if !st_path.exists() {
panic!(
"\n❌ SafeTensors model file not found!\n\
\n\
Please generate the model files first by running:\n\
\n\
cd crates/burn-store\n\
uv run benches/generate_unified_models.py\n\
\n\
Expected file: {}\n",
st_path.display()
);
}
println!("⏳ Generating Burnpack file from SafeTensors...");
type TestBackend = NdArrayBackend;
let device = Default::default();
// Load from SafeTensors
let mut model = LargeModel::<TestBackend>::new(&device);
let mut store = SafetensorsStore::from_file(&st_path).with_from_adapter(PyTorchToBurnAdapter);
model
.load_from(&mut store)
.expect("Failed to load from SafeTensors");
// Save as Burnpack
let mut burnpack_store = BurnpackStore::from_file(&bp_path);
model
.save_into(&mut burnpack_store)
.expect("Failed to save as Burnpack");
println!("✅ Created Burnpack file: {}", bp_path.display());
}
/// Initialize static model bytes (simulating include_bytes! at runtime for benchmarks)
fn get_static_model_bytes() -> &'static [u8] {
STATIC_MODEL_BYTES.get_or_init(|| {
let bp_path = get_burnpack_path();
let bytes = fs::read(&bp_path).expect("Failed to read Burnpack file");
// Leak the bytes to get a 'static lifetime (acceptable for benchmarks)
Box::leak(bytes.into_boxed_slice())
})
}
fn main() {
// Ensure Burnpack file exists
ensure_burnpack_file();
let bp_path = get_burnpack_path();
let file_size = fs::metadata(&bp_path).unwrap().len() as f64 / 1_048_576.0;
println!("✅ Found Burnpack model file:");
println!(" Path: {}", bp_path.display());
println!(" Size: {:.1} MB", file_size);
println!();
println!("🚀 Running zero-copy loading benchmarks...");
println!();
println!("Comparing loading modes:");
println!(" 1. file_copy - from_file().zero_copy(false) - copies tensor data");
println!(" 2. file_zero_copy - from_file().zero_copy(true) - zero-copy via mmap");
println!(" 3. static_copy - from_bytes() with Vec copy - copies from static");
println!(" 4. static_zero_copy - from_static() - zero-copy from static");
println!();
println!("Available backends:");
println!(" - NdArray (CPU)");
#[cfg(feature = "wgpu")]
println!(" - WGPU (GPU)");
#[cfg(feature = "cuda")]
println!(" - CUDA (NVIDIA GPU)");
#[cfg(feature = "tch")]
println!(" - LibTorch");
#[cfg(feature = "metal")]
println!(" - Metal (Apple GPU)");
println!();
// Pre-initialize static bytes before benchmarks
let _ = get_static_model_bytes();
divan::main();
}
// Macro to generate benchmarks for each backend
macro_rules! bench_backend {
($backend:ty, $mod_name:ident, $backend_name:literal) => {
#[divan::bench_group(name = $backend_name, sample_count = 10)]
mod $mod_name {
use super::*;
type TestBackend = $backend;
type TestDevice = <TestBackend as Backend>::Device;
/// File-based loading with copy mode (default)
#[divan::bench]
fn file_copy(bencher: Bencher) {
let bp_path = get_burnpack_path();
let file_size = fs::metadata(&bp_path).unwrap().len();
bencher
.counter(divan::counter::BytesCount::new(file_size))
.bench(|| {
let device: TestDevice = Default::default();
let mut model = LargeModel::<TestBackend>::new(&device);
let mut store = BurnpackStore::from_file(&bp_path).zero_copy(false);
model.load_from(&mut store).expect("Failed to load");
});
}
/// File-based loading with zero-copy mode (mmap + bytes::Bytes)
#[divan::bench]
fn file_zero_copy(bencher: Bencher) {
let bp_path = get_burnpack_path();
let file_size = fs::metadata(&bp_path).unwrap().len();
bencher
.counter(divan::counter::BytesCount::new(file_size))
.bench(|| {
let device: TestDevice = Default::default();
let mut model = LargeModel::<TestBackend>::new(&device);
let mut store = BurnpackStore::from_file(&bp_path).zero_copy(true);
model.load_from(&mut store).expect("Failed to load");
});
}
/// Static bytes with copy mode (simulating old behavior)
#[divan::bench]
fn static_copy(bencher: Bencher) {
let static_bytes = get_static_model_bytes();
let file_size = static_bytes.len() as u64;
bencher
.counter(divan::counter::BytesCount::new(file_size))
.bench(|| {
let device: TestDevice = Default::default();
let mut model = LargeModel::<TestBackend>::new(&device);
// Simulate old behavior: copy static bytes to Vec, then load
let bytes = Bytes::from_bytes_vec(static_bytes.to_vec());
let mut store = BurnpackStore::from_bytes(Some(bytes)).zero_copy(false);
model.load_from(&mut store).expect("Failed to load");
});
}
/// Static bytes with zero-copy mode (new from_static)
#[divan::bench]
fn static_zero_copy(bencher: Bencher) {
let static_bytes = get_static_model_bytes();
let file_size = static_bytes.len() as u64;
bencher
.counter(divan::counter::BytesCount::new(file_size))
.bench(|| {
let device: TestDevice = Default::default();
let mut model = LargeModel::<TestBackend>::new(&device);
// Zero-copy: use from_static which keeps data in .rodata
let mut store = BurnpackStore::from_static(static_bytes);
model.load_from(&mut store).expect("Failed to load");
});
}
/// In-memory shared bytes with zero-copy
#[divan::bench]
fn memory_shared_zero_copy(bencher: Bencher) {
let static_bytes = get_static_model_bytes();
let file_size = static_bytes.len() as u64;
// Pre-create shared bytes outside the benchmark loop
let shared = bytes::Bytes::from_static(static_bytes);
bencher
.counter(divan::counter::BytesCount::new(file_size))
.bench(|| {
let device: TestDevice = Default::default();
let mut model = LargeModel::<TestBackend>::new(&device);
// Create Bytes from shared (cheap clone of Arc)
let bytes = Bytes::from_shared(shared.clone(), AllocationProperty::Other);
let mut store = BurnpackStore::from_bytes(Some(bytes)).zero_copy(true);
model.load_from(&mut store).expect("Failed to load");
});
}
}
};
}
// =============================================================================
// Zero-copy verification (proves operations use static region data)
// =============================================================================
/// Verify that zero-copy loading actually uses data from the static region.
/// This runs once at startup to prove correctness before benchmarking.
#[divan::bench_group(name = "Zero-Copy Verification", sample_count = 1)]
mod verification {
use super::*;
use burn_ndarray::NdArray;
type B = NdArray<f32>;
/// Verify zero-copy: tensor storage is borrowed (not owned)
#[divan::bench]
fn verify_storage_is_borrowed() {
let static_bytes = get_static_model_bytes();
// Load model with zero-copy from static bytes
let device = Default::default();
let mut model = LargeModel::<B>::new(&device);
let mut store = BurnpackStore::from_static(static_bytes);
model.load_from(&mut store).expect("Failed to load");
// Get the first layer's weight tensor and verify it uses borrowed storage
let weight = model.layers[0].weight.val();
// .into_primitive() returns TensorPrimitive<B>, .tensor() extracts B::FloatTensorPrimitive
let ndarray_tensor = weight.into_primitive().tensor();
// Verify the storage is borrowed (zero-copy from static region)
assert!(
ndarray_tensor.is_borrowed(),
"ZERO-COPY FAILURE: Tensor storage is NOT borrowed. \
Data was copied instead of being zero-copy!"
);
println!("✅ Verified: Tensor storage is borrowed (zero-copy from static region)");
}
/// Verify ALL layers use borrowed (zero-copy) storage.
/// This is the key proof that loaded weights point to static memory.
#[divan::bench]
fn verify_all_layers_borrowed() {
let static_bytes = get_static_model_bytes();
// Load model with zero-copy
let device = Default::default();
let mut model = LargeModel::<B>::new(&device);
let mut store = BurnpackStore::from_static(static_bytes);
model.load_from(&mut store).expect("Failed to load");
// Check ALL layers have borrowed storage
let mut total_elements = 0usize;
for (i, layer) in model.layers.iter().enumerate() {
let weight = layer.weight.val();
total_elements += weight.shape().num_elements();
assert!(
weight.into_primitive().tensor().is_borrowed(),
"Layer {} weight should be borrowed (zero-copy)",
i
);
}
let total_mb = (total_elements * 4) as f64 / 1_048_576.0;
println!(
"✅ Verified: All {} layers use borrowed storage",
model.layers.len()
);
println!(
" - Model size: {:.2} MB - all pointing to static region",
total_mb
);
}
/// Verify data is readable and correct using sum().into_scalar().
/// Note: sum() triggers COW copy, so this shows ops work correctly on zero-copy data.
#[divan::bench]
fn verify_ops_produce_correct_results() {
let static_bytes = get_static_model_bytes();
let device = Default::default();
let mut model = LargeModel::<B>::new(&device);
let mut store = BurnpackStore::from_static(static_bytes);
model.load_from(&mut store).expect("Failed to load");
// Compute sum of first layer weight - proves data is valid
let weight = model.layers[0].weight.val();
let sum: f32 = weight.sum().into_scalar();
assert!(sum.is_finite(), "Sum should be finite");
println!("✅ Verified: Operations on zero-copy data produce valid results");
println!(" - First layer sum: {:.4}", sum);
}
/// Verify operations produce correct results on zero-copy data
#[divan::bench]
fn verify_operations_on_static_data() {
let static_bytes = get_static_model_bytes();
// Load model with zero-copy
let device = Default::default();
let mut model = LargeModel::<B>::new(&device);
let mut store = BurnpackStore::from_static(static_bytes);
model.load_from(&mut store).expect("Failed to load");
// Perform operations on the loaded weights
let weight = model.layers[0].weight.val();
let shape = weight.shape();
// Test 1: Sum should be finite (not NaN or Inf)
let sum: f32 = weight.clone().sum().to_data().to_vec().unwrap()[0];
assert!(
sum.is_finite(),
"Operation failed: sum is not finite ({})",
sum
);
// Test 2: Matrix multiply with itself transposed (W @ W.T)
let transposed = weight.clone().transpose();
let matmul_result = weight.clone().matmul(transposed);
let matmul_sum: f32 = matmul_result.sum().to_data().to_vec().unwrap()[0];
assert!(
matmul_sum.is_finite(),
"Matmul failed: result sum is not finite ({})",
matmul_sum
);
// Test 3: Element-wise operations
let doubled = weight.clone() * 2.0;
let doubled_sum: f32 = doubled.sum().to_data().to_vec().unwrap()[0];
assert!(
(doubled_sum - sum * 2.0).abs() < 1e-3,
"Element-wise op failed: doubled_sum ({}) != sum*2 ({})",
doubled_sum,
sum * 2.0
);
println!("✅ Verified: Operations on zero-copy data produce correct results");
println!(" - Weight shape: {:?}", shape.as_slice());
println!(" - Sum: {:.4}", sum);
println!(" - Matmul result sum: {:.4}", matmul_sum);
}
/// Compare zero-copy vs copy: verify both produce identical results
#[divan::bench]
fn verify_copy_vs_zero_copy_equality() {
let static_bytes = get_static_model_bytes();
let device: <B as Backend>::Device = Default::default();
// Load with zero-copy
let mut model_zc = LargeModel::<B>::new(&device);
let mut store_zc = BurnpackStore::from_static(static_bytes);
model_zc
.load_from(&mut store_zc)
.expect("Failed to load zero-copy");
// Load with copy (simulate old behavior)
let mut model_copy = LargeModel::<B>::new(&device);
let bytes = Bytes::from_bytes_vec(static_bytes.to_vec());
let mut store_copy = BurnpackStore::from_bytes(Some(bytes)).zero_copy(false);
model_copy
.load_from(&mut store_copy)
.expect("Failed to load copy");
// Compare weights from both models
for (i, (layer_zc, layer_copy)) in model_zc
.layers
.iter()
.zip(model_copy.layers.iter())
.enumerate()
{
let weight_zc = layer_zc.weight.val();
let weight_copy = layer_copy.weight.val();
// Check shapes match
assert_eq!(
weight_zc.shape(),
weight_copy.shape(),
"Layer {} weight shapes don't match",
i
);
// Check values match (using sum as a proxy)
let sum_zc: f32 = weight_zc.clone().sum().to_data().to_vec().unwrap()[0];
let sum_copy: f32 = weight_copy.clone().sum().to_data().to_vec().unwrap()[0];
assert!(
(sum_zc - sum_copy).abs() < 1e-6,
"Layer {} weight sums don't match: zero-copy={}, copy={}",
i,
sum_zc,
sum_copy
);
}
println!(
"✅ Verified: Zero-copy and copy loading produce identical results for all {} layers",
model_zc.layers.len()
);
}
}
// =============================================================================
// Store-only benchmarks (no backend allocation overhead)
// These show the TRUE zero-copy benefit at the store level
// =============================================================================
#[divan::bench_group(name = "Store Only (no backend)", sample_count = 10)]
mod store_only {
use super::*;
/// File-based store with copy mode - measures store overhead only
#[divan::bench]
fn file_copy(bencher: Bencher) {
let bp_path = get_burnpack_path();
let file_size = fs::metadata(&bp_path).unwrap().len();
bencher
.counter(divan::counter::BytesCount::new(file_size))
.bench(|| {
let mut store = BurnpackStore::from_file(&bp_path).zero_copy(false);
// Just iterate through all tensor snapshots, calling to_data() on each
// This forces the store to read and materialize all tensor data
let snapshots = store.get_all_snapshots().expect("Failed to get snapshots");
for snapshot in snapshots.values() {
let _data = snapshot.to_data().expect("Failed to get tensor data");
}
});
}
/// File-based store with zero-copy mode - measures store overhead only
#[divan::bench]
fn file_zero_copy(bencher: Bencher) {
let bp_path = get_burnpack_path();
let file_size = fs::metadata(&bp_path).unwrap().len();
bencher
.counter(divan::counter::BytesCount::new(file_size))
.bench(|| {
let mut store = BurnpackStore::from_file(&bp_path).zero_copy(true);
let snapshots = store.get_all_snapshots().expect("Failed to get snapshots");
for snapshot in snapshots.values() {
let _data = snapshot.to_data().expect("Failed to get tensor data");
}
});
}
/// Static bytes with copy mode - measures store overhead only
#[divan::bench]
fn static_copy(bencher: Bencher) {
let static_bytes = get_static_model_bytes();
let file_size = static_bytes.len() as u64;
bencher
.counter(divan::counter::BytesCount::new(file_size))
.bench(|| {
// Simulate old behavior: copy static bytes to Vec
let bytes = Bytes::from_bytes_vec(static_bytes.to_vec());
let mut store = BurnpackStore::from_bytes(Some(bytes)).zero_copy(false);
let snapshots = store.get_all_snapshots().expect("Failed to get snapshots");
for snapshot in snapshots.values() {
let _data = snapshot.to_data().expect("Failed to get tensor data");
}
});
}
/// Static bytes with zero-copy mode - measures store overhead only
#[divan::bench]
fn static_zero_copy(bencher: Bencher) {
let static_bytes = get_static_model_bytes();
let file_size = static_bytes.len() as u64;
bencher
.counter(divan::counter::BytesCount::new(file_size))
.bench(|| {
let mut store = BurnpackStore::from_static(static_bytes);
let snapshots = store.get_all_snapshots().expect("Failed to get snapshots");
for snapshot in snapshots.values() {
let _data = snapshot.to_data().expect("Failed to get tensor data");
}
});
}
}
// =============================================================================
// Full model loading benchmarks (includes backend allocation)
// =============================================================================
// Generate benchmarks for each backend
bench_backend!(NdArrayBackend, ndarray_backend, "NdArray Backend (CPU)");
#[cfg(feature = "wgpu")]
bench_backend!(WgpuBackend, wgpu_backend, "WGPU Backend (GPU)");
#[cfg(feature = "cuda")]
bench_backend!(CudaBackend, cuda_backend, "CUDA Backend (NVIDIA GPU)");
#[cfg(feature = "tch")]
bench_backend!(TchBackend, tch_backend, "LibTorch Backend");
#[cfg(feature = "metal")]
bench_backend!(MetalBackend, metal_backend, "Metal Backend (Apple GPU)");