feat: update workspace paths and enhance gitignore
- Updated stablediffusion crate path from "../stable-diffusion-burn" to "./crates/stable-diffusion-burn" for proper workspace resolution - Enhanced .gitignore to include generated model files (.mpk, .pt, .bin, .safetensors, .ckpt) and user_data directory - Added Cargo.lock to gitignore with appropriate comment - Reorganized IDE files section in gitignore for better clarity - Added newline at end of file for proper formatting
This commit is contained in:
@@ -0,0 +1,82 @@
|
||||
#!/usr/bin/env python3
|
||||
# /// script
|
||||
# requires-python = ">=3.8"
|
||||
# dependencies = [
|
||||
# "torch",
|
||||
# "torchvision",
|
||||
# ]
|
||||
# ///
|
||||
"""
|
||||
Download ResNet18 PyTorch model for benchmarking.
|
||||
This script downloads a pre-trained ResNet18 model from PyTorch Hub
|
||||
and saves it in a format suitable for benchmarking.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
import torchvision.models as models
|
||||
|
||||
def download_resnet18():
|
||||
"""Download ResNet18 model and save to temp directory."""
|
||||
|
||||
# Create a temporary directory for the model
|
||||
temp_dir = Path(tempfile.gettempdir()) / "burn_resnet18_benchmark"
|
||||
temp_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
output_path = temp_dir / "resnet18.pth"
|
||||
|
||||
# Check if already downloaded
|
||||
if output_path.exists():
|
||||
file_size_mb = output_path.stat().st_size / (1024 * 1024)
|
||||
print(f"✅ ResNet18 already exists at: {output_path}")
|
||||
print(f" Size: {file_size_mb:.1f} MB")
|
||||
return str(output_path)
|
||||
|
||||
print("📥 Downloading ResNet18 model...")
|
||||
|
||||
try:
|
||||
# Download pre-trained ResNet18 model
|
||||
model = models.resnet18(pretrained=True)
|
||||
|
||||
# Save the model state dict (this is what burn-store reads)
|
||||
# Using the legacy format for compatibility
|
||||
torch.save(model.state_dict(), output_path, _use_new_zipfile_serialization=False)
|
||||
|
||||
file_size_mb = output_path.stat().st_size / (1024 * 1024)
|
||||
print(f"✅ Successfully downloaded ResNet18 to: {output_path}")
|
||||
print(f" Size: {file_size_mb:.1f} MB")
|
||||
print(f" Format: PyTorch legacy format")
|
||||
|
||||
# Verify it's readable
|
||||
state_dict = torch.load(output_path, map_location='cpu')
|
||||
print(f" Tensors: {len(state_dict)} tensors")
|
||||
|
||||
# Print a few tensor names and shapes for verification
|
||||
print("\n Sample tensors:")
|
||||
for i, (name, tensor) in enumerate(state_dict.items()):
|
||||
if i < 3:
|
||||
print(f" - {name}: {list(tensor.shape)}")
|
||||
|
||||
return str(output_path)
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Failed to download ResNet18: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
path = download_resnet18()
|
||||
|
||||
# Write the path to a file that the benchmark can read
|
||||
bench_config = Path(tempfile.gettempdir()) / "burn_resnet18_benchmark" / "path.txt"
|
||||
bench_config.write_text(path)
|
||||
|
||||
print(f"\n💡 Model ready for benchmarking")
|
||||
print(f" Run: cargo bench --bench resnet18_loading")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,175 @@
|
||||
#!/usr/bin/env python3
|
||||
# /// script
|
||||
# requires-python = ">=3.8"
|
||||
# dependencies = [
|
||||
# "torch",
|
||||
# "safetensors",
|
||||
# "packaging",
|
||||
# "numpy",
|
||||
# ]
|
||||
# ///
|
||||
"""
|
||||
Generate a large model (~312MB) in both PyTorch and SafeTensors formats for unified benchmarking.
|
||||
|
||||
Usage:
|
||||
uv run benches/generate_unified_models.py
|
||||
|
||||
The script will create model files in /tmp/simple_bench_models/ directory.
|
||||
"""
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import os
|
||||
from pathlib import Path
|
||||
import tempfile
|
||||
from safetensors.torch import save_file
|
||||
|
||||
def get_temp_dir():
|
||||
"""Get the appropriate temp directory."""
|
||||
temp_dir = Path(tempfile.gettempdir()) / "simple_bench_models"
|
||||
temp_dir.mkdir(parents=True, exist_ok=True)
|
||||
return temp_dir
|
||||
|
||||
class LargeModel(nn.Module):
|
||||
"""Large model with 20 layers to match Rust benchmark."""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.layers = nn.ModuleList()
|
||||
|
||||
# Create a model with 20 layers matching the Rust LargeModel
|
||||
for i in range(20):
|
||||
in_size = 1024 if i == 0 else 2048
|
||||
out_size = 2048
|
||||
self.layers.append(nn.Linear(in_size, out_size))
|
||||
|
||||
print(f"Created model with {len(self.layers)} layers")
|
||||
|
||||
def forward(self, x):
|
||||
for layer in self.layers:
|
||||
x = layer(x)
|
||||
return x
|
||||
|
||||
def calculate_model_size(model):
|
||||
"""Calculate the size of the model in MB."""
|
||||
total_params = sum(p.numel() for p in model.parameters())
|
||||
size_mb = (total_params * 4) / (1024 * 1024) # 4 bytes per float32
|
||||
return total_params, size_mb
|
||||
|
||||
def initialize_weights(model):
|
||||
"""Initialize model weights with random values."""
|
||||
for param in model.parameters():
|
||||
if param.dim() > 1:
|
||||
nn.init.xavier_uniform_(param)
|
||||
else:
|
||||
nn.init.zeros_(param)
|
||||
|
||||
def save_pytorch_format(model, output_dir):
|
||||
"""Save model in PyTorch format."""
|
||||
pt_path = output_dir / "large_model.pt"
|
||||
|
||||
# Save as checkpoint with model_state_dict (common format)
|
||||
checkpoint = {
|
||||
'model_state_dict': model.state_dict(),
|
||||
'metadata': {
|
||||
'model_type': 'large_benchmark_model',
|
||||
'num_layers': len(model.layers),
|
||||
}
|
||||
}
|
||||
torch.save(checkpoint, pt_path)
|
||||
|
||||
return pt_path
|
||||
|
||||
def save_safetensors_format(model, output_dir):
|
||||
"""Save model in SafeTensors format."""
|
||||
st_path = output_dir / "large_model.safetensors"
|
||||
|
||||
# Convert state dict to safetensors format
|
||||
state_dict = model.state_dict()
|
||||
# Ensure all tensors are contiguous and on CPU
|
||||
state_dict = {k: v.contiguous().cpu() for k, v in state_dict.items()}
|
||||
|
||||
# Save with metadata
|
||||
metadata = {
|
||||
'model_type': 'large_benchmark_model',
|
||||
'num_layers': str(len(model.layers)),
|
||||
}
|
||||
save_file(state_dict, st_path, metadata=metadata)
|
||||
|
||||
return st_path
|
||||
|
||||
def verify_files(pt_path, st_path):
|
||||
"""Verify the saved files can be loaded."""
|
||||
# Verify PyTorch file
|
||||
checkpoint = torch.load(pt_path, map_location='cpu')
|
||||
pt_keys = set(checkpoint['model_state_dict'].keys())
|
||||
print(f" PyTorch file: {len(pt_keys)} tensors")
|
||||
|
||||
# Verify SafeTensors file
|
||||
from safetensors import safe_open
|
||||
with safe_open(st_path, framework="pt", device="cpu") as f:
|
||||
st_keys = set(f.keys())
|
||||
print(f" SafeTensors file: {len(st_keys)} tensors")
|
||||
|
||||
# Check keys match
|
||||
if pt_keys != st_keys:
|
||||
print(" ⚠️ Warning: Keys don't match between formats!")
|
||||
else:
|
||||
print(" ✓ Keys match between formats")
|
||||
|
||||
def main():
|
||||
print("🔧 Generating unified benchmark model files...")
|
||||
print("")
|
||||
|
||||
output_dir = get_temp_dir()
|
||||
print(f"📁 Output directory: {output_dir}")
|
||||
print("")
|
||||
|
||||
# Set random seed for reproducibility
|
||||
torch.manual_seed(42)
|
||||
|
||||
# Create the large model
|
||||
print("📝 Creating large model...")
|
||||
model = LargeModel()
|
||||
|
||||
# Calculate and display model size
|
||||
total_params, size_mb = calculate_model_size(model)
|
||||
print(f" Total parameters: {total_params:,}")
|
||||
print(f" Model size: {size_mb:.2f} MB")
|
||||
print("")
|
||||
|
||||
# Initialize weights
|
||||
print("🎲 Initializing weights...")
|
||||
initialize_weights(model)
|
||||
|
||||
# Save in PyTorch format
|
||||
print("💾 Saving PyTorch format...")
|
||||
pt_path = save_pytorch_format(model, output_dir)
|
||||
pt_size_mb = pt_path.stat().st_size / (1024 * 1024)
|
||||
print(f" Saved: {pt_path}")
|
||||
print(f" File size: {pt_size_mb:.2f} MB")
|
||||
print("")
|
||||
|
||||
# Save in SafeTensors format
|
||||
print("💾 Saving SafeTensors format...")
|
||||
st_path = save_safetensors_format(model, output_dir)
|
||||
st_size_mb = st_path.stat().st_size / (1024 * 1024)
|
||||
print(f" Saved: {st_path}")
|
||||
print(f" File size: {st_size_mb:.2f} MB")
|
||||
print("")
|
||||
|
||||
# Verify files
|
||||
print("🔍 Verifying saved files...")
|
||||
verify_files(pt_path, st_path)
|
||||
print("")
|
||||
|
||||
print(f"✅ Model files generated successfully!")
|
||||
print("")
|
||||
print("📊 Summary:")
|
||||
print(f" PyTorch file: {pt_path.name} ({pt_size_mb:.2f} MB)")
|
||||
print(f" SafeTensors file: {st_path.name} ({st_size_mb:.2f} MB)")
|
||||
print("")
|
||||
print("💡 To run the unified benchmark:")
|
||||
print(" cargo bench --bench unified_loading")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,213 @@
|
||||
//! Benchmark for ResNet18 loading to verify lazy loading memory usage.
|
||||
//!
|
||||
//! resnet18.pth is pytorch's legacy file format.
|
||||
//!
|
||||
//! This benchmark loads a ResNet18 model and materializes all tensors
|
||||
//! to ensure memory usage stays reasonable with lazy loading.
|
||||
//!
|
||||
//! Run the benchmark:
|
||||
//! ```bash
|
||||
//! cargo bench --bench resnet18_loading
|
||||
//! ```
|
||||
|
||||
use burn_store::pytorch::PytorchReader;
|
||||
use divan::{AllocProfiler, Bencher};
|
||||
use std::path::PathBuf;
|
||||
|
||||
#[global_allocator]
|
||||
static ALLOC: AllocProfiler = AllocProfiler::system();
|
||||
|
||||
#[allow(clippy::manual_range_contains)]
|
||||
fn main() {
|
||||
// Check if ResNet18 file exists
|
||||
let path = resnet18_path();
|
||||
if !path.exists() {
|
||||
eprintln!("❌ ResNet18 model not found!");
|
||||
eprintln!();
|
||||
eprintln!("Please download it first by running:");
|
||||
eprintln!(" python benches/download_resnet18.py");
|
||||
eprintln!();
|
||||
eprintln!("Or if you don't have Python/PyTorch installed:");
|
||||
eprintln!(" uv run benches/download_resnet18.py");
|
||||
eprintln!();
|
||||
eprintln!("Expected location: {}", path.display());
|
||||
std::process::exit(1);
|
||||
}
|
||||
|
||||
// Verify file size is reasonable
|
||||
let metadata = std::fs::metadata(&path).expect("Failed to read file metadata");
|
||||
let size_mb = metadata.len() as f64 / 1_048_576.0;
|
||||
|
||||
if size_mb < 40.0 || size_mb > 50.0 {
|
||||
eprintln!(
|
||||
"⚠️ Warning: ResNet18 file size ({:.1} MB) seems unusual",
|
||||
size_mb
|
||||
);
|
||||
eprintln!("Expected size is around 45 MB");
|
||||
}
|
||||
|
||||
println!("✅ Found ResNet18 model at: {}", path.display());
|
||||
println!("📦 File size: {:.1} MB", size_mb);
|
||||
println!("📊 Running ResNet18 loading benchmarks...\n");
|
||||
|
||||
// Run divan benchmarks
|
||||
divan::main();
|
||||
}
|
||||
|
||||
/// Get the path to ResNet18 model file
|
||||
fn resnet18_path() -> PathBuf {
|
||||
// First try to read from the path file created by download script
|
||||
let temp_dir = std::env::temp_dir();
|
||||
let config_file = temp_dir.join("burn_resnet18_benchmark").join("path.txt");
|
||||
|
||||
if config_file.exists()
|
||||
&& let Ok(path_str) = std::fs::read_to_string(&config_file)
|
||||
{
|
||||
let path = PathBuf::from(path_str.trim());
|
||||
if path.exists() {
|
||||
return path;
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to default location
|
||||
temp_dir
|
||||
.join("burn_resnet18_benchmark")
|
||||
.join("resnet18.pth")
|
||||
}
|
||||
|
||||
#[divan::bench(sample_count = 10)]
|
||||
fn load_resnet18_metadata(bencher: Bencher) {
|
||||
let path = resnet18_path();
|
||||
|
||||
bencher.bench_local(|| {
|
||||
let reader = PytorchReader::new(&path).expect("Failed to load ResNet18");
|
||||
let metadata = reader.metadata();
|
||||
|
||||
// Just access metadata without materializing tensors
|
||||
assert_eq!(metadata.tensor_count, 122);
|
||||
});
|
||||
}
|
||||
|
||||
#[divan::bench(sample_count = 5)]
|
||||
fn load_resnet18_materialize_all(bencher: Bencher) {
|
||||
let path = resnet18_path();
|
||||
|
||||
bencher.bench_local(|| {
|
||||
let reader = PytorchReader::new(&path).expect("Failed to load ResNet18");
|
||||
let keys = reader.keys();
|
||||
|
||||
let mut total_bytes = 0usize;
|
||||
|
||||
// Materialize all tensors one by one
|
||||
for key in &keys {
|
||||
let tensor = reader.get(key).expect("Failed to get tensor");
|
||||
// Materialize the tensor data
|
||||
let _data = tensor.to_data().expect("Failed to materialize tensor data");
|
||||
total_bytes += tensor.data_len();
|
||||
}
|
||||
|
||||
// Verify we processed all the data
|
||||
assert!(total_bytes > 40_000_000); // Should be ~45MB
|
||||
});
|
||||
}
|
||||
|
||||
#[divan::bench(sample_count = 5)]
|
||||
fn load_resnet18_materialize_sequential(bencher: Bencher) {
|
||||
let path = resnet18_path();
|
||||
|
||||
bencher.bench_local(|| {
|
||||
let reader = PytorchReader::new(&path).expect("Failed to load ResNet18");
|
||||
let keys = reader.keys();
|
||||
|
||||
// Materialize tensors one at a time, letting previous ones be dropped
|
||||
// This simulates processing tensors sequentially without keeping all in memory
|
||||
for key in &keys {
|
||||
let tensor = reader.get(key).expect("Failed to get tensor");
|
||||
let data = tensor.to_data().expect("Failed to materialize tensor data");
|
||||
|
||||
// Do minimal work with the data to prevent optimization
|
||||
let sum = match data.dtype {
|
||||
burn_tensor::DType::F32 => data
|
||||
.as_slice::<f32>()
|
||||
.map(|s| s.iter().sum::<f32>())
|
||||
.unwrap_or(0.0) as f64,
|
||||
burn_tensor::DType::F64 => data
|
||||
.as_slice::<f64>()
|
||||
.map(|s| s.iter().sum::<f64>())
|
||||
.unwrap_or(0.0),
|
||||
_ => 0.0,
|
||||
};
|
||||
|
||||
// Use the sum to prevent dead code elimination
|
||||
std::hint::black_box(sum);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#[divan::bench(sample_count = 10)]
|
||||
fn load_resnet18_largest_tensor(bencher: Bencher) {
|
||||
let path = resnet18_path();
|
||||
|
||||
bencher.bench_local(|| {
|
||||
let reader = PytorchReader::new(&path).expect("Failed to load ResNet18");
|
||||
|
||||
// Find and materialize only the largest tensor
|
||||
// This tests peak memory for a single tensor operation
|
||||
let keys = reader.keys();
|
||||
let mut largest_key = String::new();
|
||||
let mut largest_size = 0usize;
|
||||
|
||||
for key in &keys {
|
||||
let tensor = reader.get(key).expect("Failed to get tensor");
|
||||
let size = tensor.data_len();
|
||||
if size > largest_size {
|
||||
largest_size = size;
|
||||
largest_key = key.clone();
|
||||
}
|
||||
}
|
||||
|
||||
// Materialize the largest tensor
|
||||
let tensor = reader
|
||||
.get(&largest_key)
|
||||
.expect("Failed to get largest tensor");
|
||||
let _data = tensor.to_data().expect("Failed to materialize tensor data");
|
||||
|
||||
assert!(largest_size > 9_000_000); // Should be ~9MB for layer4.0.conv2.weight
|
||||
});
|
||||
}
|
||||
|
||||
#[divan::bench(sample_count = 10)]
|
||||
fn load_resnet18_memory_profile(bencher: Bencher) {
|
||||
let path = resnet18_path();
|
||||
|
||||
bencher
|
||||
.with_inputs(|| path.clone())
|
||||
.bench_local_values(|path| {
|
||||
let reader = PytorchReader::new(&path).expect("Failed to load ResNet18");
|
||||
let keys = reader.keys();
|
||||
|
||||
let mut peak_single_tensor = 0usize;
|
||||
let mut total_data = 0usize;
|
||||
|
||||
// Process each tensor and track memory
|
||||
for key in &keys {
|
||||
let tensor = reader.get(key).expect("Failed to get tensor");
|
||||
let tensor_size = tensor.data_len();
|
||||
|
||||
// Track largest single tensor
|
||||
if tensor_size > peak_single_tensor {
|
||||
peak_single_tensor = tensor_size;
|
||||
}
|
||||
|
||||
// Materialize the tensor
|
||||
let data = tensor.to_data().expect("Failed to materialize tensor data");
|
||||
total_data += tensor_size;
|
||||
|
||||
// Drop data immediately to test lazy loading memory efficiency
|
||||
drop(data);
|
||||
}
|
||||
|
||||
// Return stats for verification
|
||||
(peak_single_tensor, total_data)
|
||||
});
|
||||
}
|
||||
@@ -0,0 +1,332 @@
|
||||
#![recursion_limit = "256"]
|
||||
|
||||
//! Unified benchmark comparing all loading methods:
|
||||
//! - BurnpackStore (new native format)
|
||||
//! - NamedMpkFileRecorder (old native format)
|
||||
//! - SafetensorsStore (new)
|
||||
//! - SafetensorsFileRecorder (old)
|
||||
//! - PytorchStore (new)
|
||||
//! - PyTorchFileRecorder (old)
|
||||
//!
|
||||
//! Before running this benchmark, generate the model files:
|
||||
//! ```bash
|
||||
//! cd crates/burn-store
|
||||
//! uv run benches/generate_unified_models.py
|
||||
//! ```
|
||||
//!
|
||||
//! Then run the benchmark:
|
||||
//! ```bash
|
||||
//! cargo bench --bench unified_loading
|
||||
//! ```
|
||||
|
||||
use burn_core as burn;
|
||||
|
||||
use burn_core::module::Module;
|
||||
use burn_core::prelude::*;
|
||||
use burn_core::record::{FullPrecisionSettings, NamedMpkFileRecorder, Recorder};
|
||||
// use burn_import::pytorch::{LoadArgs, PyTorchFileRecorder};
|
||||
// use burn_import::safetensors::SafetensorsFileRecorder;
|
||||
use burn_nn as nn;
|
||||
use burn_store::{
|
||||
BurnpackStore, ModuleSnapshot, PyTorchToBurnAdapter, PytorchStore, SafetensorsStore,
|
||||
};
|
||||
use divan::{AllocProfiler, Bencher};
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
#[global_allocator]
|
||||
static ALLOC: AllocProfiler = AllocProfiler::system();
|
||||
|
||||
// Backend type aliases
|
||||
type NdArrayBackend = burn_ndarray::NdArray<f32>;
|
||||
|
||||
#[cfg(feature = "wgpu")]
|
||||
type WgpuBackend = burn_wgpu::Wgpu;
|
||||
|
||||
#[cfg(feature = "cuda")]
|
||||
type CudaBackend = burn_cuda::Cuda<f32, i32>;
|
||||
|
||||
#[cfg(feature = "tch")]
|
||||
type TchBackend = burn_tch::LibTorch<f32>;
|
||||
|
||||
#[cfg(feature = "metal")]
|
||||
type MetalBackend = burn_wgpu::Metal;
|
||||
|
||||
// Use the same LargeModel as other benchmarks for fair comparison
|
||||
#[derive(Module, Debug)]
|
||||
struct LargeModel<B: Backend> {
|
||||
layers: Vec<nn::Linear<B>>,
|
||||
}
|
||||
|
||||
impl<B: Backend> LargeModel<B> {
|
||||
fn new(device: &B::Device) -> Self {
|
||||
let mut layers = Vec::new();
|
||||
// Create a model with 20 layers - same as safetensor_loading benchmark
|
||||
for i in 0..20 {
|
||||
let in_size = if i == 0 { 1024 } else { 2048 };
|
||||
layers.push(nn::LinearConfig::new(in_size, 2048).init(device));
|
||||
}
|
||||
Self { layers }
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the path to the model files
|
||||
fn get_model_dir() -> PathBuf {
|
||||
std::env::temp_dir().join("simple_bench_models")
|
||||
}
|
||||
|
||||
/// Generate Burnpack and NamedMpk files from existing SafeTensors file
|
||||
fn generate_burn_formats(st_path: &Path, bp_path: &Path, mpk_path: &Path) {
|
||||
type TestBackend = NdArrayBackend;
|
||||
let device = Default::default();
|
||||
|
||||
// Load the model from SafeTensors
|
||||
let mut model = LargeModel::<TestBackend>::new(&device);
|
||||
let mut store = SafetensorsStore::from_file(st_path).with_from_adapter(PyTorchToBurnAdapter);
|
||||
model
|
||||
.load_from(&mut store)
|
||||
.expect("Failed to load from SafeTensors");
|
||||
|
||||
// Save as Burnpack
|
||||
if !bp_path.exists() {
|
||||
println!(" Creating Burnpack file...");
|
||||
let mut burnpack_store = BurnpackStore::from_file(bp_path);
|
||||
model
|
||||
.save_into(&mut burnpack_store)
|
||||
.expect("Failed to save as Burnpack");
|
||||
}
|
||||
|
||||
// Save as NamedMpk
|
||||
if !mpk_path.exists() {
|
||||
println!(" Creating NamedMpk file...");
|
||||
let recorder = NamedMpkFileRecorder::<FullPrecisionSettings>::default();
|
||||
model
|
||||
.save_file(mpk_path, &recorder)
|
||||
.expect("Failed to save as NamedMpk");
|
||||
}
|
||||
}
|
||||
|
||||
/// Get paths to the model files
|
||||
fn get_model_paths() -> (PathBuf, PathBuf, PathBuf, PathBuf) {
|
||||
let dir = get_model_dir();
|
||||
(
|
||||
dir.join("large_model.bpk"),
|
||||
dir.join("large_model.mpk"),
|
||||
dir.join("large_model.safetensors"),
|
||||
dir.join("large_model.pt"),
|
||||
)
|
||||
}
|
||||
|
||||
/// Check if model files exist
|
||||
fn check_model_files() -> Result<(), String> {
|
||||
let (_, _, st_path, pt_path) = get_model_paths();
|
||||
|
||||
// For now, only check safetensors and pytorch files (will generate burnpack/mpk later)
|
||||
if !st_path.exists() || !pt_path.exists() {
|
||||
return Err(format!(
|
||||
"\n❌ Model files not found!\n\
|
||||
\n\
|
||||
Please generate the model files first by running:\n\
|
||||
\n\
|
||||
cd crates/burn-store\n\
|
||||
uv run benches/generate_unified_models.py\n\
|
||||
\n\
|
||||
Expected files:\n\
|
||||
- {}\n\
|
||||
- {}\n",
|
||||
st_path.display(),
|
||||
pt_path.display()
|
||||
));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn main() {
|
||||
// Check if model files exist before running benchmarks
|
||||
match check_model_files() {
|
||||
Ok(()) => {
|
||||
let (bp_path, mpk_path, st_path, pt_path) = get_model_paths();
|
||||
|
||||
// First, generate Burnpack and MPK files if they don't exist
|
||||
if !bp_path.exists() || !mpk_path.exists() {
|
||||
println!("⏳ Generating Burnpack and NamedMpk files from SafeTensors...");
|
||||
generate_burn_formats(&st_path, &bp_path, &mpk_path);
|
||||
}
|
||||
|
||||
let bp_size = fs::metadata(&bp_path)
|
||||
.ok()
|
||||
.map(|m| m.len() as f64 / 1_048_576.0);
|
||||
let mpk_size = fs::metadata(&mpk_path)
|
||||
.ok()
|
||||
.map(|m| m.len() as f64 / 1_048_576.0);
|
||||
let st_size = fs::metadata(&st_path).unwrap().len() as f64 / 1_048_576.0;
|
||||
let pt_size = fs::metadata(&pt_path).unwrap().len() as f64 / 1_048_576.0;
|
||||
|
||||
println!("✅ Found model files:");
|
||||
if let Some(size) = bp_size {
|
||||
println!(" Burnpack: {} ({:.1} MB)", bp_path.display(), size);
|
||||
}
|
||||
if let Some(size) = mpk_size {
|
||||
println!(" NamedMpk: {} ({:.1} MB)", mpk_path.display(), size);
|
||||
}
|
||||
println!(" SafeTensors: {} ({:.1} MB)", st_path.display(), st_size);
|
||||
println!(" PyTorch: {} ({:.1} MB)", pt_path.display(), pt_size);
|
||||
println!();
|
||||
println!("🚀 Running unified loading benchmarks...");
|
||||
println!();
|
||||
println!("Comparing 6 loading methods:");
|
||||
println!(" 1. BurnpackStore (new native format - lazy loading)");
|
||||
println!(" 2. NamedMpkFileRecorder (old native format - loads all to memory)");
|
||||
println!(" 3. SafetensorsStore (new)");
|
||||
println!(" 4. SafetensorsFileRecorder (old)");
|
||||
println!(" 5. PytorchStore (new)");
|
||||
println!(" 6. PyTorchFileRecorder (old)");
|
||||
println!();
|
||||
println!("Available backends:");
|
||||
println!(" - NdArray (CPU)");
|
||||
#[cfg(feature = "wgpu")]
|
||||
println!(" - WGPU (GPU)");
|
||||
#[cfg(feature = "cuda")]
|
||||
println!(" - CUDA (NVIDIA GPU)");
|
||||
#[cfg(feature = "tch")]
|
||||
println!(" - LibTorch");
|
||||
#[cfg(feature = "metal")]
|
||||
println!(" - Metal (Apple GPU)");
|
||||
println!();
|
||||
|
||||
divan::main();
|
||||
}
|
||||
Err(msg) => {
|
||||
eprintln!("{}", msg);
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Macro to generate benchmarks for each backend
|
||||
macro_rules! bench_backend {
|
||||
($backend:ty, $mod_name:ident, $backend_name:literal) => {
|
||||
#[divan::bench_group(name = $backend_name, sample_count = 10)]
|
||||
mod $mod_name {
|
||||
use super::*;
|
||||
|
||||
type TestBackend = $backend;
|
||||
type TestDevice = <TestBackend as Backend>::Device;
|
||||
|
||||
#[divan::bench]
|
||||
fn burnpack_store(bencher: Bencher) {
|
||||
let (bp_path, _, _, _) = get_model_paths();
|
||||
let file_size = fs::metadata(&bp_path).unwrap().len();
|
||||
|
||||
bencher
|
||||
.counter(divan::counter::BytesCount::new(file_size))
|
||||
.bench(|| {
|
||||
let device: TestDevice = Default::default();
|
||||
let mut model = LargeModel::<TestBackend>::new(&device);
|
||||
let mut store = BurnpackStore::from_file(bp_path.clone());
|
||||
model.load_from(&mut store).expect("Failed to load");
|
||||
});
|
||||
}
|
||||
|
||||
#[divan::bench]
|
||||
fn namedmpk_recorder(bencher: Bencher) {
|
||||
let (_, mpk_path, _, _) = get_model_paths();
|
||||
let file_size = fs::metadata(&mpk_path).unwrap().len();
|
||||
|
||||
bencher
|
||||
.counter(divan::counter::BytesCount::new(file_size))
|
||||
.bench(|| {
|
||||
let device: TestDevice = Default::default();
|
||||
let recorder = NamedMpkFileRecorder::<FullPrecisionSettings>::default();
|
||||
let record = recorder
|
||||
.load(mpk_path.clone().into(), &device)
|
||||
.expect("Failed to load");
|
||||
let _model = LargeModel::<TestBackend>::new(&device).load_record(record);
|
||||
});
|
||||
}
|
||||
|
||||
#[divan::bench]
|
||||
fn safetensors_store(bencher: Bencher) {
|
||||
let (_, _, st_path, _) = get_model_paths();
|
||||
let file_size = fs::metadata(&st_path).unwrap().len();
|
||||
|
||||
bencher
|
||||
.counter(divan::counter::BytesCount::new(file_size))
|
||||
.bench(|| {
|
||||
let device: TestDevice = Default::default();
|
||||
let mut model = LargeModel::<TestBackend>::new(&device);
|
||||
let mut store = SafetensorsStore::from_file(st_path.clone())
|
||||
.with_from_adapter(PyTorchToBurnAdapter);
|
||||
model.load_from(&mut store).expect("Failed to load");
|
||||
});
|
||||
}
|
||||
|
||||
// #[divan::bench]
|
||||
// fn safetensors_recorder(bencher: Bencher) {
|
||||
// let (_, _, st_path, _) = get_model_paths();
|
||||
// let file_size = fs::metadata(&st_path).unwrap().len();
|
||||
|
||||
// bencher
|
||||
// .counter(divan::counter::BytesCount::new(file_size))
|
||||
// .bench(|| {
|
||||
// let device: TestDevice = Default::default();
|
||||
// let recorder = SafetensorsFileRecorder::<FullPrecisionSettings>::default();
|
||||
// let record = recorder
|
||||
// .load(st_path.clone().into(), &device)
|
||||
// .expect("Failed to load");
|
||||
// let _model = LargeModel::<TestBackend>::new(&device).load_record(record);
|
||||
// });
|
||||
// }
|
||||
|
||||
#[divan::bench]
|
||||
fn pytorch_store(bencher: Bencher) {
|
||||
let (_, _, _, pt_path) = get_model_paths();
|
||||
let file_size = fs::metadata(&pt_path).unwrap().len();
|
||||
|
||||
bencher
|
||||
.counter(divan::counter::BytesCount::new(file_size))
|
||||
.bench(|| {
|
||||
let device: TestDevice = Default::default();
|
||||
let mut model = LargeModel::<TestBackend>::new(&device);
|
||||
let mut store = PytorchStore::from_file(pt_path.clone())
|
||||
.with_top_level_key("model_state_dict")
|
||||
.allow_partial(true);
|
||||
model.load_from(&mut store).expect("Failed to load");
|
||||
});
|
||||
}
|
||||
|
||||
// #[divan::bench]
|
||||
// fn pytorch_recorder(bencher: Bencher) {
|
||||
// let (_, _, _, pt_path) = get_model_paths();
|
||||
// let file_size = fs::metadata(&pt_path).unwrap().len();
|
||||
|
||||
// bencher
|
||||
// .counter(divan::counter::BytesCount::new(file_size))
|
||||
// .bench(|| {
|
||||
// let device: TestDevice = Default::default();
|
||||
// let recorder = PyTorchFileRecorder::<FullPrecisionSettings>::default();
|
||||
// let load_args =
|
||||
// LoadArgs::new(pt_path.clone()).with_top_level_key("model_state_dict");
|
||||
// let record = recorder.load(load_args, &device).expect("Failed to load");
|
||||
// let _model = LargeModel::<TestBackend>::new(&device).load_record(record);
|
||||
// });
|
||||
// }
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Generate benchmarks for each backend
|
||||
bench_backend!(NdArrayBackend, ndarray_backend, "NdArray Backend (CPU)");
|
||||
|
||||
#[cfg(feature = "wgpu")]
|
||||
bench_backend!(WgpuBackend, wgpu_backend, "WGPU Backend (GPU)");
|
||||
|
||||
#[cfg(feature = "cuda")]
|
||||
bench_backend!(CudaBackend, cuda_backend, "CUDA Backend (NVIDIA GPU)");
|
||||
|
||||
#[cfg(feature = "tch")]
|
||||
bench_backend!(TchBackend, tch_backend, "LibTorch Backend");
|
||||
|
||||
#[cfg(feature = "metal")]
|
||||
bench_backend!(MetalBackend, metal_backend, "Metal Backend (Apple GPU)");
|
||||
@@ -0,0 +1,183 @@
|
||||
#![recursion_limit = "256"]
|
||||
|
||||
//! Unified benchmark comparing all saving methods:
|
||||
//! - BurnpackStore (new native format)
|
||||
//! - NamedMpkFileRecorder (old native format)
|
||||
//! - SafetensorsStore (new)
|
||||
//!
|
||||
//! Before running this benchmark, ensure the directory exists:
|
||||
//! ```bash
|
||||
//! mkdir -p /tmp/simple_bench_models
|
||||
//! ```
|
||||
//!
|
||||
//! Then run the benchmark:
|
||||
//! ```bash
|
||||
//! cargo bench --bench unified_saving
|
||||
//! ```
|
||||
use burn_core as burn;
|
||||
|
||||
use burn_core::module::Module;
|
||||
use burn_core::prelude::*;
|
||||
use burn_core::record::{FullPrecisionSettings, NamedMpkFileRecorder};
|
||||
use burn_nn as nn;
|
||||
use burn_store::{BurnpackStore, ModuleSnapshot, SafetensorsStore};
|
||||
use divan::{AllocProfiler, Bencher};
|
||||
use std::fs;
|
||||
use std::path::PathBuf;
|
||||
|
||||
#[global_allocator]
|
||||
static ALLOC: AllocProfiler = AllocProfiler::system();
|
||||
|
||||
// Backend type aliases
|
||||
type NdArrayBackend = burn_ndarray::NdArray<f32>;
|
||||
|
||||
#[cfg(feature = "wgpu")]
|
||||
type WgpuBackend = burn_wgpu::Wgpu;
|
||||
|
||||
#[cfg(feature = "cuda")]
|
||||
type CudaBackend = burn_cuda::Cuda<f32, i32>;
|
||||
|
||||
#[cfg(feature = "tch")]
|
||||
type TchBackend = burn_tch::LibTorch<f32>;
|
||||
|
||||
#[cfg(feature = "metal")]
|
||||
type MetalBackend = burn_wgpu::Metal;
|
||||
|
||||
// Use the same LargeModel as other benchmarks for fair comparison
|
||||
#[derive(Module, Debug)]
|
||||
struct LargeModel<B: Backend> {
|
||||
layers: Vec<nn::Linear<B>>,
|
||||
}
|
||||
|
||||
impl<B: Backend> LargeModel<B> {
|
||||
fn new(device: &B::Device) -> Self {
|
||||
let mut layers = Vec::new();
|
||||
// Create a model with 20 layers - same as loading benchmarks
|
||||
for i in 0..20 {
|
||||
let in_size = if i == 0 { 1024 } else { 2048 };
|
||||
layers.push(nn::LinearConfig::new(in_size, 2048).init(device));
|
||||
}
|
||||
Self { layers }
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the path to the output directory
|
||||
fn get_output_dir() -> PathBuf {
|
||||
std::env::temp_dir().join("simple_bench_models_saving")
|
||||
}
|
||||
|
||||
/// Ensure output directory exists
|
||||
fn ensure_output_dir() -> Result<(), String> {
|
||||
let dir = get_output_dir();
|
||||
if !dir.exists() {
|
||||
fs::create_dir_all(&dir)
|
||||
.map_err(|e| format!("Failed to create output directory: {}", e))?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn main() {
|
||||
match ensure_output_dir() {
|
||||
Ok(()) => {
|
||||
println!("✅ Output directory ready: {}", get_output_dir().display());
|
||||
println!();
|
||||
println!("🚀 Running unified saving benchmarks...");
|
||||
println!();
|
||||
println!("Comparing 3 saving methods:");
|
||||
println!(" 1. BurnpackStore (new native format)");
|
||||
println!(" 2. NamedMpkFileRecorder (old native format)");
|
||||
println!(" 3. SafetensorsStore (new)");
|
||||
println!();
|
||||
println!("Available backends:");
|
||||
println!(" - NdArray (CPU)");
|
||||
#[cfg(feature = "wgpu")]
|
||||
println!(" - WGPU (GPU)");
|
||||
#[cfg(feature = "cuda")]
|
||||
println!(" - CUDA (NVIDIA GPU)");
|
||||
#[cfg(feature = "tch")]
|
||||
println!(" - LibTorch");
|
||||
#[cfg(feature = "metal")]
|
||||
println!(" - Metal (Apple GPU)");
|
||||
println!();
|
||||
|
||||
divan::main();
|
||||
}
|
||||
Err(msg) => {
|
||||
eprintln!("❌ {}", msg);
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Macro to generate benchmarks for each backend
|
||||
macro_rules! bench_backend {
|
||||
($backend:ty, $mod_name:ident, $backend_name:literal) => {
|
||||
#[divan::bench_group(name = $backend_name, sample_count = 10)]
|
||||
mod $mod_name {
|
||||
use super::*;
|
||||
|
||||
type TestBackend = $backend;
|
||||
type TestDevice = <TestBackend as Backend>::Device;
|
||||
|
||||
#[divan::bench]
|
||||
fn burnpack_store(bencher: Bencher) {
|
||||
bencher.bench(|| {
|
||||
let device: TestDevice = Default::default();
|
||||
let model = LargeModel::<TestBackend>::new(&device);
|
||||
let output_path = get_output_dir().join("test_burnpack.bpk");
|
||||
let mut store = BurnpackStore::from_file(output_path.clone()).overwrite(true);
|
||||
model
|
||||
.save_into(&mut store)
|
||||
.expect("Failed to save with BurnpackStore");
|
||||
// Clean up
|
||||
let _ = fs::remove_file(output_path);
|
||||
});
|
||||
}
|
||||
|
||||
#[divan::bench]
|
||||
fn namedmpk_recorder(bencher: Bencher) {
|
||||
bencher.bench(|| {
|
||||
let device: TestDevice = Default::default();
|
||||
let model = LargeModel::<TestBackend>::new(&device);
|
||||
let output_path = get_output_dir().join("test_namedmpk.mpk");
|
||||
let recorder = NamedMpkFileRecorder::<FullPrecisionSettings>::default();
|
||||
model
|
||||
.save_file(output_path.clone(), &recorder)
|
||||
.expect("Failed to save with NamedMpkFileRecorder");
|
||||
// Clean up
|
||||
let _ = fs::remove_file(output_path);
|
||||
});
|
||||
}
|
||||
|
||||
#[divan::bench]
|
||||
fn safetensors_store(bencher: Bencher) {
|
||||
bencher.bench(|| {
|
||||
let device: TestDevice = Default::default();
|
||||
let model = LargeModel::<TestBackend>::new(&device);
|
||||
let output_path = get_output_dir().join("test_safetensors_store.safetensors");
|
||||
let mut store = SafetensorsStore::from_file(output_path.clone());
|
||||
model
|
||||
.save_into(&mut store)
|
||||
.expect("Failed to save with SafetensorsStore");
|
||||
// Clean up
|
||||
let _ = fs::remove_file(output_path);
|
||||
});
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Generate benchmarks for each backend
|
||||
bench_backend!(NdArrayBackend, ndarray_backend, "NdArray Backend (CPU)");
|
||||
|
||||
#[cfg(feature = "wgpu")]
|
||||
bench_backend!(WgpuBackend, wgpu_backend, "WGPU Backend (GPU)");
|
||||
|
||||
#[cfg(feature = "cuda")]
|
||||
bench_backend!(CudaBackend, cuda_backend, "CUDA Backend (NVIDIA GPU)");
|
||||
|
||||
#[cfg(feature = "tch")]
|
||||
bench_backend!(TchBackend, tch_backend, "LibTorch Backend");
|
||||
|
||||
#[cfg(feature = "metal")]
|
||||
bench_backend!(MetalBackend, metal_backend, "Metal Backend (Apple GPU)");
|
||||
@@ -0,0 +1,596 @@
|
||||
#![recursion_limit = "256"]
|
||||
|
||||
//! Benchmark comparing zero-copy vs copy loading modes for BurnpackStore.
|
||||
//!
|
||||
//! This benchmark measures the performance difference between:
|
||||
//! - `zero_copy(false)` - Default mode, copies tensor data into new allocations
|
||||
//! - `zero_copy(true)` - Zero-copy mode, slices tensor data without copying
|
||||
//!
|
||||
//! ## Understanding the Results
|
||||
//!
|
||||
//! **IMPORTANT**: For NdArray backend, you'll see similar allocation numbers because:
|
||||
//! - NdArray uses `ndarray::ArrayD` which MUST own data as `Vec<T>`
|
||||
//! - Even with zero-copy, the backend eventually copies data into its own format
|
||||
//!
|
||||
//! The zero-copy benefit is:
|
||||
//! - **Without zero-copy**: File → Copy to heap (Bytes) → Copy to Vec (backend)
|
||||
//! - **With zero-copy**: File → Zero-copy slice → Copy to Vec (backend)
|
||||
//!
|
||||
//! So zero-copy saves ONE memory copy at the store level. The `store_only_*` benchmarks
|
||||
//! show the raw store performance without backend allocation overhead.
|
||||
//!
|
||||
//! GPU backends that can consume `Bytes` directly will show larger benefits.
|
||||
//!
|
||||
//! ## Running the benchmark
|
||||
//!
|
||||
//! Before running this benchmark, generate the model files:
|
||||
//! ```bash
|
||||
//! cd crates/burn-store
|
||||
//! uv run benches/generate_unified_models.py
|
||||
//! ```
|
||||
//!
|
||||
//! Then run the benchmark:
|
||||
//! ```bash
|
||||
//! cargo bench --bench zero_copy_loading
|
||||
//! ```
|
||||
|
||||
use burn_core as burn;
|
||||
|
||||
use burn_core::module::Module;
|
||||
use burn_core::prelude::*;
|
||||
use burn_nn as nn;
|
||||
use burn_store::{
|
||||
BurnpackStore, ModuleSnapshot, ModuleStore, PyTorchToBurnAdapter, SafetensorsStore,
|
||||
};
|
||||
use burn_tensor::{AllocationProperty, Bytes};
|
||||
use divan::{AllocProfiler, Bencher};
|
||||
use std::fs;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::OnceLock;
|
||||
|
||||
#[global_allocator]
|
||||
static ALLOC: AllocProfiler = AllocProfiler::system();
|
||||
|
||||
// Static storage for embedded model bytes (simulating include_bytes!)
|
||||
static STATIC_MODEL_BYTES: OnceLock<&'static [u8]> = OnceLock::new();
|
||||
|
||||
// Backend type aliases
|
||||
type NdArrayBackend = burn_ndarray::NdArray<f32>;
|
||||
|
||||
#[cfg(feature = "wgpu")]
|
||||
type WgpuBackend = burn_wgpu::Wgpu;
|
||||
|
||||
#[cfg(feature = "cuda")]
|
||||
type CudaBackend = burn_cuda::Cuda<f32, i32>;
|
||||
|
||||
#[cfg(feature = "tch")]
|
||||
type TchBackend = burn_tch::LibTorch<f32>;
|
||||
|
||||
#[cfg(feature = "metal")]
|
||||
type MetalBackend = burn_wgpu::Metal;
|
||||
|
||||
// Use the same LargeModel as other benchmarks for fair comparison
|
||||
#[derive(Module, Debug)]
|
||||
struct LargeModel<B: Backend> {
|
||||
layers: Vec<nn::Linear<B>>,
|
||||
}
|
||||
|
||||
impl<B: Backend> LargeModel<B> {
|
||||
fn new(device: &B::Device) -> Self {
|
||||
let mut layers = Vec::new();
|
||||
// Create a model with 20 layers - same as unified_loading benchmark
|
||||
for i in 0..20 {
|
||||
let in_size = if i == 0 { 1024 } else { 2048 };
|
||||
layers.push(nn::LinearConfig::new(in_size, 2048).init(device));
|
||||
}
|
||||
Self { layers }
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the path to the model files
|
||||
fn get_model_dir() -> PathBuf {
|
||||
std::env::temp_dir().join("simple_bench_models")
|
||||
}
|
||||
|
||||
/// Get path to Burnpack model file
|
||||
fn get_burnpack_path() -> PathBuf {
|
||||
get_model_dir().join("large_model.bpk")
|
||||
}
|
||||
|
||||
/// Generate Burnpack file from existing SafeTensors file if needed
|
||||
fn ensure_burnpack_file() {
|
||||
let bp_path = get_burnpack_path();
|
||||
let st_path = get_model_dir().join("large_model.safetensors");
|
||||
|
||||
if bp_path.exists() {
|
||||
return;
|
||||
}
|
||||
|
||||
if !st_path.exists() {
|
||||
panic!(
|
||||
"\n❌ SafeTensors model file not found!\n\
|
||||
\n\
|
||||
Please generate the model files first by running:\n\
|
||||
\n\
|
||||
cd crates/burn-store\n\
|
||||
uv run benches/generate_unified_models.py\n\
|
||||
\n\
|
||||
Expected file: {}\n",
|
||||
st_path.display()
|
||||
);
|
||||
}
|
||||
|
||||
println!("⏳ Generating Burnpack file from SafeTensors...");
|
||||
|
||||
type TestBackend = NdArrayBackend;
|
||||
let device = Default::default();
|
||||
|
||||
// Load from SafeTensors
|
||||
let mut model = LargeModel::<TestBackend>::new(&device);
|
||||
let mut store = SafetensorsStore::from_file(&st_path).with_from_adapter(PyTorchToBurnAdapter);
|
||||
model
|
||||
.load_from(&mut store)
|
||||
.expect("Failed to load from SafeTensors");
|
||||
|
||||
// Save as Burnpack
|
||||
let mut burnpack_store = BurnpackStore::from_file(&bp_path);
|
||||
model
|
||||
.save_into(&mut burnpack_store)
|
||||
.expect("Failed to save as Burnpack");
|
||||
|
||||
println!("✅ Created Burnpack file: {}", bp_path.display());
|
||||
}
|
||||
|
||||
/// Initialize static model bytes (simulating include_bytes! at runtime for benchmarks)
|
||||
fn get_static_model_bytes() -> &'static [u8] {
|
||||
STATIC_MODEL_BYTES.get_or_init(|| {
|
||||
let bp_path = get_burnpack_path();
|
||||
let bytes = fs::read(&bp_path).expect("Failed to read Burnpack file");
|
||||
// Leak the bytes to get a 'static lifetime (acceptable for benchmarks)
|
||||
Box::leak(bytes.into_boxed_slice())
|
||||
})
|
||||
}
|
||||
|
||||
fn main() {
|
||||
// Ensure Burnpack file exists
|
||||
ensure_burnpack_file();
|
||||
|
||||
let bp_path = get_burnpack_path();
|
||||
let file_size = fs::metadata(&bp_path).unwrap().len() as f64 / 1_048_576.0;
|
||||
|
||||
println!("✅ Found Burnpack model file:");
|
||||
println!(" Path: {}", bp_path.display());
|
||||
println!(" Size: {:.1} MB", file_size);
|
||||
println!();
|
||||
println!("🚀 Running zero-copy loading benchmarks...");
|
||||
println!();
|
||||
println!("Comparing loading modes:");
|
||||
println!(" 1. file_copy - from_file().zero_copy(false) - copies tensor data");
|
||||
println!(" 2. file_zero_copy - from_file().zero_copy(true) - zero-copy via mmap");
|
||||
println!(" 3. static_copy - from_bytes() with Vec copy - copies from static");
|
||||
println!(" 4. static_zero_copy - from_static() - zero-copy from static");
|
||||
println!();
|
||||
println!("Available backends:");
|
||||
println!(" - NdArray (CPU)");
|
||||
#[cfg(feature = "wgpu")]
|
||||
println!(" - WGPU (GPU)");
|
||||
#[cfg(feature = "cuda")]
|
||||
println!(" - CUDA (NVIDIA GPU)");
|
||||
#[cfg(feature = "tch")]
|
||||
println!(" - LibTorch");
|
||||
#[cfg(feature = "metal")]
|
||||
println!(" - Metal (Apple GPU)");
|
||||
println!();
|
||||
|
||||
// Pre-initialize static bytes before benchmarks
|
||||
let _ = get_static_model_bytes();
|
||||
|
||||
divan::main();
|
||||
}
|
||||
|
||||
// Macro to generate benchmarks for each backend
|
||||
macro_rules! bench_backend {
|
||||
($backend:ty, $mod_name:ident, $backend_name:literal) => {
|
||||
#[divan::bench_group(name = $backend_name, sample_count = 10)]
|
||||
mod $mod_name {
|
||||
use super::*;
|
||||
|
||||
type TestBackend = $backend;
|
||||
type TestDevice = <TestBackend as Backend>::Device;
|
||||
|
||||
/// File-based loading with copy mode (default)
|
||||
#[divan::bench]
|
||||
fn file_copy(bencher: Bencher) {
|
||||
let bp_path = get_burnpack_path();
|
||||
let file_size = fs::metadata(&bp_path).unwrap().len();
|
||||
|
||||
bencher
|
||||
.counter(divan::counter::BytesCount::new(file_size))
|
||||
.bench(|| {
|
||||
let device: TestDevice = Default::default();
|
||||
let mut model = LargeModel::<TestBackend>::new(&device);
|
||||
let mut store = BurnpackStore::from_file(&bp_path).zero_copy(false);
|
||||
model.load_from(&mut store).expect("Failed to load");
|
||||
});
|
||||
}
|
||||
|
||||
/// File-based loading with zero-copy mode (mmap + bytes::Bytes)
|
||||
#[divan::bench]
|
||||
fn file_zero_copy(bencher: Bencher) {
|
||||
let bp_path = get_burnpack_path();
|
||||
let file_size = fs::metadata(&bp_path).unwrap().len();
|
||||
|
||||
bencher
|
||||
.counter(divan::counter::BytesCount::new(file_size))
|
||||
.bench(|| {
|
||||
let device: TestDevice = Default::default();
|
||||
let mut model = LargeModel::<TestBackend>::new(&device);
|
||||
let mut store = BurnpackStore::from_file(&bp_path).zero_copy(true);
|
||||
model.load_from(&mut store).expect("Failed to load");
|
||||
});
|
||||
}
|
||||
|
||||
/// Static bytes with copy mode (simulating old behavior)
|
||||
#[divan::bench]
|
||||
fn static_copy(bencher: Bencher) {
|
||||
let static_bytes = get_static_model_bytes();
|
||||
let file_size = static_bytes.len() as u64;
|
||||
|
||||
bencher
|
||||
.counter(divan::counter::BytesCount::new(file_size))
|
||||
.bench(|| {
|
||||
let device: TestDevice = Default::default();
|
||||
let mut model = LargeModel::<TestBackend>::new(&device);
|
||||
|
||||
// Simulate old behavior: copy static bytes to Vec, then load
|
||||
let bytes = Bytes::from_bytes_vec(static_bytes.to_vec());
|
||||
let mut store = BurnpackStore::from_bytes(Some(bytes)).zero_copy(false);
|
||||
model.load_from(&mut store).expect("Failed to load");
|
||||
});
|
||||
}
|
||||
|
||||
/// Static bytes with zero-copy mode (new from_static)
|
||||
#[divan::bench]
|
||||
fn static_zero_copy(bencher: Bencher) {
|
||||
let static_bytes = get_static_model_bytes();
|
||||
let file_size = static_bytes.len() as u64;
|
||||
|
||||
bencher
|
||||
.counter(divan::counter::BytesCount::new(file_size))
|
||||
.bench(|| {
|
||||
let device: TestDevice = Default::default();
|
||||
let mut model = LargeModel::<TestBackend>::new(&device);
|
||||
|
||||
// Zero-copy: use from_static which keeps data in .rodata
|
||||
let mut store = BurnpackStore::from_static(static_bytes);
|
||||
model.load_from(&mut store).expect("Failed to load");
|
||||
});
|
||||
}
|
||||
|
||||
/// In-memory shared bytes with zero-copy
|
||||
#[divan::bench]
|
||||
fn memory_shared_zero_copy(bencher: Bencher) {
|
||||
let static_bytes = get_static_model_bytes();
|
||||
let file_size = static_bytes.len() as u64;
|
||||
|
||||
// Pre-create shared bytes outside the benchmark loop
|
||||
let shared = bytes::Bytes::from_static(static_bytes);
|
||||
|
||||
bencher
|
||||
.counter(divan::counter::BytesCount::new(file_size))
|
||||
.bench(|| {
|
||||
let device: TestDevice = Default::default();
|
||||
let mut model = LargeModel::<TestBackend>::new(&device);
|
||||
|
||||
// Create Bytes from shared (cheap clone of Arc)
|
||||
let bytes = Bytes::from_shared(shared.clone(), AllocationProperty::Other);
|
||||
let mut store = BurnpackStore::from_bytes(Some(bytes)).zero_copy(true);
|
||||
model.load_from(&mut store).expect("Failed to load");
|
||||
});
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Zero-copy verification (proves operations use static region data)
|
||||
// =============================================================================
|
||||
|
||||
/// Verify that zero-copy loading actually uses data from the static region.
|
||||
/// This runs once at startup to prove correctness before benchmarking.
|
||||
#[divan::bench_group(name = "Zero-Copy Verification", sample_count = 1)]
|
||||
mod verification {
|
||||
use super::*;
|
||||
use burn_ndarray::NdArray;
|
||||
|
||||
type B = NdArray<f32>;
|
||||
|
||||
/// Verify zero-copy: tensor storage is borrowed (not owned)
|
||||
#[divan::bench]
|
||||
fn verify_storage_is_borrowed() {
|
||||
let static_bytes = get_static_model_bytes();
|
||||
|
||||
// Load model with zero-copy from static bytes
|
||||
let device = Default::default();
|
||||
let mut model = LargeModel::<B>::new(&device);
|
||||
let mut store = BurnpackStore::from_static(static_bytes);
|
||||
model.load_from(&mut store).expect("Failed to load");
|
||||
|
||||
// Get the first layer's weight tensor and verify it uses borrowed storage
|
||||
let weight = model.layers[0].weight.val();
|
||||
// .into_primitive() returns TensorPrimitive<B>, .tensor() extracts B::FloatTensorPrimitive
|
||||
let ndarray_tensor = weight.into_primitive().tensor();
|
||||
|
||||
// Verify the storage is borrowed (zero-copy from static region)
|
||||
assert!(
|
||||
ndarray_tensor.is_borrowed(),
|
||||
"ZERO-COPY FAILURE: Tensor storage is NOT borrowed. \
|
||||
Data was copied instead of being zero-copy!"
|
||||
);
|
||||
|
||||
println!("✅ Verified: Tensor storage is borrowed (zero-copy from static region)");
|
||||
}
|
||||
|
||||
/// Verify ALL layers use borrowed (zero-copy) storage.
|
||||
/// This is the key proof that loaded weights point to static memory.
|
||||
#[divan::bench]
|
||||
fn verify_all_layers_borrowed() {
|
||||
let static_bytes = get_static_model_bytes();
|
||||
|
||||
// Load model with zero-copy
|
||||
let device = Default::default();
|
||||
let mut model = LargeModel::<B>::new(&device);
|
||||
let mut store = BurnpackStore::from_static(static_bytes);
|
||||
model.load_from(&mut store).expect("Failed to load");
|
||||
|
||||
// Check ALL layers have borrowed storage
|
||||
let mut total_elements = 0usize;
|
||||
for (i, layer) in model.layers.iter().enumerate() {
|
||||
let weight = layer.weight.val();
|
||||
total_elements += weight.shape().num_elements();
|
||||
|
||||
assert!(
|
||||
weight.into_primitive().tensor().is_borrowed(),
|
||||
"Layer {} weight should be borrowed (zero-copy)",
|
||||
i
|
||||
);
|
||||
}
|
||||
|
||||
let total_mb = (total_elements * 4) as f64 / 1_048_576.0;
|
||||
println!(
|
||||
"✅ Verified: All {} layers use borrowed storage",
|
||||
model.layers.len()
|
||||
);
|
||||
println!(
|
||||
" - Model size: {:.2} MB - all pointing to static region",
|
||||
total_mb
|
||||
);
|
||||
}
|
||||
|
||||
/// Verify data is readable and correct using sum().into_scalar().
|
||||
/// Note: sum() triggers COW copy, so this shows ops work correctly on zero-copy data.
|
||||
#[divan::bench]
|
||||
fn verify_ops_produce_correct_results() {
|
||||
let static_bytes = get_static_model_bytes();
|
||||
|
||||
let device = Default::default();
|
||||
let mut model = LargeModel::<B>::new(&device);
|
||||
let mut store = BurnpackStore::from_static(static_bytes);
|
||||
model.load_from(&mut store).expect("Failed to load");
|
||||
|
||||
// Compute sum of first layer weight - proves data is valid
|
||||
let weight = model.layers[0].weight.val();
|
||||
let sum: f32 = weight.sum().into_scalar();
|
||||
|
||||
assert!(sum.is_finite(), "Sum should be finite");
|
||||
println!("✅ Verified: Operations on zero-copy data produce valid results");
|
||||
println!(" - First layer sum: {:.4}", sum);
|
||||
}
|
||||
|
||||
/// Verify operations produce correct results on zero-copy data
|
||||
#[divan::bench]
|
||||
fn verify_operations_on_static_data() {
|
||||
let static_bytes = get_static_model_bytes();
|
||||
|
||||
// Load model with zero-copy
|
||||
let device = Default::default();
|
||||
let mut model = LargeModel::<B>::new(&device);
|
||||
let mut store = BurnpackStore::from_static(static_bytes);
|
||||
model.load_from(&mut store).expect("Failed to load");
|
||||
|
||||
// Perform operations on the loaded weights
|
||||
let weight = model.layers[0].weight.val();
|
||||
let shape = weight.shape();
|
||||
|
||||
// Test 1: Sum should be finite (not NaN or Inf)
|
||||
let sum: f32 = weight.clone().sum().to_data().to_vec().unwrap()[0];
|
||||
assert!(
|
||||
sum.is_finite(),
|
||||
"Operation failed: sum is not finite ({})",
|
||||
sum
|
||||
);
|
||||
|
||||
// Test 2: Matrix multiply with itself transposed (W @ W.T)
|
||||
let transposed = weight.clone().transpose();
|
||||
let matmul_result = weight.clone().matmul(transposed);
|
||||
let matmul_sum: f32 = matmul_result.sum().to_data().to_vec().unwrap()[0];
|
||||
assert!(
|
||||
matmul_sum.is_finite(),
|
||||
"Matmul failed: result sum is not finite ({})",
|
||||
matmul_sum
|
||||
);
|
||||
|
||||
// Test 3: Element-wise operations
|
||||
let doubled = weight.clone() * 2.0;
|
||||
let doubled_sum: f32 = doubled.sum().to_data().to_vec().unwrap()[0];
|
||||
assert!(
|
||||
(doubled_sum - sum * 2.0).abs() < 1e-3,
|
||||
"Element-wise op failed: doubled_sum ({}) != sum*2 ({})",
|
||||
doubled_sum,
|
||||
sum * 2.0
|
||||
);
|
||||
|
||||
println!("✅ Verified: Operations on zero-copy data produce correct results");
|
||||
println!(" - Weight shape: {:?}", shape.as_slice());
|
||||
println!(" - Sum: {:.4}", sum);
|
||||
println!(" - Matmul result sum: {:.4}", matmul_sum);
|
||||
}
|
||||
|
||||
/// Compare zero-copy vs copy: verify both produce identical results
|
||||
#[divan::bench]
|
||||
fn verify_copy_vs_zero_copy_equality() {
|
||||
let static_bytes = get_static_model_bytes();
|
||||
let device: <B as Backend>::Device = Default::default();
|
||||
|
||||
// Load with zero-copy
|
||||
let mut model_zc = LargeModel::<B>::new(&device);
|
||||
let mut store_zc = BurnpackStore::from_static(static_bytes);
|
||||
model_zc
|
||||
.load_from(&mut store_zc)
|
||||
.expect("Failed to load zero-copy");
|
||||
|
||||
// Load with copy (simulate old behavior)
|
||||
let mut model_copy = LargeModel::<B>::new(&device);
|
||||
let bytes = Bytes::from_bytes_vec(static_bytes.to_vec());
|
||||
let mut store_copy = BurnpackStore::from_bytes(Some(bytes)).zero_copy(false);
|
||||
model_copy
|
||||
.load_from(&mut store_copy)
|
||||
.expect("Failed to load copy");
|
||||
|
||||
// Compare weights from both models
|
||||
for (i, (layer_zc, layer_copy)) in model_zc
|
||||
.layers
|
||||
.iter()
|
||||
.zip(model_copy.layers.iter())
|
||||
.enumerate()
|
||||
{
|
||||
let weight_zc = layer_zc.weight.val();
|
||||
let weight_copy = layer_copy.weight.val();
|
||||
|
||||
// Check shapes match
|
||||
assert_eq!(
|
||||
weight_zc.shape(),
|
||||
weight_copy.shape(),
|
||||
"Layer {} weight shapes don't match",
|
||||
i
|
||||
);
|
||||
|
||||
// Check values match (using sum as a proxy)
|
||||
let sum_zc: f32 = weight_zc.clone().sum().to_data().to_vec().unwrap()[0];
|
||||
let sum_copy: f32 = weight_copy.clone().sum().to_data().to_vec().unwrap()[0];
|
||||
assert!(
|
||||
(sum_zc - sum_copy).abs() < 1e-6,
|
||||
"Layer {} weight sums don't match: zero-copy={}, copy={}",
|
||||
i,
|
||||
sum_zc,
|
||||
sum_copy
|
||||
);
|
||||
}
|
||||
|
||||
println!(
|
||||
"✅ Verified: Zero-copy and copy loading produce identical results for all {} layers",
|
||||
model_zc.layers.len()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Store-only benchmarks (no backend allocation overhead)
|
||||
// These show the TRUE zero-copy benefit at the store level
|
||||
// =============================================================================
|
||||
|
||||
#[divan::bench_group(name = "Store Only (no backend)", sample_count = 10)]
|
||||
mod store_only {
|
||||
use super::*;
|
||||
|
||||
/// File-based store with copy mode - measures store overhead only
|
||||
#[divan::bench]
|
||||
fn file_copy(bencher: Bencher) {
|
||||
let bp_path = get_burnpack_path();
|
||||
let file_size = fs::metadata(&bp_path).unwrap().len();
|
||||
|
||||
bencher
|
||||
.counter(divan::counter::BytesCount::new(file_size))
|
||||
.bench(|| {
|
||||
let mut store = BurnpackStore::from_file(&bp_path).zero_copy(false);
|
||||
// Just iterate through all tensor snapshots, calling to_data() on each
|
||||
// This forces the store to read and materialize all tensor data
|
||||
let snapshots = store.get_all_snapshots().expect("Failed to get snapshots");
|
||||
for snapshot in snapshots.values() {
|
||||
let _data = snapshot.to_data().expect("Failed to get tensor data");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/// File-based store with zero-copy mode - measures store overhead only
|
||||
#[divan::bench]
|
||||
fn file_zero_copy(bencher: Bencher) {
|
||||
let bp_path = get_burnpack_path();
|
||||
let file_size = fs::metadata(&bp_path).unwrap().len();
|
||||
|
||||
bencher
|
||||
.counter(divan::counter::BytesCount::new(file_size))
|
||||
.bench(|| {
|
||||
let mut store = BurnpackStore::from_file(&bp_path).zero_copy(true);
|
||||
let snapshots = store.get_all_snapshots().expect("Failed to get snapshots");
|
||||
for snapshot in snapshots.values() {
|
||||
let _data = snapshot.to_data().expect("Failed to get tensor data");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/// Static bytes with copy mode - measures store overhead only
|
||||
#[divan::bench]
|
||||
fn static_copy(bencher: Bencher) {
|
||||
let static_bytes = get_static_model_bytes();
|
||||
let file_size = static_bytes.len() as u64;
|
||||
|
||||
bencher
|
||||
.counter(divan::counter::BytesCount::new(file_size))
|
||||
.bench(|| {
|
||||
// Simulate old behavior: copy static bytes to Vec
|
||||
let bytes = Bytes::from_bytes_vec(static_bytes.to_vec());
|
||||
let mut store = BurnpackStore::from_bytes(Some(bytes)).zero_copy(false);
|
||||
let snapshots = store.get_all_snapshots().expect("Failed to get snapshots");
|
||||
for snapshot in snapshots.values() {
|
||||
let _data = snapshot.to_data().expect("Failed to get tensor data");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/// Static bytes with zero-copy mode - measures store overhead only
|
||||
#[divan::bench]
|
||||
fn static_zero_copy(bencher: Bencher) {
|
||||
let static_bytes = get_static_model_bytes();
|
||||
let file_size = static_bytes.len() as u64;
|
||||
|
||||
bencher
|
||||
.counter(divan::counter::BytesCount::new(file_size))
|
||||
.bench(|| {
|
||||
let mut store = BurnpackStore::from_static(static_bytes);
|
||||
let snapshots = store.get_all_snapshots().expect("Failed to get snapshots");
|
||||
for snapshot in snapshots.values() {
|
||||
let _data = snapshot.to_data().expect("Failed to get tensor data");
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Full model loading benchmarks (includes backend allocation)
|
||||
// =============================================================================
|
||||
|
||||
// Generate benchmarks for each backend
|
||||
bench_backend!(NdArrayBackend, ndarray_backend, "NdArray Backend (CPU)");
|
||||
|
||||
#[cfg(feature = "wgpu")]
|
||||
bench_backend!(WgpuBackend, wgpu_backend, "WGPU Backend (GPU)");
|
||||
|
||||
#[cfg(feature = "cuda")]
|
||||
bench_backend!(CudaBackend, cuda_backend, "CUDA Backend (NVIDIA GPU)");
|
||||
|
||||
#[cfg(feature = "tch")]
|
||||
bench_backend!(TchBackend, tch_backend, "LibTorch Backend");
|
||||
|
||||
#[cfg(feature = "metal")]
|
||||
bench_backend!(MetalBackend, metal_backend, "Metal Backend (Apple GPU)");
|
||||
Reference in New Issue
Block a user