mirror of
https://gitea.hainer-ernst.de/rasmus/burn-stablediffusion-vibecode.git
synced 2026-06-11 02:09:21 +00:00
Add first successful sampling implementation
This commit is contained in:
@@ -56,9 +56,6 @@ class ResnetBlock:
|
|||||||
|
|
||||||
def __call__(self, x):
|
def __call__(self, x):
|
||||||
h = self.conv1(self.norm1(x).swish())
|
h = self.conv1(self.norm1(x).swish())
|
||||||
'''v = h
|
|
||||||
print(v.shape)
|
|
||||||
print(v[0, 0:10, :, :].numpy())'''
|
|
||||||
h = self.conv2(self.norm2(h).swish())
|
h = self.conv2(self.norm2(h).swish())
|
||||||
return self.nin_shortcut(x) + h
|
return self.nin_shortcut(x) + h
|
||||||
|
|
||||||
@@ -145,7 +142,6 @@ class AutoencoderKL:
|
|||||||
latent = self.encoder(x)
|
latent = self.encoder(x)
|
||||||
latent = self.quant_conv(latent)
|
latent = self.quant_conv(latent)
|
||||||
latent = latent[:, 0:4] # only the means
|
latent = latent[:, 0:4] # only the means
|
||||||
print("latent", latent.shape)
|
|
||||||
latent = self.post_quant_conv(latent)
|
latent = self.post_quant_conv(latent)
|
||||||
return self.decoder(latent)
|
return self.decoder(latent)
|
||||||
|
|
||||||
@@ -339,15 +335,12 @@ class UNetModel:
|
|||||||
|
|
||||||
saved_inputs = []
|
saved_inputs = []
|
||||||
for i,b in enumerate(self.input_blocks):
|
for i,b in enumerate(self.input_blocks):
|
||||||
#print("input block", i)
|
|
||||||
print(x.numpy())
|
|
||||||
for bb in b:
|
for bb in b:
|
||||||
x = run(x, bb)
|
x = run(x, bb)
|
||||||
saved_inputs.append(x)
|
saved_inputs.append(x)
|
||||||
for bb in self.middle_block:
|
for bb in self.middle_block:
|
||||||
x = run(x, bb)
|
x = run(x, bb)
|
||||||
for i,b in enumerate(self.output_blocks):
|
for i,b in enumerate(self.output_blocks):
|
||||||
#print("output block", i)
|
|
||||||
x = x.cat(saved_inputs.pop(), dim=1)
|
x = x.cat(saved_inputs.pop(), dim=1)
|
||||||
for bb in b:
|
for bb in b:
|
||||||
x = run(x, bb)
|
x = run(x, bb)
|
||||||
@@ -644,7 +637,9 @@ if __name__ == "__main__":
|
|||||||
download_file('https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt', FILENAME)
|
download_file('https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt', FILENAME)
|
||||||
load_state_dict(model, torch_load(FILENAME)['state_dict'], strict=False)
|
load_state_dict(model, torch_load(FILENAME)['state_dict'], strict=False)
|
||||||
|
|
||||||
|
print('Saving model...')
|
||||||
sdsave.save_stable_diffusion(model, "params")
|
sdsave.save_stable_diffusion(model, "params")
|
||||||
|
print('Model saved.')
|
||||||
|
|
||||||
|
|
||||||
'''parser = argparse.ArgumentParser(description='Run Stable Diffusion', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
'''parser = argparse.ArgumentParser(description='Run Stable Diffusion', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import pathlib
|
||||||
from autoencoder import save_autoencoder
|
from autoencoder import save_autoencoder
|
||||||
from unet import save_unet_model
|
from unet import save_unet_model
|
||||||
from clip import save_clip_text_transformer
|
from clip import save_clip_text_transformer
|
||||||
@@ -5,8 +6,9 @@ from clip import save_clip_text_transformer
|
|||||||
from save import save_scalar, save_tensor
|
from save import save_scalar, save_tensor
|
||||||
|
|
||||||
def save_stable_diffusion(stable_diffusion, path):
|
def save_stable_diffusion(stable_diffusion, path):
|
||||||
|
pathlib.Path(path).mkdir(parents=True, exist_ok=True)
|
||||||
save_scalar(stable_diffusion.alphas_cumprod.shape[0], "n_steps", path)
|
save_scalar(stable_diffusion.alphas_cumprod.shape[0], "n_steps", path)
|
||||||
save_tensor(stable_diffusion.alphas_cumprod, 'alphas_cumprod', path)
|
save_tensor(stable_diffusion.alphas_cumprod, 'alphas_cumprod', path)
|
||||||
save_autoencoder(stable_diffusion.autoencoder, 'autoencoder', path)
|
save_autoencoder(stable_diffusion.first_stage_model, pathlib.Path(path, 'autoencoder'))
|
||||||
save_unet_model(stable_diffusion.diffusion, 'unet', path)
|
save_unet_model(stable_diffusion.model.diffusion_model, pathlib.Path(path, 'unet'))
|
||||||
save_clip_text_transformer(stable_diffusion.clip, 'clip', path)
|
save_clip_text_transformer(stable_diffusion.cond_stage_model.transformer.text_model, pathlib.Path(path, 'clip'))
|
||||||
@@ -1,5 +1,3 @@
|
|||||||
#![feature(generic_const_exprs)]
|
|
||||||
|
|
||||||
pub mod model;
|
pub mod model;
|
||||||
pub mod tokenizer;
|
pub mod tokenizer;
|
||||||
pub mod helper;
|
pub mod helper;
|
||||||
11
src/main.rs
11
src/main.rs
@@ -65,16 +65,23 @@ fn main() {
|
|||||||
let output = unet.forward(input, timesteps, context);*/
|
let output = unet.forward(input, timesteps, context);*/
|
||||||
//print_tensor(output);
|
//print_tensor(output);
|
||||||
|
|
||||||
|
println!("Loading tokenizer...");
|
||||||
let tokenizer = SimpleTokenizer::new().unwrap();
|
let tokenizer = SimpleTokenizer::new().unwrap();
|
||||||
|
|
||||||
|
println!("Loading Stable Diffusion...");
|
||||||
let sd: StableDiffusion<Backend> = load_stable_diffusion("params", &device).unwrap();
|
let sd: StableDiffusion<Backend> = load_stable_diffusion("params", &device).unwrap();
|
||||||
|
let sd = sd.to_device(&device);
|
||||||
|
|
||||||
let unconditional_guidance_scale = 7.5;
|
let unconditional_guidance_scale = 7.5;
|
||||||
let unconditional_context = sd.unconditional_context(&tokenizer);
|
let unconditional_context = sd.unconditional_context(&tokenizer);
|
||||||
let context = sd.context(&tokenizer, "A rainbow pony is flying.").unsqueeze();
|
let context = sd.context(&tokenizer, "A wine glass filled with pink flower petals.").unsqueeze();
|
||||||
|
|
||||||
let n_steps = 5;
|
let n_steps = 100;
|
||||||
|
|
||||||
|
println!("Sampling images...");
|
||||||
let images = sd.sample_image(context, unconditional_context, unconditional_guidance_scale, n_steps);
|
let images = sd.sample_image(context, unconditional_context, unconditional_guidance_scale, n_steps);
|
||||||
|
|
||||||
|
println!("Saving images...");
|
||||||
save_images(&images, "image_samples/", 512, 512).unwrap();
|
save_images(&images, "image_samples/", 512, 512).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ pub fn qkv_attention<B: Backend>(q: Tensor<B, 3>, k: Tensor<B, 3>, v: Tensor<B,
|
|||||||
return o;
|
return o;
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn attn_decoder_mask<B: Backend>(seq_length: usize) -> Tensor<B, 2> {
|
pub fn attn_decoder_mask<B: Backend>(seq_length: usize, device: &B::Device) -> Tensor<B, 2> {
|
||||||
let mut mask = Tensor::<B, 2>::zeros([seq_length, seq_length]);
|
let mut mask = Tensor::<B, 2>::zeros([seq_length, seq_length]);
|
||||||
|
|
||||||
for i in 0..(seq_length - 1) {
|
for i in 0..(seq_length - 1) {
|
||||||
@@ -43,5 +43,5 @@ pub fn attn_decoder_mask<B: Backend>(seq_length: usize) -> Tensor<B, 2> {
|
|||||||
mask = mask.slice_assign([i..i + 1, i + 1..seq_length], values);
|
mask = mask.slice_assign([i..i + 1, i + 1..seq_length], values);
|
||||||
}
|
}
|
||||||
|
|
||||||
return mask;
|
return mask.to_device(device);
|
||||||
}
|
}
|
||||||
@@ -43,14 +43,6 @@ impl AutoencoderConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
fn print_tensor<B: Backend>(x: Tensor<B, 4>) {
|
|
||||||
let [_, channels, height, width] = x.dims();
|
|
||||||
let channels = channels.min(10);
|
|
||||||
let data = x.slice([0..1, 0..channels, 0..height, 0..width]).into_data();
|
|
||||||
println!("{:?}", data);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#[derive(Module, Debug)]
|
#[derive(Module, Debug)]
|
||||||
pub struct Autoencoder<B: Backend> {
|
pub struct Autoencoder<B: Backend> {
|
||||||
encoder: Encoder<B>,
|
encoder: Encoder<B>,
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ impl<B: Backend> CLIP<B> {
|
|||||||
pub fn forward(&self, x: Tensor<B, 2, Int>) -> Tensor<B, 3> {
|
pub fn forward(&self, x: Tensor<B, 2, Int>) -> Tensor<B, 3> {
|
||||||
let [n_batch, seq_len] = x.dims();
|
let [n_batch, seq_len] = x.dims();
|
||||||
|
|
||||||
let mask = attn_decoder_mask(seq_len);
|
let mask = attn_decoder_mask(seq_len, &x.device());
|
||||||
|
|
||||||
let embedded = self.token_embedding.forward(x)
|
let embedded = self.token_embedding.forward(x)
|
||||||
+ self.position_embedding.val().slice([0..seq_len]).unsqueeze();
|
+ self.position_embedding.val().slice([0..seq_len]).unsqueeze();
|
||||||
|
|||||||
@@ -85,19 +85,21 @@ impl<B: Backend> StableDiffusion<B> {
|
|||||||
let start = b * num_elements_per_image;
|
let start = b * num_elements_per_image;
|
||||||
let end = start + num_elements_per_image;
|
let end = start + num_elements_per_image;
|
||||||
|
|
||||||
flattened[start..end].into_iter().map(|v| v.to_u8().unwrap()).collect()
|
flattened[start..end].into_iter().map(|v| v.to_f64().unwrap().min(255.0).max(0.0).to_u8().unwrap()).collect()
|
||||||
}).collect()
|
}).collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn sample_latent(&self, context: Tensor<B, 3>, unconditional_context: Tensor<B, 2>, unconditional_guidance_scale: f64, n_steps: usize) -> Tensor<B, 4> {
|
pub fn sample_latent(&self, context: Tensor<B, 3>, unconditional_context: Tensor<B, 2>, unconditional_guidance_scale: f64, n_steps: usize) -> Tensor<B, 4> {
|
||||||
assert!(self.n_steps % n_steps == 0);
|
assert!(self.n_steps % n_steps == 0);
|
||||||
|
|
||||||
|
let device = context.device();
|
||||||
|
|
||||||
let step_size = self.n_steps / n_steps;
|
let step_size = self.n_steps / n_steps;
|
||||||
|
|
||||||
let [n_batches, _, _] = context.dims();
|
let [n_batches, _, _] = context.dims();
|
||||||
|
|
||||||
let gen_noise = || {
|
let gen_noise = || {
|
||||||
Tensor::random([n_batches, 4, 64, 64], Distribution::Normal(0.0, 1.0) )
|
Tensor::random([n_batches, 4, 64, 64], Distribution::Normal(0.0, 1.0)).to_device(&device)
|
||||||
};
|
};
|
||||||
|
|
||||||
let sigma = 0.0; // Use deterministic diffusion
|
let sigma = 0.0; // Use deterministic diffusion
|
||||||
@@ -114,7 +116,7 @@ impl<B: Backend> StableDiffusion<B> {
|
|||||||
|
|
||||||
let sqrt_noise = (1.0 - current_alpha).sqrt();
|
let sqrt_noise = (1.0 - current_alpha).sqrt();
|
||||||
|
|
||||||
let timestep = Tensor::from_ints([t as i32]);
|
let timestep = Tensor::from_ints([t as i32]).to_device(&device);
|
||||||
let pred_noise = self.forward_diffuser(latent.clone(), timestep, context.clone(), unconditional_context.clone(), unconditional_guidance_scale);
|
let pred_noise = self.forward_diffuser(latent.clone(), timestep, context.clone(), unconditional_context.clone(), unconditional_guidance_scale);
|
||||||
|
|
||||||
let predx0 = (latent - pred_noise.clone() * sqrt_noise) / current_alpha.sqrt();
|
let predx0 = (latent - pred_noise.clone() * sqrt_noise) / current_alpha.sqrt();
|
||||||
@@ -128,17 +130,29 @@ impl<B: Backend> StableDiffusion<B> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn forward_diffuser(&self, latent: Tensor<B, 4>, timestep: Tensor<B, 1, Int>, context: Tensor<B, 3>, unconditional_context: Tensor<B, 2>, unconditional_guidance_scale: f64) -> Tensor<B, 4> {
|
fn forward_diffuser(&self, latent: Tensor<B, 4>, timestep: Tensor<B, 1, Int>, context: Tensor<B, 3>, unconditional_context: Tensor<B, 2>, unconditional_guidance_scale: f64) -> Tensor<B, 4> {
|
||||||
let [n_batch, n_channel, height, width] = latent.dims();
|
///let [n_batch, n_channel, height, width] = latent.dims();
|
||||||
let latent = latent.repeat(0, 2);
|
//let latent = latent.repeat(0, 2);
|
||||||
|
|
||||||
let latent = self.diffusion.forward(
|
let unconditional_latent = self.diffusion.forward(
|
||||||
|
latent.clone(),
|
||||||
|
timestep.clone(),
|
||||||
|
unconditional_context.unsqueeze()
|
||||||
|
);
|
||||||
|
|
||||||
|
let conditional_latent = self.diffusion.forward(
|
||||||
|
latent,
|
||||||
|
timestep,
|
||||||
|
context
|
||||||
|
);
|
||||||
|
|
||||||
|
/*let latent = self.diffusion.forward(
|
||||||
latent.repeat(0, 2),
|
latent.repeat(0, 2),
|
||||||
timestep.repeat(0, 2),
|
timestep.repeat(0, 2),
|
||||||
Tensor::cat(vec![unconditional_context.unsqueeze::<3>(), context], 0)
|
Tensor::cat(vec![unconditional_context.unsqueeze::<3>(), context], 0)
|
||||||
);
|
);
|
||||||
|
|
||||||
let unconditional_latent = latent.clone().slice([0..n_batch]);
|
let unconditional_latent = latent.clone().slice([0..n_batch]);
|
||||||
let conditional_latent = latent.slice([n_batch..2 * n_batch]);
|
let conditional_latent = latent.slice([n_batch..2 * n_batch]);*/
|
||||||
|
|
||||||
unconditional_latent.clone() + (conditional_latent - unconditional_latent) * unconditional_guidance_scale
|
unconditional_latent.clone() + (conditional_latent - unconditional_latent) * unconditional_guidance_scale
|
||||||
}
|
}
|
||||||
@@ -148,10 +162,11 @@ impl<B: Backend> StableDiffusion<B> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn context(&self, tokenizer: &SimpleTokenizer, text: &str) -> Tensor<B, 3> {
|
pub fn context(&self, tokenizer: &SimpleTokenizer, text: &str) -> Tensor<B, 3> {
|
||||||
|
let device = &self.devices()[0];
|
||||||
let text = format!("<|startoftext|>{}<|endoftext|>", text);
|
let text = format!("<|startoftext|>{}<|endoftext|>", text);
|
||||||
let tokenized: Vec<_> = tokenizer.encode(&text).into_iter().map(|v| v as i32).collect();
|
let tokenized: Vec<_> = tokenizer.encode(&text).into_iter().map(|v| v as i32).collect();
|
||||||
|
|
||||||
self.clip.forward(Tensor::from_ints(&tokenized[..]).unsqueeze())
|
self.clip.forward(Tensor::from_ints(&tokenized[..]).to_device(device).unsqueeze())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -113,7 +113,6 @@ impl<B: Backend> UNet<B> {
|
|||||||
|
|
||||||
// input blocks
|
// input blocks
|
||||||
for block in self.input_blocks.as_array() {
|
for block in self.input_blocks.as_array() {
|
||||||
println!("{:?}", x.clone().flatten::<1>(0, 3).slice([0..100]).into_data());
|
|
||||||
x = block.forward(x, emb.clone(), context.clone());
|
x = block.forward(x, emb.clone(), context.clone());
|
||||||
saved_inputs.push(x.clone())
|
saved_inputs.push(x.clone())
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user