Python Script to run it below 16GB vram

#50
by snapo - opened

Unfortunately i dont recall from which github gist i got most of the code, but with this i am able to run FLUX.1-dev in q8 on less than 16GB vram.

Might help some of you.

On my 22GB RTX 2080Ti it takes approx. 7,5min per image.
I have 2 cards, i am still looking how i can split it somehow on both of the cards (seems that diffusion image models are far far behind llm's in quantization AND in parallelisation of vram [multi gpu for single image output sharing])

import torch

from optimum.quanto import freeze, qfloat8, quantize

from diffusers import FlowMatchEulerDiscreteScheduler, AutoencoderKL
from diffusers.models.transformers.transformer_flux import FluxTransformer2DModel
from diffusers.pipelines.flux.pipeline_flux import FluxPipeline
from transformers import CLIPTextModel, CLIPTokenizer,T5EncoderModel, T5TokenizerFast

dtype = torch.bfloat16
bfl_repo = "black-forest-labs/FLUX.1-dev"
revision = "refs/heads/main"

scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(bfl_repo, subfolder="scheduler", revision=revision)
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=dtype)
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=dtype)
text_encoder_2 = T5EncoderModel.from_pretrained(bfl_repo, subfolder="text_encoder_2", torch_dtype=dtype, revision=revision)
tokenizer_2 = T5TokenizerFast.from_pretrained(bfl_repo, subfolder="tokenizer_2", torch_dtype=dtype, revision=revision)
vae = AutoencoderKL.from_pretrained(bfl_repo, subfolder="vae", torch_dtype=dtype, revision=revision)
transformer = FluxTransformer2DModel.from_pretrained(bfl_repo, subfolder="transformer", torch_dtype=dtype, revision=revision)

quantize(transformer, weights=qfloat8)
freeze(transformer)

quantize(text_encoder_2, weights=qfloat8)
freeze(text_encoder_2)

pipe = FluxPipeline(
    scheduler=scheduler,
    text_encoder=text_encoder,
    tokenizer=tokenizer,
    text_encoder_2=None,
    tokenizer_2=tokenizer_2,
    vae=vae,
    transformer=None,
)
pipe.text_encoder_2 = text_encoder_2
pipe.transformer = transformer
pipe.enable_model_cpu_offload()

generator = torch.Generator().manual_seed(42)
image = pipe(
    prompt='Photorealistic nekomusume cat girl selfie, university graduation gown, high heels, holding "Richard" card, cat ears, cinematic lighting, shallow depth of field, campus back'
    width=1024,
    height=1024,
    num_inference_steps=50,
    generator=generator,
    guidance_scale=3.5,
    max_sequence_length=512,
).images[0]
image.save('test_flux_distilled.png')

What is less than 16GB VRAM? Would this work on 8GB?

Hold my beer: 3070 mobile 8 GB VRAM: https://github.com/InServiceOfX/InServiceOfX/blob/master/PythonLibraries/HuggingFace/MoreDiffusers/morediffusers/Applications/terminal_only_finite_loop_flux.py Note that you use a .yml file to configure the input values dynamically and will prompt you for the prompt.

An image couple walking on the road

Hold my beer: 3070 mobile 8 GB VRAM: https://github.com/InServiceOfX/InServiceOfX/blob/master/PythonLibraries/HuggingFace/MoreDiffusers/morediffusers/Applications/terminal_only_finite_loop_flux.py Note that you use a .yml file to configure the input values dynamically and will prompt you for the prompt.

how do i get this running? would i clone the entire repo?

-Thanks!

I found that as with SD3, it is possible to get FLUX.1-schnell and FLUX.1-dev running on 12GB Nvidia RTX3060 just by 8bit quantizing the big T5 text encoder in conjunction with using accelerate's device_map. For FLUX it is text_encoder_2. Note that I have accelerate and bitsandbytes installed in my Python environment (pip install)

import torch
from diffusers import DiffusionPipeline
from transformers import T5EncoderModel, BitsAndBytesConfig

#https://huggingface.co./black-forest-labs/FLUX.1-dev                                                                                              
#https://huggingface.co./docs/diffusers/en/api/pipelines/flux                                                                                      
#https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/flux/pipeline_flux.py                                                 

quantization_config = BitsAndBytesConfig(load_in_8bit=True)

model_id = "black-forest-labs/FLUX.1-schnell"    #needs 4 steps only - it is faster than the dev version as the name implies                                        
#model_id = "black-forest-labs/FLUX.1-dev"       #needs a lot more steps - eg 50                                                                  
text_encoder = T5EncoderModel.from_pretrained(
    model_id,
    subfolder="text_encoder_2",
    quantization_config=quantization_config,
    torch_dtype=torch.bfloat16   #bfloat16 and normal float16 both work - former gives a warning but seems to work                                
)

pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16,   #bfloat16 and float16 both work, must match the T5               
        text_encoder_2=text_encoder,
        device_map="balanced", max_memory={0:"11GiB", "cpu":"20GiB"})
pipe.vae.enable_tiling()   #less memory usage at VAE time                                                                                         

prompt = "A cat holding a sign that says hello world"
prompt2 = "A photorealistic image of a relaxed looking tabby cat holding up a brown cardboard sign that says 'hello world' written in blue felt marker pen, ensure that the paws look correct"
image = pipe(
    prompt,
    prompt_2=prompt2,
    num_images_per_prompt=1,
    guidance_scale=0.0,    #must be 0.0 for schnell version, dev version can be as per SD                                                         
    num_inference_steps=4,  #only need 4 for schnell version, dev version needs 50 or so                                                      
    max_sequence_length=256,  #relates to the T5 encoder - text_encoder_2 - max 256 for schnell                                                   
    generator=torch.Generator("cpu").manual_seed(0)
).images[0]
image.save("flux-schnell.png")

You may need to tweak the to suit your hardware. Note that adding a second GPU involves adding additional entries to the dict, eg: 1:"24GiB" or whatever.
This code works for me for both schnell and dev versions on a 12GB Nvidia card. I don't know if it will support smaller cards.

This comment has been hidden

Hold my beer: 3070 mobile 8 GB VRAM: https://github.com/InServiceOfX/InServiceOfX/blob/master/PythonLibraries/HuggingFace/MoreDiffusers/morediffusers/Applications/terminal_only_finite_loop_flux.py Note that you use a .yml file to configure the input values dynamically and will prompt you for the prompt.

how do i get this running? would i clone the entire repo?

-Thanks!

@gptninja my bad for the late response, so much going on in life. Yeah you'd clone the whole repo because you first 1. build a Docker container and 2. run that script within the repo because I was trying to roll my own library of wrappers. Also I made everything configurable in the repo's Configuration subdirectory. Hmm, if you're really interested lemme kno because I need to start writing documentation for the code I'm writing.

This comment has been hidden

run in colab t4 vram 16gb

import torch
from diffusers import DiffusionPipeline
from transformers import T5EncoderModel, BitsAndBytesConfig

#https://huggingface.co./black-forest-labs/FLUX.1-dev
#https://huggingface.co./docs/diffusers/en/api/pipelines/flux
#https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/flux/pipeline_flux.py

quantization_config = BitsAndBytesConfig(load_in_4bit=True)

#model_id = "black-forest-labs/FLUX.1-schnell" #needs 4 steps only - it is faster than the dev version as the name implies
model_id = "black-forest-labs/FLUX.1-dev" #needs a lot more steps - eg 50
text_encoder = T5EncoderModel.from_pretrained(
model_id,
subfolder="text_encoder_2",
quantization_config=quantization_config,
torch_dtype=torch.bfloat16 #bfloat16 and normal float16 both work - former gives a warning but seems to work
)

pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16, #bfloat16 and float16 both work, must match the T5
text_encoder_2=text_encoder,
device_map="balanced", max_memory={0:"11GiB", "cpu":"20GiB"})
pipe.vae.enable_tiling() #less memory usage at VAE time

prompt = "A cat holding a sign that says hello world"
prompt2 = "A photorealistic image of a relaxed looking tabby cat holding up a brown cardboard sign that says 'hello world' written in blue felt marker pen, ensure that the paws look correct"
image = pipe(
prompt,
prompt_2=prompt2,
num_images_per_prompt=1,
guidance_scale=0.0, #must be 0.0 for schnell version, dev version can be as per SD
num_inference_steps=4, #only need 4 for schnell version, dev version needs 50 or so
max_sequence_length=256, #relates to the T5 encoder - text_encoder_2 - max 256 for schnell
generator=torch.Generator("cpu").manual_seed(0)
).images[0]
image.save("flux-schnell.png")
شغال_flux.png

Hold my beer: 3070 mobile 8 GB VRAM: https://github.com/InServiceOfX/InServiceOfX/blob/master/PythonLibraries/HuggingFace/MoreDiffusers/morediffusers/Applications/terminal_only_finite_loop_flux.py Note that you use a .yml file to configure the input values dynamically and will prompt you for the prompt.

I forgot I even listed my GPU in my profile, thank you kindly

Sign up or log in to comment