Skip to content

Add cuda support#31

Open
taf2 wants to merge 1 commit intoantirez:mainfrom
taf2:pr/cuda-support-860f829
Open

Add cuda support#31
taf2 wants to merge 1 commit intoantirez:mainfrom
taf2:pr/cuda-support-860f829

Conversation

@taf2
Copy link

@taf2 taf2 commented Feb 9, 2026

Worked with codex on this one so it could be i missed something critical but the testing environment passes and it did run for most of the weekend with only a few inputs from me... here is the output ```make cuda
rm -f flux.o flux_kernels.o flux_tokenizer.o flux_vae.o flux_transformer.o flux_sample.o flux_image.o jpeg.o flux_safetensors.o flux_qwen3.o flux_qwen3_tokenizer.o terminals.o flux_cli.o linenoise.o embcache.o *.mps.o flux_metal.o flux_cuda.o main.o flux libflux.a
rm -f flux_shaders_source.h
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -DUSE_CUDA -I/usr/include/openblas -I/usr/local/cuda/include -c -o flux.o flux.c
flux.c: In function ‘flux_img2img_debug_py’:
flux.c:1405:5: warning: ignoring return value of ‘fread’ declared with attribute ‘warn_unused_result’ [-Wunused-result]
1405 | fread(noise, sizeof(float), noise_size, f_noise);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
flux.c:1420:5: warning: ignoring return value of ‘fread’ declared with attribute ‘warn_unused_result’ [-Wunused-result]
1420 | fread(ref_latent, sizeof(float), ref_size, f_ref);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
flux.c:1436:5: warning: ignoring return value of ‘fread’ declared with attribute ‘warn_unused_result’ [-Wunused-result]
1436 | fread(text_emb, sizeof(float), txt_size, f_txt);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -DUSE_CUDA -I/usr/include/openblas -I/usr/local/cuda/include -c -o flux_kernels.o flux_kernels.c
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -DUSE_CUDA -I/usr/include/openblas -I/usr/local/cuda/include -c -o flux_tokenizer.o flux_tokenizer.c
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -DUSE_CUDA -I/usr/include/openblas -I/usr/local/cuda/include -c -o flux_vae.o flux_vae.c
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -DUSE_CUDA -I/usr/include/openblas -I/usr/local/cuda/include -c -o flux_transformer.o flux_transformer.c
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -DUSE_CUDA -I/usr/include/openblas -I/usr/local/cuda/include -c -o flux_sample.o flux_sample.c
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -DUSE_CUDA -I/usr/include/openblas -I/usr/local/cuda/include -c -o flux_image.o flux_image.c
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -DUSE_CUDA -I/usr/include/openblas -I/usr/local/cuda/include -c -o jpeg.o jpeg.c
In file included from jpeg.c:9:
jpeg.h: In function ‘jpeg_extend’:
jpeg.h:377:21: warning: left shift of negative value [-Wshift-negative-value]
377 | v = v + (-1 << bits) + 1;
| ^~
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -DUSE_CUDA -I/usr/include/openblas -I/usr/local/cuda/include -c -o flux_safetensors.o flux_safetensors.c
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -DUSE_CUDA -I/usr/include/openblas -I/usr/local/cuda/include -c -o flux_qwen3.o flux_qwen3.c
flux_qwen3.c: In function ‘open_safetensors_shards’:
flux_qwen3.c:1460:9: warning: ignoring return value of ‘fread’ declared with attribute ‘warn_unused_result’ [-Wunused-result]
1460 | fread(buf, 1, fsize, f);
| ^~~~~~~~~~~~~~~~~~~~~~~
flux_qwen3.c: In function ‘open_safetensors_shards.constprop’:
flux_qwen3.c:1501:50: warning: ‘%s’ directive output may be truncated writing up to 2047 bytes into a region of size 1023 [-Wformat-truncation=]
1501 | snprintf(path, sizeof(path), "%s/%s", model_dir, shard_names[i]);
| ^~
In file included from /usr/include/stdio.h:970,
from flux_qwen3.c:15:
In function ‘snprintf’,
inlined from ‘open_safetensors_shards.constprop’ at flux_qwen3.c:1501:17:
/usr/include/x86_64-linux-gnu/bits/stdio2.h:68:10: note: ‘__builtin___snprintf_chk’ output 2 or more bytes (assuming 2049) into a destination of size 1024
68 | return __builtin___snprintf_chk (__s, __n, __USE_FORTIFY_LEVEL - 1,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
69 | __glibc_objsize (__s), __fmt,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
70 | __va_arg_pack ());
| ~~~~~~~~~~~~~~~~~
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -DUSE_CUDA -I/usr/include/openblas -I/usr/local/cuda/include -c -o flux_qwen3_tokenizer.o flux_qwen3_tokenizer.c
flux_qwen3_tokenizer.c: In function ‘qwen3_tokenizer_load’:
flux_qwen3_tokenizer.c:334:5: warning: ignoring return value of ‘fread’ declared with attribute ‘warn_unused_result’ [-Wunused-result]
334 | fread(json, 1, size, f);
| ^~~~~~~~~~~~~~~~~~~~~~~
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -DUSE_CUDA -I/usr/include/openblas -I/usr/local/cuda/include -c -o terminals.o terminals.c
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -DUSE_CUDA -I/usr/include/openblas -I/usr/local/cuda/include -c -o flux_cli.o flux_cli.c
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -DUSE_CUDA -I/usr/include/openblas -I/usr/local/cuda/include -c -o linenoise.o linenoise.c
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -DUSE_CUDA -I/usr/include/openblas -I/usr/local/cuda/include -c -o embcache.o embcache.c
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -DUSE_CUDA -I/usr/include/openblas -I/usr/local/cuda/include -c -o main.o main.c
nvcc -O3 -U_GNU_SOURCE -c -o flux_cuda.o flux_cuda.cu
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -DUSE_CUDA -I/usr/include/openblas -I/usr/local/cuda/include -o flux flux.o flux_kernels.o flux_tokenizer.o flux_vae.o flux_transformer.o flux_sample.o flux_image.o jpeg.o flux_safetensors.o flux_qwen3.o flux_qwen3_tokenizer.o terminals.o flux_cli.o linenoise.o embcache.o main.o flux_cuda.o -L/usr/local/cuda/lib64 -Wl,-rpath,/usr/local/cuda/lib64 -lcublasLt -lcublas -lcudart -lopenblas -lstdc++ -lm

Built with CUDA backend (cuBLAS GPU acceleration)
broodlord/var/db/work/flux2.c (main)> /usr/bin/time -f 'ELAPSED=%e' ./flux --no-mmap -v -S 42 -d flux-klein-model -p "A woman wearing sunglasses smiling with her mouth open" -o output.png
CUDA: cuBLAS GPU acceleration enabled
Seed: 42
FLUX.2 klein 4B Image Generator

Model: flux-klein-model
Prompt: A woman wearing sunglasses smiling with her mouth open
Output: output.png
Size: 256x256
Steps: 0

Loading VAE... done (0.1s)
Model: FLUX.2-klein-4B v1.0 (distilled, 4 steps, guidance 1.0)
Loading Qwen3 encoder... Qwen3 tokenizer loaded (151669 vocab)
done (3.8s)
Encoding text... done (2.2s)
Loading FLUX.2 transformer... done (4.1s)
Denoising (d=double block, s=single blocks, F=final):
Step 1/4 dddddssssF
Step 2/4 dddddssssF
Step 3/4 dddddssssF
Step 4/4 dddddssssF
Denoising timing breakdown:
Step 1: 1315.9 ms
Step 2: 1329.1 ms
Step 3: 1300.0 ms
Step 4: 1297.1 ms
Total denoising: 5242.1 ms (5.24 s)
Transformer breakdown:
Double blocks: 1769.4 ms (34.2%)
Single blocks: 3408.7 ms (65.8%)
Final layer: 0.2 ms (0.0%)
Total: 5178.4 ms

Decoding image... done (0.1s)
Generated in 15.9s total
Output: 256x256, 3 channels
Saving... output.png 256x256 (0.0s)
Total generation time: 16.0 seconds
ELAPSED=16.89
broodlord/var/db/work/flux2.c (main)> make blas
rm -f flux.o flux_kernels.o flux_tokenizer.o flux_vae.o flux_transformer.o flux_sample.o flux_image.o jpeg.o flux_safetensors.o flux_qwen3.o flux_qwen3_tokenizer.o terminals.o flux_cli.o linenoise.o embcache.o *.mps.o flux_metal.o flux_cuda.o main.o flux libflux.a
rm -f flux_shaders_source.h
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -I/usr/include/openblas -c -o flux.o flux.c
flux.c: In function ‘flux_img2img_debug_py’:
flux.c:1405:5: warning: ignoring return value of ‘fread’ declared with attribute ‘warn_unused_result’ [-Wunused-result]
1405 | fread(noise, sizeof(float), noise_size, f_noise);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
flux.c:1420:5: warning: ignoring return value of ‘fread’ declared with attribute ‘warn_unused_result’ [-Wunused-result]
1420 | fread(ref_latent, sizeof(float), ref_size, f_ref);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
flux.c:1436:5: warning: ignoring return value of ‘fread’ declared with attribute ‘warn_unused_result’ [-Wunused-result]
1436 | fread(text_emb, sizeof(float), txt_size, f_txt);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -I/usr/include/openblas -c -o flux_kernels.o flux_kernels.c
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -I/usr/include/openblas -c -o flux_tokenizer.o flux_tokenizer.c
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -I/usr/include/openblas -c -o flux_vae.o flux_vae.c
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -I/usr/include/openblas -c -o flux_transformer.o flux_transformer.c
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -I/usr/include/openblas -c -o flux_sample.o flux_sample.c
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -I/usr/include/openblas -c -o flux_image.o flux_image.c
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -I/usr/include/openblas -c -o jpeg.o jpeg.c
In file included from jpeg.c:9:
jpeg.h: In function ‘jpeg_extend’:
jpeg.h:377:21: warning: left shift of negative value [-Wshift-negative-value]
377 | v = v + (-1 << bits) + 1;
| ^~
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -I/usr/include/openblas -c -o flux_safetensors.o flux_safetensors.c
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -I/usr/include/openblas -c -o flux_qwen3.o flux_qwen3.c
flux_qwen3.c: In function ‘open_safetensors_shards’:
flux_qwen3.c:1460:9: warning: ignoring return value of ‘fread’ declared with attribute ‘warn_unused_result’ [-Wunused-result]
1460 | fread(buf, 1, fsize, f);
| ^~~~~~~~~~~~~~~~~~~~~~~
flux_qwen3.c: In function ‘open_safetensors_shards.constprop’:
flux_qwen3.c:1501:50: warning: ‘%s’ directive output may be truncated writing up to 2047 bytes into a region of size 1023 [-Wformat-truncation=]
1501 | snprintf(path, sizeof(path), "%s/%s", model_dir, shard_names[i]);
| ^~
In file included from /usr/include/stdio.h:970,
from flux_qwen3.c:15:
In function ‘snprintf’,
inlined from ‘open_safetensors_shards.constprop’ at flux_qwen3.c:1501:17:
/usr/include/x86_64-linux-gnu/bits/stdio2.h:68:10: note: ‘__builtin___snprintf_chk’ output 2 or more bytes (assuming 2049) into a destination of size 1024
68 | return __builtin___snprintf_chk (__s, __n, __USE_FORTIFY_LEVEL - 1,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
69 | __glibc_objsize (__s), __fmt,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
70 | __va_arg_pack ());
| ~~~~~~~~~~~~~~~~~
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -I/usr/include/openblas -c -o flux_qwen3_tokenizer.o flux_qwen3_tokenizer.c
flux_qwen3_tokenizer.c: In function ‘qwen3_tokenizer_load’:
flux_qwen3_tokenizer.c:334:5: warning: ignoring return value of ‘fread’ declared with attribute ‘warn_unused_result’ [-Wunused-result]
334 | fread(json, 1, size, f);
| ^~~~~~~~~~~~~~~~~~~~~~~
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -I/usr/include/openblas -c -o terminals.o terminals.c
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -I/usr/include/openblas -c -o flux_cli.o flux_cli.c
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -I/usr/include/openblas -c -o linenoise.o linenoise.c
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -I/usr/include/openblas -c -o embcache.o embcache.c
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -I/usr/include/openblas -c -o main.o main.c
gcc -Wall -Wextra -O3 -march=native -ffast-math -DUSE_BLAS -DUSE_OPENBLAS -I/usr/include/openblas -o flux flux.o flux_kernels.o flux_tokenizer.o flux_vae.o flux_transformer.o flux_sample.o flux_image.o jpeg.o flux_safetensors.o flux_qwen3.o flux_qwen3_tokenizer.o terminals.o flux_cli.o linenoise.o embcache.o main.o -lm -lopenblas

Built with BLAS backend (~30x faster than generic)
broodlord/var/db/work/flux2.c (main)> /usr/bin/time -f 'ELAPSED=%e' ./flux --no-mmap -v -S 42 -d flux-klein-model -p "A woman wearing sunglasses smiling with her mouth open" -o output.png
BLAS: CPU acceleration enabled (Accelerate/OpenBLAS)
Seed: 42
FLUX.2 klein 4B Image Generator

Model: flux-klein-model
Prompt: A woman wearing sunglasses smiling with her mouth open
Output: output.png
Size: 256x256
Steps: 0

Loading VAE... done (0.3s)
Model: FLUX.2-klein-4B v1.0 (distilled, 4 steps, guidance 1.0)
Loading Qwen3 encoder... Qwen3 tokenizer loaded (151669 vocab)
done (3.7s)
Encoding text... done (6.3s)
Loading FLUX.2 transformer... done (4.4s)
Denoising (d=double block, s=single blocks, F=final):
Step 1/4 dddddssssF
Step 2/4 dddddssssF
Step 3/4 dddddssssF
Step 4/4 dddddssssF
Denoising timing breakdown:
Step 1: 6608.0 ms
Step 2: 6846.5 ms
Step 3: 6696.1 ms
Step 4: 6529.1 ms
Total denoising: 26679.8 ms (26.68 s)
Transformer breakdown:
Double blocks: 6187.5 ms (23.4%)
Single blocks: 20190.0 ms (76.5%)
Final layer: 28.1 ms (0.1%)
Total: 26405.5 ms

Decoding image... done (3.2s)
Generated in 44.9s total
Output: 256x256, 3 channels
Saving... output.png 256x256 (0.0s)
Total generation time: 45.2 seconds
ELAPSED=45.77

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant

Comments