# Qwen 3.6-35B-A3B FP8 (MoE, 3B active) on DGX Spark GB10

> Source: <https://gist.github.com/wshobson/d32c98a5537ca4d51c92fea7e54aef40>
> Published: 2026-05-01 17:05:58+00:00

| # Qwen 3.6-35B-A3B FP8 (MoE, 3B active) on DGX Spark GB10 | |
| # API: http://localhost:8888/v1 | |
| services: | |
| vllm-qwen36-35b: | |
| image: vllm/vllm-openai:v0.20.0-aarch64-cu130-ubuntu2404 | |
| container_name: qwen36-35b-vllm | |
| runtime: nvidia | |
| ipc: host | |
| shm_size: "64gb" | |
| ulimits: | |
| memlock: -1 | |
| stack: 67108864 | |
| environment: | |
| - NVIDIA_VISIBLE_DEVICES=all | |
| - HF_TOKEN=${HF_TOKEN} | |
| - CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 | |
| - PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True | |
| - VLLM_MARLIN_USE_ATOMIC_ADD=1 | |
| - TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas | |
| - OMP_NUM_THREADS=4 | |
| volumes: | |
| - ${HOME}/.cache/huggingface:/root/.cache/huggingface | |
| - ${HOME}/.cache/vllm:/root/.cache/vllm | |
| ports: | |
| - "8888:8000" | |
| command: | |
| - "--model" | |
| - "Qwen/Qwen3.6-35B-A3B-FP8" | |
| - "--served-model-name" | |
| - "qwen3.6-35b-a3b" | |
| - "--host" | |
| - "0.0.0.0" | |
| - "--port" | |
| - "8000" | |
| - "--attention-backend" | |
| - "flashinfer" | |
| - "--max-model-len" | |
| - "262144" | |
| # 0.20.0+ CUDA-graph memory profiling shaves ~1pp; 0.7069 restores the pre-0.20.0 0.70 KV budget. | |
| - "--gpu-memory-utilization" | |
| - "0.7069" | |
| - "--kv-cache-dtype" | |
| - "fp8" | |
| - "--max-num-seqs" | |
| - "20" | |
| - "--max-num-batched-tokens" | |
| - "32768" | |
| - "--enable-prefix-caching" | |
| - "--enable-auto-tool-choice" | |
| - "--tool-call-parser" | |
| - "qwen3_coder" | |
| - "--reasoning-parser" | |
| - "qwen3" | |
| - "--speculative-config" | |
| - '{"method":"qwen3_next_mtp","num_speculative_tokens":2}' | |
| - "--trust-remote-code" | |
| deploy: | |
| resources: | |
| reservations: | |
| devices: | |
| - driver: nvidia | |
| count: all | |
| capabilities: [gpu] | |
| limits: | |
| memory: 100g | |
| healthcheck: | |
| test: ["CMD-SHELL", "python3 -c \"import urllib.request; urllib.request.urlopen('http://localhost:8000/health')\""] | |
| interval: 30s | |
| timeout: 10s | |
| retries: 30 | |
| start_period: 900s | |
| restart: unless-stopped |
