| # Qwen 3.6-35B-A3B FP8 (MoE, 3B active) on DGX Spark GB10 | |
| # API: http://localhost:8888/v1 | |
| services: | |
| vllm-qwen36-35b: | |
| image: vllm/vllm-openai:v0.20.0-aarch64-cu130-ubuntu2404 | |
| container_name: qwen36-35b-vllm | |
| runtime: nvidia | | | ipc: host | | | shm_size: "64gb" | | | ulimits: | | | memlock: -1 | | | stack: 67108864 | | | environment: | |
| - NVIDIA_VISIBLE_DEVICES=all | |
| - HF_TOKEN=${HF_TOKEN} | |
| - CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 | |
| - PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True | |
| - VLLM_MARLIN_USE_ATOMIC_ADD=1 | |
| - TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas | | | - OMP_NUM_THREADS=4 | | | volumes: | |
| - ${HOME}/.cache/huggingface:/root/.cache/huggingface | |
| - ${HOME}/.cache/vllm:/root/.cache/vllm | |
| ports: | | | - "8888:8000" | | | command: | |
| - "--model" | |
| - "Qwen/Qwen3.6-35B-A3B-FP8" | |
| - "--served-model-name" | |
| - "qwen3.6-35b-a3b" | |
| - "--host" | |
| - "0.0.0.0" | | | - "--port" | | | - "8000" | | | - "--attention-backend" | | | - "flashinfer" | | | - "--max-model-len" | | | - "262144" | | | # 0.20.0+ CUDA-graph memory profiling shaves ~1pp; 0.7069 restores the pre-0.20.0 0.70 KV budget. | | | - "--gpu-memory-utilization" | | | - "0.7069" | | | - "--kv-cache-dtype" | | | - "fp8" | | | - "--max-num-seqs" | | | - "20" | | | - "--max-num-batched-tokens" | | | - "32768" | |
| - "--enable-prefix-caching" | |
| - "--enable-auto-tool-choice" | |
| - "--tool-call-parser" | |
| - "qwen3_coder" | | | - "--reasoning-parser" | | | - "qwen3" | |
| - "--speculative-config" | |
| - '{"method":"qwen3_next_mtp","num_speculative_tokens":2}' | |
| - "--trust-remote-code" | |
| deploy: | | | resources: | | | reservations: | | | devices: | | | - driver: nvidia | | | count: all | | | capabilities: [gpu] | | | limits: | | | memory: 100g | | | healthcheck: | | | test: ["CMD-SHELL", "python3 -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')""] | | | interval: 30s | | | timeout: 10s | | | retries: 30 | | | start_period: 900s | | | restart: unless-stopped |