{"slug": "benchmark-qwen3-6-27b-on-modal", "title": "Benchmark Qwen3.6 27B on Modal", "summary": "A developer benchmarked the Qwen3.6 27B model on Modal using llama.cpp, deploying a serverless pipeline that downloads GGUF shards from Hugging Face and runs perplexity evaluation on an A100-80GB GPU. The setup includes a custom CUDA container, a Modal volume for model caching, and automated logit dumping for comparison against a BF16 reference.", "body_md": "| import os | |\n| import modal | |\n| import subprocess | |\n| app = modal.App(\"llama-logits-kld\") | |\n| LLAMA_CUDA_URL = \"https://github.com/ai-dock/llama.cpp-cuda/releases/download/b9279/llama.cpp-b9279-cuda-12.8-amd64.tar.gz\" | |\n| vol = modal.Volume.from_name(\"qwen3-6-kld-vol\", create_if_missing=True) | |\n| image = ( | |\n| modal.Image.from_registry( | |\n| \"nvidia/cuda:12.4.0-runtime-ubuntu22.04\", | |\n| add_python=\"3.11\", | |\n| ) | |\n| .apt_install(\"wget\", \"tar\", \"libgomp1\", \"unzip\") | |\n| .run_commands( | |\n| f\"wget -q {LLAMA_CUDA_URL} -O /tmp/llama-cuda.tar.gz\", | |\n| \"mkdir -p /llama-bin && tar -xzf /tmp/llama-cuda.tar.gz -C /llama-bin --strip-components=1\", | |\n| \"echo '/llama-bin' >> /etc/ld.so.conf.d/llama.conf && ldconfig || true\", | |\n| \"echo '/llama-bin/lib' >> /etc/ld.so.conf.d/llama.conf && ldconfig || true\", | |\n| ) | |\n| .pip_install(\"huggingface_hub\") | |\n| ) | |\n| HF_REPO = \"unsloth/Qwen3.6-27B-GGUF\" | |\n| MODEL_DIR = \"/vol/model\" | |\n| MODEL_SHARDS = [ | |\n| \"Qwen3.6-27B-BF16-00001-of-00002.gguf\", | |\n| \"Qwen3.6-27B-BF16-00002-of-00002.gguf\", | |\n| ] | |\n| BF16_MODEL_FILE = MODEL_SHARDS[0] | |\n| LOGITS_FILE = \"/vol/logits-bf16.dat\" | |\n| def ensure_base_model(): | |\n| from huggingface_hub import hf_hub_download | |\n| model_dir = os.path.join(MODEL_DIR, \"BF16\") | |\n| os.makedirs(model_dir, exist_ok=True) | |\n| download_occurred = False | |\n| for shard in MODEL_SHARDS: | |\n| shard_path = os.path.join(model_dir, shard) | |\n| if not os.path.exists(shard_path): | |\n| print(f\"Downloading {shard} from {HF_REPO}...\") | |\n| hf_hub_download( | |\n| repo_id=HF_REPO, | |\n| filename=\"BF16/\" + shard, | |\n| local_dir=MODEL_DIR, | |\n| local_dir_use_symlinks=False, | |\n| ) | |\n| download_occurred = True | |\n| else: | |\n| print(f\"{shard} already exists in volume. Skipping download.\") | |\n| if download_occurred: | |\n| print(\"Committing downloaded model shards to volume...\") | |\n| vol.commit() | |\n| def resolve_model_path(source: str, value: str) -> str: | |\n| if source == \"volume\": | |\n| return value | |\n| if source == \"hf\": | |\n| from huggingface_hub import hf_hub_download | |\n| if \"::\" not in value: | |\n| raise ValueError(\"HF target must be formatted as 'repo_id::filename'\") | |\n| repo_id, filename = value.split(\"::\", 1) | |\n| local_dir = f\"/tmp/hf_models/{repo_id.replace('/', '__')}\" | |\n| print(f\"Downloading {filename} from {repo_id} ...\") | |\n| path = hf_hub_download( | |\n| repo_id=repo_id, | |\n| filename=filename, | |\n| local_dir=local_dir, | |\n| local_dir_use_symlinks=False, | |\n| ) | |\n| print(f\"Downloaded to {path}\") | |\n| return path | |\n| raise ValueError(f\"Unknown source: {source!r}. Use 'volume' or 'hf'.\") | |\n| @app.function( | |\n| image=image, | |\n| gpu=\"A100-80GB\", | |\n| timeout=14400, | |\n| volumes={\"/vol\": vol}, | |\n| ) | |\n| def run_kld_compare(source: str, value: str, label: str): | |\n| import glob | |\n| matches = glob.glob(\"/llama-bin/**/llama-perplexity\", recursive=True) | |\n| assert matches, \"llama-perplexity binary not found!\" | |\n| binary = matches[0] | |\n| # Download wikitext-2 test set | |\n| subprocess.run( | |\n| [ | |\n| \"wget\", \"-q\", | |\n| \"https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip\", | |\n| \"-O\", \"/tmp/wikitext.zip\", | |\n| ], | |\n| check=True, | |\n| ) | |\n| subprocess.run([\"unzip\", \"-q\", \"/tmp/wikitext.zip\", \"-d\", \"/tmp/wikitext\"], check=True) | |\n| dataset = \"/tmp/wikitext/wikitext-2-raw/wiki.test.raw\" | |\n| # Step 1: ensure base model exists on volume, then dump reference logits | |\n| ensure_base_model() | |\n| if not os.path.exists(LOGITS_FILE): | |\n| print(\"=== Step 1: Dumping BF16 reference logits ===\") | |\n| result = subprocess.run( | |\n| [ | |\n| binary, | |\n| \"-m\", f\"{MODEL_DIR}/BF16/{BF16_MODEL_FILE}\", | |\n| \"-f\", dataset, | |\n| \"-c\", \"8192\", | |\n| \"-ngl\", \"99\", | |\n| \"-ctk\", \"q8_0\", | |\n| \"-ctv\", \"q8_0\", | |\n| \"--save-all-logits\", LOGITS_FILE, | |\n| ], | |\n| capture_output=True, | |\n| text=True, | |\n| ) | |\n| print(result.stdout) | |\n| print(result.stderr) | |\n| vol.commit() | |\n| else: | |\n| print(f\"=== Step 1: Skipping — logits already at {LOGITS_FILE} ===\") | |\n| # Step 2: resolve model and run KLD | |\n| print(f\"\\n=== Step 2 [{label}]: Resolving model path ===\") | |\n| model_path = resolve_model_path(source, value) | |\n| print(f\"=== Step 2 [{label}]: Computing KL divergence ===\") | |\n| result = subprocess.run( | |\n| [ | |\n| binary, | |\n| \"-m\", model_path, | |\n| \"-f\", dataset, | |\n| \"-c\", \"8192\", | |\n| \"-ngl\", \"99\", | |\n| \"-ctk\", \"q8_0\", | |\n| \"-ctv\", \"q8_0\", | |\n| \"--kl-divergence-base\", LOGITS_FILE, | |\n| \"--kl-divergence\", | |\n| ], | |\n| capture_output=True, | |\n| text=True, | |\n| ) | |\n| print(result.stdout) | |\n| print(result.stderr) | |\n| # Persist results to volume | |\n| kld_output = f\"/vol/kld-{label}.txt\" | |\n| with open(kld_output, \"w\") as f: | |\n| f.write(result.stdout) | |\n| f.write(result.stderr) | |\n| vol.commit() | |\n| print(f\"Results saved to {kld_output}\") | |\n| return result.stdout + result.stderr | |\n| @app.local_entrypoint() | |\n| def main( | |\n| source: str, | |\n| value: str, | |\n| label: str = \"\", | |\n| ): | |\n| \"\"\" | |\n| Examples: | |\n| # volume model | |\n| modal run kld.py --source volume --value /vol/model/Qwen3.6-27B-attn_q5_q3.gguf --label attn_q5_q3 | |\n| # huggingface model (repo_id::filename) | |\n| modal run kld.py --source hf --value \"unsloth/Qwen3.6-27B-GGUF::Qwen3.6-27B-Q4_K_M.gguf\" --label Q4_K_M | |\n| \"\"\" | |\n| if not label: | |\n| label = value.split(\"::\")[-1].replace(\".gguf\", \"\") | |\n| output = run_kld_compare.remote(source=source, value=value, label=label) | |\n| print(output) |", "url": "https://wpnews.pro/news/benchmark-qwen3-6-27b-on-modal", "canonical_source": "https://gist.github.com/huytd/ac6457b4581598a198c027e4051380de", "published_at": "2026-05-29 18:13:57+00:00", "updated_at": "2026-06-05 03:14:21.368204+00:00", "lang": "en", "topics": ["large-language-models", "artificial-intelligence", "ai-infrastructure", "ai-tools"], "entities": ["Modal", "Qwen", "Unsloth", "Hugging Face", "llama.cpp", "NVIDIA", "CUDA"], "alternates": {"html": "https://wpnews.pro/news/benchmark-qwen3-6-27b-on-modal", "markdown": "https://wpnews.pro/news/benchmark-qwen3-6-27b-on-modal.md", "text": "https://wpnews.pro/news/benchmark-qwen3-6-27b-on-modal.txt", "jsonld": "https://wpnews.pro/news/benchmark-qwen3-6-27b-on-modal.jsonld"}}