Benchmark Qwen3.6 27B on Modal A developer benchmarked the Qwen3.6 27B model on Modal using llama.cpp, deploying a serverless pipeline that downloads GGUF shards from Hugging Face and runs perplexity evaluation on an A100-80GB GPU. The setup includes a custom CUDA container, a Modal volume for model caching, and automated logit dumping for comparison against a BF16 reference. | import os | | | import modal | | | import subprocess | | | app = modal.App "llama-logits-kld" | | | LLAMA CUDA URL = "https://github.com/ai-dock/llama.cpp-cuda/releases/download/b9279/llama.cpp-b9279-cuda-12.8-amd64.tar.gz" | | | vol = modal.Volume.from name "qwen3-6-kld-vol", create if missing=True | | | image = | | | modal.Image.from registry | | | "nvidia/cuda:12.4.0-runtime-ubuntu22.04", | | | add python="3.11", | | | | | | .apt install "wget", "tar", "libgomp1", "unzip" | | | .run commands | | | f"wget -q {LLAMA CUDA URL} -O /tmp/llama-cuda.tar.gz", | | | "mkdir -p /llama-bin && tar -xzf /tmp/llama-cuda.tar.gz -C /llama-bin --strip-components=1", | | | "echo '/llama-bin' /etc/ld.so.conf.d/llama.conf && ldconfig || true", | | | "echo '/llama-bin/lib' /etc/ld.so.conf.d/llama.conf && ldconfig || true", | | | | | | .pip install "huggingface hub" | | | | | | HF REPO = "unsloth/Qwen3.6-27B-GGUF" | | | MODEL DIR = "/vol/model" | | | MODEL SHARDS = | | | "Qwen3.6-27B-BF16-00001-of-00002.gguf", | | | "Qwen3.6-27B-BF16-00002-of-00002.gguf", | | | | | | BF16 MODEL FILE = MODEL SHARDS 0 | | | LOGITS FILE = "/vol/logits-bf16.dat" | | | def ensure base model : | | | from huggingface hub import hf hub download | | | model dir = os.path.join MODEL DIR, "BF16" | | | os.makedirs model dir, exist ok=True | | | download occurred = False | | | for shard in MODEL SHARDS: | | | shard path = os.path.join model dir, shard | | | if not os.path.exists shard path : | | | print f"Downloading {shard} from {HF REPO}..." | | | hf hub download | | | repo id=HF REPO, | | | filename="BF16/" + shard, | | | local dir=MODEL DIR, | | | local dir use symlinks=False, | | | | | | download occurred = True | | | else: | | | print f"{shard} already exists in volume. Skipping download." | | | if download occurred: | | | print "Committing downloaded model shards to volume..." | | | vol.commit | | | def resolve model path source: str, value: str - str: | | | if source == "volume": | | | return value | | | if source == "hf": | | | from huggingface hub import hf hub download | | | if "::" not in value: | | | raise ValueError "HF target must be formatted as 'repo id::filename'" | | | repo id, filename = value.split "::", 1 | | | local dir = f"/tmp/hf models/{repo id.replace '/', ' ' }" | | | print f"Downloading {filename} from {repo id} ..." | | | path = hf hub download | | | repo id=repo id, | | | filename=filename, | | | local dir=local dir, | | | local dir use symlinks=False, | | | | | | print f"Downloaded to {path}" | | | return path | | | raise ValueError f"Unknown source: {source r}. Use 'volume' or 'hf'." | | | @app.function | | | image=image, | | | gpu="A100-80GB", | | | timeout=14400, | | | volumes={"/vol": vol}, | | | | | | def run kld compare source: str, value: str, label: str : | | | import glob | | | matches = glob.glob "/llama-bin/ /llama-perplexity", recursive=True | | | assert matches, "llama-perplexity binary not found " | | | binary = matches 0 | | | Download wikitext-2 test set | | | subprocess.run | | | | | | "wget", "-q", | | | "https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip", | | | "-O", "/tmp/wikitext.zip", | | | , | | | check=True, | | | | | | subprocess.run "unzip", "-q", "/tmp/wikitext.zip", "-d", "/tmp/wikitext" , check=True | | | dataset = "/tmp/wikitext/wikitext-2-raw/wiki.test.raw" | | | Step 1: ensure base model exists on volume, then dump reference logits | | | ensure base model | | | if not os.path.exists LOGITS FILE : | | | print "=== Step 1: Dumping BF16 reference logits ===" | | | result = subprocess.run | | | | | | binary, | | | "-m", f"{MODEL DIR}/BF16/{BF16 MODEL FILE}", | | | "-f", dataset, | | | "-c", "8192", | | | "-ngl", "99", | | | "-ctk", "q8 0", | | | "-ctv", "q8 0", | | | "--save-all-logits", LOGITS FILE, | | | , | | | capture output=True, | | | text=True, | | | | | | print result.stdout | | | print result.stderr | | | vol.commit | | | else: | | | print f"=== Step 1: Skipping — logits already at {LOGITS FILE} ===" | | | Step 2: resolve model and run KLD | | | print f"\n=== Step 2 {label} : Resolving model path ===" | | | model path = resolve model path source, value | | | print f"=== Step 2 {label} : Computing KL divergence ===" | | | result = subprocess.run | | | | | | binary, | | | "-m", model path, | | | "-f", dataset, | | | "-c", "8192", | | | "-ngl", "99", | | | "-ctk", "q8 0", | | | "-ctv", "q8 0", | | | "--kl-divergence-base", LOGITS FILE, | | | "--kl-divergence", | | | , | | | capture output=True, | | | text=True, | | | | | | print result.stdout | | | print result.stderr | | | Persist results to volume | | | kld output = f"/vol/kld-{label}.txt" | | | with open kld output, "w" as f: | | | f.write result.stdout | | | f.write result.stderr | | | vol.commit | | | print f"Results saved to {kld output}" | | | return result.stdout + result.stderr | | | @app.local entrypoint | | | def main | | | source: str, | | | value: str, | | | label: str = "", | | | : | | | """ | | | Examples: | | | volume model | | | modal run kld.py --source volume --value /vol/model/Qwen3.6-27B-attn q5 q3.gguf --label attn q5 q3 | | | huggingface model repo id::filename | | | modal run kld.py --source hf --value "unsloth/Qwen3.6-27B-GGUF::Qwen3.6-27B-Q4 K M.gguf" --label Q4 K M | | | """ | | | if not label: | | | label = value.split "::" -1 .replace ".gguf", "" | | | output = run kld compare.remote source=source, value=value, label=label | | | print output |