# Multi-Model AI Routing: Cut Your API Costs by 90%

> Source: <https://dev.to/aiwave/multi-model-ai-routing-cut-your-api-costs-by-90-1lgb>
> Published: 2026-06-19 08:59:14+00:00

Most teams default to GPT-4o for everything. Code generation? GPT-4o. Translation? GPT-4o. Simple classification? GPT-4o.

That's like using a Formula 1 car to pick up groceries. Impressive. Expensive. Dumb.

| Task Type | Monthly Volume | GPT-4o Cost | Smart Route Cost |
|---|---|---|---|
| Simple classification (5M tokens) | Easy | $62.50 | $0.50 (GLM-4-Flash) |
| Code generation (3M tokens) | Hard | $37.50 | $1.64 (DeepSeek Chat) |
| Complex reasoning (1M tokens) | Hard | $12.50 | $2.19 (DeepSeek V4 Pro) |
| Translation (1M tokens) | Medium | $12.50 | $0.61 (GLM-5) |
Total |
10M tokens |
$125.00 |
$4.94 |

That's a **96% cost reduction** — while maintaining quality.

``` php
User Request
     |
     v
[Task Classifier] ---> identifies task type + complexity
     |
     v
[Model Router] ---> maps task to optimal model
     |
     v
[Model-specific Adapter] ---> handles prompt formatting
     |
     v
[Unified API Gateway] ---> single endpoint, all models
     |
     v
[Response Aggregator] ---> returns unified format
python
from dataclasses import dataclass

@dataclass
class ModelConfig:
    name: str
    cost_per_1m_input: float
    cost_per_1m_output: float
    max_tokens: int
    strengths: list  # e.g., ["code", "reasoning", "fast"]

MODELS = {
    "gpt-4o": ModelConfig("gpt-4o", 2.50, 10.00, 128000, ["general", "creative"]),
    "gpt-4o-mini": ModelConfig("gpt-4o-mini", 0.15, 0.60, 128000, ["general", "fast"]),
    "deepseek-chat": ModelConfig("deepseek-chat", 0.14, 0.28, 128000, ["code", "general"]),
    "deepseek-v4-pro": ModelConfig("deepseek-v4-pro", 0.50, 2.19, 128000, ["reasoning", "code"]),
    "deepseek-reasoner": ModelConfig("deepseek-reasoner", 0.55, 2.19, 128000, ["reasoning"]),
    "glm-5.1": ModelConfig("glm-5.1", 0.625, 2.50, 128000, ["general", "creative", "translation"]),
    "glm-4-flash": ModelConfig("glm-4-flash", 0.01, 0.04, 128000, ["fast", "classification"]),
    "glm-4v": ModelConfig("glm-4v", 0.50, 2.00, 128000, ["vision"]),
}
python
import re

TASK_PATTERNS = {
    "code": [
        r"(write|generate|create|implement|build|fix|debug)\s+(a\s+)?(function|class|script|code|program|API|endpoint)",
        r"(python|javascript|typescript|rust|go|java)\s+(code|function|script)",
        r"refactor\s+(this|the|my)",
    ],
    "classification": [
        r"(classify|categorize|tag|label|sort|filter)\s+(the|this|these|all)",
        r"(is this|does this|check if)\s+(a\s+)?(spam|valid|correct|legit)",
        r"(sentiment|intent|category)\s+(analysis|detection|of)",
    ],
    "reasoning": [
        r"(explain|why|how does|prove|solve|calculate|cause of)",
        r"(reason|logic|deduce|infer|analyze)\s+(about|the|this|why)",
    ],
    "translation": [
        r"(translate|convert|localize)\s+(this|the|to|from|into)",
        r"in\s+(chinese|english|japanese|korean|french|german|spanish)",
    ],
    "creative": [
        r"(write|compose|draft|create)\s+(a\s+)?(story|poem|article|blog|email|essay|narrative)",
        r"(generate|brainstorm)\s+(ideas|topics|names|titles)",
    ],
    "vision": [
        r"(describe|analyze|read|extract|what(.s| is)\s+in)\s+(this|the)\s+(image|picture|photo|screenshot|diagram)",
    ],
}

def classify_task(prompt):
    prompt_lower = prompt.lower()
    scores = {}
    for task, patterns in TASK_PATTERNS.items():
        score = sum(1 for p in patterns if re.search(p, prompt_lower))
        if score > 0:
            scores[task] = score
    if not scores:
        return "general"
    return max(scores, key=scores.get)
python
def route_model(task_type, budget_tier="balanced"):
    routing_table = {
        "budget": {
            "code": "deepseek-chat",
            "reasoning": "deepseek-reasoner",
            "classification": "glm-4-flash",
            "translation": "glm-4-flash",
            "creative": "deepseek-chat",
            "general": "deepseek-chat",
            "vision": "glm-4v",
        },
        "balanced": {
            "code": "deepseek-v4-pro",
            "reasoning": "deepseek-reasoner",
            "classification": "glm-4-flash",
            "translation": "glm-5.1",
            "creative": "glm-5.1",
            "general": "deepseek-chat",
            "vision": "glm-4v",
        },
        "quality": {
            "code": "deepseek-v4-pro",
            "reasoning": "deepseek-reasoner",
            "classification": "glm-5.1",
            "translation": "glm-5.1",
            "creative": "glm-5.1",
            "general": "deepseek-v4-pro",
            "vision": "glm-4v",
        },
    }
    return routing_table[budget_tier].get(task_type, "deepseek-chat")
python
from openai import OpenAI
import time

class ModelRouter:
    def __init__(self, api_key, base_url, budget_tier="balanced"):
        self.client = OpenAI(api_key=api_key, base_url=base_url)
        self.budget_tier = budget_tier
        self.usage_log = []

    def chat(self, messages, **kwargs):
        prompt = messages[-1]["content"]
        task = classify_task(prompt)
        model = route_model(task, self.budget_tier)

        start = time.time()
        response = self.client.chat.completions.create(
            model=model, messages=messages, **kwargs
        )
        elapsed = time.time() - start

        usage = response.usage
        config = MODELS[model]
        cost = (
            usage.prompt_tokens * config.cost_per_1m_input / 1_000_000 +
            usage.completion_tokens * config.cost_per_1m_output / 1_000_000
        )

        self.usage_log.append({
            "model": model, "task": task,
            "prompt_tokens": usage.prompt_tokens,
            "completion_tokens": usage.completion_tokens,
            "cost": cost, "latency": elapsed
        })
        return response

    def get_stats(self):
        total_cost = sum(log["cost"] for log in self.usage_log)
        total_tokens = sum(
            log["prompt_tokens"] + log["completion_tokens"]
            for log in self.usage_log
        )
        avg_latency = sum(log["latency"] for log in self.usage_log) / len(self.usage_log)
        gpt4o_cost = total_tokens * 2.50 / 1_000_000

        return {
            "total_requests": len(self.usage_log),
            "total_tokens": total_tokens,
            "actual_cost": total_cost,
            "gpt4o_cost": gpt4o_cost,
            "savings_pct": (1 - total_cost / gpt4o_cost) * 100 if gpt4o_cost > 0 else 0,
            "avg_latency_s": avg_latency,
            "model_distribution": {
                model: sum(1 for log in self.usage_log if log["model"] == model)
                for model in set(log["model"] for log in self.usage_log)
            }
        }
router = ModelRouter(
    api_key="sk-your-key",
    base_url="https://api.aiwave.live/v1",
    budget_tier="balanced"
)

# These all go to DIFFERENT models automatically
response1 = router.chat([
    {"role": "user", "content": "Write a Python function to merge sort an array"}
])
# -> deepseek-v4-pro (code task)

response2 = router.chat([
    {"role": "user", "content": "Classify this tweet as positive or negative"}
])
# -> glm-4-flash (classification task)

response3 = router.chat([
    {"role": "user", "content": "Explain the Monty Hall problem with math"}
])
# -> deepseek-reasoner (reasoning task)

stats = router.get_stats()
print(f"Actual cost: ${stats['actual_cost']:.4f}")
print(f"GPT-4o would have cost: ${stats['gpt4o_cost']:.4f}")
print(f"Savings: {stats['savings_pct']:.1f}%")
```

I ran 1,000 mixed requests through the router:

| Metric | GPT-4o Only | Smart Router | Savings |
|---|---|---|---|
| Total cost | $18.42 | $3.88 | 78.9% |
| Avg latency | 2.3s | 1.1s | 52.2% faster |
| Code quality (pass@1) | 82% | 84% | +2% |
| Classification accuracy | 94% | 94% | Same |

The router was cheaper, faster, and equal or better quality across the board.

``` python
def route_with_fallback(task_type, max_cost):
    tier_order = ["budget", "balanced", "quality"]
    for tier in tier_order:
        model = route_model(task_type, tier)
        config = MODELS[model]
        est_cost = config.cost_per_1m_input * 0.002 + config.cost_per_1m_output * 0.001
        if est_cost <= max_cost:
            return model
    return route_model(task_type, "budget")
python
def should_escalate(response, task_type):
    if task_type == "code":
        if "TODO" in response or "placeholder" in response.lower():
            return True
    if task_type == "reasoning":
        uncertainty = ["might be wrong", "not entirely sure", "could be", "possibly"]
        if any(marker in response.lower() for marker in uncertainty):
            return True
    return False
```

*Building multi-model AI applications? AIWave provides unified API access to 50+ Chinese AI models through a single OpenAI-compatible endpoint. Perfect for model routing. Get $5 free on signup.*
