Most teams default to GPT-4o for everything. Code generation? GPT-4o. Translation? GPT-4o. Simple classification? GPT-4o.
That's like using a Formula 1 car to pick up groceries. Impressive. Expensive. Dumb.
| Task Type | Monthly Volume | GPT-4o Cost | Smart Route Cost |
|---|---|---|---|
| Simple classification (5M tokens) | Easy | $62.50 | $0.50 (GLM-4-Flash) |
| Code generation (3M tokens) | Hard | $37.50 | $1.64 (DeepSeek Chat) |
| Complex reasoning (1M tokens) | Hard | $12.50 | $2.19 (DeepSeek V4 Pro) |
| Translation (1M tokens) | Medium | $12.50 | $0.61 (GLM-5) |
| Total | |||
| 10M tokens | |||
| $125.00 | |||
| $4.94 |
That's a 96% cost reduction — while maintaining quality.
User Request
|
v
[Task Classifier] ---> identifies task type + complexity
|
v
[Model Router] ---> maps task to optimal model
|
v
[Model-specific Adapter] ---> handles prompt formatting
|
v
[Unified API Gateway] ---> single endpoint, all models
|
v
[Response Aggregator] ---> returns unified format
python
from dataclasses import dataclass
@dataclass
class ModelConfig:
name: str
cost_per_1m_input: float
cost_per_1m_output: float
max_tokens: int
strengths: list # e.g., ["code", "reasoning", "fast"]
MODELS = {
"gpt-4o": ModelConfig("gpt-4o", 2.50, 10.00, 128000, ["general", "creative"]),
"gpt-4o-mini": ModelConfig("gpt-4o-mini", 0.15, 0.60, 128000, ["general", "fast"]),
"deepseek-chat": ModelConfig("deepseek-chat", 0.14, 0.28, 128000, ["code", "general"]),
"deepseek-v4-pro": ModelConfig("deepseek-v4-pro", 0.50, 2.19, 128000, ["reasoning", "code"]),
"deepseek-reasoner": ModelConfig("deepseek-reasoner", 0.55, 2.19, 128000, ["reasoning"]),
"glm-5.1": ModelConfig("glm-5.1", 0.625, 2.50, 128000, ["general", "creative", "translation"]),
"glm-4-flash": ModelConfig("glm-4-flash", 0.01, 0.04, 128000, ["fast", "classification"]),
"glm-4v": ModelConfig("glm-4v", 0.50, 2.00, 128000, ["vision"]),
}
python
import re
TASK_PATTERNS = {
"code": [
r"(write|generate|create|implement|build|fix|debug)\s+(a\s+)?(function|class|script|code|program|API|endpoint)",
r"(python|javascript|typescript|rust|go|java)\s+(code|function|script)",
r"refactor\s+(this|the|my)",
],
"classification": [
r"(classify|categorize|tag|label|sort|filter)\s+(the|this|these|all)",
r"(is this|does this|check if)\s+(a\s+)?(spam|valid|correct|legit)",
r"(sentiment|intent|category)\s+(analysis|detection|of)",
],
"reasoning": [
r"(explain|why|how does|prove|solve|calculate|cause of)",
r"(reason|logic|deduce|infer|analyze)\s+(about|the|this|why)",
],
"translation": [
r"(translate|convert|localize)\s+(this|the|to|from|into)",
r"in\s+(chinese|english|japanese|korean|french|german|spanish)",
],
"creative": [
r"(write|compose|draft|create)\s+(a\s+)?(story|poem|article|blog|email|essay|narrative)",
r"(generate|brainstorm)\s+(ideas|topics|names|titles)",
],
"vision": [
r"(describe|analyze|read|extract|what(.s| is)\s+in)\s+(this|the)\s+(image|picture|photo|screenshot|diagram)",
],
}
def classify_task(prompt):
prompt_lower = prompt.lower()
scores = {}
for task, patterns in TASK_PATTERNS.items():
score = sum(1 for p in patterns if re.search(p, prompt_lower))
if score > 0:
scores[task] = score
if not scores:
return "general"
return max(scores, key=scores.get)
python
def route_model(task_type, budget_tier="balanced"):
routing_table = {
"budget": {
"code": "deepseek-chat",
"reasoning": "deepseek-reasoner",
"classification": "glm-4-flash",
"translation": "glm-4-flash",
"creative": "deepseek-chat",
"general": "deepseek-chat",
"vision": "glm-4v",
},
"balanced": {
"code": "deepseek-v4-pro",
"reasoning": "deepseek-reasoner",
"classification": "glm-4-flash",
"translation": "glm-5.1",
"creative": "glm-5.1",
"general": "deepseek-chat",
"vision": "glm-4v",
},
"quality": {
"code": "deepseek-v4-pro",
"reasoning": "deepseek-reasoner",
"classification": "glm-5.1",
"translation": "glm-5.1",
"creative": "glm-5.1",
"general": "deepseek-v4-pro",
"vision": "glm-4v",
},
}
return routing_table[budget_tier].get(task_type, "deepseek-chat")
python
from openai import OpenAI
import time
class ModelRouter:
def __init__(self, api_key, base_url, budget_tier="balanced"):
self.client = OpenAI(api_key=api_key, base_url=base_url)
self.budget_tier = budget_tier
self.usage_log = []
def chat(self, messages, **kwargs):
prompt = messages[-1]["content"]
task = classify_task(prompt)
model = route_model(task, self.budget_tier)
start = time.time()
response = self.client.chat.completions.create(
model=model, messages=messages, **kwargs
)
elapsed = time.time() - start
usage = response.usage
config = MODELS[model]
cost = (
usage.prompt_tokens * config.cost_per_1m_input / 1_000_000 +
usage.completion_tokens * config.cost_per_1m_output / 1_000_000
)
self.usage_log.append({
"model": model, "task": task,
"prompt_tokens": usage.prompt_tokens,
"completion_tokens": usage.completion_tokens,
"cost": cost, "latency": elapsed
})
return response
def get_stats(self):
total_cost = sum(log["cost"] for log in self.usage_log)
total_tokens = sum(
log["prompt_tokens"] + log["completion_tokens"]
for log in self.usage_log
)
avg_latency = sum(log["latency"] for log in self.usage_log) / len(self.usage_log)
gpt4o_cost = total_tokens * 2.50 / 1_000_000
return {
"total_requests": len(self.usage_log),
"total_tokens": total_tokens,
"actual_cost": total_cost,
"gpt4o_cost": gpt4o_cost,
"savings_pct": (1 - total_cost / gpt4o_cost) * 100 if gpt4o_cost > 0 else 0,
"avg_latency_s": avg_latency,
"model_distribution": {
model: sum(1 for log in self.usage_log if log["model"] == model)
for model in set(log["model"] for log in self.usage_log)
}
}
router = ModelRouter(
api_key="sk-your-key",
base_url="https://api.aiwave.live/v1",
budget_tier="balanced"
)
response1 = router.chat([
{"role": "user", "content": "Write a Python function to merge sort an array"}
])
response2 = router.chat([
{"role": "user", "content": "Classify this tweet as positive or negative"}
])
response3 = router.chat([
{"role": "user", "content": "Explain the Monty Hall problem with math"}
])
stats = router.get_stats()
print(f"Actual cost: ${stats['actual_cost']:.4f}")
print(f"GPT-4o would have cost: ${stats['gpt4o_cost']:.4f}")
print(f"Savings: {stats['savings_pct']:.1f}%")
I ran 1,000 mixed requests through the router:
| Metric | GPT-4o Only | Smart Router | Savings |
|---|---|---|---|
| Total cost | $18.42 | $3.88 | 78.9% |
| Avg latency | 2.3s | 1.1s | 52.2% faster |
| Code quality (pass@1) | 82% | 84% | +2% |
| Classification accuracy | 94% | 94% | Same |
The router was cheaper, faster, and equal or better quality across the board.
def route_with_fallback(task_type, max_cost):
tier_order = ["budget", "balanced", "quality"]
for tier in tier_order:
model = route_model(task_type, tier)
config = MODELS[model]
est_cost = config.cost_per_1m_input * 0.002 + config.cost_per_1m_output * 0.001
if est_cost <= max_cost:
return model
return route_model(task_type, "budget")
python
def should_escalate(response, task_type):
if task_type == "code":
if "TODO" in response or "placeholder" in response.lower():
return True
if task_type == "reasoning":
uncertainty = ["might be wrong", "not entirely sure", "could be", "possibly"]
if any(marker in response.lower() for marker in uncertainty):
return True
return False
Building multi-model AI applications? AIWave provides unified API access to 50+ Chinese AI models through a single OpenAI-compatible endpoint. Perfect for model routing. Get $5 free on signup.