OpenAI-compatible proxy for DeepSeek V4 Flash with intelligent auto context compression features

This article describes a Python script that functions as an OpenAI-compatible proxy for the DeepSeek V4 Flash model, designed to optimize API usage through intelligent context compression. The proxy automatically compresses system prompts, deduplicates markdown blocks and repeated user message segments, and triggers conversation summarization when the token budget is exceeded. It also caches assistant reasoning and uses SHA-256 fingerprinting to remove boilerplate content, while ignoring all client-supplied model parameters in favor of a fixed global configuration.

/usr/bin/env python3 """ Zero-dependency OpenAI-compatible proxy for DeepSeek V4 Flash. Author: g023 License: MIT All client‑supplied model and generation parameters are ignored . The proxy always uses the model, max output tokens, and other settings defined in the global configuration see --help and the constants below . Optimisations: - System prompt compression auto-summarized via DeepSeek API; originals stored in ./pre sys/, summaries cached in ./post sys/ - Markdown block deduplication keeps only the latest occurrence full - Conversation summarisation triggers when token budget is exceeded - Assistant reasoning is cached to avoid redundant re‑generation - Inter‑message content fingerprinting & deduplication Feature F-1 - Removes repeated boilerplate segments environment info, userMemory, reminderInstructions, etc. from user messages across conversation turns. - Segments are hashed SHA‑256 , duplicates replaced with an empty string or a minimal placeholder if the message becomes empty . - Per‑conversation fingerprint storage with LRU eviction. Reads from local file K.dat for API key if DEEPSEEK API KEY env var is not set. just a proof of concept pet project. Do not expose this server to the internet. """ import argparse import collections import copy import hashlib import http.server import json import logging import os import re import signal import socketserver import sys import threading import time import urllib.error import urllib.request from datetime import datetime from pathlib import Path from typing import Any, Dict, List, Optional, Tuple ============================================================================== Global configuration – these forcibly override every client request ============================================================================== DEEPSEEK BASE = "https://api.deepseek.com" DEFAULT MODEL = "deepseek-v4-flash" "deepseek-v4-flash" "deepseek-v4-pro" model that will always be used MAX CACHE SIZE = 500 LRU cache for assistant reasoning MAX CONTEXT = 128000 tokens context size SUMMARY RATIO = 0.8 trigger summarisation at 80 % of MAX CONTEXT SUMMARY MODEL = DEFAULT MODEL model used for the summarisation call MAX OUTPUT TOKENS = 128000 max tokens to generate overrides client THINKING MODE = "auto" "enabled", "disabled", or "auto" default -------------------------------------------------------------------------- Local file save toggles – set to False to disable disk writes -------------------------------------------------------------------------- SAVE PREPOST MSGS = False save pre/post message dumps to ./pre msg/ and ./post msg/ SAVE PREPOST SYSTEM = True save original/summarized system prompts to ./pre sys/ and ./post sys/ -------------------------------------------------------------------------- Retry configuration for summarisation calls -------------------------------------------------------------------------- SUMMARISE MAX RETRIES = 3 SUMMARISE RETRY BASE SLEEP = 2.0 seconds, doubled each attempt -------------------------------------------------------------------------- Feature F-1: Inter‑message content fingerprinting & deduplication -------------------------------------------------------------------------- MAX FINGERPRINT HISTORY = 100 max number of segments stored per conversation Known boilerplate XML tags – each as open tag, close tag BOILERPLATE PATTERNS = { "environment info": "<environment info ", "</environment info " , "workspace info": "<workspace info ", "</workspace info " , "userMemory": "<userMemory ", "</userMemory " , "sessionMemory": "<sessionMemory ", "</sessionMemory " , "repoMemory": "<repoMemory ", "</repoMemory " , "context": "<context ", "</context " , "reminderInstructions": "<reminderInstructions ", "</reminderInstructions " , "additional skills reminder": "<additional skills reminder ", "</additional skills reminder " , "editorContext": "<editorContext ", "</editorContext " , } Fingerprint storage: conv id - { segment hash: first index, full segment text } segment fingerprints: Dict str, Dict str, Tuple int, str = {} segment fp lock = threading.Lock Read the DeepSeek API key DSEEK KEY = "" if os.environ.get "DEEPSEEK API KEY" : DSEEK KEY = os.environ "DEEPSEEK API KEY" else: try: with open "K.dat", "r" as f: DSEEK KEY = f.read .strip except Exception: print "ERROR: DEEPSEEK API KEY environment variable not set and K.dat not found.", file=sys.stderr sys.exit 1 ============================================================================== Logging / dev feedback – directories for pre/post messages ============================================================================== PRE MSG DIR = Path "./pre msg" POST MSG DIR = Path "./post msg" def save json message dir path: Path, prefix: str, messages: list, msg hash: str : """Save a message list as a tab-indented JSON file.""" try: timestamp = datetime.now .strftime "%Y%m%d %H%M%S %f" filename = f"{timestamp} {msg hash}.json" filepath = dir path / filename with open filepath, "w", encoding="utf-8" as f: json.dump messages, f, ensure ascii=False, indent="\t" except Exception: logging.exception f"Failed to save {prefix} message dump" def md5 of messages messages: list - str: """Short 8-char MD5 hash of the messages list stable .""" data = json.dumps messages, sort keys=True, ensure ascii=False .encode return hashlib.md5 data .hexdigest :8 ============================================================================== Improved token estimation – CJK‑aware heuristic ============================================================================== CJK RANGES = 0x4E00, 0x9FFF , CJK Unified Ideographs 0x3400, 0x4DBF , CJK Unified Ideographs Extension A 0x20000, 0x2A6DF , Extension B 0x2A700, 0x2B73F , Extension C 0x2B740, 0x2B81F , Extension D 0x2B820, 0x2CEAF , Extension E 0x2CEB0, 0x2EBEF , Extension F 0x30000, 0x3134F , Extension G 0x31350, 0x323AF , Extension H CJK PUNCT = {0x3000, 0x3001, 0x3002, 0xFF0C, 0xFF0E, 0xFF1A, 0xFF1B, 0xFF01, 0xFF1F, 0x300C, 0x300D, 0x300E, 0x300F, 0x3010, 0x3011, 0x300A, 0x300B} def is cjk cp: int - bool: if cp in CJK PUNCT: return True for lo, hi in CJK RANGES: if lo <= cp <= hi: return True return False def estimate tokens text: str - int: """ Token count heuristic: - For CJK‑heavy text 50% CJK characters : 1 token per character. - Otherwise: 1 token per 3.5 characters conservative for code/English . """ total = len text if total == 0: return 0 cjk count = sum 1 for ch in text if is cjk ord ch if cjk count / total 0.5: Mostly CJK: each character is roughly one token return max 1, total else: return max 1, int total // 3.5 ============================================================================== LRU Cache for assistant reasoning ============================================================================== class LRUCache: def init self, maxsize: int : self.maxsize = maxsize self. cache = collections.OrderedDict self. lock = threading.Lock def get self, key: str : with self. lock: if key in self. cache: self. cache.move to end key return self. cache key return None def set self, key: str, value: dict : with self. lock: if key in self. cache: self. cache.move to end key self. cache key = value if len self. cache self.maxsize: self. cache.popitem last=False assistant cache = LRUCache MAX CACHE SIZE ============================================================================== Hashing utilities ============================================================================== def stable hash obj: Any - str: """Stable SHA256 hash of a JSON‑serialisable object dict or list .""" return hashlib.sha256 json.dumps obj, sort keys=True, ensure ascii=False .encode .hexdigest def conv hash messages: List dict - str: """Hash of a message list – only fields that affect the conversation identity.""" important keys = {"role", "content", "tool calls", "name", "tool call id"} cleaned = {k: v for k, v in m.items if k in important keys} for m in messages return stable hash cleaned ============================================================================== System prompt compression – auto-summarize via DeepSeek API ============================================================================== PRE SYS DIR = Path "./pre sys" POST SYS DIR = Path "./post sys" sys lock = threading.Lock def ensure sys dirs : PRE SYS DIR.mkdir parents=True, exist ok=True POST SYS DIR.mkdir parents=True, exist ok=True def sys original path sys hash: str - Path: return PRE SYS DIR / f"{sys hash}.txt" def sys summary path sys hash: str - Path: return POST SYS DIR / f"{sys hash}.txt" def load summarized prompt sys hash: str - Optional str : """Return the cached summarized prompt if it exists, else None.""" path = sys summary path sys hash if path.exists : try: return path.read text encoding="utf-8" except Exception: logging.warning f"Failed to read summarized prompt {sys hash}" return None def save original prompt sys hash: str, content: str : """Atomically save the original system prompt to disk thread‑safe .""" path = sys original path sys hash if path.exists : return already saved tmp path = path.with suffix ".tmp" with sys lock: try: tmp path.write text content, encoding="utf-8" tmp path.rename path except Exception as e: logging.warning f"Failed to save original prompt {sys hash}: {e}" def save summarized prompt sys hash: str, content: str : """Atomically write a summarized prompt to disk thread‑safe .""" path = sys summary path sys hash tmp path = path.with suffix ".tmp" with sys lock: try: tmp path.write text content, encoding="utf-8" tmp path.rename path except Exception as e: logging.warning f"Failed to write summarized prompt {sys hash}: {e}" def summarize system prompt original: str, api key: str - str: """Call DeepSeek to produce a concise summary of a system prompt.""" summary prompt = "You are a prompt compression assistant. Summarize the following system prompt " "as concisely as possible while preserving ALL critical instructions, constraints, " "formatting rules, and behavioral guidelines. Remove redundancy, examples, and " "verbose explanations. Output ONLY the compressed prompt — no commentary.\n\n" f"{original}" payload = { "model": SUMMARY MODEL, "messages": {"role": "user", "content": summary prompt} , "max tokens": 2000, "temperature": 0.0, "thinking": {"type": "disabled"}, } headers = { "Authorization": f"Bearer {api key}", "Content-Type": "application/json", } last exc = None for attempt in range 1, SUMMARISE MAX RETRIES + 1 : try: req = urllib.request.Request f"{DEEPSEEK BASE}/chat/completions", data=json.dumps payload .encode , headers=headers, method="POST", with urllib.request.urlopen req, timeout=120 as resp: body = json.loads resp.read .decode return body "choices" 0 "message" "content" except Exception as e: last exc = e if attempt < SUMMARISE MAX RETRIES: sleep time = SUMMARISE RETRY BASE SLEEP 2 attempt - 1 logging.warning "System prompt summarization attempt %d failed, retrying in %.1fs: %s", attempt, sleep time, e time.sleep sleep time else: logging.error "System prompt summarization failed after %d attempts", SUMMARISE MAX RETRIES raise RuntimeError f"System prompt summarization failed: {last exc}" ============================================================================== Markdown block deduplication fixed – no index corruption ============================================================================== FENCE PATTERN = re.compile r" |~~~ \w \n . ? \1", re.DOTALL def deduplicate markdown blocks messages: List dict - int: """ For each fenced code block, keep the last occurrence full; replace earlier occurrences with a placeholder. Modifies messages in-place. Returns the count of blocks replaced. """ block info: Dict str, int = {} hash - latest global ID all matches: List Dict str, Any = per-match info replaced count = 0 global counter = 0 for msg idx, msg in enumerate messages : content = msg.get "content", "" if not isinstance content, str : continue for match in FENCE PATTERN.finditer content : inner text = match.group 3 h = hashlib.sha256 inner text.encode .hexdigest all matches.append { "global id": global counter, "msg idx": msg idx, "start": match.start 3 , "end": match.end 3 , "hash": h, "full text": inner text, } block info h = global counter global counter += 1 msg matches: Dict int, List dict = {} for m in all matches: msg matches.setdefault m "msg idx" , .append m for msg idx, matches in msg matches.items : msg = messages msg idx original = msg "content" matches sorted = sorted matches, key=lambda x: x "start" , reverse=True new parts = prev end = len original for match in matches sorted: start, end = match "start" , match "end" is last = block info.get match "hash" == match "global id" if not is last: replacement = ".. code omitted, see later version .." replaced count += 1 else: replacement = match "full text" new parts.append original end:prev end new parts.append replacement prev end = start new parts.append original :prev end msg "content" = "".join reversed new parts return replaced count ============================================================================== Feature F-1: Inter-message content fingerprinting & deduplication ============================================================================== def get conversation base id messages: List dict - str: """ Derive a stable conversation identifier from the first system message and the first user message excluding boilerplate tags . This ID persists across requests of the same conversation, even as new messages are added. """ Find first system message if any sys content = "" for msg in messages: if msg.get "role" == "system" and isinstance msg.get "content" , str : sys content = msg "content" break Find first user message if any user content = "" for msg in messages: if msg.get "role" == "user" and isinstance msg.get "content" , str : Strip known boilerplate tags to get the "core" user content content = msg "content" for open tag, close tag in BOILERPLATE PATTERNS.values : Remove all occurrences of this tag pair and their contents crude but sufficient for fingerprinting pattern = re.escape open tag + r". ?" + re.escape close tag content = re.sub pattern, "", content, flags=re.DOTALL user content = content.strip break combined = f"{sys content}\n{user content}".strip if not combined: Fallback: use the full conversation hash will change each turn, but still isolates turns that are completely boilerplate return conv hash messages return hashlib.sha256 combined.encode .hexdigest def deduplicate user message segments messages: List dict , conv id: str - Tuple List dict , int : """ For each user message, identify boilerplate segments and omit those that are identical to previously-seen segments in this conversation. Modifies messages in-place. Returns messages, segments removed . If a user message becomes empty after removals, it is replaced with a minimal placeholder " no new content ". """ global segment fingerprints with segment fp lock: if conv id not in segment fingerprints: segment fingerprints conv id = {} history = segment fingerprints conv id segments removed = 0 for msg in messages: if msg.get "role" = "user": continue content = msg.get "content", "" if not isinstance content, str : continue new content = content Process each boilerplate pattern for open tag, close tag in BOILERPLATE PATTERNS.values : Find all non-overlapping occurrences idx = 0 while True: start = new content.find open tag, idx if start == -1: break end = new content.find close tag, start + len open tag if end == -1: break end += len close tag segment = new content start:end seg hash = hashlib.sha256 segment.encode .hexdigest if seg hash in history: Duplicate segment: remove it entirely new content = new content :start + new content end: segments removed += 1 Continue scanning from the same start position idx = start else: First time seeing this segment: store it history seg hash = len history , segment Prune if too many entries if len history MAX FINGERPRINT HISTORY: oldest = min history.keys , key=lambda k: history k 0 del history oldest idx = end After processing, check if the message became empty new content = new content.strip if new content == "": new content = " no new content " msg "content" = new content return messages, segments removed ============================================================================== Conversation summarisation – with token-aware split & retries ============================================================================== def total tokens messages: List dict - int: """Estimate total token count for a list of messages.""" total = 0 for m in messages: content = m.get "content", "" if isinstance content, str : total += estimate tokens content for tc in m.get "tool calls", : total += estimate tokens json.dumps tc.get "function", {} .get "arguments", "" return total def summarise messages with retry summarise payload: dict, api key: str - str: """ Call DeepSeek summarisation with retries exponential backoff . Raises RuntimeError if all attempts fail. """ headers = { "Authorization": f"Bearer {api key}", "Content-Type": "application/json", } last exc = None for attempt in range 1, SUMMARISE MAX RETRIES + 1 : try: req = urllib.request.Request f"{DEEPSEEK BASE}/chat/completions", data=json.dumps summarise payload .encode , headers=headers, method="POST", with urllib.request.urlopen req, timeout=120 as resp: body = json.loads resp.read .decode return body "choices" 0 "message" "content" except Exception as e: last exc = e if attempt < SUMMARISE MAX RETRIES: sleep time = SUMMARISE RETRY BASE SLEEP 2 attempt - 1 logging.warning "Summarisation attempt %d failed, retrying in %.1fs: %s", attempt, sleep time, e time.sleep sleep time else: logging.error "Summarisation failed after %d attempts", SUMMARISE MAX RETRIES raise RuntimeError f"Summarisation API call failed: {last exc}" def merge system messages messages: List dict - Tuple Optional str , List dict : """Extract and merge all system messages into a single string. Returns the merged content or None if none and the remaining non‑system messages.""" systems = m "content" for m in messages if m "role" == "system" and isinstance m.get "content" , str others = m for m in messages if m "role" = "system" merged = "\n".join systems if systems else None return merged, others def maybe summarize messages: List dict , api key: str, max context: int = MAX CONTEXT, ratio: float = SUMMARY RATIO - Tuple List dict , bool : """ If total estimated tokens max context ratio, summarise oldest messages except system and replace them with a condensed summary message. Handles multiple system messages by merging them. Returns a new message list does not modify original and a boolean indicating whether summarisation was performed. Guarantees at most ONE system message. """ threshold = int max context ratio total = total tokens messages if total <= threshold: merged sys, non sys = merge system messages messages if merged sys is not None: return {"role": "system", "content": merged sys} + non sys, False return messages, False merged sys, non system = merge system messages messages if not non system: prefix = {"role": "system", "content": merged sys} if merged sys else return prefix, False Token-aware split: accumulate messages from the start until the token deficit is covered or at least half the messages are covered, whichever is less . The deficit is total - threshold. deficit = total - threshold accumulated = 0 idx = 0 for i, msg in enumerate non system : Add token count of this message accumulated += estimate tokens msg.get "content", "" for tc in msg.get "tool calls", : accumulated += estimate tokens json.dumps tc.get "function", {} .get "arguments", "" if accumulated = deficit and i = len non system // 2: idx = i + 1 summarise up to and including this message break else: Not enough tokens? Fallback to half the messages idx = max 1, len non system // 2 --- Critical fix: ensure we never split a tool message from its preceding assistant message with tool calls --- Walk forward from idx until we are at a safe split point: - Not in the middle of an assistant tool calls → tool pair - i.e., the message at idx must NOT be a "tool" role message whose preceding assistant message had tool calls while idx < len non system : msg = non system idx if msg.get "role" == "tool": This tool message belongs to a preceding assistant; move split past it idx += 1 continue if msg.get "role" == "assistant" and msg.get "tool calls" : Check if the next message is a tool response to this assistant if idx + 1 < len non system and non system idx + 1 .get "role" == "tool": Move split past both the assistant and its tool response s idx += 1 while idx < len non system and non system idx .get "role" == "tool": idx += 1 continue break to summarise = non system :idx to keep = non system idx: if not to summarise: prefix = {"role": "system", "content": merged sys} if merged sys else return prefix + non system, False summary prompt = "Summarise the following conversation excerpt. " "Retain all critical facts, decisions, and code fragments. " "Be concise but complete.\n\n" summarise text = "" for m in to summarise: role = m "role" content = m.get "content", "" if isinstance content, str : summarise text += f" {role} : {content}\n" payload = { "model": SUMMARY MODEL, "messages": {"role": "user", "content": summary prompt + summarise text} , "max tokens": 1000, "temperature": 0.0, "thinking": {"type": "disabled"}, } try: summary = summarise messages with retry payload, api key except RuntimeError as e: logging.warning f"Summarisation failed, falling back to truncation: {e}" summary = " Earlier conversation truncated due to length " new messages: List dict = if merged sys: new sys content = merged sys + "\n\n Earlier conversation summary \n" + summary new messages.append {"role": "system", "content": new sys content} else: new messages.append {"role": "system", "content": f" Earlier conversation summary \n{summary}"} new messages.extend to keep return new messages, True ============================================================================== Reasoning injection helpers ============================================================================== def cache assistant message original msgs: List dict , assistant msg: dict : """Cache assistant message with reasoning so it can be reused on subsequent turns.""" if not assistant msg.get "tool calls" and not assistant msg.get "reasoning content" : return prefix = m.copy for m in original msgs clean asst = {k: v for k, v in assistant msg.items if k = "reasoning content"} prefix.append clean asst assistant cache.set conv hash prefix , assistant msg def inject reasoning messages: List dict : """Look up cached reasoning content for tool‑call assistant messages and inject it.""" for i, msg in enumerate messages : if msg.get "role" = "assistant": continue if not msg.get "tool calls" : continue if "reasoning content" in msg: continue prefix = messages :i+1 cached = assistant cache.get conv hash prefix if cached and "reasoning content" in cached: msg "reasoning content" = cached "reasoning content" def should disable thinking messages: List dict - bool: """Return True if thinking should be disabled for the current conversation i.e. there is a tool‑call assistant message WITHOUT reasoning, meaning the model doesn’t need to produce new reasoning .""" return any m.get "role" == "assistant" and m.get "tool calls" and "reasoning content" not in m for m in messages ============================================================================== DeepSeek API helpers ============================================================================== def make deepseek request payload: dict, stream: bool - urllib.request.Request: headers = { "Authorization": f"Bearer {DSEEK KEY}", "Content-Type": "application/json", "Accept": "text/event-stream" if stream else "application/json", } return urllib.request.Request f"{DEEPSEEK BASE}/chat/completions", data=json.dumps payload .encode , headers=headers, method="POST", def deepseek nonstream payload: dict - dict: """Perform a non‑streaming request. Raises RuntimeError on HTTP errors.""" req = make deepseek request payload, stream=False try: with urllib.request.urlopen req, timeout=600 as resp: return json.loads resp.read .decode except urllib.error.HTTPError as e: error body = e.read .decode if e.fp else "" raise RuntimeError f"DeepSeek HTTP {e.code}: {error body}" from e ============================================================================== Streaming buffer for accumulating tool calls ============================================================================== class StreamBuffer: def init self : self.reasoning = "" self.content = "" self.tool calls: Dict int, dict = {} self.finish reason: Optional str = None self.usage: Optional dict = None def process chunk self, chunk: dict - None: for choice in chunk.get "choices", : delta = choice.get "delta", {} if "reasoning content" in delta: rc = delta "reasoning content" self.reasoning = "" if rc is None else self.reasoning + rc if "content" in delta: ct = delta "content" self.content = "" if ct is None else self.content + ct for tc in delta.get "tool calls", : idx = tc.get "index" if idx is None: continue if idx not in self.tool calls: self.tool calls idx = { "id": tc.get "id", "" , "type": tc.get "type", "function" , "function": { "name": "", "arguments": "", }, } cur = self.tool calls idx tid = tc.get "id" if tid is not None: cur "id" = tid ttype = tc.get "type" if ttype is not None: cur "type" = ttype func raw = tc.get "function" func = func raw if isinstance func raw, dict else {} name = func.get "name" if name is not None: cur "function" "name" = name args = func.get "arguments" if args is not None: cur "function" "arguments" += args if "message" in choice: msg = choice "message" rc msg = msg.get "reasoning content" ct msg = msg.get "content" if rc msg is not None: self.reasoning = rc msg if ct msg is not None: self.content = ct msg finish = choice.get "finish reason" if finish is not None: self.finish reason = finish if "usage" in chunk: self.usage = chunk "usage" def build assistant message self - dict: msg: Dict str, Any = {"role": "assistant"} if self.reasoning: msg "reasoning content" = self.reasoning if self.content: msg "content" = self.content if self.tool calls: msg "tool calls" = self.tool calls k for k in sorted self.tool calls return msg ============================================================================== HTTP Request Handler ============================================================================== class ProxyHandler http.server.BaseHTTPRequestHandler : def log message self, format, args : logging.info "%s - %s", self.client address 0 , format % args def do POST self : if self.path = "/v1/chat/completions": self.send error 404 return content length = int self.headers.get "Content-Length", 0 if not content length: self.send error 400, "Empty body" return body = self.rfile.read content length try: client req = json.loads body except json.JSONDecodeError: self.send error 400, "Invalid JSON" return messages = client req.get "messages", --- Early validation: messages must be a non-empty list --- if not isinstance messages, list or len messages == 0: self.send error 400, "Empty or invalid 'messages' array" return stream = client req.get "stream", False Deep copy before any in‑place mutations original messages = copy.deepcopy messages ---------- Pre-processing metrics ---------- original msg count = len original messages original tokens = total tokens original messages msg hash = md5 of messages original messages Save original message for developer introspection if SAVE PREPOST MSGS: save json message PRE MSG DIR, "pre", original messages, msg hash ---- Pipeline ---- 1. Markdown block deduplication replaced blocks = deduplicate markdown blocks messages 2. Inter-message content fingerprinting & deduplication Feature F-1 conv id = get conversation base id original messages messages, deduped segments = deduplicate user message segments messages, conv id 3. Conversation summarisation token-aware split messages, summarized = maybe summarize messages, api key=DSEEK KEY, max context=MAX CONTEXT, ratio=SUMMARY RATIO, 4. System prompt compression – auto-summarize & cache compressed prompts found = 0 for msg in messages: if msg "role" == "system" and isinstance msg.get "content" , str : sys content = msg "content" h = stable hash {"content": sys content} Save original if not already saved if SAVE PREPOST SYSTEM: save original prompt h, sys content Check for cached summarized version summarized sys = load summarized prompt h if summarized sys: logging.debug "Using summarized system prompt for hash %s", h :12 msg "content" = summarized sys compressed prompts found += 1 else: Generate summary via DeepSeek API try: logging.info "Generating summarized system prompt for hash %s", h :12 summarized sys = summarize system prompt sys content, DSEEK KEY if SAVE PREPOST SYSTEM: save summarized prompt h, summarized sys msg "content" = summarized sys compressed prompts found += 1 except RuntimeError as e: logging.warning "Failed to summarize system prompt, using original: %s", e 5. Reasoning injection inject reasoning messages 6. Build final payload payload = dict client req payload "messages" = messages payload "model" = DEFAULT MODEL payload "max tokens" = MAX OUTPUT TOKENS if THINKING MODE == "enabled": payload "thinking" = {"type": "enabled"} elif THINKING MODE == "disabled": payload "thinking" = {"type": "disabled"} else: payload "thinking" = { "type": "disabled" if should disable thinking messages else "enabled" } payload "stream" = stream ---------- Post-processing metrics ---------- final msg count = len messages final tokens = total tokens messages compression pct = 1 - final tokens / original tokens 100 if original tokens 0 else 0.0 if SAVE PREPOST MSGS: save json message POST MSG DIR, "post", messages, msg hash logging.info f"REQ {msg hash} | msgs: {original msg count} → {final msg count} " f"| tokens: {original tokens} → {final tokens} {compression pct:+.1f}% " f"| blocks dedup'd: {replaced blocks} " f"| dedup'd segments: {deduped segments} " f"| summarized: {'yes' if summarized else 'no'} " f"| compressed prompts: {compressed prompts found} " f"| stream: {stream} " f"| thinking: {payload 'thinking' 'type' }" 7. Dispatch try: if stream: self. handle stream payload, original messages else: self. handle nonstream payload, original messages except RuntimeError as e: logging.error "Upstream error: %s", e self.send error 502, f"Upstream error: {e}" except Exception: logging.exception "Unexpected proxy error" try: self.send error 500, "Internal proxy error" except Exception: pass def handle nonstream self, payload, original msgs : resp = deepseek nonstream payload choices = resp.get "choices" if not choices or len choices == 0: logging.error "DeepSeek returned empty choices array" self.send error 502, "Empty response from upstream" return choice = choices 0 assistant msg = choice.get "message", {} .copy cache assistant message original msgs, assistant msg if "reasoning content" in assistant msg: del assistant msg "reasoning content" choice "message" = assistant msg body = json.dumps resp .encode try: self.send response 200 self.send header "Content-Type", "application/json" self.send header "Content-Length", str len body self.end headers self.wfile.write body except BrokenPipeError, ConnectionResetError, OSError as e: logging.warning "Client disconnected while sending non‑stream response: %s", e def handle stream self, payload, original msgs : req = make deepseek request payload, stream=True try: with urllib.request.urlopen req, timeout=600 as upstream: self.send response 200 self.send header "Content-Type", "text/event-stream" self.send header "Cache-Control", "no-cache" self.end headers buffer = StreamBuffer done = False try: for line in upstream: line str = line.decode if isinstance line, bytes else line if line str.startswith "data:" : data part = line str 5: .strip if data part == " DONE ": self.wfile.write b"data: DONE \n\n" self.wfile.flush done = True break try: chunk = json.loads data part except json.JSONDecodeError: self.wfile.write line str.encode + b"\n" self.wfile.flush continue buffer.process chunk chunk for choice in chunk.get "choices", : if "delta" in choice: choice "delta" .pop "reasoning content", None if "message" in choice: choice "message" .pop "reasoning content", None self.wfile.write f"data: {json.dumps chunk }\n\n".encode self.wfile.flush else: self.wfile.write line str.encode + b"\n" self.wfile.flush except BrokenPipeError, ConnectionResetError, OSError as e: logging.warning "Client disconnected during stream: %s", e done = True except Exception: logging.exception "Unexpected error while streaming response" done = True if not done: try: self.wfile.write b"data: DONE \n\n" self.wfile.flush except BrokenPipeError, ConnectionResetError, OSError : pass assistant msg = buffer.build assistant message if assistant msg.get "tool calls" or assistant msg.get "reasoning content" : cache assistant message original msgs, assistant msg except urllib.error.HTTPError, urllib.error.URLError as e: logging.error "Upstream connection error: %s", e error detail = "" if isinstance e, urllib.error.HTTPError : try: error detail = e.read .decode except Exception: pass self.send error 502, f"Upstream error: {error detail}" ============================================================================== Main – with fixed Ctrl+C handling avoid deadlock and dev directories ============================================================================== def main : global MAX CONTEXT, SUMMARY RATIO, MAX OUTPUT TOKENS, THINKING MODE parser = argparse.ArgumentParser description="DeepSeek V4 Flash OpenAI Proxy globals forced " parser.add argument "--port", type=int, default=8080, help="Listening port" parser.add argument "--host", default="0.0.0.0", help="Bind address" parser.add argument "--max-context", type=int, default=128000, help="Max context tokens" parser.add argument "--summarize-ratio", type=float, default=0.8, help="Trigger summarisation when tokens exceed ratio max-context" parser.add argument "--disable-compression", action="store true", help="Do not auto-summarize system prompts use originals as-is " parser.add argument "--max-output-tokens", type=int, default=128000, help="Force this many max tokens for generation overrides client " parser.add argument "--thinking", choices= "enabled", "disabled", "auto" , default="auto", help="Force thinking mode default: auto " args = parser.parse args logging.basicConfig level=logging.INFO, format="% asctime s % levelname s % message s" if not args.disable compression and SAVE PREPOST SYSTEM: ensure sys dirs MAX CONTEXT = args.max context SUMMARY RATIO = args.summarize ratio MAX OUTPUT TOKENS = args.max output tokens THINKING MODE = args.thinking Ensure developer dump directories exist if SAVE PREPOST MSGS: PRE MSG DIR.mkdir parents=True, exist ok=True POST MSG DIR.mkdir parents=True, exist ok=True server = socketserver.ThreadingTCPServer args.host, args.port , ProxyHandler server.daemon threads = True ---- Graceful shutdown WITHOUT deadlocking the main thread ---- shutdown lock = threading.Lock shutting down = False def shutdown signum, frame : nonlocal shutting down with shutdown lock: if shutting down: return shutting down = True logging.info "Received signal %s, shutting down.", signum threading.Thread target=server.shutdown, daemon=True .start signal.signal signal.SIGTERM, shutdown signal.signal signal.SIGINT, shutdown logging.info f"Proxy listening on {args.host}:{args.port}" logging.info f"Forced model: {DEFAULT MODEL}, max tokens: {MAX OUTPUT TOKENS}, thinking: {THINKING MODE}" logging.info f"Pre/post message dumps: {PRE MSG DIR} / {POST MSG DIR}" try: server.serve forever except KeyboardInterrupt: pass finally: server.server close logging.info "Server stopped." if name == " main ": main