Transplant MTP block from one GGUF file into another

A developer has released a Python script that transplants extra tensors—such as Multi-Token Prediction (MTP) layers—from one GGUF file into another, enabling the creation of mixed-quantization models. The tool preserves the exact on-disk layout, including per-row metadata critical for GPU inference, and supports using smaller "tensors-only" donor files to save bandwidth. Example usage includes transplanting an MTP block from a Q8_0 quantized file into an IQ4_KS base model.

| /usr/bin/env python3 | | | """ | | | Transplant extra tensors e.g. MTP layers from one GGUF file into another, | | | producing a mixed-quantization GGUF. | | | Note: Tested with ik llama.cpp GGUF Python module. | | | Usage: | | | python convert.py <target.gguf <source.gguf <output.gguf | | | Arguments: | | | target — base GGUF tensors + metadata kept as-is | | | source — GGUF with extra blocks to transplant e.g. blk.64. for MTP | | | output — resulting mixed-quantization GGUF | | | The script preserves the exact on-disk layout including per-row metadata | | | for quantization types like IQ4 KS that have row meta size 0. This is | | | critical for GPU inference to work correctly. | | | Donor Models: | | | To save bandwidth, you can use 'tensors-only' GGUFs as the source file. | | | Credits to AzerbaijanNyan for the extraction: | | | https://www.reddit.com/r/LocalLLaMA/comments/1t6r1ny/extracted mtp tensor ggufs smaller donor models/ | | | Available donors: | | | - Qwen3.6-35A3B: https://huggingface.co/IHaveNoClueAndIMustPost/Qwen3.6-35A3B-MTP-TENSORS-ONLY | | | - Qwen3.6-27b: https://huggingface.co/IHaveNoClueAndIMustPost/Qwen3.6-27b-MTP-TENSORS-ONLY | | | Example: | | | Transplant MTP block from Q8 0 into IQ4 KS base model | | | python convert.py Qwen3.6-27B-IQ4 KS.gguf Qwen3.6-27B-MTP-Q8 0.gguf Qwen3.6-27B-MTP-IQ4 KS.gguf | | | """ | | | import hashlib | | | import sys | | | import struct | | | from pathlib import Path | | | from gguf import GGUFReader, GGUFValueType | | | def get field value reader: GGUFReader, key: str : | | | """Safely get a field value from GGUFReader.""" | | | field = reader.get field key | | | return field.contents if field else None | | | def calculate on disk sizes tensors, file size : | | | """Calculate on-disk size for each tensor including per-row metadata/padding .""" | | | n tensors = len tensors | | | sizes = | | | for i in range n tensors : | | | if i < n tensors - 1: | | | sizes.append tensors i + 1 .data offset - tensors i .data offset | | | else: | | | sizes.append file size - tensors i .data offset | | | return sizes | | | def write kv value fout, kv type, value : | | | """Write a KV value to the output file.""" | | | if kv type == GGUFValueType.STRING: | | | value bytes = value.encode "utf-8" | | | fout.write struct.pack "<Q", len value bytes | | | fout.write value bytes | | | elif kv type == GGUFValueType.ARRAY: | | | This is handled separately in the main code | | | pass | | | elif kv type in GGUFValueType.UINT8, GGUFValueType.INT8, GGUFValueType.BOOL : | | | fout.write struct.pack "<B", value | | | elif kv type in GGUFValueType.UINT16, GGUFValueType.INT16 : | | | fout.write struct.pack "<H", value | | | elif kv type in GGUFValueType.UINT32, GGUFValueType.INT32 : | | | fout.write struct.pack "<I", value | | | elif kv type == GGUFValueType.FLOAT32: | | | fout.write struct.pack "<f", value | | | elif kv type in GGUFValueType.UINT64, GGUFValueType.INT64 : | | | fout.write struct.pack "<Q", value | | | elif kv type == GGUFValueType.FLOAT64: | | | fout.write struct.pack "<d", value | | | def write array value fout, sub type, arr : | | | """Write an array KV value to the output file.""" | | | fout.write struct.pack "<I", int sub type | | | fout.write struct.pack "<Q", len arr | | | for elem in arr: | | | if sub type == GGUFValueType.STRING: | | | elem bytes = elem.encode "utf-8" | | | fout.write struct.pack "<Q", len elem bytes | | | fout.write elem bytes | | | elif sub type in GGUFValueType.UINT8, GGUFValueType.INT8, GGUFValueType.BOOL : | | | fout.write struct.pack "<B", elem | | | elif sub type in GGUFValueType.UINT16, GGUFValueType.INT16 : | | | fout.write struct.pack "<H", elem | | | elif sub type in GGUFValueType.UINT32, GGUFValueType.INT32 : | | | fout.write struct.pack "<I", elem | | | elif sub type == GGUFValueType.FLOAT32: | | | fout.write struct.pack "<f", elem | | | elif sub type in GGUFValueType.UINT64, GGUFValueType.INT64 : | | | fout.write struct.pack "<Q", elem | | | elif sub type == GGUFValueType.FLOAT64: | | | fout.write struct.pack "<d", elem | | | def main - None: | | | if len sys.argv = 4: | | | print | | | f"Usage: {sys.argv 0 } <target.gguf <source.gguf <output.gguf ", | | | file=sys.stderr, | | | | | | sys.exit 1 | | | target path, source path, output path = sys.argv 1 , sys.argv 2 , sys.argv 3 | | | ------------------------------------------------------------------ | | | 1. Open both files | | | ------------------------------------------------------------------ | | | print f"Reading target: {target path}" | | | target reader = GGUFReader target path | | | print f"Reading source: {source path}" | | | source reader = GGUFReader source path | | | target file size = Path target path .stat .st size | | | source file size = Path source path .stat .st size | | | print | | | f" Target tensors: {len target reader.tensors }, KVs: {len k for k in target reader.fields if not k.startswith 'GGUF.' }" | | | | | | print | | | f" Source tensors: {len source reader.tensors }, KVs: {len k for k in source reader.fields if not k.startswith 'GGUF.' }" | | | | | | ------------------------------------------------------------------ | | | 2. Read architecture and MTP metadata from source | | | ------------------------------------------------------------------ | | | arch = get field value target reader, "general.architecture" | | | if arch is None: | | | print "ERROR: Target GGUF has no general.architecture key" | | | sys.exit 1 | | | source block count = get field value source reader, f"{arch}.block count" | | | source nextn = get field value source reader, f"{arch}.nextn predict layers" | | | if source nextn is None: | | | print "ERROR: Source GGUF has no nextn predict layers key" | | | sys.exit 1 | | | target block count = get field value target reader, f"{arch}.block count" | | | print f"\n Arch: {arch}" | | | print f" Target block count: {target block count}" | | | print | | | f" Source block count: {source block count}, nextn predict layers: {source nextn}" | | | | | | Identify extra tensors in the source blocks beyond target's count | | | source extra = | | | t | | | for t in source reader.tensors | | | if t.name.startswith f"blk.{target block count}." | | | | | | print f"\n Extra tensors to transplant: {len source extra }" | | | if not source extra: | | | print | | | f"ERROR: No tensors found with prefix 'blk.{target block count}.' in source" | | | | | | sys.exit 1 | | | ------------------------------------------------------------------ | | | 3. Prepare tensor lists and calculate sizes | | | ------------------------------------------------------------------ | | | Combine tensors: all from target + extra from source | | | all tensors = list target reader.tensors + source extra | | | Calculate on-disk sizes for source tensors including per-row metadata | | | target on disk sizes = calculate on disk sizes | | | target reader.tensors, target file size | | | | | | source on disk sizes = calculate on disk sizes | | | source reader.tensors, source file size | | | | | | Create mapping for source tensors | | | source tensor map = { | | | t.name: t, size | | | for t, size in zip source reader.tensors, source on disk sizes | | | } | | | ------------------------------------------------------------------ | | | 4. Write output file | | | ------------------------------------------------------------------ | | | print f"\nWriting output: {output path}" | | | with | | | open target path, "rb" as target fin, | | | open source path, "rb" as source fin, | | | open output path, "wb" as fout, | | | : | | | 4.1 Write header | | | Magic 4 bytes | | | fout.write b"GGUF" | | | Version 4 bytes | | | fout.write struct.pack "<I", 3 | | | Tensor count 8 bytes | | | fout.write struct.pack "<Q", len all tensors | | | Calculate KV count | | | kv count = len | | | k for k in target reader.fields.keys if not k.startswith "GGUF." | | | | | | kv count += 1 block count override | | | Add source-only KVs excluding block count and nextn predict layers | | | for key in source reader.fields: | | | if | | | not key.startswith "GGUF." | | | and key not in target reader.fields | | | and key = f"{arch}.block count" | | | and key = f"{arch}.nextn predict layers" | | | : | | | kv count += 1 | | | KV count 8 bytes | | | fout.write struct.pack "<Q", kv count | | | 4.2 Write KV data from target with block count override | | | written keys = set | | | for key, field in target reader.fields.items : | | | if key.startswith "GGUF." : | | | continue | | | Skip block count we'll override it | | | if key == f"{arch}.block count": | | | continue | | | Write key | | | key bytes = key.encode "utf-8" | | | fout.write struct.pack "<Q", len key bytes | | | fout.write key bytes | | | Write type | | | kv type = field.types 0 | | | fout.write struct.pack "<I", int kv type | | | Write value | | | if kv type == GGUFValueType.STRING: | | | write kv value fout, kv type, field.contents | | | elif kv type == GGUFValueType.ARRAY: | | | sub type = | | | field.types 1 if len field.types 1 else GGUFValueType.FLOAT32 | | | | | | write array value fout, sub type, field.contents | | | else: | | | write kv value fout, kv type, field.contents | | | written keys.add key | | | Add block count from source | | | key = f"{arch}.block count" | | | key bytes = key.encode "utf-8" | | | fout.write struct.pack "<Q", len key bytes | | | fout.write key bytes | | | fout.write struct.pack "<I", int GGUFValueType.UINT32 | | | fout.write struct.pack "<I", source block count | | | written keys.add key | | | Add nextn predict layers from source | | | key = f"{arch}.nextn predict layers" | | | key bytes = key.encode "utf-8" | | | fout.write struct.pack "<Q", len key bytes | | | fout.write key bytes | | | fout.write struct.pack "<I", int GGUFValueType.UINT32 | | | fout.write struct.pack "<I", source nextn | | | written keys.add key | | | Copy source-only KVs | | | for key, field in source reader.fields.items : | | | if | | | key.startswith "GGUF." | | | or key in written keys | | | or key == f"{arch}.nextn predict layers" | | | : | | | continue | | | Write key | | | key bytes = key.encode "utf-8" | | | fout.write struct.pack "<Q", len key bytes | | | fout.write key bytes | | | Write type | | | kv type = field.types 0 | | | fout.write struct.pack "<I", int kv type | | | Write value | | | if kv type == GGUFValueType.STRING: | | | write kv value fout, kv type, field.contents | | | elif kv type == GGUFValueType.ARRAY: | | | sub type = | | | field.types 1 if len field.types 1 else GGUFValueType.FLOAT32 | | | | | | write array value fout, sub type, field.contents | | | else: | | | write kv value fout, kv type, field.contents | | | 4.3 Write tensor info | | | Calculate offsets for all tensors | | | current offset = 0 | | | tensor offsets = | | | for i, tensor in enumerate all tensors : | | | if i < len target reader.tensors : | | | size = target on disk sizes i | | | else: | | | , size = source tensor map tensor.name | | | tensor offsets.append current offset | | | current offset += size | | | Write tensor info for each tensor | | | for i, tensor in enumerate all tensors : | | | Tensor name | | | name bytes = tensor.name.encode "utf-8" | | | fout.write struct.pack "<Q", len name bytes | | | fout.write name bytes | | | Dimensions in GGUF file order: fastest-varying first | | | shape = tensor.shape.tolist | | | fout.write struct.pack "<I", len shape | | | for dim in shape: | | | fout.write struct.pack "<Q", dim | | | Quantization type | | | fout.write struct.pack "<I", int tensor.tensor type | | | Offset | | | fout.write struct.pack "<Q", tensor offsets i | | | 4.4 Pad to alignment if needed | | | current pos = fout.tell | | | alignment = get field value target reader, "general.alignment" or 32 | | | padding needed = alignment - current pos % alignment % alignment | | | if padding needed: | | | fout.write b"\x00" padding needed | | | 4.5 Copy tensor data | | | print f"Copying {len all tensors } tensors..." | | | for i, tensor in enumerate all tensors : | | | if i < len target reader.tensors : | | | Target tensor | | | offset = target reader.tensors i .data offset | | | size = target on disk sizes i | | | fin = target fin | | | else: | | | Source extra tensor | | | src tensor, size = source tensor map tensor.name | | | offset = src tensor.data offset | | | fin = source fin | | | fin.seek offset | | | raw data = fin.read size | | | fout.write raw data | | | if i + 1 % 50 == 0 or i == len all tensors - 1: | | | print f" Copied {i + 1}/{len all tensors } tensors" | | | ------------------------------------------------------------------ | | | 5. Verify output | | | ------------------------------------------------------------------ | | | output size = Path output path .stat .st size | | | print f"\nOutput: {output path}" | | | print f" Size: {output size / 1 000 000 000:.2f} GB" | | | print f" Tensors: {len all tensors }" | | | Validate | | | print "\nValidating output..." | | | errors = | | | try: | | | out reader = GGUFReader output path | | | Check block count | | | out block count = get field value out reader, f"{arch}.block count" | | | if out block count = source block count: | | | errors.append | | | f"block count: expected {source block count}, got {out block count}" | | | | | | Check nextn predict layers | | | out nextn = get field value out reader, f"{arch}.nextn predict layers" | | | if out nextn = source nextn: | | | errors.append | | | f"nextn predict layers: expected {source nextn}, got {out nextn}" | | | | | | Check extra tensors exist | | | out tensor names = {t.name for t in out reader.tensors} | | | for tensor in source extra: | | | if tensor.name not in out tensor names: | | | errors.append f"Missing tensor: {tensor.name}" | | | Spot-check tensor data integrity | | | print " Spot-checking tensor data integrity..." | | | out tensors = {t.name: t for t in out reader.tensors} | | | Check a target tensor | | | for name in "token embd.weight" : | | | if name in out tensors and name in {t.name for t in target reader.tensors}: | | | target t = next | | | t for t in target reader.tensors if t.name == name , None | | | | | | out t = out tensors.get name | | | if target t and out t: | | | target hash = hashlib.sha256 target t.data.tobytes .hexdigest | | | :16 | | | | | | out hash = hashlib.sha256 out t.data.tobytes .hexdigest :16 | | | if target hash == out hash: | | | print f" {name}: OK {out hash} " | | | else: | | | errors.append f"Data mismatch: {name}" | | | Check an extra tensor | | | if source extra: | | | extra name = source extra 0 .name | | | source t = source tensor map extra name 0 | | | out t = out tensors.get extra name | | | if out t: | | | source hash = hashlib.sha256 source t.data.tobytes .hexdigest :16 | | | out hash = hashlib.sha256 out t.data.tobytes .hexdigest :16 | | | if source hash == out hash: | | | print f" {extra name}: OK {out hash} " | | | else: | | | errors.append f"Data mismatch: {extra name}" | | | except Exception as e: | | | errors.append f"Failed to read output: {e}" | | | if errors: | | | print "\nVALIDATION FAILED:" | | | for err in errors: | | | print f" - {err}" | | | sys.exit 1 | | | else: | | | print " OK — all checks passed" | | | print f"\nDone. Output: {output path}" | | | if name == " main ": | | | main |