{"slug": "transplant-mtp-block-from-one-gguf-file-into-another", "title": "Transplant MTP block from one GGUF file into another", "summary": "A developer has released a Python script that transplants extra tensors—such as Multi-Token Prediction (MTP) layers—from one GGUF file into another, enabling the creation of mixed-quantization models. The tool preserves the exact on-disk layout, including per-row metadata critical for GPU inference, and supports using smaller \"tensors-only\" donor files to save bandwidth. Example usage includes transplanting an MTP block from a Q8_0 quantized file into an IQ4_KS base model.", "body_md": "| #!/usr/bin/env python3 | |\n| \"\"\" | |\n| Transplant extra tensors (e.g. MTP layers) from one GGUF file into another, | |\n| producing a mixed-quantization GGUF. | |\n| Note: Tested with ik_llama.cpp GGUF Python module. | |\n| Usage: | |\n| python convert.py <target.gguf> <source.gguf> <output.gguf> | |\n| Arguments: | |\n| target — base GGUF (tensors + metadata kept as-is) | |\n| source — GGUF with extra blocks to transplant (e.g. blk.64.* for MTP) | |\n| output — resulting mixed-quantization GGUF | |\n| The script preserves the exact on-disk layout including per-row metadata | |\n| for quantization types like IQ4_KS that have row_meta_size > 0. This is | |\n| critical for GPU inference to work correctly. | |\n| Donor Models: | |\n| To save bandwidth, you can use 'tensors-only' GGUFs as the source file. | |\n| Credits to AzerbaijanNyan for the extraction: | |\n| https://www.reddit.com/r/LocalLLaMA/comments/1t6r1ny/extracted_mtp_tensor_ggufs_smaller_donor_models/ | |\n| Available donors: | |\n| - Qwen3.6-35A3B: https://huggingface.co/IHaveNoClueAndIMustPost/Qwen3.6-35A3B-MTP-TENSORS-ONLY | |\n| - Qwen3.6-27b: https://huggingface.co/IHaveNoClueAndIMustPost/Qwen3.6-27b-MTP-TENSORS-ONLY | |\n| Example: | |\n| # Transplant MTP block from Q8_0 into IQ4_KS base model | |\n| python convert.py Qwen3.6-27B-IQ4_KS.gguf Qwen3.6-27B-MTP-Q8_0.gguf Qwen3.6-27B-MTP-IQ4_KS.gguf | |\n| \"\"\" | |\n| import hashlib | |\n| import sys | |\n| import struct | |\n| from pathlib import Path | |\n| from gguf import GGUFReader, GGUFValueType | |\n| def get_field_value(reader: GGUFReader, key: str): | |\n| \"\"\"Safely get a field value from GGUFReader.\"\"\" | |\n| field = reader.get_field(key) | |\n| return field.contents() if field else None | |\n| def calculate_on_disk_sizes(tensors, file_size): | |\n| \"\"\"Calculate on-disk size for each tensor (including per-row metadata/padding).\"\"\" | |\n| n_tensors = len(tensors) | |\n| sizes = [] | |\n| for i in range(n_tensors): | |\n| if i < n_tensors - 1: | |\n| sizes.append(tensors[i + 1].data_offset - tensors[i].data_offset) | |\n| else: | |\n| sizes.append(file_size - tensors[i].data_offset) | |\n| return sizes | |\n| def write_kv_value(fout, kv_type, value): | |\n| \"\"\"Write a KV value to the output file.\"\"\" | |\n| if kv_type == GGUFValueType.STRING: | |\n| value_bytes = value.encode(\"utf-8\") | |\n| fout.write(struct.pack(\"<Q\", len(value_bytes))) | |\n| fout.write(value_bytes) | |\n| elif kv_type == GGUFValueType.ARRAY: | |\n| # This is handled separately in the main code | |\n| pass | |\n| elif kv_type in (GGUFValueType.UINT8, GGUFValueType.INT8, GGUFValueType.BOOL): | |\n| fout.write(struct.pack(\"<B\", value)) | |\n| elif kv_type in (GGUFValueType.UINT16, GGUFValueType.INT16): | |\n| fout.write(struct.pack(\"<H\", value)) | |\n| elif kv_type in (GGUFValueType.UINT32, GGUFValueType.INT32): | |\n| fout.write(struct.pack(\"<I\", value)) | |\n| elif kv_type == GGUFValueType.FLOAT32: | |\n| fout.write(struct.pack(\"<f\", value)) | |\n| elif kv_type in (GGUFValueType.UINT64, GGUFValueType.INT64): | |\n| fout.write(struct.pack(\"<Q\", value)) | |\n| elif kv_type == GGUFValueType.FLOAT64: | |\n| fout.write(struct.pack(\"<d\", value)) | |\n| def write_array_value(fout, sub_type, arr): | |\n| \"\"\"Write an array KV value to the output file.\"\"\" | |\n| fout.write(struct.pack(\"<I\", int(sub_type))) | |\n| fout.write(struct.pack(\"<Q\", len(arr))) | |\n| for elem in arr: | |\n| if sub_type == GGUFValueType.STRING: | |\n| elem_bytes = elem.encode(\"utf-8\") | |\n| fout.write(struct.pack(\"<Q\", len(elem_bytes))) | |\n| fout.write(elem_bytes) | |\n| elif sub_type in (GGUFValueType.UINT8, GGUFValueType.INT8, GGUFValueType.BOOL): | |\n| fout.write(struct.pack(\"<B\", elem)) | |\n| elif sub_type in (GGUFValueType.UINT16, GGUFValueType.INT16): | |\n| fout.write(struct.pack(\"<H\", elem)) | |\n| elif sub_type in (GGUFValueType.UINT32, GGUFValueType.INT32): | |\n| fout.write(struct.pack(\"<I\", elem)) | |\n| elif sub_type == GGUFValueType.FLOAT32: | |\n| fout.write(struct.pack(\"<f\", elem)) | |\n| elif sub_type in (GGUFValueType.UINT64, GGUFValueType.INT64): | |\n| fout.write(struct.pack(\"<Q\", elem)) | |\n| elif sub_type == GGUFValueType.FLOAT64: | |\n| fout.write(struct.pack(\"<d\", elem)) | |\n| def main() -> None: | |\n| if len(sys.argv) != 4: | |\n| print( | |\n| f\"Usage: {sys.argv[0]} <target.gguf> <source.gguf> <output.gguf>\", | |\n| file=sys.stderr, | |\n| ) | |\n| sys.exit(1) | |\n| target_path, source_path, output_path = sys.argv[1], sys.argv[2], sys.argv[3] | |\n| # ------------------------------------------------------------------ | |\n| # 1. Open both files | |\n| # ------------------------------------------------------------------ | |\n| print(f\"Reading target: {target_path}\") | |\n| target_reader = GGUFReader(target_path) | |\n| print(f\"Reading source: {source_path}\") | |\n| source_reader = GGUFReader(source_path) | |\n| target_file_size = Path(target_path).stat().st_size | |\n| source_file_size = Path(source_path).stat().st_size | |\n| print( | |\n| f\" Target tensors: {len(target_reader.tensors)}, KVs: {len([k for k in target_reader.fields if not k.startswith('GGUF.')])}\" | |\n| ) | |\n| print( | |\n| f\" Source tensors: {len(source_reader.tensors)}, KVs: {len([k for k in source_reader.fields if not k.startswith('GGUF.')])}\" | |\n| ) | |\n| # ------------------------------------------------------------------ | |\n| # 2. Read architecture and MTP metadata from source | |\n| # ------------------------------------------------------------------ | |\n| arch = get_field_value(target_reader, \"general.architecture\") | |\n| if arch is None: | |\n| print(\"ERROR: Target GGUF has no general.architecture key\") | |\n| sys.exit(1) | |\n| source_block_count = get_field_value(source_reader, f\"{arch}.block_count\") | |\n| source_nextn = get_field_value(source_reader, f\"{arch}.nextn_predict_layers\") | |\n| if source_nextn is None: | |\n| print(\"ERROR: Source GGUF has no nextn_predict_layers key\") | |\n| sys.exit(1) | |\n| target_block_count = get_field_value(target_reader, f\"{arch}.block_count\") | |\n| print(f\"\\n Arch: {arch}\") | |\n| print(f\" Target block_count: {target_block_count}\") | |\n| print( | |\n| f\" Source block_count: {source_block_count}, nextn_predict_layers: {source_nextn}\" | |\n| ) | |\n| # Identify extra tensors in the source (blocks beyond target's count) | |\n| source_extra = [ | |\n| t | |\n| for t in source_reader.tensors | |\n| if t.name.startswith(f\"blk.{target_block_count}.\") | |\n| ] | |\n| print(f\"\\n Extra tensors to transplant: {len(source_extra)}\") | |\n| if not source_extra: | |\n| print( | |\n| f\"ERROR: No tensors found with prefix 'blk.{target_block_count}.' in source\" | |\n| ) | |\n| sys.exit(1) | |\n| # ------------------------------------------------------------------ | |\n| # 3. Prepare tensor lists and calculate sizes | |\n| # ------------------------------------------------------------------ | |\n| # Combine tensors: all from target + extra from source | |\n| all_tensors = list(target_reader.tensors) + source_extra | |\n| # Calculate on-disk sizes for source tensors (including per-row metadata) | |\n| target_on_disk_sizes = calculate_on_disk_sizes( | |\n| target_reader.tensors, target_file_size | |\n| ) | |\n| source_on_disk_sizes = calculate_on_disk_sizes( | |\n| source_reader.tensors, source_file_size | |\n| ) | |\n| # Create mapping for source tensors | |\n| source_tensor_map = { | |\n| t.name: (t, size) | |\n| for t, size in zip(source_reader.tensors, source_on_disk_sizes) | |\n| } | |\n| # ------------------------------------------------------------------ | |\n| # 4. Write output file | |\n| # ------------------------------------------------------------------ | |\n| print(f\"\\nWriting output: {output_path}\") | |\n| with ( | |\n| open(target_path, \"rb\") as target_fin, | |\n| open(source_path, \"rb\") as source_fin, | |\n| open(output_path, \"wb\") as fout, | |\n| ): | |\n| # 4.1 Write header | |\n| # Magic (4 bytes) | |\n| fout.write(b\"GGUF\") | |\n| # Version (4 bytes) | |\n| fout.write(struct.pack(\"<I\", 3)) | |\n| # Tensor count (8 bytes) | |\n| fout.write(struct.pack(\"<Q\", len(all_tensors))) | |\n| # Calculate KV count | |\n| kv_count = len( | |\n| [k for k in target_reader.fields.keys() if not k.startswith(\"GGUF.\")] | |\n| ) | |\n| kv_count += 1 # block_count override | |\n| # Add source-only KVs (excluding block_count and nextn_predict_layers) | |\n| for key in source_reader.fields: | |\n| if ( | |\n| not key.startswith(\"GGUF.\") | |\n| and key not in target_reader.fields | |\n| and key != f\"{arch}.block_count\" | |\n| and key != f\"{arch}.nextn_predict_layers\" | |\n| ): | |\n| kv_count += 1 | |\n| # KV count (8 bytes) | |\n| fout.write(struct.pack(\"<Q\", kv_count)) | |\n| # 4.2 Write KV data from target (with block_count override) | |\n| written_keys = set() | |\n| for key, field in target_reader.fields.items(): | |\n| if key.startswith(\"GGUF.\"): | |\n| continue | |\n| # Skip block_count (we'll override it) | |\n| if key == f\"{arch}.block_count\": | |\n| continue | |\n| # Write key | |\n| key_bytes = key.encode(\"utf-8\") | |\n| fout.write(struct.pack(\"<Q\", len(key_bytes))) | |\n| fout.write(key_bytes) | |\n| # Write type | |\n| kv_type = field.types[0] | |\n| fout.write(struct.pack(\"<I\", int(kv_type))) | |\n| # Write value | |\n| if kv_type == GGUFValueType.STRING: | |\n| write_kv_value(fout, kv_type, field.contents()) | |\n| elif kv_type == GGUFValueType.ARRAY: | |\n| sub_type = ( | |\n| field.types[1] if len(field.types) > 1 else GGUFValueType.FLOAT32 | |\n| ) | |\n| write_array_value(fout, sub_type, field.contents()) | |\n| else: | |\n| write_kv_value(fout, kv_type, field.contents()) | |\n| written_keys.add(key) | |\n| # Add block_count from source | |\n| key = f\"{arch}.block_count\" | |\n| key_bytes = key.encode(\"utf-8\") | |\n| fout.write(struct.pack(\"<Q\", len(key_bytes))) | |\n| fout.write(key_bytes) | |\n| fout.write(struct.pack(\"<I\", int(GGUFValueType.UINT32))) | |\n| fout.write(struct.pack(\"<I\", source_block_count)) | |\n| written_keys.add(key) | |\n| # Add nextn_predict_layers from source | |\n| key = f\"{arch}.nextn_predict_layers\" | |\n| key_bytes = key.encode(\"utf-8\") | |\n| fout.write(struct.pack(\"<Q\", len(key_bytes))) | |\n| fout.write(key_bytes) | |\n| fout.write(struct.pack(\"<I\", int(GGUFValueType.UINT32))) | |\n| fout.write(struct.pack(\"<I\", source_nextn)) | |\n| written_keys.add(key) | |\n| # Copy source-only KVs | |\n| for key, field in source_reader.fields.items(): | |\n| if ( | |\n| key.startswith(\"GGUF.\") | |\n| or key in written_keys | |\n| or key == f\"{arch}.nextn_predict_layers\" | |\n| ): | |\n| continue | |\n| # Write key | |\n| key_bytes = key.encode(\"utf-8\") | |\n| fout.write(struct.pack(\"<Q\", len(key_bytes))) | |\n| fout.write(key_bytes) | |\n| # Write type | |\n| kv_type = field.types[0] | |\n| fout.write(struct.pack(\"<I\", int(kv_type))) | |\n| # Write value | |\n| if kv_type == GGUFValueType.STRING: | |\n| write_kv_value(fout, kv_type, field.contents()) | |\n| elif kv_type == GGUFValueType.ARRAY: | |\n| sub_type = ( | |\n| field.types[1] if len(field.types) > 1 else GGUFValueType.FLOAT32 | |\n| ) | |\n| write_array_value(fout, sub_type, field.contents()) | |\n| else: | |\n| write_kv_value(fout, kv_type, field.contents()) | |\n| # 4.3 Write tensor info | |\n| # Calculate offsets for all tensors | |\n| current_offset = 0 | |\n| tensor_offsets = [] | |\n| for i, tensor in enumerate(all_tensors): | |\n| if i < len(target_reader.tensors): | |\n| size = target_on_disk_sizes[i] | |\n| else: | |\n| _, size = source_tensor_map[tensor.name] | |\n| tensor_offsets.append(current_offset) | |\n| current_offset += size | |\n| # Write tensor info for each tensor | |\n| for i, tensor in enumerate(all_tensors): | |\n| # Tensor name | |\n| name_bytes = tensor.name.encode(\"utf-8\") | |\n| fout.write(struct.pack(\"<Q\", len(name_bytes))) | |\n| fout.write(name_bytes) | |\n| # Dimensions (in GGUF file order: fastest-varying first) | |\n| shape = tensor.shape.tolist() | |\n| fout.write(struct.pack(\"<I\", len(shape))) | |\n| for dim in shape: | |\n| fout.write(struct.pack(\"<Q\", dim)) | |\n| # Quantization type | |\n| fout.write(struct.pack(\"<I\", int(tensor.tensor_type))) | |\n| # Offset | |\n| fout.write(struct.pack(\"<Q\", tensor_offsets[i])) | |\n| # 4.4 Pad to alignment if needed | |\n| current_pos = fout.tell() | |\n| alignment = get_field_value(target_reader, \"general.alignment\") or 32 | |\n| padding_needed = (alignment - (current_pos % alignment)) % alignment | |\n| if padding_needed: | |\n| fout.write(b\"\\x00\" * padding_needed) | |\n| # 4.5 Copy tensor data | |\n| print(f\"Copying {len(all_tensors)} tensors...\") | |\n| for i, tensor in enumerate(all_tensors): | |\n| if i < len(target_reader.tensors): | |\n| # Target tensor | |\n| offset = target_reader.tensors[i].data_offset | |\n| size = target_on_disk_sizes[i] | |\n| fin = target_fin | |\n| else: | |\n| # Source extra tensor | |\n| src_tensor, size = source_tensor_map[tensor.name] | |\n| offset = src_tensor.data_offset | |\n| fin = source_fin | |\n| fin.seek(offset) | |\n| raw_data = fin.read(size) | |\n| fout.write(raw_data) | |\n| if (i + 1) % 50 == 0 or i == len(all_tensors) - 1: | |\n| print(f\" Copied {i + 1}/{len(all_tensors)} tensors\") | |\n| # ------------------------------------------------------------------ | |\n| # 5. Verify output | |\n| # ------------------------------------------------------------------ | |\n| output_size = Path(output_path).stat().st_size | |\n| print(f\"\\nOutput: {output_path}\") | |\n| print(f\" Size: {output_size / 1_000_000_000:.2f} GB\") | |\n| print(f\" Tensors: {len(all_tensors)}\") | |\n| # Validate | |\n| print(\"\\nValidating output...\") | |\n| errors = [] | |\n| try: | |\n| out_reader = GGUFReader(output_path) | |\n| # Check block_count | |\n| out_block_count = get_field_value(out_reader, f\"{arch}.block_count\") | |\n| if out_block_count != source_block_count: | |\n| errors.append( | |\n| f\"block_count: expected {source_block_count}, got {out_block_count}\" | |\n| ) | |\n| # Check nextn_predict_layers | |\n| out_nextn = get_field_value(out_reader, f\"{arch}.nextn_predict_layers\") | |\n| if out_nextn != source_nextn: | |\n| errors.append( | |\n| f\"nextn_predict_layers: expected {source_nextn}, got {out_nextn}\" | |\n| ) | |\n| # Check extra tensors exist | |\n| out_tensor_names = {t.name for t in out_reader.tensors} | |\n| for tensor in source_extra: | |\n| if tensor.name not in out_tensor_names: | |\n| errors.append(f\"Missing tensor: {tensor.name}\") | |\n| # Spot-check tensor data integrity | |\n| print(\" Spot-checking tensor data integrity...\") | |\n| out_tensors = {t.name: t for t in out_reader.tensors} | |\n| # Check a target tensor | |\n| for name in [\"token_embd.weight\"]: | |\n| if name in out_tensors and name in {t.name for t in target_reader.tensors}: | |\n| target_t = next( | |\n| (t for t in target_reader.tensors if t.name == name), None | |\n| ) | |\n| out_t = out_tensors.get(name) | |\n| if target_t and out_t: | |\n| target_hash = hashlib.sha256(target_t.data.tobytes()).hexdigest()[ | |\n| :16 | |\n| ] | |\n| out_hash = hashlib.sha256(out_t.data.tobytes()).hexdigest()[:16] | |\n| if target_hash == out_hash: | |\n| print(f\" {name}: OK ({out_hash})\") | |\n| else: | |\n| errors.append(f\"Data mismatch: {name}\") | |\n| # Check an extra tensor | |\n| if source_extra: | |\n| extra_name = source_extra[0].name | |\n| source_t = source_tensor_map[extra_name][0] | |\n| out_t = out_tensors.get(extra_name) | |\n| if out_t: | |\n| source_hash = hashlib.sha256(source_t.data.tobytes()).hexdigest()[:16] | |\n| out_hash = hashlib.sha256(out_t.data.tobytes()).hexdigest()[:16] | |\n| if source_hash == out_hash: | |\n| print(f\" {extra_name}: OK ({out_hash})\") | |\n| else: | |\n| errors.append(f\"Data mismatch: {extra_name}\") | |\n| except Exception as e: | |\n| errors.append(f\"Failed to read output: {e}\") | |\n| if errors: | |\n| print(\"\\nVALIDATION FAILED:\") | |\n| for err in errors: | |\n| print(f\" - {err}\") | |\n| sys.exit(1) | |\n| else: | |\n| print(\" OK — all checks passed\") | |\n| print(f\"\\nDone. Output: {output_path}\") | |\n| if __name__ == \"__main__\": | |\n| main() |", "url": "https://wpnews.pro/news/transplant-mtp-block-from-one-gguf-file-into-another", "canonical_source": "https://gist.github.com/buzz/1c439684d5e3f36492ae9f64ef7e3f67", "published_at": "2026-05-01 23:30:18+00:00", "updated_at": "2026-05-27 23:24:08.000014+00:00", "lang": "en", "topics": ["large-language-models", "artificial-intelligence", "machine-learning", "ai-tools", "ai-infrastructure"], "entities": ["AzerbaijanNyan", "Qwen3.6-35A3B", "Qwen3.6-27b", "Hugging Face", "GGUF", "llama.cpp"], "alternates": {"html": "https://wpnews.pro/news/transplant-mtp-block-from-one-gguf-file-into-another", "markdown": "https://wpnews.pro/news/transplant-mtp-block-from-one-gguf-file-into-another.md", "text": "https://wpnews.pro/news/transplant-mtp-block-from-one-gguf-file-into-another.txt", "jsonld": "https://wpnews.pro/news/transplant-mtp-block-from-one-gguf-file-into-another.jsonld"}}