{"slug": "microgpt", "title": "microgpt", "summary": "`microgpt.py`, a minimal, dependency-free Python implementation of a GPT (Generative Pre-trained Transformer) model for training and inference. It includes a complete autograd system, a character-level tokenizer, and a transformer with one layer, an embedding dimension of 16, and a context window of 16 tokens. The code is designed to be the most atomic version of the algorithm, prioritizing clarity over efficiency.", "body_md": "\"\"\"\nThe most atomic way to train and run inference for a GPT in pure, dependency-free Python.\nThis file is the complete algorithm.\nEverything else is just efficiency.\n\n@karpathy\n\"\"\"\n\nimport os       # os.path.exists\nimport math     # math.log, math.exp\nimport random   # random.seed, random.choices, random.gauss, random.shuffle\nrandom.seed(42) # Let there be order among chaos\n\n# Let there be a Dataset `docs`: list[str] of documents (e.g. a list of names)\nif not os.path.exists('input.txt'):\n    import urllib.request\n    names_url = 'https://raw.githubusercontent.com/karpathy/makemore/988aa59/names.txt'\n    urllib.request.urlretrieve(names_url, 'input.txt')\ndocs = [line.strip() for line in open('input.txt') if line.strip()]\nrandom.shuffle(docs)\nprint(f\"num docs: {len(docs)}\")\n\n# Let there be a Tokenizer to translate strings to sequences of integers (\"tokens\") and back\nuchars = sorted(set(''.join(docs))) # unique characters in the dataset become token ids 0..n-1\nBOS = len(uchars) # token id for a special Beginning of Sequence (BOS) token\nvocab_size = len(uchars) + 1 # total number of unique tokens, +1 is for BOS\nprint(f\"vocab size: {vocab_size}\")\n\n# Let there be Autograd to recursively apply the chain rule through a computation graph\nclass Value:\n    __slots__ = ('data', 'grad', '_children', '_local_grads') # Python optimization for memory usage\n\n    def __init__(self, data, children=(), local_grads=()):\n        self.data = data                # scalar value of this node calculated during forward pass\n        self.grad = 0                   # derivative of the loss w.r.t. this node, calculated in backward pass\n        self._children = children       # children of this node in the computation graph\n        self._local_grads = local_grads # local derivative of this node w.r.t. its children\n\n    def __add__(self, other):\n        other = other if isinstance(other, Value) else Value(other)\n        return Value(self.data + other.data, (self, other), (1, 1))\n\n    def __mul__(self, other):\n        other = other if isinstance(other, Value) else Value(other)\n        return Value(self.data * other.data, (self, other), (other.data, self.data))\n\n    def __pow__(self, other): return Value(self.data**other, (self,), (other * self.data**(other-1),))\n    def log(self): return Value(math.log(self.data), (self,), (1/self.data,))\n    def exp(self): return Value(math.exp(self.data), (self,), (math.exp(self.data),))\n    def relu(self): return Value(max(0, self.data), (self,), (float(self.data > 0),))\n    def __neg__(self): return self * -1\n    def __radd__(self, other): return self + other\n    def __sub__(self, other): return self + (-other)\n    def __rsub__(self, other): return other + (-self)\n    def __rmul__(self, other): return self * other\n    def __truediv__(self, other): return self * other**-1\n    def __rtruediv__(self, other): return other * self**-1\n\n    def backward(self):\n        topo = []\n        visited = set()\n        def build_topo(v):\n            if v not in visited:\n                visited.add(v)\n                for child in v._children:\n                    build_topo(child)\n                topo.append(v)\n        build_topo(self)\n        self.grad = 1\n        for v in reversed(topo):\n            for child, local_grad in zip(v._children, v._local_grads):\n                child.grad += local_grad * v.grad\n\n# Initialize the parameters, to store the knowledge of the model\nn_layer = 1     # depth of the transformer neural network (number of layers)\nn_embd = 16     # width of the network (embedding dimension)\nblock_size = 16 # maximum context length of the attention window (note: the longest name is 15 characters)\nn_head = 4      # number of attention heads\nhead_dim = n_embd // n_head # derived dimension of each head\nmatrix = lambda nout, nin, std=0.08: [[Value(random.gauss(0, std)) for _ in range(nin)] for _ in range(nout)]\nstate_dict = {'wte': matrix(vocab_size, n_embd), 'wpe': matrix(block_size, n_embd), 'lm_head': matrix(vocab_size, n_embd)}\nfor i in range(n_layer):\n    state_dict[f'layer{i}.attn_wq'] = matrix(n_embd, n_embd)\n    state_dict[f'layer{i}.attn_wk'] = matrix(n_embd, n_embd)\n    state_dict[f'layer{i}.attn_wv'] = matrix(n_embd, n_embd)\n    state_dict[f'layer{i}.attn_wo'] = matrix(n_embd, n_embd)\n    state_dict[f'layer{i}.mlp_fc1'] = matrix(4 * n_embd, n_embd)\n    state_dict[f'layer{i}.mlp_fc2'] = matrix(n_embd, 4 * n_embd)\nparams = [p for mat in state_dict.values() for row in mat for p in row] # flatten params into a single list[Value]\nprint(f\"num params: {len(params)}\")\n\n# Define the model architecture: a function mapping tokens and parameters to logits over what comes next\n# Follow GPT-2, blessed among the GPTs, with minor differences: layernorm -> rmsnorm, no biases, GeLU -> ReLU\ndef linear(x, w):\n    return [sum(wi * xi for wi, xi in zip(wo, x)) for wo in w]\n\ndef softmax(logits):\n    max_val = max(val.data for val in logits)\n    exps = [(val - max_val).exp() for val in logits]\n    total = sum(exps)\n    return [e / total for e in exps]\n\ndef rmsnorm(x):\n    ms = sum(xi * xi for xi in x) / len(x)\n    scale = (ms + 1e-5) ** -0.5\n    return [xi * scale for xi in x]\n\ndef gpt(token_id, pos_id, keys, values):\n    tok_emb = state_dict['wte'][token_id] # token embedding\n    pos_emb = state_dict['wpe'][pos_id] # position embedding\n    x = [t + p for t, p in zip(tok_emb, pos_emb)] # joint token and position embedding\n    x = rmsnorm(x) # note: not redundant due to backward pass via the residual connection\n\n    for li in range(n_layer):\n        # 1) Multi-head Attention block\n        x_residual = x\n        x = rmsnorm(x)\n        q = linear(x, state_dict[f'layer{li}.attn_wq'])\n        k = linear(x, state_dict[f'layer{li}.attn_wk'])\n        v = linear(x, state_dict[f'layer{li}.attn_wv'])\n        keys[li].append(k)\n        values[li].append(v)\n        x_attn = []\n        for h in range(n_head):\n            hs = h * head_dim\n            q_h = q[hs:hs+head_dim]\n            k_h = [ki[hs:hs+head_dim] for ki in keys[li]]\n            v_h = [vi[hs:hs+head_dim] for vi in values[li]]\n            attn_logits = [sum(q_h[j] * k_h[t][j] for j in range(head_dim)) / head_dim**0.5 for t in range(len(k_h))]\n            attn_weights = softmax(attn_logits)\n            head_out = [sum(attn_weights[t] * v_h[t][j] for t in range(len(v_h))) for j in range(head_dim)]\n            x_attn.extend(head_out)\n        x = linear(x_attn, state_dict[f'layer{li}.attn_wo'])\n        x = [a + b for a, b in zip(x, x_residual)]\n        # 2) MLP block\n        x_residual = x\n        x = rmsnorm(x)\n        x = linear(x, state_dict[f'layer{li}.mlp_fc1'])\n        x = [xi.relu() for xi in x]\n        x = linear(x, state_dict[f'layer{li}.mlp_fc2'])\n        x = [a + b for a, b in zip(x, x_residual)]\n\n    logits = linear(x, state_dict['lm_head'])\n    return logits\n\n# Let there be Adam, the blessed optimizer and its buffers\nlearning_rate, beta1, beta2, eps_adam = 0.01, 0.85, 0.99, 1e-8\nm = [0.0] * len(params) # first moment buffer\nv = [0.0] * len(params) # second moment buffer\n\n# Repeat in sequence\nnum_steps = 1000 # number of training steps\nfor step in range(num_steps):\n\n    # Take single document, tokenize it, surround it with BOS special token on both sides\n    doc = docs[step % len(docs)]\n    tokens = [BOS] + [uchars.index(ch) for ch in doc] + [BOS]\n    n = min(block_size, len(tokens) - 1)\n\n    # Forward the token sequence through the model, building up the computation graph all the way to the loss\n    keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)]\n    losses = []\n    for pos_id in range(n):\n        token_id, target_id = tokens[pos_id], tokens[pos_id + 1]\n        logits = gpt(token_id, pos_id, keys, values)\n        probs = softmax(logits)\n        loss_t = -probs[target_id].log()\n        losses.append(loss_t)\n    loss = (1 / n) * sum(losses) # final average loss over the document sequence. May yours be low.\n\n    # Backward the loss, calculating the gradients with respect to all model parameters\n    loss.backward()\n\n    # Adam optimizer update: update the model parameters based on the corresponding gradients\n    lr_t = learning_rate * (1 - step / num_steps) # linear learning rate decay\n    for i, p in enumerate(params):\n        m[i] = beta1 * m[i] + (1 - beta1) * p.grad\n        v[i] = beta2 * v[i] + (1 - beta2) * p.grad ** 2\n        m_hat = m[i] / (1 - beta1 ** (step + 1))\n        v_hat = v[i] / (1 - beta2 ** (step + 1))\n        p.data -= lr_t * m_hat / (v_hat ** 0.5 + eps_adam)\n        p.grad = 0\n\n    print(f\"step {step+1:4d} / {num_steps:4d} | loss {loss.data:.4f}\", end='\\r')\n\n# Inference: may the model babble back to us\ntemperature = 0.5 # in (0, 1], control the \"creativity\" of generated text, low to high\nprint(\"\\n--- inference (new, hallucinated names) ---\")\nfor sample_idx in range(20):\n    keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)]\n    token_id = BOS\n    sample = []\n    for pos_id in range(block_size):\n        logits = gpt(token_id, pos_id, keys, values)\n        probs = softmax([l / temperature for l in logits])\n        token_id = random.choices(range(vocab_size), weights=[p.data for p in probs])[0]\n        if token_id == BOS:\n            break\n        sample.append(uchars[token_id])\n    print(f\"sample {sample_idx+1:2d}: {''.join(sample)}\")", "url": "https://wpnews.pro/news/microgpt", "canonical_source": "https://gist.github.com/karpathy/8627fe009c40f57531cb18360106ce95", "published_at": "2026-02-11 21:08:51+00:00", "updated_at": "2026-05-21 15:43:50.250686+00:00", "lang": "en", "topics": ["machine-learning", "large-language-models", "open-source", "developer-tools", "research"], "entities": ["Karpathy"], "alternates": {"html": "https://wpnews.pro/news/microgpt", "markdown": "https://wpnews.pro/news/microgpt.md", "text": "https://wpnews.pro/news/microgpt.txt", "jsonld": "https://wpnews.pro/news/microgpt.jsonld"}}