{"slug": "gist-6a5e632583c67dadf84d68d339cdf799", "title": "gist:6a5e632583c67dadf84d68d339cdf799", "summary": "A developer built an AUR malware monitor that performs static analysis on Arch User Repository packages. The tool fetches AUR metadata, applies regex pre-filters, and retrieves git diffs via cgit to identify potentially malicious packages without executing any code. It stages suspicious candidates for human review or automated classification.", "body_md": "| #!/usr/bin/env python3 | |\n| \"\"\"AUR malware monitor — candidate dropper. | |\n| Static analysis only. Never runs makepkg, never sources PKGBUILDs, never | |\n| executes anything fetched. Fetches AUR bulk metadata, selects newly created / | |\n| recently modified packages, runs a cheap offline regex pre-filter, fetches the | |\n| git diff of each candidate over cgit (HTTP, no clones), and stages the diff + | |\n| metadata into per-candidate folders for a human (or `claude -p`) to classify. | |\n| See design.md for the threat model and pipeline. | |\n| \"\"\" | |\n| from __future__ import annotations | |\n| import argparse | |\n| import gzip | |\n| import json | |\n| import re | |\n| import shutil | |\n| import subprocess | |\n| import sys | |\n| import time | |\n| from concurrent.futures import ThreadPoolExecutor | |\n| from dataclasses import dataclass, field | |\n| from datetime import datetime, timezone | |\n| from pathlib import Path | |\n| import requests | |\n| # Known-bad indicators. If any appears in a candidate's diff it's marked | |\n| # malicious outright — no need to spend a classification call. (atomic-lockfile | |\n| # + execa were the payload pulled by the 2026 ledger-udev-bin specimen.) | |\n| KNOWN_BAD_IOCS = [ | |\n| \"atomic-lockfile\", | |\n| ] | |\n| # Verdicts the analyst may return, ordered clean -> worst. Anything at or above | |\n| # ALERT_THRESHOLD rings the terminal bell. | |\n| VERDICT_ORDER = [\"clean\", \"review\", \"suspicious\", \"malicious\"] | |\n| ALERT_THRESHOLD = \"suspicious\" | |\n| VERDICT_RANK = {v: i for i, v in enumerate(VERDICT_ORDER)} | |\n| USER_AGENT = \"aur-malware-monitor/0.1 (static-analysis; +https://github.com/local/aur-monitor)\" | |\n| META_URL = \"https://aur.archlinux.org/packages-meta-ext-v1.json.gz\" | |\n| # cgit patch view: latest commit as a git-format patch, including its SHA in the | |\n| # \"From <sha>\" header. The pkgbase and `id2` lower bound are passed as query | |\n| # params so requests URL-encodes them (pkgbases can contain `+`, which would | |\n| # otherwise be read as a space). | |\n| CGIT_PATCH_URL = \"https://aur.archlinux.org/cgit/aur.git/patch/\" | |\n| PKG_PAGE_URL = \"https://aur.archlinux.org/packages/{name}\" | |\n| STATE_DIR = Path.home() / \".cache\" / \"aur-watch\" | |\n| STATE_FILE = STATE_DIR / \"state.json\" | |\n| MAX_DIFF_BYTES = 512 * 1024 # cap on a single staged diff | |\n| # --- Regex pre-filter (cheap, offline) ------------------------------------- | |\n| # Each rule is (signal_name, compiled_regex). A hit prioritises the candidate | |\n| # for review and is recorded in meta.json. See design.md \"Regex pre-filter\". | |\n| BRAND_KEYWORDS = ( | |\n| \"firefox|chrome|brave|librewolf|zen|tor|vivaldi|opera|telegram|\" | |\n| \"signal|discord|spotify|zoom|vscode\" | |\n| ) | |\n| PREFILTER_RULES: list[tuple[str, re.Pattern]] = [ | |\n| (\"pipe-to-shell\", re.compile(r\"(curl|wget|fetch)[^\\n]*\\|\\s*(bash|sh|zsh)\", re.I)), | |\n| (\"exec-process-substitution\", re.compile(r\"(bash|sh)\\s+<\\(\\s*(curl|wget)\", re.I)), | |\n| (\"python-remote-exec\", re.compile( | |\n| r\"python[0-9.]*\\s+-c[^\\n]*(urlopen|requests\\.get|urllib)\", re.I)), | |\n| (\"base64-to-shell\", re.compile(r\"base64\\s+-d[^\\n]*\\|\\s*(ba)?sh\", re.I)), | |\n| (\"hardcoded-ip-port\", re.compile(r\"\\b(?:\\d{1,3}\\.){3}\\d{1,3}:\\d+\")), | |\n| (\"drop-host-or-paste\", re.compile( | |\n| r\"pastebin|gist\\.githubusercontent|0x0\\.st|transfer\\.sh|anonfiles|\" | |\n| r\"file\\.io|ngrok|bit\\.ly|tinyurl\", re.I)), | |\n| (\"tmp-write-exec\", re.compile(r\"chmod\\s+\\+x[^\\n]*/tmp\", re.I)), | |\n| (\"tmp-unit\", re.compile(r\"(systemd|cron|timer)[^\\n]*/tmp\", re.I)), | |\n| (\"sudo\", re.compile(r\"\\bsudo\\b\")), | |\n| (\"long-base64-or-hex\", re.compile(r\"[A-Za-z0-9+/]{120,}={0,2}\")), | |\n| # `npm install` / `npx` pulling packages at build or install time fetches | |\n| # and runs arbitrary remote code — flag it even for ostensibly non-JS | |\n| # packages (e.g. a \"udev rules\" pkg that npm-installs in a /tmp hook). | |\n| (\"npm-install-exec\", re.compile(r\"\\b(npm\\s+(install|i|exec)|npx)\\b\", re.I)), | |\n| ] | |\n| # Match a brand keyword only as a whole token (bounded by start/end or a | |\n| # non-alphanumeric separator), so \"tor\" doesn't fire inside \"aligator\". | |\n| BRAND_RE = re.compile(rf\"(?:^|[^a-z0-9])(?:{BRAND_KEYWORDS})(?:[^a-z0-9]|$)\", re.I) | |\n| BIN_SUFFIX_RE = re.compile(r\"-bin$\", re.I) | |\n| PATCH_HINT_RE = re.compile(r\"-(patch|fix|patched|mod|hotfix)\\b\", re.I) | |\n| # --- Data model ------------------------------------------------------------- | |\n| @dataclass | |\n| class Candidate: | |\n| pkgbase: str | |\n| name: str | |\n| maintainer: str | None | |\n| num_votes: int | |\n| popularity: float | |\n| first_submitted: int | |\n| last_modified: int | |\n| version: str | |\n| reason: str # \"new\" | \"modified\" | \"since-hours\" | |\n| signals: list[str] = field(default_factory=list) | |\n| priority: int = 0 | |\n| # --- State ------------------------------------------------------------------ | |\n| def load_state() -> dict: | |\n| if STATE_FILE.exists(): | |\n| try: | |\n| return json.loads(STATE_FILE.read_text()) | |\n| except (json.JSONDecodeError, OSError): | |\n| pass | |\n| # seen maps pkgbase -> {\"version\": str, \"sha\": str} | |\n| return {\"last_run\": 0, \"etag\": None, \"seen\": {}} | |\n| def save_state(state: dict) -> None: | |\n| STATE_DIR.mkdir(parents=True, exist_ok=True) | |\n| STATE_FILE.write_text(json.dumps(state, indent=2)) | |\n| # --- Fetch ------------------------------------------------------------------ | |\n| def session() -> requests.Session: | |\n| s = requests.Session() | |\n| s.headers[\"User-Agent\"] = USER_AGENT | |\n| return s | |\n| def fetch_metadata(s: requests.Session, etag: str | None) -> tuple[list[dict] | None, str | None]: | |\n| \"\"\"Return (entries, new_etag). entries is None if 304 Not Modified.\"\"\" | |\n| headers = {} | |\n| if etag: | |\n| headers[\"If-None-Match\"] = etag | |\n| resp = s.get(META_URL, headers=headers, timeout=60) | |\n| if resp.status_code == 304: | |\n| return None, etag | |\n| resp.raise_for_status() | |\n| raw = gzip.decompress(resp.content) | |\n| return json.loads(raw), resp.headers.get(\"ETag\", etag) | |\n| SHA_HEADER_RE = re.compile(r\"^From ([0-9a-f]{7,40}) \", re.M) | |\n| def fetch_diff(s: requests.Session, pkgbase: str, since_sha: str | None | |\n| ) -> tuple[str, str | None]: | |\n| \"\"\"Fetch the git diff for a package over cgit. | |\n| Returns (diff_text, new_head_sha). If `since_sha` is given, the diff spans | |\n| since_sha..HEAD; otherwise it's the latest commit (or, for a brand-new | |\n| package, its initial commit — full file contents as additions). | |\n| \"\"\" | |\n| params = {\"h\": pkgbase} | |\n| if since_sha: | |\n| params[\"id2\"] = since_sha | |\n| resp = s.get(CGIT_PATCH_URL, params=params, timeout=60) | |\n| resp.raise_for_status() | |\n| text = resp.text | |\n| m = SHA_HEADER_RE.search(text) | |\n| new_sha = m.group(1) if m else None | |\n| return text, new_sha | |\n| # --- Candidate selection ---------------------------------------------------- | |\n| def select_candidates(entries: list[dict], last_run: int, since_cutoff: int | None) -> list[Candidate]: | |\n| # Lower bound a package must beat to qualify. last_run drives the | |\n| # incremental trigger; since_cutoff (if set) tightens it to a recent window. | |\n| cutoff = max(last_run, since_cutoff or 0) | |\n| cands: list[Candidate] = [] | |\n| for e in entries: | |\n| first = e.get(\"FirstSubmitted\", 0) or 0 | |\n| modified = e.get(\"LastModified\", 0) or 0 | |\n| is_new = first > cutoff | |\n| is_mod = modified > cutoff | |\n| if not (is_new or is_mod): | |\n| continue | |\n| reason = \"new\" if is_new else \"modified\" | |\n| cands.append(Candidate( | |\n| pkgbase=e.get(\"PackageBase\") or e.get(\"Name\", \"\"), | |\n| name=e.get(\"Name\", \"\"), | |\n| maintainer=e.get(\"Maintainer\"), | |\n| num_votes=e.get(\"NumVotes\", 0) or 0, | |\n| popularity=float(e.get(\"Popularity\", 0) or 0), | |\n| first_submitted=first, | |\n| last_modified=modified, | |\n| version=e.get(\"Version\", \"\"), | |\n| reason=reason, | |\n| )) | |\n| return cands | |\n| def rank_metadata(c: Candidate) -> None: | |\n| \"\"\"Apply metadata-only signals and a priority score (higher = check first).\"\"\" | |\n| prio = 0 | |\n| if c.reason == \"new\": | |\n| prio += 2 | |\n| if c.num_votes == 0: | |\n| prio += 1 | |\n| if c.popularity < 0.01: | |\n| prio += 1 | |\n| if BRAND_RE.search(c.name) and c.reason == \"new\": | |\n| c.signals.append(\"brand-name-new\") | |\n| prio += 3 | |\n| if BIN_SUFFIX_RE.search(c.name) and PATCH_HINT_RE.search(c.name): | |\n| c.signals.append(\"bin-patch-suffix\") | |\n| prio += 3 | |\n| c.priority = prio | |\n| def run_prefilter(text: str) -> list[str]: | |\n| return [name for name, rx in PREFILTER_RULES if rx.search(text)] | |\n| # --- Staging ---------------------------------------------------------------- | |\n| PROMPT_TEXT = \"\"\"\\ | |\n| You are a supply-chain security analyst reviewing the git diff of one Arch Linux | |\n| AUR package for malware. Judge ONLY the code in the diff — what it would *do* if | |\n| built or installed. The package's name, age, vote count, or popularity is NOT | |\n| evidence of anything; ignore it. For a brand-new package the diff is the initial | |\n| commit (the full PKGBUILD/scripts as `+` additions); otherwise it is just the | |\n| change. Focus on added lines (`+`). | |\n| Flag a diff only for concrete malicious behaviour: | |\n| - remote fetch-and-execute (`curl|bash`, `bash <(curl)`, `python -c` fetching, | |\n| `base64 -d | sh`, `eval` of downloaded data) | |\n| - hardcoded IP:port / C2; sources from paste sites, raw gists, IP URLs, | |\n| file-drop hosts, URL shorteners, ngrok | |\n| - write + `chmod +x` + exec in `/tmp`; systemd unit / cron / timer pointing at | |\n| dropped files; `sudo` | |\n| - obfuscation (long base64/hex, `\\\\x` escapes) | |\n| - a binary for a well-known app sourced from a non-official domain | |\n| (typosquat / masquerade) | |\n| - installing/running unrelated packages (e.g. `npm install`/` npx`) in build() | |\n| or an install hook | |\n| Most diffs are ordinary packaging and are `clean`. Do NOT downgrade a clean diff | |\n| to `review` just because the package is new, unpopular, or sparsely documented — | |\n| that is not a security signal. Legitimate `-bin` packages pulling from the | |\n| vendor's official release URL are clean. | |\n| Verdict scale (pick the lowest that fits): | |\n| - clean : nothing concerning in the diff. This is the common case. | |\n| - review : a specific line is genuinely ambiguous and a human should look — | |\n| not \"could be anything\", but \"this exact thing might be bad\". | |\n| - suspicious : a concrete pattern above is present and not clearly legitimate. | |\n| - malicious : clear fetch-and-execute, C2, or obfuscated payload. | |\n| Respond with ONLY a JSON object (no prose, no markdown fences). For a clean | |\n| diff, leave iocs/reasons/evidence empty: | |\n| { | |\n| \"verdict\": \"clean | review | suspicious | malicious\", | |\n| \"confidence\": 0.0, | |\n| \"iocs\": [\"130.162.225.47:8080\", \"https://...\"], | |\n| \"reasons\": [\"one-line findings tied to a specific line\"], | |\n| \"evidence\": [{\"snippet\": \"exact line\", \"why\": \"what's wrong\"}] | |\n| } | |\n| \"\"\" | |\n| # JSON schema for the verdict, used to grammar-constrain the local llama-server | |\n| # so it can only emit schema-valid JSON — small quantized models otherwise tend | |\n| # to ignore the \"respond with JSON\" instruction and parrot the diff or loop. | |\n| VERDICT_SCHEMA = { | |\n| \"type\": \"object\", | |\n| \"properties\": { | |\n| \"verdict\": {\"type\": \"string\", \"enum\": VERDICT_ORDER}, | |\n| \"confidence\": {\"type\": \"number\"}, | |\n| \"iocs\": {\"type\": \"array\", \"items\": {\"type\": \"string\"}}, | |\n| \"reasons\": {\"type\": \"array\", \"items\": {\"type\": \"string\"}}, | |\n| \"evidence\": { | |\n| \"type\": \"array\", | |\n| \"items\": { | |\n| \"type\": \"object\", | |\n| \"properties\": { | |\n| \"snippet\": {\"type\": \"string\"}, | |\n| \"why\": {\"type\": \"string\"}, | |\n| }, | |\n| \"required\": [\"snippet\", \"why\"], | |\n| }, | |\n| }, | |\n| }, | |\n| \"required\": [\"verdict\", \"confidence\", \"reasons\"], | |\n| } | |\n| def stage(out_dir: Path, candidate: Candidate, diff: str, new_sha: str | None) -> None: | |\n| cdir = out_dir / candidate.pkgbase | |\n| cdir.mkdir(parents=True, exist_ok=True) | |\n| (cdir / \"change.diff\").write_text(diff) | |\n| meta = { | |\n| \"pkgbase\": candidate.pkgbase, | |\n| \"name\": candidate.name, | |\n| \"maintainer\": candidate.maintainer, | |\n| \"num_votes\": candidate.num_votes, | |\n| \"popularity\": candidate.popularity, | |\n| \"first_submitted\": iso(candidate.first_submitted), | |\n| \"last_modified\": iso(candidate.last_modified), | |\n| \"version\": candidate.version, | |\n| \"reason\": candidate.reason, | |\n| \"head_sha\": new_sha, | |\n| \"prefilter_signals\": candidate.signals, | |\n| \"priority\": candidate.priority, | |\n| \"aur_page\": PKG_PAGE_URL.format(name=candidate.name), | |\n| } | |\n| (cdir / \"meta.json\").write_text(json.dumps(meta, indent=2)) | |\n| def iso(ts: int) -> str: | |\n| if not ts: | |\n| return \"\" | |\n| return datetime.fromtimestamp(ts, tz=timezone.utc).isoformat() | |\n| # --- Classification (claude -p) -------------------------------------------- | |\n| def _extract_json(text: str) -> dict | None: | |\n| \"\"\"Pull the first JSON object out of claude's stdout, fences or not.\"\"\" | |\n| text = text.strip() | |\n| fence = re.search(r\"```(?:json)?\\s*(\\{.*?\\})\\s*```\", text, re.S) | |\n| if fence: | |\n| text = fence.group(1) | |\n| start = text.find(\"{\") | |\n| if start == -1: | |\n| return None | |\n| depth = 0 | |\n| for i in range(start, len(text)): | |\n| if text[i] == \"{\": | |\n| depth += 1 | |\n| elif text[i] == \"}\": | |\n| depth -= 1 | |\n| if depth == 0: | |\n| try: | |\n| return json.loads(text[start:i + 1]) | |\n| except json.JSONDecodeError: | |\n| return None | |\n| return None | |\n| def match_known_bad(diff: str) -> list[str]: | |\n| \"\"\"Return any known-bad IOCs present in the diff (case-insensitive).\"\"\" | |\n| low = diff.lower() | |\n| return [ioc for ioc in KNOWN_BAD_IOCS if ioc.lower() in low] | |\n| def trim_diff(diff: str) -> str: | |\n| \"\"\"Shrink a unified diff for classification, keeping the signal. | |\n| Removes unchanged context lines (those starting with a single space), | |\n| keeping only added/removed lines plus the structural headers that say which | |\n| file and hunk a change belongs to. The full diff stays on disk untouched — | |\n| this only affects what we hand to Claude. | |\n| \"\"\" | |\n| out: list[str] = [] | |\n| for line in diff.splitlines(): | |\n| # Drop unchanged context lines (single leading space). Keep everything | |\n| # else: headers, hunk markers (@@), file markers (+++/---), and the | |\n| # actual +/- changes. | |\n| if line.startswith(\" \"): | |\n| continue | |\n| out.append(line) | |\n| return \"\\n\".join(out) + \"\\n\" | |\n| def build_payload(name: str, diff: str) -> str: | |\n| \"\"\"The model-facing payload. PROMPT_TEXT comes first and is byte-identical | |\n| across candidates, so a prefix-caching backend (claude -p, or llama-server | |\n| with cache_prompt) reuses it and only reprocesses the per-candidate tail. | |\n| We send only the package name (so the model can spot typosquats), not the | |\n| reputation metadata (votes/popularity/dates) — that biased the model into | |\n| downgrading clean diffs to \"review\". The staged change.diff stays full; only | |\n| what we hand the model is trimmed. | |\n| \"\"\" | |\n| return ( | |\n| f\"{PROMPT_TEXT}\\n\\n\" | |\n| f\"--- package name ---\\n{name}\\n\\n\" | |\n| f\"--- git diff (unchanged context lines removed) ---\\n{trim_diff(diff)}\\n\" | |\n| ) | |\n| def call_claude(payload: str, model: str, timeout: int) -> str: | |\n| \"\"\"Run `claude -p`; return raw stdout. Raises ClassifyError on failure.\"\"\" | |\n| cmd = [\"claude\", \"-p\", \"--model\", model] | |\n| try: | |\n| proc = subprocess.run( | |\n| cmd, input=payload, capture_output=True, text=True, timeout=timeout) | |\n| except FileNotFoundError: | |\n| raise ClassifyError(\"`claude` CLI not found on PATH\") | |\n| except subprocess.TimeoutExpired: | |\n| raise ClassifyError(f\"claude timed out after {timeout}s\") | |\n| if proc.returncode != 0: | |\n| err = (proc.stderr or proc.stdout or \"\").strip().splitlines() | |\n| raise ClassifyError(f\"claude exited {proc.returncode}: {err[-1] if err else '?'}\") | |\n| return proc.stdout | |\n| def call_llama(payload: str, url: str, timeout: int) -> str: | |\n| \"\"\"POST to a llama-server /completion endpoint; return generated text. | |\n| `cache_prompt: true` tells llama-server to keep the KV cache for the shared | |\n| prefix (PROMPT_TEXT) across requests, so repeat candidates only reprocess | |\n| their diff. `json_schema` grammar-constrains the output so the model can | |\n| only emit schema-valid JSON — small quantized models otherwise tend to | |\n| ignore the instruction, parrot the diff, and loop. Run the server with e.g.: | |\n| taskset -c 0,2,4,6,8,10 llama-server \\\\ | |\n| -m ~/models/gemma4-26b/gemma-4-26B-A4B-it-qat-UD-Q4_K_XL.gguf \\\\ | |\n| -t 6 -c 8192 --swa-full --cache-reuse 256 --host 127.0.0.1 --port 8080 | |\n| \"\"\" | |\n| body = { | |\n| \"prompt\": payload, | |\n| \"cache_prompt\": True, # reuse KV cache for the shared PROMPT_TEXT prefix | |\n| \"json_schema\": VERDICT_SCHEMA, # force schema-valid JSON output | |\n| \"temperature\": 0, | |\n| \"n_predict\": 512, | |\n| } | |\n| try: | |\n| resp = requests.post(url.rstrip(\"/\") + \"/completion\", json=body, timeout=timeout) | |\n| resp.raise_for_status() | |\n| except requests.RequestException as exc: | |\n| raise ClassifyError(f\"llama-server request failed: {exc}\") | |\n| return resp.json().get(\"content\", \"\") | |\n| class ClassifyError(Exception): | |\n| \"\"\"A backend call failed; the candidate gets an \"error\" verdict.\"\"\" | |\n| def classify(cdir: Path, backend: str, model: str, llama_url: str, | |\n| timeout: int = 180) -> dict: | |\n| \"\"\"Classify one staged candidate folder; return a verdict dict. | |\n| A diff containing a known-bad IOC is marked malicious immediately, with no | |\n| model call. Otherwise the configured backend (claude | llama) judges the | |\n| diff. On any failure we return a verdict of \"error\" so the caller flags it | |\n| for manual review rather than silently dropping it. Pre-filter signals are | |\n| carried onto the verdict so they stay visible in the summary even when the | |\n| model rates the diff clean. | |\n| \"\"\" | |\n| meta = json.loads((cdir / \"meta.json\").read_text()) | |\n| diff = (cdir / \"change.diff\").read_text() | |\n| signals = meta.get(\"prefilter_signals\") or [] | |\n| hits = match_known_bad(diff) | |\n| if hits: | |\n| return { | |\n| \"verdict\": \"malicious\", | |\n| \"confidence\": 1.0, | |\n| \"iocs\": hits, | |\n| \"reasons\": [f\"known-bad indicator in diff: {', '.join(hits)}\"], | |\n| \"evidence\": [], | |\n| \"source\": \"known-bad-ioc\", | |\n| \"prefilter_signals\": signals, | |\n| } | |\n| payload = build_payload(meta.get(\"name\", cdir.name), diff) | |\n| try: | |\n| if backend == \"llama\": | |\n| raw = call_llama(payload, llama_url, timeout) | |\n| else: | |\n| raw = call_claude(payload, model, timeout) | |\n| except ClassifyError as exc: | |\n| return {\"verdict\": \"error\", \"reasons\": [str(exc)], \"prefilter_signals\": signals} | |\n| verdict = _extract_json(raw) | |\n| if verdict is None: | |\n| # Keep the full model output so the failure is debuggable: it lands in | |\n| # the candidate's verdict.json, and a snippet shows in the summary. | |\n| snippet = \" \".join(raw.split())[:200] | |\n| return {\"verdict\": \"error\", | |\n| \"reasons\": [f\"could not parse JSON from {backend} output: {snippet}\" | |\n| if snippet else f\"{backend} returned empty output\"], | |\n| \"raw_output\": raw, | |\n| \"prefilter_signals\": signals} | |\n| verdict.setdefault(\"verdict\", \"error\") | |\n| if verdict[\"verdict\"] not in VERDICT_ORDER: | |\n| verdict.setdefault(\"reasons\", []).append( | |\n| f\"unexpected verdict value: {verdict['verdict']!r}\") | |\n| verdict[\"verdict\"] = \"error\" | |\n| verdict[\"prefilter_signals\"] = signals | |\n| return verdict | |\n| def is_bad(verdict: str) -> bool: | |\n| return VERDICT_RANK.get(verdict, -1) >= VERDICT_RANK[ALERT_THRESHOLD] | |\n| def summarize_and_alert(results: list[dict]) -> bool: | |\n| \"\"\"Print a verdict table sorted worst-first; ring the bell on bad ones. | |\n| `results` is a list of {pkgbase, verdict, reasons, ...}. Returns True if | |\n| anything is at or above the alert threshold (suspicious/malicious). | |\n| \"\"\" | |\n| if not results: | |\n| print(\"No new candidates this cycle.\", file=sys.stderr) | |\n| return False | |\n| def sort_key(r: dict) -> tuple[int, str]: | |\n| # errors sort just below malicious so they're visible at the top | |\n| rank = VERDICT_RANK.get(r[\"verdict\"], len(VERDICT_ORDER)) | |\n| return (-rank, r[\"pkgbase\"]) | |\n| results = sorted(results, key=sort_key) | |\n| bad = [r for r in results if is_bad(r[\"verdict\"])] | |\n| errors = [r for r in results if r[\"verdict\"] == \"error\"] | |\n| # Candidates the offline regex pre-filter flagged but Claude did not rate | |\n| # bad — easy to overlook, so call them out separately. | |\n| flagged_clean = [r for r in results | |\n| if r.get(\"prefilter_signals\") and not is_bad(r[\"verdict\"]) | |\n| and r[\"verdict\"] != \"error\"] | |\n| print(\"\\n=== verdict summary ===\", file=sys.stderr) | |\n| for r in results: | |\n| sig = r.get(\"prefilter_signals\") or [] | |\n| if is_bad(r[\"verdict\"]): | |\n| mark = \"!!\" | |\n| elif r[\"verdict\"] == \"error\": | |\n| mark = \"??\" | |\n| elif sig: | |\n| mark = \"* \" # clean-ish, but regex flagged it | |\n| else: | |\n| mark = \" \" | |\n| note = \"; \".join(r.get(\"reasons\") or []) | |\n| if sig: | |\n| note = (note + \" \" if note else \"\") + f\"[regex: {','.join(sig)}]\" | |\n| print(f\" {mark} {r['verdict']:10s} {r['pkgbase']:40s} {note[:120]}\", file=sys.stderr) | |\n| if bad: | |\n| # Ring the terminal bell once per bad candidate (capped), and shout. | |\n| sys.stderr.write(\"\\a\" * min(len(bad), 5)) | |\n| sys.stderr.flush() | |\n| names = \", \".join(r[\"pkgbase\"] for r in bad) | |\n| print(f\"\\n*** ALERT: {len(bad)} candidate(s) >= {ALERT_THRESHOLD}: {names} ***\", | |\n| file=sys.stderr) | |\n| if flagged_clean: | |\n| names = \", \".join(r[\"pkgbase\"] for r in flagged_clean) | |\n| print(f\"(*) {len(flagged_clean)} candidate(s) regex-flagged but rated \" | |\n| f\"{'/'.join(sorted({r['verdict'] for r in flagged_clean}))} — worth a glance: \" | |\n| f\"{names}\", file=sys.stderr) | |\n| if errors: | |\n| print(f\"({len(errors)} candidate(s) could not be classified — review manually)\", | |\n| file=sys.stderr) | |\n| if not bad and not errors and not flagged_clean: | |\n| print(\"All clear.\", file=sys.stderr) | |\n| return bool(bad) | |\n| # --- Pipeline --------------------------------------------------------------- | |\n| def run_once(args) -> list[Path]: | |\n| \"\"\"Fetch metadata, select + stage candidates. Returns staged folder paths.\"\"\" | |\n| state = {\"last_run\": 0, \"etag\": None, \"seen\": {}} if args.no_state else load_state() | |\n| last_run = state.get(\"last_run\", 0) | |\n| now = int(time.time()) | |\n| since_cutoff = (now - args.since_hours * 3600) if args.since_hours else None | |\n| if last_run == 0 and since_cutoff is None: | |\n| # First run with no window would select the entire AUR. Default to 24h. | |\n| since_cutoff = now - 24 * 3600 | |\n| print(\"First run and no --since-hours: defaulting to last 24h.\", file=sys.stderr) | |\n| s = session() | |\n| print(\"Fetching metadata archive...\", file=sys.stderr) | |\n| # --force skips the stored ETag so the server can't 304 us, while still | |\n| # using/updating last_run and seen (unlike --no-state). | |\n| etag = None if args.force else state.get(\"etag\") | |\n| entries, new_etag = fetch_metadata(s, etag) | |\n| if entries is None: | |\n| print(\"Metadata unchanged since last run (304); use --force to refetch.\", | |\n| file=sys.stderr) | |\n| return [] | |\n| print(f\" {len(entries)} packages in archive.\", file=sys.stderr) | |\n| cands = select_candidates(entries, last_run, since_cutoff) | |\n| for c in cands: | |\n| rank_metadata(c) | |\n| cands.sort(key=lambda c: c.priority, reverse=True) | |\n| print(f\"Selected {len(cands)} candidate(s) (new/modified/since-window).\", file=sys.stderr) | |\n| if args.dry_run: | |\n| for c in cands[:args.max_candidates]: | |\n| print(f\" [{c.priority:2d}] {c.reason:9s} {c.pkgbase} \" | |\n| f\"votes={c.num_votes} pop={c.popularity:.3f} signals={c.signals}\") | |\n| return [] | |\n| args.out.mkdir(parents=True, exist_ok=True) | |\n| (args.out / \"PROMPT.md\").write_text(PROMPT_TEXT) | |\n| seen = state.get(\"seen\", {}) | |\n| # Candidates whose diff we still need (skip already-seen exact versions). | |\n| todo: list[Candidate] = [] | |\n| for c in cands[:args.max_candidates]: | |\n| prev = seen.get(c.pkgbase) or {} | |\n| if prev.get(\"version\") == c.version and not args.no_state: | |\n| continue | |\n| todo.append(c) | |\n| def fetch(c: Candidate) -> tuple[Candidate, str | None, str | None, str | None]: | |\n| \"\"\"Worker: fetch one diff. Returns (candidate, diff, sha, error).\"\"\" | |\n| prev = seen.get(c.pkgbase) or {} | |\n| since_sha = None if args.no_state else prev.get(\"sha\") | |\n| try: | |\n| diff, new_sha = fetch_diff(s, c.pkgbase, since_sha) | |\n| return c, diff, new_sha, None | |\n| except requests.RequestException as exc: | |\n| return c, None, None, str(exc) | |\n| # Fetch diffs concurrently (bounded). The pool size is the politeness knob — | |\n| # a handful of parallel requests, not a flood. | |\n| print(f\"Fetching {len(todo)} diff(s) with {args.workers} worker(s)...\", file=sys.stderr) | |\n| fetched: list[tuple[Candidate, str | None, str | None, str | None]] = [] | |\n| if todo: | |\n| with ThreadPoolExecutor(max_workers=max(1, args.workers)) as pool: | |\n| for res in pool.map(fetch, todo): | |\n| fetched.append(res) | |\n| # Process results sequentially: prefilter, stage, index. Preserve priority | |\n| # order (todo was already sorted; pool.map keeps input order). | |\n| index: list[dict] = [] | |\n| staged_dirs: list[Path] = [] | |\n| flagged = 0 | |\n| for c, diff, new_sha, err in fetched: | |\n| if err is not None: | |\n| print(f\" ! {c.pkgbase}: diff fetch failed: {err}\", file=sys.stderr) | |\n| continue | |\n| if len(diff.encode(\"utf-8\", \"replace\")) > MAX_DIFF_BYTES: | |\n| diff = diff[:MAX_DIFF_BYTES] + \"\\n\\n[... diff truncated ...]\\n\" | |\n| c.signals.extend(run_prefilter(diff)) | |\n| if c.signals: | |\n| flagged += 1 | |\n| stage(args.out, c, diff, new_sha) | |\n| staged_dirs.append(args.out / c.pkgbase) | |\n| seen[c.pkgbase] = {\"version\": c.version, \"sha\": new_sha} | |\n| index.append({ | |\n| \"pkgbase\": c.pkgbase, | |\n| \"name\": c.name, | |\n| \"priority\": c.priority, | |\n| \"reason\": c.reason, | |\n| \"signals\": c.signals, | |\n| }) | |\n| index.sort(key=lambda d: (bool(d[\"signals\"]), d[\"priority\"]), reverse=True) | |\n| (args.out / \"candidates.json\").write_text(json.dumps(index, indent=2)) | |\n| state[\"last_run\"] = now | |\n| state[\"etag\"] = new_etag | |\n| state[\"seen\"] = seen | |\n| if not args.no_state: | |\n| save_state(state) | |\n| print(f\"Staged {len(staged_dirs)} candidate(s) into {args.out}/ \" | |\n| f\"({flagged} with pre-filter hits).\", file=sys.stderr) | |\n| return staged_dirs | |\n| def classify_staged(staged_dirs: list[Path], backend: str, model: str, | |\n| llama_url: str) -> list[dict]: | |\n| \"\"\"Classify each staged folder with the chosen backend; write verdict.json.\"\"\" | |\n| results: list[dict] = [] | |\n| for i, cdir in enumerate(staged_dirs, 1): | |\n| pkgbase = cdir.name | |\n| print(f\" [{i}/{len(staged_dirs)}] classifying {pkgbase} ...\", file=sys.stderr) | |\n| verdict = classify(cdir, backend, model, llama_url) | |\n| verdict[\"pkgbase\"] = pkgbase | |\n| (cdir / \"verdict.json\").write_text(json.dumps(verdict, indent=2)) | |\n| results.append(verdict) | |\n| return results | |\n| # --- Main ------------------------------------------------------------------- | |\n| def cycle(args) -> bool: | |\n| \"\"\"One full pass: fetch -> stage -> classify -> summarize. Returns True if bad.\"\"\" | |\n| staged = run_once(args) | |\n| if args.dry_run: | |\n| return False | |\n| if not staged: | |\n| print(\"No new candidates to classify.\", file=sys.stderr) | |\n| return False | |\n| if args.no_classify: | |\n| print(f\"\\nStaged only (--no-classify). Point Claude at {args.out}/:\\n\" | |\n| f\" cd {args.out} && claude -p \\\"$(cat PROMPT.md)\\\"\", file=sys.stderr) | |\n| return False | |\n| results = classify_staged(staged, args.backend, args.model, args.llama_url) | |\n| return summarize_and_alert(results) | |\n| def main(argv: list[str] | None = None) -> int: | |\n| ap = argparse.ArgumentParser(description=__doc__, | |\n| formatter_class=argparse.RawDescriptionHelpFormatter) | |\n| ap.add_argument(\"--out\", type=Path, default=Path(\"staging\"), | |\n| help=\"staging directory for candidate folders (default: ./staging)\") | |\n| ap.add_argument(\"--since-hours\", type=float, default=None, | |\n| help=\"also include packages created/modified within the last N hours\") | |\n| ap.add_argument(\"--max-candidates\", type=int, default=200, | |\n| help=\"cap on candidates to fetch diffs for (default: 200)\") | |\n| ap.add_argument(\"--workers\", type=int, default=4, | |\n| help=\"concurrent diff downloads (default: 4; keep it modest \" | |\n| \"to be polite to the AUR)\") | |\n| ap.add_argument(\"--backend\", choices=(\"claude\", \"llama\"), default=\"claude\", | |\n| help=\"classifier backend: claude -p, or a local llama-server (default: claude)\") | |\n| ap.add_argument(\"--model\", default=\"claude-haiku-4-5\", | |\n| help=\"model for the claude backend (default: claude-haiku-4-5)\") | |\n| ap.add_argument(\"--llama-url\", default=\"http://127.0.0.1:8080\", | |\n| help=\"llama-server base URL for the llama backend \" | |\n| \"(default: http://127.0.0.1:8080)\") | |\n| ap.add_argument(\"--no-classify\", action=\"store_true\", | |\n| help=\"stage only; don't classify (review manually later)\") | |\n| ap.add_argument(\"--loop\", action=\"store_true\", | |\n| help=\"run continuously, sleeping --interval-hours between cycles\") | |\n| ap.add_argument(\"--interval-hours\", type=float, default=1.0, | |\n| help=\"hours to sleep between cycles in --loop mode (default: 1)\") | |\n| ap.add_argument(\"--no-state\", action=\"store_true\", | |\n| help=\"ignore/don't update state; diff against latest commit only\") | |\n| ap.add_argument(\"--force\", action=\"store_true\", | |\n| help=\"skip the stored ETag and refetch metadata even if \" | |\n| \"unchanged (still uses/updates last_run and seen)\") | |\n| ap.add_argument(\"--dry-run\", action=\"store_true\", | |\n| help=\"select + rank from metadata only; don't fetch diffs or stage\") | |\n| args = ap.parse_args(argv) | |\n| if (not args.no_classify and not args.dry_run and args.backend == \"claude\" | |\n| and shutil.which(\"claude\") is None): | |\n| print(\"warning: `claude` not found on PATH — classification will report \" | |\n| \"errors. Use --no-classify to stage only, or --backend llama.\", | |\n| file=sys.stderr) | |\n| if not args.loop: | |\n| bad = cycle(args) | |\n| # Non-zero exit if anything is suspicious/malicious (for cron/alerting). | |\n| return 2 if bad else 0 | |\n| interval = args.interval_hours * 3600 | |\n| print(f\"Loop mode: cycle every {args.interval_hours}h. Ctrl-C to stop.\", file=sys.stderr) | |\n| while True: | |\n| ts = datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\") | |\n| print(f\"\\n========== cycle @ {ts} ==========\", file=sys.stderr) | |\n| try: | |\n| cycle(args) | |\n| except requests.RequestException as exc: | |\n| print(f\"cycle failed (network): {exc} — retrying next interval\", file=sys.stderr) | |\n| except Exception as exc: # keep the loop alive across unexpected errors | |\n| print(f\"cycle failed: {exc!r} — retrying next interval\", file=sys.stderr) | |\n| try: | |\n| time.sleep(interval) | |\n| except KeyboardInterrupt: | |\n| print(\"\\nStopped.\", file=sys.stderr) | |\n| return 0 | |\n| if __name__ == \"__main__\": | |\n| raise SystemExit(main()) |", "url": "https://wpnews.pro/news/gist-6a5e632583c67dadf84d68d339cdf799", "canonical_source": "https://gist.github.com/drinkcat/6a5e632583c67dadf84d68d339cdf799", "published_at": "2026-06-13 07:27:32+00:00", "updated_at": "2026-06-13 11:18:20.676732+00:00", "lang": "en", "topics": ["developer-tools"], "entities": ["Arch Linux", "AUR", "cgit", "Python"], "alternates": {"html": "https://wpnews.pro/news/gist-6a5e632583c67dadf84d68d339cdf799", "markdown": "https://wpnews.pro/news/gist-6a5e632583c67dadf84d68d339cdf799.md", "text": "https://wpnews.pro/news/gist-6a5e632583c67dadf84d68d339cdf799.txt", "jsonld": "https://wpnews.pro/news/gist-6a5e632583c67dadf84d68d339cdf799.jsonld"}}