Crawlee for Python: Build a Web Crawling Pipeline with Robots Handling, Link Graphs, and RAG Chunk Export Apify released a tutorial for Crawlee for Python, demonstrating how to build a web crawling pipeline with robots handling, link graphs, and RAG chunk export. The tutorial covers environment setup, static and dynamic crawling, structured extraction, and downstream data processing using tools like BeautifulSoupCrawler, ParselCrawler, and PlaywrightCrawler. This enables developers to efficiently scrape and process web data for AI applications. In this tutorial, we build a full Crawlee-for-Python https://github.com/apify/crawlee-python workflow that covers environment setup, local website generation, static crawling, dynamic crawling, structured extraction, and downstream data processing. We begin by configuring a compatible Crawlee runtime with pinned Pydantic support, Playwright browser installation, persistent storage directories, and Colab-safe execution handling. We then generate a realistic local demo website containing product pages, documentation pages, blog content, internal links, robots.txt rules, JSON-LD metadata, and JavaScript-rendered catalog items. Using BeautifulSoupCrawler, we perform fast recursive HTML crawling and extract page titles, metadata, text previews, outgoing links, product attributes, documentation headings, code blocks, and blog tags. With ParselCrawler, we run precise CSS- and XPath-based extraction on product detail pages. With PlaywrightCrawler, we render JavaScript content in a headless Chromium browser, wait for dynamic DOM elements to appear, extract client-side data, and capture full-page screenshots. Setting Up the Crawlee Python Runtime and Helpers python import os import sys import re import csv import json import time import math import shutil import socket import hashlib import asyncio import textwrap import subprocess import threading from pathlib import Path from functools import partial from http.server import ThreadingHTTPServer, SimpleHTTPRequestHandler from importlib.metadata import version, PackageNotFoundError SETUP SENTINEL = "/content/.crawlee python tutorial setup done v2" def sh command, check=True, quiet=False : print f"\n$ {command}" result = subprocess.run command, shell=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, if not quiet and result.stdout: print result.stdout -5000: if check and result.returncode = 0: raise RuntimeError f"Command failed with exit code {result.returncode}: {command}" return result.returncode == 0 def package version package name : try: return version package name except PackageNotFoundError: return None def is good pydantic version v : if not v: return False m = re.match r"^ \d+ \. \d+ ", v if not m: return False major, minor = int m.group 1 , int m.group 2 return major == 2 and minor == 11 current crawlee = package version "crawlee" current pydantic = package version "pydantic" needs setup = not os.path.exists SETUP SENTINEL or current crawlee is None or not is good pydantic version current pydantic if needs setup: print "PHASE 1: Installing compatible Crawlee + Pydantic + Playwright dependencies." print "After this finishes, Colab will restart automatically. Then run this same cell again." sh f'{sys.executable} -m pip uninstall -y crawlee pydantic pydantic-core', check=False sh f'{sys.executable} -m pip install -q -U ' f'"pydantic =2.11,<2.12" ' f'"crawlee all " ' f'pandas matplotlib networkx nest asyncio beautifulsoup4 parsel' sh f'{sys.executable} -m playwright install --with-deps chromium', check=False Path SETUP SENTINEL .write text "done", encoding="utf-8" print "\nInstalled versions:" sh f'{sys.executable} -m pip show crawlee pydantic pydantic-core', check=False try: import google.colab print "\nRestarting Colab runtime now. After it reconnects, run this same cell again." os.kill os.getpid , 9 except Exception: raise SystemExit "Setup complete. Restart the runtime/kernel manually, then run this cell again." print "PHASE 2: Dependencies are ready. Running the Crawlee tutorial." import pandas as pd import matplotlib.pyplot as plt import networkx as nx import nest asyncio nest asyncio.apply TUTORIAL ROOT = Path "/content/crawlee python advanced tutorial" SITE DIR = TUTORIAL ROOT / "demo site" OUTPUT DIR = TUTORIAL ROOT / "outputs" STORAGE DIR = TUTORIAL ROOT / "crawlee storage" SCREENSHOT DIR = OUTPUT DIR / "screenshots" for path in SITE DIR, OUTPUT DIR, STORAGE DIR : if path.exists : shutil.rmtree path for path in SITE DIR, OUTPUT DIR, STORAGE DIR, SCREENSHOT DIR : path.mkdir parents=True, exist ok=True os.environ "CRAWLEE STORAGE DIR" = str STORAGE DIR os.environ "CRAWLEE LOG LEVEL" = "INFO" os.environ "CRAWLEE PURGE ON START" = "true" from crawlee import Glob, ConcurrencySettings from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext, ParselCrawler, ParselCrawlingContext, PlaywrightCrawler, PlaywrightCrawlingContext, try: import crawlee print "Crawlee version:", crawlee. version except Exception: print "Crawlee imported successfully." print "Pydantic version:", package version "pydantic" def safe slug value : value = re.sub r" ^a-zA-Z0-9 +", "-", str value .strip "-" .lower return value or "item" def money to float value : if value is None: return None cleaned = re.sub r" ^0-9. ", "", str value return float cleaned if cleaned else None def normalize text value, max len=None : value = re.sub r"\s+", " ", value or "" .strip return value :max len if max len else value def write file path, content : path = Path path path.parent.mkdir parents=True, exist ok=True path.write text textwrap.dedent content .strip + "\n", encoding="utf-8" We begin by preparing the complete Colab runtime for the Crawlee tutorial. We install compatible versions of Crawlee, Pydantic, Playwright, and the required analysis libraries, and handle the automatic restart required after setup. We then configure storage folders, environment variables, crawler imports, and helper functions to ensure the rest of the workflow runs smoothly. Generating the Demo Website and Product Catalog PRODUCTS = { "sku": "CRW-101", "name": "Crawler Reliability Kit", "category": "automation", "price": 149.0, "rating": 4.8, "stock": 18, "features": "retry policy", "queue replay", "structured logs" , "related": "CRW-202", "CRW-303" , }, { "sku": "CRW-202", "name": "Playwright Rendering Pack", "category": "browser", "price": 249.0, "rating": 4.7, "stock": 9, "features": "headless chromium", "screenshots", "dynamic DOM extraction" , "related": "CRW-101", "CRW-404" , }, { "sku": "CRW-303", "name": "RAG Extraction Bundle", "category": "ai-data", "price": 199.0, "rating": 4.9, "stock": 13, "features": "clean text chunks", "metadata capture", "JSONL export" , "related": "CRW-101", "CRW-505" , }, { "sku": "CRW-404", "name": "Anti-Fragile Session Toolkit", "category": "resilience", "price": 299.0, "rating": 4.6, "stock": 5, "features": "session rotation", "state recovery", "graceful failures" , "related": "CRW-202", "CRW-505" , }, { "sku": "CRW-505", "name": "Data Export Control Plane", "category": "storage", "price": 179.0, "rating": 4.5, "stock": 21, "features": "datasets", "key-value store", "CSV and JSON export" , "related": "CRW-303", "CRW-404" , }, def layout title, body, extra head="", extra script="" : css = """