In this exploration, we'll see how to turn raw, unstructured documents into structured knowledge graphs using Gemini. We'll start by prototyping to develop our intuition. Then, we'll optimize our prompts and outputs, and finally scale up to process entire books or dense legal contracts. By the end, we'll even visualize extracted book narratives and contractual network graphs!
A few notes before we start:
Documents are everywhere. We use them for business, daily operations, legal matters, technical docs, education, and even just for fun. However, documents are not databases. They're generally unstructured, and fully understanding them requires multiple reading passes.
So, can we extract structured knowledge from documents using only the following?
Let's try with Gemini…
We'll use the following packages:
google-genai
for calling Gemini with the networkx
for graph managementWe'll also need:
tenacity
for request management (a dependency of google-genai
)matplotlib
and pillow
for data visualization (dependencies of networkx
)
%pip install --quiet "google-genai>=2.6.0" "networkx[default]"
To use the Gemini API, we have two main options:
🛠️ Option 1 - Gemini API via Agent Platform
Requirements:
Gen AI SDK environment variables:
GOOGLE_GENAI_USE_ENTERPRISE="True"
GOOGLE_CLOUD_PROJECT="<PROJECT_ID>"
GOOGLE_CLOUD_LOCATION="<LOCATION>"
💡 For preview models, the location must be set to
global
. For generally available models, we can choose the closest location among the[Google model endpoint locations].ℹ️ Learn more about
[setting up a project and a development environment].
🛠️ Option 2 - Gemini API via Google AI Studio
Requirement:
Gen AI SDK environment variables:
GOOGLE_GENAI_USE_ENTERPRISE="False"
GOOGLE_API_KEY="<API_KEY>"
ℹ️ Learn more about
[getting a Gemini API key from Google AI Studio].
💡 You can store your environment configuration outside of the source code:
| Environment | Method |
|---|---|
| IDE | |
.env file (or equivalent) |
|
| Colab | Colab Secrets (🗝️ icon in left panel, see code below) |
| Colab Enterprise | Google Cloud project and location are automatically defined |
| Workbench | Google Cloud project and location are automatically defined |
import os
import sys
from collections.abc import Callable
from google import genai
GOOGLE_GENAI_USE_ENTERPRISE = True # @param {type: "boolean"}
GOOGLE_CLOUD_PROJECT = "" # @param {type: "string"}
GOOGLE_CLOUD_LOCATION = "global" # @param {type: "string"}
GOOGLE_API_KEY = "" # @param {type: "string"}
def check_environment() -> bool:
check_colab_user_authentication()
return check_manual_setup() or check_enterprise() or check_colab() or check_local()
def check_manual_setup() -> bool:
return check_define_env_vars(
GOOGLE_GENAI_USE_ENTERPRISE,
GOOGLE_CLOUD_PROJECT.strip(), # Might have been pasted with a newline
GOOGLE_CLOUD_LOCATION,
GOOGLE_API_KEY,
)
def check_enterprise() -> bool:
match os.getenv("VERTEX_PRODUCT", ""):
case "WORKBENCH_INSTANCE":
pass
case "COLAB_ENTERPRISE":
if not running_in_colab_env():
return False
case _:
return False
return check_define_env_vars(
True,
os.getenv("GOOGLE_CLOUD_PROJECT", ""),
os.getenv("GOOGLE_CLOUD_REGION", ""),
"",
)
def check_colab() -> bool:
if not running_in_colab_env():
return False
from google.colab import auth as colab_auth # type: ignore
colab_auth.authenticate_user()
enterprise, project, location, api_key = get_vars(get_colab_secret)
return check_define_env_vars(enterprise, project, location, api_key)
def check_local() -> bool:
enterprise, project, location, api_key = get_vars(os.getenv)
return check_define_env_vars(enterprise, project, location, api_key)
def running_in_colab_env() -> bool:
return "google.colab" in sys.modules
def check_colab_user_authentication() -> None:
if running_in_colab_env():
from google.colab import auth as colab_auth # type: ignore
colab_auth.authenticate_user()
def get_colab_secret(secret_name: str, default: str) -> str:
from google.colab import errors, userdata # type: ignore
try:
return userdata.get(secret_name)
except errors.SecretNotFoundError:
return default
def disable_colab_cell_scrollbar() -> None:
if running_in_colab_env():
from google.colab import output # type: ignore
output.no_vertical_scroll()
def get_vars(getenv: Callable[[str, str], str]) -> tuple[bool, str, str, str]:
enterprise_str = getenv("GOOGLE_GENAI_USE_ENTERPRISE", "")
if not enterprise_str:
enterprise_str = getenv("GOOGLE_GENAI_USE_VERTEXAI", "")
if enterprise_str:
enterprise = enterprise_str.lower() in ["true", "1"]
else:
enterprise = bool(getenv("GOOGLE_CLOUD_PROJECT", ""))
project = getenv("GOOGLE_CLOUD_PROJECT", "") if enterprise else ""
location = getenv("GOOGLE_CLOUD_LOCATION", "") if project else ""
api_key = getenv("GOOGLE_API_KEY", "") if not project else ""
return enterprise, project, location, api_key
def check_define_env_vars(
enterprise: bool,
project: str,
location: str,
api_key: str,
) -> bool:
match (enterprise, bool(project), bool(location), bool(api_key)):
case (True, True, _, _):
location = location or "global"
define_env_vars(enterprise, project, location, "")
case (True, False, _, True):
define_env_vars(enterprise, "", "", api_key)
case (False, _, _, True):
define_env_vars(enterprise, "", "", api_key)
case _:
return False
return True
def define_env_vars(
enterprise: bool,
project: str,
location: str,
api_key: str,
) -> None:
os.environ["GOOGLE_GENAI_USE_ENTERPRISE"] = str(enterprise)
os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = str(enterprise)
os.environ["GOOGLE_CLOUD_PROJECT"] = project
os.environ["GOOGLE_CLOUD_LOCATION"] = location
os.environ["GOOGLE_API_KEY"] = api_key
def check_configuration(client: genai.Client) -> None:
service = "Agent Platform" if client.vertexai else "Google AI Studio"
print(f"✅ Using the {service} API", end="")
if client._api_client.project:
print(f' with project "{client._api_client.project[:7]}…"', end="")
print(f' in location "{client._api_client.location}"')
elif client._api_client.api_key:
api_key = client._api_client.api_key
print(f' with API key "{api_key[:5]}…{api_key[-5:]}"', end="")
print(f" (in case of error, make sure it was created for {service})")
print("✅ Environment functions defined")
✅ Environment functions defined
To send Gemini requests, we'll use a google.genai
client:
from google import genai
check_environment()
client = genai.Client()
check_configuration(client)
✅ Using the Agent Platform API with project "lpdemo-…" in location "global"
We need a suite of test data to develop our solution.
Multimodality
We'll test the following types:
text/plain
): Classic books are good text sources of varying lengths and languages.application/pdf
): Legal agreements are also great examples of complex and dense documents.Gemini is natively multimodal, which means it can process different types of inputs. Once we've built knowledge graphs from text or PDF inputs, the solution will also naturally support the following formats:
image/*
)audio/*
)video/*
)General knowledge
⚠️ LLMs are trained on general knowledge, which becomes part of their "long-term memory". To avoid generating memorized information, we'll explicitly instruct the model to use only the provided inputs.
Multilinguality
Gemini is also natively multilingual, which lets us process inputs and generate outputs in 100+ languages.
To keep things general, we'll use English for prompts and knowledge graphs, but you can use any of the 100+ supported languages, as long as your prompts remain clear and explicit.
Let's define a few data sources and helpers: 🔽
import mimetypes
from collections.abc import Iterator
from enum import Enum
from pathlib import Path
from google.genai.types import Part
GOOGLE_CLOUD_STORAGE_PREFIX = "gs://"
HTTPS_PREFIX = "https://"
FILE_PREFIX = "file://"
LOCAL_FOLDER = "./"
class Source(Enum):
def yield_contents(self) -> Iterator[Part]:
file_uri = self.value
if not client.vertexai:
file_uri = convert_to_https_url_if_cloud_storage_uri(file_uri)
mime_type, _ = mimetypes.guess_type(file_uri)
assert mime_type is not None, f"❌ Could not determine mime type: {file_uri=}"
if file_uri.startswith((GOOGLE_CLOUD_STORAGE_PREFIX, HTTPS_PREFIX)):
yield Part.from_uri(file_uri=file_uri, mime_type=mime_type)
return
if file_uri.startswith(FILE_PREFIX):
file = Path(file_uri.removeprefix(FILE_PREFIX))
assert file.exists(), f"❌ File does not exist: {file=}"
if mime_type == "text/plain":
yield Part.from_text(text=file.read_text(encoding="utf-8"))
else:
yield Part.from_bytes(data=file.read_bytes(), mime_type=mime_type)
return
def yield_source_names(self) -> Iterator[str]:
yield self.name
def yield_source_links(self) -> Iterator[str]:
file_uri = convert_to_https_url_if_cloud_storage_uri(self.value)
if file_uri.startswith(HTTPS_PREFIX):
yield file_uri
return
if file_uri.startswith(FILE_PREFIX):
yield file_uri.removeprefix(FILE_PREFIX)
return
def convert_to_https_url_if_cloud_storage_uri(uri: str) -> str:
return (
f"{HTTPS_PREFIX}storage.googleapis.com/{uri.removeprefix(GOOGLE_CLOUD_STORAGE_PREFIX)}"
if uri.startswith(GOOGLE_CLOUD_STORAGE_PREFIX)
else uri
)
def local_file(filename: str) -> str:
return f"{FILE_PREFIX}{LOCAL_FOLDER}{filename}"
def project_gutenberg_txt_url(id: int) -> str:
return f"{HTTPS_PREFIX}gutenberg.org/cache/epub/{id}/pg{id}.txt"
class Classic(Source):
en_hugo_les_misérables = project_gutenberg_txt_url(135)
en_dumas_count_of_monte_cristo = project_gutenberg_txt_url(1184)
fr_zola_thérèse_raquin = project_gutenberg_txt_url(7461)
fr_dumas_trois_mousquetaires = project_gutenberg_txt_url(13951)
fr_dumas_vingt_ans_après = project_gutenberg_txt_url(13952)
fr_dumas_comte_de_monte_cristo_1 = project_gutenberg_txt_url(17989)
fr_dumas_comte_de_monte_cristo_2 = project_gutenberg_txt_url(17990)
fr_dumas_comte_de_monte_cristo_3 = project_gutenberg_txt_url(17991)
fr_dumas_comte_de_monte_cristo_4 = project_gutenberg_txt_url(17992)
class Document(Source):
en_pharma_dev_agreement = "gs://cloud-samples-data/documentai/ContractDocAI/CUAD_v1/Part_I/Development/PhasebioPharmaceuticalsInc_20200330_10-K_EX-10.21_12086810_EX-10.21_Development Agreement.pdf"
class Collection(Source):
fr_dumas_comte_de_monte_cristo = [
Classic.fr_dumas_comte_de_monte_cristo_1,
Classic.fr_dumas_comte_de_monte_cristo_2,
Classic.fr_dumas_comte_de_monte_cristo_3,
Classic.fr_dumas_comte_de_monte_cristo_4,
]
fr_dumas_trois_mousquetaires_vingt_ans_après = [
Classic.fr_dumas_trois_mousquetaires,
Classic.fr_dumas_vingt_ans_après,
]
def yield_contents(self) -> Iterator[Part]:
for source in self.value:
yield from source.yield_contents()
def yield_source_names(self) -> Iterator[str]:
for source in self.value:
yield from source.yield_source_names()
def yield_source_links(self) -> Iterator[str]:
for source in self.value:
yield from source.yield_source_links()
def display_input_data_caption(source: Source) -> None:
names = list(source.yield_source_names())
links = list(source.yield_source_links())
links = ", ".join(
f"[{name}](<{link}>)" for name, link in zip(names, links, strict=True)
)
md = f"**Input data** ({links})"
display_markdown(md)
print("✅ Data helpers defined")
✅ Data helpers defined
Gemini comes in different versions and sizes (Flash-Lite, Flash, and Pro).
Let's get started with Gemini 3.1 Flash-Lite, as it offers high performance, low latency, and very high output speed:
GEMINI_3_1_FLASH_LITE = "gemini-3.1-flash-lite"
Gemini can be used in different ways, ranging from factual to creative modes. We're essentially dealing with a data-extraction use case. We want the results to be as factual and deterministic as possible. To achieve this, we can adjust the content generation parameters.
We'll set the temperature
, top_p
, and seed
parameters to minimize randomness:
temperature=0.0
top_p=0.0
seed=42
(arbitrary fixed value)
from enum import StrEnum, auto
import IPython.display
import tenacity
from google.genai.errors import ClientError
from google.genai.types import (
FinishReason,
GenerateContentConfig,
GenerateContentResponse,
ThinkingConfig,
ThinkingLevel,
)
class Model(Enum):
GEMINI_3_1_FLASH_LITE = "gemini-3.1-flash-lite"
GEMINI_3_5_FLASH = "gemini-3.5-flash"
GEMINI_2_5_FLASH = "gemini-2.5-flash"
GEMINI_2_5_PRO = "gemini-2.5-pro"
GEMINI_3_1_PRO = "gemini-3.1-pro-preview"
DEFAULT = GEMINI_3_1_FLASH_LITE
DEFAULT_CONFIG = GenerateContentConfig(
temperature=0.0,
top_p=0.0,
seed=42, # Arbitrary fixed value
)
class ShowAs(StrEnum):
DONT_SHOW = auto()
TEXT = auto()
MARKDOWN = auto()
def generate_content(
prompt: str,
source: Source | str | None = None,
*,
model: Model | None = None,
config: GenerateContentConfig | None = None,
system_instruction: str | None = None,
show_prompt: ShowAs = ShowAs.DONT_SHOW,
show_response: ShowAs = ShowAs.MARKDOWN,
only_show_prompt: bool = False,
return_response: bool = False,
) -> GenerateContentResponse | None:
disable_colab_cell_scrollbar()
model = model or Model.DEFAULT
model_id = model.value
prompt_contents = get_prompt_contents(prompt, source, show_prompt, only_show_prompt)
if only_show_prompt:
return None
config = config or get_generate_content_config(model, system_instruction)
client = check_client_for_model(model)
response = None
display_request_header(model_id, source)
for attempt in get_retrier():
with attempt:
response = client.models.generate_content(
model=model_id,
contents=prompt_contents, # type: ignore
config=config,
)
display_response_info(response)
display_response(response, show_response)
return response if return_response else None
def get_prompt_contents(
prompt: str,
source: Source | str | None,
show_prompt: ShowAs,
only_show_prompt: bool,
) -> list[str | Part]:
def yield_prompt_contents() -> Iterator[str | Part]:
if not source:
yield prompt.strip()
return
yield "==Start of input data==\n"
if isinstance(source, str):
yield f"{source.strip()}\n"
else:
yield from source.yield_contents()
yield "==End of input data==\n"
yield f"==Start of user prompt==\n{prompt.strip()}\n==End of user prompt=="
prompt_contents = list(yield_prompt_contents())
display_prompt(prompt_contents, show_prompt, only_show_prompt)
return prompt_contents
def get_generate_content_config(
model: Model,
system_instruction: str | None = None,
) -> GenerateContentConfig:
thinking_config = get_thinking_config_for_model(model)
return GenerateContentConfig(
system_instruction=system_instruction,
temperature=DEFAULT_CONFIG.temperature,
top_p=DEFAULT_CONFIG.top_p,
seed=DEFAULT_CONFIG.seed,
thinking_config=thinking_config,
)
def get_thinking_config_for_model(model: Model) -> ThinkingConfig | None:
match model:
case Model.GEMINI_2_5_FLASH:
return ThinkingConfig(thinking_budget=0)
case Model.GEMINI_2_5_PRO:
return ThinkingConfig(thinking_budget=128, include_thoughts=False)
case Model.GEMINI_3_1_FLASH_LITE | Model.GEMINI_3_5_FLASH:
return ThinkingConfig(thinking_level=ThinkingLevel.MINIMAL)
case Model.GEMINI_3_1_PRO:
return ThinkingConfig(thinking_level=ThinkingLevel.LOW)
case _:
return None # Default (dynamic thinking is generally enabled)
def check_client_for_model(model: Model) -> genai.Client:
if (
model.value.endswith("-preview")
and client.vertexai
and client._api_client.location != "global"
):
return genai.Client(
enterprise=client.vertexai,
project=client._api_client.project,
location="global",
)
return client
def get_retrier() -> tenacity.Retrying:
return tenacity.Retrying(
stop=tenacity.stop_after_attempt(7),
wait=tenacity.wait_incrementing(start=10, increment=1),
retry=tenacity.retry_if_exception(should_retry_request),
reraise=True,
)
def should_retry_request(err: BaseException) -> bool:
if not isinstance(err, ClientError):
return False
print(f"❌ ClientError {err.code}: {err.message}")
retry = False
match err.code:
case 400 if err.message is not None and " try again " in err.message:
retry = True
case 429:
retry = True
print(f"🔄 Retry: {retry}")
return retry
PRINT_COLUMNS = 80
PRINT_SEPARATOR_CHAR = "-"
PRINT_SEPARATOR = PRINT_COLUMNS * PRINT_SEPARATOR_CHAR
def print_caption(caption: str) -> None:
print(f" {caption} ".center(PRINT_COLUMNS, PRINT_SEPARATOR_CHAR))
def print_separator() -> None:
print(PRINT_SEPARATOR)
def display_response_info(response: GenerateContentResponse) -> None:
if usage_metadata := response.usage_metadata:
if usage_metadata.prompt_token_count:
print(f"Input tokens : {usage_metadata.prompt_token_count:9,d}")
if usage_metadata.cached_content_token_count:
print(f"Cached tokens : {usage_metadata.cached_content_token_count:9,d}")
if usage_metadata.candidates_token_count:
print(f"Output tokens : {usage_metadata.candidates_token_count:9,d}")
if usage_metadata.thoughts_token_count:
print(f"Thoughts tokens : {usage_metadata.thoughts_token_count:9,d}")
if not response.candidates:
print("❌ No `response.candidates`")
return
if (finish_reason := response.candidates[0].finish_reason) != FinishReason.STOP:
print(f"❌ {finish_reason = }")
if not response.text:
print("❌ No `response.text`")
return
def display_prompt(
contents: list[str | Part],
show_as: ShowAs,
only_show_prompt: bool,
) -> None:
def yield_prompt_strings() -> Iterator[str]:
for content in contents:
if isinstance(content, Part):
yield f"{content!r}\n"
continue
yield content
if only_show_prompt and show_as == ShowAs.DONT_SHOW:
show_as = ShowAs.TEXT
if show_as == ShowAs.DONT_SHOW:
return
separator = "\n" if show_as == ShowAs.MARKDOWN else ""
prompt = separator.join(yield_prompt_strings())
print_caption("Prompt")
match show_as:
case ShowAs.TEXT:
print(prompt)
case ShowAs.MARKDOWN:
display_markdown(prompt)
if only_show_prompt:
print_separator()
def display_request_header(model_id: str, source: Source | str | None = None) -> None:
print_caption(f"Request / {model_id}")
def display_response(response: GenerateContentResponse, show_as: ShowAs) -> None:
if show_as == ShowAs.DONT_SHOW or not (response_text := response.text):
return
print_caption("Start of Response")
response_text = response_text.strip()
match show_as:
case ShowAs.TEXT:
print(response_text)
case ShowAs.MARKDOWN:
display_markdown(response_text)
print_caption("End of Response")
def display_markdown(markdown: str) -> None:
IPython.display.display(IPython.display.Markdown(markdown))
print("✅ Helpers defined")
✅ Helpers defined
Before diving into a solution, it helps to start by prototyping to build some intuition about the natural behavior of the model.
Let's define a short text of a few sentences:
text = """
- Henry Jones is a famous archaeologist. He is actually a "Junior" because he is named after his father.
- Sophie is Henry's daughter, shares his last name, and works as a software engineer.
- William Smith is an aerospace engineer and Sophie's lifelong friend. Everybody calls him Bill and Beau is his dog.
- Short Round met Henry as a child. They first became close friends, and Henry officially adopted him a few years later.
- Sophie and Bill both work at Acme Aerospace.
"""
🧪 Let's see if Gemini can spot our characters…
prompt = """
Using only the input data, list all people and animals mentioned.
"""
generate_content(prompt, text)
----------------------- Request / gemini-3.1-flash-lite ------------------------
Input tokens : 148
Output tokens : 67
------------------------------ Start of Response -------------------------------
Based on the input data provided, here are the people and animals mentioned:
**People:**
* Henry Jones (also known as Henry Jones Junior)
* Sophie Jones
* William Smith (also known as Bill)
* Short Round
**Animals:**
* Beau (Bill's dog)
------------------------------- End of Response --------------------------------
💡 All people and animals are detected as expected.
🧪 Now, let's see if it can connect the dots and figure out who's who…
prompt = """
Using only the input data, list all people and animals mentioned, and how they relate to each other.
"""
generate_content(prompt, text)
----------------------- Request / gemini-3.1-flash-lite ------------------------
Input tokens : 156
Output tokens : 168
------------------------------ Start of Response -------------------------------
Based on the input data provided, here are the people and animals mentioned and their relationships:
**People:**
* **Henry Jones (Junior):** A famous archaeologist. He is the father of Sophie, the adoptive father of Short Round, and is named after his own father.
* **Sophie Jones:** A software engineer at Acme Aerospace. She is the daughter of Henry Jones and a lifelong friend of Bill (William Smith).
* **William (Bill) Smith:** An aerospace engineer at Acme Aerospace. He is a lifelong friend of Sophie and the owner of Beau.
* **Short Round:** The adopted son of Henry Jones. He met Henry as a child and they became close friends before the adoption.
**Animals:**
* **Beau:** A dog owned by William (Bill) Smith.
------------------------------- End of Response --------------------------------
💡 Notes
We're not domain experts in the field we're exploring (yet!).
An LLM processes instructions based on the given prompt and its training knowledge. This knowledge is part of its long-term memory, and we can learn a lot directly from the model itself.
🧪 Let's ask Gemini:
prompt = """
What is the terminology used when building a knowledge graph?
Please provide a simple data example in JSON.
"""
generate_content(prompt)
----------------------- Request / gemini-3.1-flash-lite ------------------------
Input tokens : 21
Output tokens : 581
------------------------------ Start of Response -------------------------------
Building a knowledge graph involves representing information as a network of interconnected entities. Here is the core terminology and a simple data example.
### Core Terminology
1. **Entity (Node):** The "things" in your graph (e.g., a person, a place, a product).
2. **Relationship (Edge/Link):** The connection between two entities. It describes how they interact (e.g., "works at," "lives in," "is a friend of").
3. **Property (Attribute):** Key-value pairs that provide more detail about an entity or a relationship (e.g., a person's "age" or a relationship's "start_date").
4. **Label:** A category assigned to a node or edge to define its type (e.g., a node might have the label "Person").
5. **Schema (Ontology):** The formal structure or "blueprint" that defines the types of entities allowed and the rules for how they can be connected.
6. **Triple:** The fundamental unit of a knowledge graph, consisting of a **Subject → Predicate → Object** (e.g., *Alice* → *works at* → *Google*).
---
### Simple Data Example (JSON)
In a knowledge graph, data is often represented as a collection of **Nodes** and **Edges**.
``` json
{
"nodes": [
{
"id": "1",
"label": "Person",
"properties": {
"name": "Alice",
"age": 30
}
},
{
"id": "2",
"label": "Company",
"properties": {
"name": "Google",
"industry": "Technology"
}
}
],
"edges": [
{
"id": "e1",
"source": "1",
"target": "2",
"label": "WORKS_AT",
"properties": {
"since": 2020
}
}
]
}
Breakdown of the Example:
- Nodes: We have two entities: "Alice" (a Person) and "Google" (a Company).
- Edge: We have one relationship: "WORKS_AT" connecting Alice to Google.
- Properties: We stored specific details like Alice's age and the year she started working at Google.
- Triple representation: This JSON effectively encodes the triple: (Alice) —[WORKS_AT]—> (Google). ------------------------------- End of Response --------------------------------
💡 We learn that knowledge graphs are made of **entities** and **relationships**, also called **nodes** and **edges**, and we get a nice introduction to the field. Using domain terminology will help make our prompts explicit and precise.
To extract knowledge graphs, we'll reason in terms of entities and relationships, adopting domain terminology.
If we think of the final result as a database, our goal is to generate two linked tables, allowing us to reason in terms of data and fields.
Here is a conceptual view of what we want to achieve:
**Entities**
`id` |
`name` |
`label` |
|---|---|---|
| 0 | Henry Jones Jr. | person |
| 1 | Henry Jones Sr. | person |
**Relationships**
`source_id` |
`link` |
`target_id` |
|---|---|---|
| 0 | child_of | 1 |
Let's call this approach "tabular extraction" and split our instructions to output two successive tables, while still using a single request…
In our prototype text, the entities we want to extract are characters (like people or animals). We can define an entity data schema with the fields `id`
(0, 1, 2…), `name`
(full name of the entity), and `label`
(`person`
|`animal`
).
🧪 Let's extract the entities:
prompt = """ Data Schema
Entity:
id: Unique integer identifier (0, 1, 2…).name: Full name of the entity.label:person|animal.
Instructions
- Entity Extraction:
- Extract every distinct entity from the input data that matches an allowed
label. - Include entities that are explicitly named as well as implied entities whose names can be determined from the context.
- Extract every distinct entity from the input data that matches an allowed
- Output the results as a JSON array inside a fenced code block. """
generate_content(prompt, text) ----------------------- Request / gemini-3.1-flash-lite ------------------------ Input tokens : 249 Output tokens : 195 ------------------------------ Start of Response ------------------------------- [ { "id": 0, "name": "Henry Jones Jr.", "label": "person" }, { "id": 1, "name": "Henry Jones Sr.", "label": "person" }, { "id": 2, "name": "Sophie Jones", "label": "person" }, { "id": 3, "name": "William Smith", "label": "person" }, { "id": 4, "name": "Beau", "label": "animal" }, { "id": 5, "name": "Short Round", "label": "person" } ] ------------------------------- End of Response --------------------------------
💡 Remarks
🧪 Now, let's extract both the entities and their relationships:
prompt = """ Data Schema
Entity:
id: Unique integer identifier (0, 1, 2…).name: Full name of the entity.label:person|animal.
Relationship:
source_id:idof the subject entity.link:snake_casepredicate describing the relationship.target_id:idof the object entity.
Instructions
- Entity Extraction:
- Extract every distinct entity from the input data that matches an allowed
label. - Include entities that are explicitly named as well as implied entities whose names can be determined from the context.
- Extract every distinct entity from the input data that matches an allowed
- Relationship Extraction:
- Extract every distinct relationship between them.
- If a relationship changes over time, make sure to include every distinct stage of the relationship.
- Output a JSON object with keys
entitiesandrelationshipsinside a fenced code block. """
response = generate_content(prompt, text, return_response=True) ----------------------- Request / gemini-3.1-flash-lite ------------------------ Input tokens : 340 Output tokens : 456 ------------------------------ Start of Response ------------------------------- { "entities": [ { "id": 0, "name": "Henry Jones Jr.", "label": "person" }, { "id": 1, "name": "Henry Jones Sr.", "label": "person" }, { "id": 2, "name": "Sophie Jones", "label": "person" }, { "id": 3, "name": "William Smith", "label": "person" }, { "id": 4, "name": "Beau", "label": "animal" }, { "id": 5, "name": "Short Round", "label": "person" } ], "relationships": [ { "source_id": 0, "link": "child_of", "target_id": 1 }, { "source_id": 2, "link": "child_of", "target_id": 0 }, { "source_id": 3, "link": "friend_of", "target_id": 2 }, { "source_id": 4, "link": "pet_of", "target_id": 3 }, { "source_id": 5, "link": "friend_of", "target_id": 0 }, { "source_id": 0, "link": "adopted", "target_id": 5 }, { "source_id": 5, "link": "child_of", "target_id": 0 } ] } ------------------------------- End of Response --------------------------------
💡 Remarks
`relationships`
array.`link`
predicates are completely dynamic (a level of flexibility we left in the prompt). While it's interesting to see this natural behavior, we'll want to make it more deterministic for production since our prompt has too many degrees of freedom.`pet_of`
[Person]" is an asymmetric relationship that could also be extracted as "[Person] `owner_of`
[Animal]". This is another area where the prompt is too open-ended. In the finalization section, we'll see an example that asks the model to extract symmetric and asymmetric relationships in both directions.We've concluded our prototyping stage with promising results using a data schema.
To move to production, the next step is to control the generation with a specific structured output.
The JSON format has industry-wide support and serves as a core or intermediate format in many use cases.
For the next step, we would typically define classes using the Pydantic library and request a pure JSON output with a response schema in the config parameters:
`response_mime_type="application/json"`
`response_schema="CLASS_DERIVED_FROM_PYDANTIC_BASE_MODEL"`
(⚠️ However, JSON is a pretty verbose format, designed for interoperability but not optimized for size. Even if we generate compact JSON (also called minified JSON), it still has inherent verbosity due to:
ℹ️ When using LLMs, once the first token is generated, the remaining generation time is roughly proportional to the number of output tokens. Similarly, the cost of a request is based on token usage (input + output), with output tokens being significantly more expensive than input tokens.
💡 A better output structure will positively impact both generation speed and request cost.
Let's explore an alternative…
Our tabular-extraction problem clearly calls for table outputs. An interesting possibility is to ask for Tab-Separated Values (TSV) outputs. For example, we can define our output to be formatted as two consecutive TSV tables.
**Example output format**
tsv filename="entities.tsv"
id{TAB}name{TAB}label
[rows]
tsv filename="relationships.tsv"
source_id{TAB}link{TAB}target_id
[rows]
**Will this work?**
Generating structured outputs like TSV will work seamlessly, as Gemini excels at patterns. We just need to be explicit about what's expected.
**Will this be efficient?**
💡 For our use case, this structure looks optimal:
ℹ️ CSV could be another alternative, but common separators like commas are everywhere in natural language and frequently appear in names and descriptions (e.g., if we decide to extend entity fields). If you're interested in this topic, check out the [TOON format](https://github.com/toon-format/toon), which proposes a JSON alternative using a YAML+CSV mix.
To make an informed decision, we should actually compare the number of tokens needed to represent the same data…
``` python
import csv
import io
import json
import re
from typing import Literal
def get_data_from_response(response: GenerateContentResponse) -> dict:
response_text = response.text or ""
pattern = r"```
json\s*(.*?)\s*
```"
match = re.search(pattern, response_text, re.DOTALL)
json_str = match.group(1) if match else response_text
try:
data = json.loads(json_str)
if not isinstance(data, dict):
print("❌ Returning empty dict (could not parse response as dict)")
data = {}
except (json.JSONDecodeError, TypeError):
print("❌ Returning empty dict (failed parsing the JSON string)")
data = {}
return data
def get_tsv_string_from_data(data: dict) -> str:
output = ""
for key, items in data.items():
rows = ""
if items:
with io.StringIO() as out:
headers = list(items[0].keys())
writer = csv.DictWriter(
out,
fieldnames=headers,
delimiter="\t",
lineterminator="\n",
)
writer.writeheader()
writer.writerows(items)
rows = out.getvalue()
if output:
output += "\n"
output += f'```
{% endraw %}
tsv filename="{key}.tsv"\n{rows}
{% raw %}
```\n'
return output
def print_text_excerpt(title: str, text: str, max_chars: int = 400) -> None:
assert max_chars > 0
chars = len(text)
if chars <= 0:
print_caption("❌ Empty text")
return
if chars <= max_chars:
print_caption(f"{title} ({chars} chars)")
print(text)
return
print_caption(
f"{title} - First {max_chars}/{chars} chars ({max_chars / chars:.0%})"
)
print(f"{text[:max_chars]}…")
def compare_json_vs_tsv(
response: GenerateContentResponse | None,
only_show_excerpts: bool = False,
) -> None:
def get_gain(rows: list[tuple[int, int]], col: Literal[0, 1]) -> str:
val_0, val_1 = rows[0][col], rows[1][col]
return f"**{1 - val_1 / val_0:.1%}**" if val_0 > 0 else "?"
def yield_table_string_rows(
source_title: str,
target_title: str,
rows: list[tuple[int, int]],
) -> Iterator[list[str]]:
yield ["", "Chars", "Tokens"]
yield ["-", "-:", "-:"]
for caption, values in zip([source_title, target_title], rows):
yield [caption, *[str(value) for value in values]]
yield ["**Gain**", get_gain(rows, 0), get_gain(rows, 1)]
def display_gain_table(
source_title: str,
target_title: str,
source_text: str,
target_text: str,
) -> None:
print_caption(f"{source_title} → {target_title}")
model = Model.DEFAULT
model_id = model.value
client = check_client_for_model(model)
rows: list[tuple[int, int]] = []
for s in [source_text, target_text]:
chars = len(s)
tokens = client.models.count_tokens(model=model_id, contents=s).total_tokens
rows.append((chars, tokens or 0))
markdown = "\n".join(
"| " + " | ".join(row) + " |"
for row in yield_table_string_rows(source_title, target_title, rows)
)
display_markdown(markdown)
if response is None:
print("❌ response is None")
return
data = get_data_from_response(response)
formatted_json = f"```
{% endraw %}
json\n{json.dumps(data, indent=2)}\n
{% raw %}
```"
compact_json = f"```
{% endraw %}
json\n{json.dumps(data, separators=(',', ':'))}\n
{% raw %}
```"
tsv = get_tsv_string_from_data(data)
if only_show_excerpts:
max_chars = len(tsv)
print_text_excerpt("Formatted JSON", formatted_json, max_chars)
print_text_excerpt("Compact JSON", compact_json, max_chars)
print_text_excerpt("TSV", tsv, max_chars)
return
display_gain_table("Formatted JSON", "Compact JSON", formatted_json, compact_json)
display_gain_table("Compact JSON", "TSV", compact_json, tsv)
display_gain_table("Formatted JSON", "TSV", formatted_json, tsv)
print_separator()
print("✅ JSON vs TSV helpers defined")
✅ JSON vs TSV helpers defined
🧪 First, let's compare how much data we can represent for the same number of characters based on our latest API response:
compare_json_vs_tsv(response, only_show_excerpts=True)
----------------- Formatted JSON - First 335/1122 chars (30%) ------------------
``` json
{
"entities": [
{
"id": 0,
"name": "Henry Jones Jr.",
"label": "person"
},
{
"id": 1,
"name": "Henry Jones Sr.",
"label": "person"
},
{
"id": 2,
"name": "Sophie Jones",
"label": "person"
},
{
"id": 3,
"name": "William Smith",
…
------------------- Compact JSON - First 335/665 chars (50%) -------------------
``` json
{"entities":[{"id":0,"name":"Henry Jones Jr.","label":"person"},{"id":1,"name":"Henry Jones Sr.","label":"person"},{"id":2,"name":"Sophie Jones","label":"person"},{"id":3,"name":"William Smith","label":"person"},{"id":4,"name":"Beau","label":"animal"},{"id":5,"name":"Short Round","label":"person"}],"relationships":[{"source_i…
------------------------------- TSV (335 chars) --------------------------------
``` tsv filename="entities.tsv"
id name label
0 Henry Jones Jr. person
1 Henry Jones Sr. person
2 Sophie Jones person
3 William Smith person
4 Beau animal
5 Short Round person
tsv filename="relationships.tsv"
source_id link target_id
0 child_of 1
2 child_of 0
3 friend_of 2
4 pet_of 3
5 friend_of 0
0 adopted 5
5 child_of 0
💡 Notice how much more information can be represented in the same number of text characters. This will apply similarly to token counts.
🧪 And now, let's compare the gains, especially for token counts:
compare_json_vs_tsv(response) ------------------------ Formatted JSON → Compact JSON -------------------------
| Chars | Tokens | |
|---|---|---|
| Formatted JSON | 1122 | 456 |
| Compact JSON | 665 | 216 |
Gain |
40.7% |
52.6% |
------------------------------ Compact JSON → TSV ------------------------------
| Chars | Tokens | |
|---|---|---|
| Compact JSON | 665 | 216 |
| TSV | 335 | 137 |
Gain |
49.6% |
36.6% |
----------------------------- Formatted JSON → TSV -----------------------------
| Chars | Tokens | |
|---|---|---|
| Formatted JSON | 1122 | 456 |
| TSV | 335 | 137 |
Gain |
70.1% |
70.0% |
💡 **Savings in output tokens:**
With a double-digit percentage reduction in output tokens, building knowledge graphs with TSV outputs is significantly faster (and cheaper)!
Now, let's finalize our code with optimized structures…
First, it helps to define a structured prompt template, so we can focus on specific parts of our solution using a divide-and-conquer approach.
Here's a possible prompt template:
KNOWLEDGE_GRAPH_PROMPT_TEMPLATE = """ Data Schema
{data_schema}
Instructions
{instructions}
Output Format
{output_format} """
Then, here are some possible `Entity`
, `Relationship`
, and `KnowledgeGraph`
data classes with the matching output format:
``` python
from dataclasses import dataclass, field
@dataclass
class Entity:
id: int
name: str
label: str
@dataclass
class Relationship:
source_id: int
link: str
target_id: int
@dataclass
class KnowledgeGraph:
entities: list[Entity] = field(default_factory=list)
relationships: list[Relationship] = field(default_factory=list)
TAB = "\t"
KNOWLEDGE_GRAPH_OUTPUT_FORMAT = f"""
Format the output strictly as two TSV code blocks (including the header row):
``` tsv filename="entities.tsv"
id{TAB}name{TAB}label
[data_rows]
tsv filename="relationships.tsv"
source_id{TAB}link{TAB}target_id
[data_rows]
"""
💡 While the Gen AI SDK natively supports Pydantic models for JSON structured outputs, we're using standard Python data classes here and TSV outputs to maximize our token efficiency.
ℹ️ If you use multiple entity or relationship data classes in your solution, you can dynamically generate the output format specification using features of the `dataclasses`
package (like class docstrings and field descriptions).
``` python
from dataclasses import fields, is_dataclass
from typing import get_args, get_origin, get_type_hints
def generate_knowledge_graph(
data_schema: str,
instructions: str,
source: Source | str,
*,
model: Model | None = None,
config: GenerateContentConfig | None = None,
system_instruction: str | None = None,
show_prompt: ShowAs = ShowAs.DONT_SHOW,
show_response: ShowAs = ShowAs.DONT_SHOW,
) -> KnowledgeGraph:
prompt = get_prompt_for_data_schema_and_instructions(data_schema, instructions)
response = generate_content(
prompt,
source,
model=model,
config=config,
system_instruction=system_instruction,
show_prompt=show_prompt,
show_response=show_response,
return_response=True,
)
knowledge_graph = (
parse_list_dataclass(KnowledgeGraph, response)
if isinstance(response, GenerateContentResponse)
else KnowledgeGraph()
)
display_knowledge_graph_info(knowledge_graph)
return knowledge_graph
def show_knowledge_graph_prompt(
data_schema: str,
instructions: str,
source: Source | str,
*,
model: Model | None = None,
config: GenerateContentConfig | None = None,
system_instruction: str | None = None,
show_as: ShowAs = ShowAs.TEXT,
) -> None:
prompt = get_prompt_for_data_schema_and_instructions(data_schema, instructions)
generate_content(
prompt,
source,
model=model,
config=config,
system_instruction=system_instruction,
show_prompt=show_as,
only_show_prompt=True,
)
def get_prompt_for_data_schema_and_instructions(
data_schema: str,
instructions: str,
) -> str:
return KNOWLEDGE_GRAPH_PROMPT_TEMPLATE.format(
data_schema=data_schema.strip(),
instructions=instructions.strip(),
output_format=KNOWLEDGE_GRAPH_OUTPUT_FORMAT.strip(),
).strip()
def parse_list_dataclass[T](cls: type[T], response: GenerateContentResponse) -> T:
assert is_dataclass(cls)
if not (response_text := response.text):
return cls()
data = {}
for f in fields(cls):
origin, list_types = get_origin(f.type), get_args(f.type)
assert (
origin is list
), f"❌ Field {f.name} must be a list[dataclass] parameterized list"
assert len(list_types) == 1, f"❌ Expected 1 single type: {len(list_types)=}"
data[f.name] = parse_tsv_block(list_types[0], response_text, f.name)
return cls(**data)
def parse_tsv_block[T](cls: type[T], data: str, tsv_filestem: str) -> list[T]:
rows = []
valid_fields = get_type_hints(cls)
tsv_string = extract_tsv_block(data, tsv_filestem)
for row in csv.DictReader(io.StringIO(tsv_string), delimiter="\t"):
casted_data = {}
for key, value in row.items():
if key not in valid_fields or value is None:
continue
field_type = valid_fields[key]
try: # Note: Works only for directly castable types such as int, float, str, enum (e.g., not bool)
casted_data[key] = field_type(value)
except (ValueError, TypeError):
print(f'❌ Could not cast "{value}" to {field_type} → Skipping {row=}')
break
else:
try:
rows.append(cls(**casted_data))
except TypeError as e:
print(f"❌ Could not instantiate {cls.__name__}: {e} → Skipping {row=}")
return rows
def extract_tsv_block(data: str, filestem: str) -> str:
pattern = rf'```
{% endraw %}
tsv filename="{re.escape(filestem)}.tsv"\s*\n(.*?)
{% raw %}
```'
if not (match := re.search(pattern, data, flags=re.DOTALL)):
print(f'❌ Could not find a TSV block for "{filestem=}"')
return ""
return match.group(1)
def display_knowledge_graph_info(kg: KnowledgeGraph) -> None:
print_caption("Knowledge Graph Info")
print(f"Entities : {len(kg.entities):3,d}")
print(f"Relationships : {len(kg.relationships):3,d}")
print_separator()
print("✅ Knowledge graph generation helpers defined")
✅ Knowledge graph generation helpers defined
And here is a possible data schema with some instructions to generate a knowledge graph for our book analysis use case:
from enum import StrEnum, auto
class BookAnalysisEntityLabel(StrEnum):
PERSON = auto()
ANIMAL = auto()
ORGANIZATION = auto()
def pipe_delimited_union(enum: type[StrEnum]) -> str:
return "|".join(f"`{e.value}`" for e in enum)
BOOK_ANALYSIS_DATA_SCHEMA = f"""
Entity:
- `id`: Unique integer identifier (0, 1, 2…).
- `name`: Most complete name as exclusively determined from the input data.
- `label`: {pipe_delimited_union(BookAnalysisEntityLabel)}.
Relationship:
- `source_id`: `id` of the subject entity.
- `link`: `snake_case` predicate.
- `target_id`: `id` of the object entity.
"""
BOOK_ANALYSIS_INSTRUCTIONS = """
- Extract every distinct entity:
- Treat distinct pseudonyms/identities as separate entities.
- Include implied entities whose names can be exclusively determined from the input data.
- Extract every distinct relationship between them:
- Use specific `link` predicates in `snake_case` as needed (e.g., `alias_of`, `son_of`, `fiancée_of`, `friend_of`, `murderer_of`, `employer_of`, `in_love_with`, `rival_of`).
- If a relationship changes over time, make sure to include every distinct stage of the relationship.
- For every asymmetric relationship extracted, make sure to include the logical inverse relationship (e.g., `A husband_of B` AND `B wife_of A`, `A employer_of B` AND `B employee_of A`).
- For every symmetric relationship extracted, make sure to include both directions (e.g., `A friend_of B` AND `B friend_of A`).
"""
Verify the structured prompt:
show_knowledge_graph_prompt(
BOOK_ANALYSIS_DATA_SCHEMA,
BOOK_ANALYSIS_INSTRUCTIONS,
text,
show_as=ShowAs.TEXT,
)
------------------------------------ Prompt ------------------------------------
==Start of input data==
- Henry Jones is a famous archaeologist. He is actually a "Junior" because he is named after his father.
- Sophie is Henry's daughter, shares his last name, and works as a software engineer.
- William Smith is an aerospace engineer and Sophie's lifelong friend. Everybody calls him Bill and Beau is his dog.
- Short Round met Henry as a child. They first became close friends, and Henry officially adopted him a few years later.
- Sophie and Bill both work at Acme Aerospace.
==End of input data==
==Start of user prompt==
**Data Schema**
Entity:
- `id`: Unique integer identifier (0, 1, 2…).
- `name`: Most complete name as exclusively determined from the input data.
- `label`: `person`|` animal`|` organization`.
Relationship:
- `source_id`: `id` of the subject entity.
- `link`: `snake_case` predicate.
- `target_id`: `id` of the object entity.
**Instructions**
- Extract every distinct entity:
- Treat distinct pseudonyms/identities as separate entities.
- Include implied entities whose names can be exclusively determined from the input data.
- Extract every distinct relationship between them:
- Use specific `link` predicates in `snake_case` as needed (e.g., `alias_of`, `son_of`, `fiancée_of`, `friend_of`, `murderer_of`, `employer_of`, `in_love_with`, `rival_of`).
- If a relationship changes over time, make sure to include every distinct stage of the relationship.
- For every asymmetric relationship extracted, make sure to include the logical inverse relationship (e.g., `A husband_of B` AND `B wife_of A`, `A employer_of B` AND `B employee_of A`).
- For every symmetric relationship extracted, make sure to include both directions (e.g., `A friend_of B` AND `B friend_of A`).
**Output Format**
Format the output strictly as two TSV code blocks (including the header row):
``` tsv filename="entities.tsv"
id name label
[data_rows]
tsv filename="relationships.tsv"
source_id link target_id
[data_rows]
==End of user prompt== #
🧪 Let's generate a knowledge graph:
knowledge_graph = generate_knowledge_graph( BOOK_ANALYSIS_DATA_SCHEMA, BOOK_ANALYSIS_INSTRUCTIONS, text, show_response=ShowAs.TEXT, )
print(knowledge_graph.entities) print(knowledge_graph.relationships) ----------------------- Request / gemini-3.1-flash-lite ------------------------ Input tokens : 534 Output tokens : 244 ------------------------------ Start of Response -------------------------------
id name label
0 Henry Jones Jr. person
1 Henry Jones Sr. person
2 Sophie Jones person
3 William Smith person
4 Bill person
5 Beau animal
6 Short Round person
7 Acme Aerospace organization
tsv filename="relationships.tsv"
source_id link target_id
0 son_of 1
1 father_of 0
0 father_of 2
2 daughter_of 0
2 friend_of 3
3 friend_of 2
3 alias_of 4
4 alias_of 3
3 employee_of 7
7 employer_of 3
2 employee_of 7
7 employer_of 2
3 owner_of 5
5 pet_of 3
6 friend_of 0
0 friend_of 6
0 adopted_father_of 6
6 adopted_son_of 0
------------------------------- End of Response -------------------------------- ----------------------------- Knowledge Graph Info ----------------------------- Entities : 8 Relationships : 18 #
[Entity(id=0, name='Henry Jones Jr.', label='person'), Entity(id=1, name='Henry Jones Sr.', label='person'), Entity(id=2, name='Sophie Jones', label='person'), Entity(id=3, name='William Smith', label='person'), Entity(id=4, name='Bill', label='person'), Entity(id=5, name='Beau', label='animal'), Entity(id=6, name='Short Round', label='person'), Entity(id=7, name='Acme Aerospace', label='organization')] [Relationship(source_id=0, link='son_of', target_id=1), Relationship(source_id=1, link='father_of', target_id=0), Relationship(source_id=0, link='father_of', target_id=2), Relationship(source_id=2, link='daughter_of', target_id=0), Relationship(source_id=2, link='friend_of', target_id=3), Relationship(source_id=3, link='friend_of', target_id=2), Relationship(source_id=3, link='alias_of', target_id=4), Relationship(source_id=4, link='alias_of', target_id=3), Relationship(source_id=3, link='employee_of', target_id=7), Relationship(source_id=7, link='employer_of', target_id=3), Relationship(source_id=2, link='employee_of', target_id=7), Relationship(source_id=7, link='employer_of', target_id=2), Relationship(source_id=3, link='owner_of', target_id=5), Relationship(source_id=5, link='pet_of', target_id=3), Relationship(source_id=6, link='friend_of', target_id=0), Relationship(source_id=0, link='friend_of', target_id=6), Relationship(source_id=0, link='adopted_father_of', target_id=6), Relationship(source_id=6, link='adopted_son_of', target_id=0)]
💡 This is looking good!
Now, let's go to the next stage and build a network graph from our data…
Now that we have our entities and relationships neatly packed into data classes, let's bring them to life. We'll use `networkx`
to build a network graph. Using domain terminology, entities become nodes and relationships become directed edges. We'll also calculate node centralities to identify key nodes and use the Louvain method to detect communities (clusters of closely related nodes)…
``` python
import textwrap
from typing import cast
import networkx as nx
import numpy as np
from networkx.algorithms.community.louvain import louvain_communities
NODE_CENTRALITY = "node_centrality"
NODE_COMMUNITY_INDEX = "node_community_index"
NODE_COLOR = "node_color"
EDGE_COLOR = "edge_color"
MULTILINE_NAMES = True
MULTILINE_CHARS = 12
def build_graph(kg: KnowledgeGraph, remove_orphan_nodes: bool) -> nx.DiGraph:
graph = nx.DiGraph()
node_name_from_id: dict[int, str] = {}
for entity in kg.entities:
node_name, display_name = get_node_and_display_names_for_entity(entity)
node_name_from_id[entity.id] = node_name
graph.add_node(node_name, name=display_name)
for relationship in kg.relationships:
source_node = node_name_from_id.get(relationship.source_id, "")
target_node = node_name_from_id.get(relationship.target_id, "")
if not source_node or not target_node:
print(f"❌ Skipping relationship due to empty node:\n{relationship}")
continue
weight = 1
edge_label = relationship.link
if graph.has_edge(source_node, target_node):
existing_data = graph[source_node][target_node]
existing_data["link"] += f"\n{edge_label}"
existing_data["weight"] += weight
else:
graph.add_edge(source_node, target_node, link=edge_label, weight=weight)
if remove_orphan_nodes:
graph.remove_nodes_from(list(nx.isolates(graph)))
return graph
def get_node_and_display_names_for_entity(entity: Entity) -> tuple[str, str]:
snake_case_name = "_".join(map(str.lower, entity.name.split()))
node_name = f"{entity.id}_{snake_case_name}"
display_name = entity.name
if MULTILINE_NAMES:
display_name = "\n".join(textwrap.wrap(display_name, width=MULTILINE_CHARS))
return node_name, display_name
def color_gen(color_count: int) -> Iterator[str]:
B50, R50, Y50, G50 = ("#4285F4", "#EA4335", "#FBBC04", "#34A853")
B20, R20, Y20, G20 = ("#AECBFA", "#F6AEA9", "#FDE293", "#A8DAB5")
B05, R05, Y05, G05 = ("#E8F0FE", "#FCE8E6", "#FEF7E0", "#E6F4EA")
COLORS = [B50, R50, Y50, G50, B20, R20, Y20, G20, B05, R05, Y05, G05]
for i in range(color_count):
yield COLORS[i % len(COLORS)]
Node = str
Positions = dict[Node, np.ndarray]
Community = set[Node]
Communities = list[Community]
Nodes = list[Node]
INTER_COMMUNITY_EDGE_COLOR = "#8888"
def init_graph_data(graph: nx.Graph) -> Nodes:
def node_centrality(node: Node) -> float:
return graph.nodes[node][NODE_CENTRALITY]
def community_max_centrality(community: Community) -> float:
return max(node_centrality(node) for node in community)
def nodes_sorted_by_community(communities: Communities) -> Nodes:
entities = []
for community in communities:
sorted_entities = sorted(community, key=node_centrality, reverse=True)
entities.extend(sorted_entities)
return entities
centralities = nx.betweenness_centrality(graph, endpoints=True)
for node_key in graph.nodes:
graph.nodes[node_key][NODE_CENTRALITY] = centralities[node_key]
communities = cast(Communities, louvain_communities(graph, seed=42))
sorted_communities = sorted(communities, key=community_max_centrality, reverse=True)
community_count = len(sorted_communities)
community_colors = list(color_gen(community_count))
for community_index, community in enumerate(sorted_communities):
for node_key in community:
node = graph.nodes[node_key]
node[NODE_COMMUNITY_INDEX] = community_index
node[NODE_COLOR] = community_colors[community_index]
for node_key_i, node_key_j, edge_data in graph.edges(data=True):
node_i = graph.nodes[node_key_i]
node_j = graph.nodes[node_key_j]
same_community = node_i[NODE_COMMUNITY_INDEX] == node_j[NODE_COMMUNITY_INDEX]
edge_data[EDGE_COLOR] = (
node_i[NODE_COLOR] if same_community else INTER_COMMUNITY_EDGE_COLOR
)
return nodes_sorted_by_community(sorted_communities)
def compute_node_positions(graph: nx.DiGraph, entities: Nodes) -> Positions:
undirected_graph = nx.Graph()
for entity in entities:
undirected_graph.add_node(entity)
undirected_graph.add_edges_from(graph.edges())
if len(entities) < 10:
positions = nx.circular_layout(undirected_graph)
else:
positions = nx.kamada_kawai_layout(undirected_graph)
positions = nx.arf_layout(undirected_graph, positions, seed=42)
return positions
@dataclass
class GraphData:
knowledge_graph: KnowledgeGraph
remove_orphan_nodes: bool = True
graph: nx.DiGraph = field(init=False)
nodes: Nodes = field(init=False)
positions: Positions = field(init=False)
def __post_init__(self) -> None:
self.graph = build_graph(self.knowledge_graph, self.remove_orphan_nodes)
self.nodes = init_graph_data(self.graph)
self.positions = compute_node_positions(self.graph, self.nodes)
print("✅ Network graph helpers defined")
✅ Network graph helpers defined
Let's test this:
graph_data = GraphData(knowledge_graph)
print(f"{graph_data.graph = !s}")
print(f"{graph_data.nodes = }")
graph_data.graph = DiGraph with 8 nodes and 16 edges
graph_data.nodes = ['2_sophie_jones', '3_william_smith', '7_acme_aerospace', '5_beau', '4_bill', '0_henry_jones_jr.', '1_henry_jones_sr.', '6_short_round']
The extracted data is much easier to understand when you can actually see it! We can use matplotlib
to draw our network graphs. We'll size the nodes based on their centrality and color-code them by community. To make it even easier to digest, we'll generate an animated sequence highlighting each character's connections one by one…
import typing
from io import BytesIO
import matplotlib.pyplot as plt
from IPython import display
from matplotlib.axes import Axes
from matplotlib.backends.backend_agg import FigureCanvasAgg
from matplotlib.figure import Figure
from PIL import Image as PilImage
class AnimationFormat(StrEnum):
WEBP = auto()
PNG = auto()
GIF = auto()
FIGURE_DPI = 200
FIGURE_FACTOR = 1.0
ANIMATION_INTRO_DURATION = 2500
ANIMATION_FRAME_DURATION = 250
EDGE_STYLE = "arc3,rad=0.2"
NodeSizes = dict[Node, int]
if running_in_colab_env():
ANIMATION_FORMAT = AnimationFormat.WEBP
else:
ANIMATION_FORMAT = AnimationFormat.GIF
def init_figure(title: str, subtitle: str) -> tuple[Figure, Axes]:
figsize = (16 * FIGURE_FACTOR, 9 * FIGURE_FACTOR)
fig, ax = plt.subplots(figsize=figsize, dpi=FIGURE_DPI)
ax.set_title(title, loc="left")
ax.set_title(subtitle, loc="right")
ax.axis("off")
fig.tight_layout(pad=2)
return fig, ax
def draw_nodes(graph: nx.Graph, positions: Positions, ax: Axes) -> NodeSizes:
node_view = graph.nodes(data=True)
node_sizes = {
node: max(500, int(10000 * data[NODE_CENTRALITY])) for node, data in node_view
}
node_colors = [str(data[NODE_COLOR]) for _, data in node_view]
border_width = 3.0
nx.draw_networkx_nodes(
graph,
pos=positions,
node_size=list(node_sizes.values()),
node_color=node_colors,
alpha=0.95,
ax=ax,
linewidths=border_width,
)
labels = {node: data.get("name", str(node)) for node, data in node_view}
nx.draw_networkx_labels(graph, positions, labels=labels, ax=ax)
if len(node_sizes) >= 10:
(x_min, x_max), (y_min, y_max) = ax.get_xlim(), ax.get_ylim()
x_min, x_max = int(x_min - 1.0), int(x_max + 1.0)
y_min, y_max = int(y_min - 1.0), int(y_max + 1.0)
ax.set_xlim(x_min, x_max)
ax.set_ylim(y_min, y_max)
return node_sizes
def draw_edges(
graph: nx.DiGraph,
positions: Positions,
node_sizes: NodeSizes,
ax: Axes,
*,
focused_node: Node | None = None,
) -> None:
if focused_node:
out_edges = graph.edges([focused_node], data=True)
else:
out_edges = graph.edges(data=True)
edge_colors = [data[EDGE_COLOR] for _, _, data in out_edges]
edge_list = [(u, v) for u, v, _ in out_edges]
ordered_sizes = [node_sizes[n] for n in graph.nodes()]
nx.draw_networkx_edges(
graph,
positions,
edge_list,
edge_color=edge_colors,
style=":",
alpha=0.9,
arrowstyle="-|>",
arrowsize=20,
ax=ax,
node_size=ordered_sizes,
connectionstyle=EDGE_STYLE,
)
edge_labels = {(u, v): data["link"] for u, v, data in out_edges}
nx.draw_networkx_edge_labels(
graph,
positions,
edge_labels,
font_size=8,
font_family="monospace",
bbox=dict(ec="#FFF8", fc="#FFF8"),
ax=ax,
node_size=ordered_sizes, # type: ignore
connectionstyle=EDGE_STYLE, # type: ignore
)
def display_graph(title: str, subtitle: str, graph_data: GraphData) -> None:
_, ax = init_figure(title, subtitle)
node_graph = graph_data.graph
edge_graph = graph_data.graph
positions = graph_data.positions
node_sizes = draw_nodes(node_graph, positions, ax)
draw_edges(edge_graph, positions, node_sizes, ax)
plt.show()
def yield_images(
title: str,
subtitle: str,
graph_data: GraphData,
) -> Iterator[PilImage.Image]:
fig, ax = init_figure(title, subtitle)
canvas = FigureCanvasAgg(fig)
positions = graph_data.positions
node_graph = graph_data.graph
node_sizes = draw_nodes(node_graph, positions, ax)
edge_graph = graph_data.graph
for focused_node in [None, *graph_data.nodes]:
if focused_node is not None:
draw_edges(edge_graph, positions, node_sizes, ax, focused_node=focused_node)
canvas.draw()
image_size = canvas.get_width_height()
image_bytes = canvas.buffer_rgba()
yield PilImage.frombytes("RGBA", image_size, image_bytes).convert("RGB")
plt.close(fig)
def generate_animation(
title: str,
subtitle: str,
graph_data: GraphData,
format: AnimationFormat,
) -> BytesIO:
frames = list(yield_images(title, subtitle, graph_data))
assert len(frames) >= 1
if format == AnimationFormat.GIF:
method = PilImage.Quantize.MEDIANCUT
palettized = frames[-1].quantize(method=method)
frames = [frame.quantize(method=method, palette=palettized) for frame in frames]
first_frame = frames[-1]
next_frames = frames[:-1]
durations = [ANIMATION_INTRO_DURATION]
durations += [ANIMATION_FRAME_DURATION] * len(next_frames)
params: dict[str, typing.Any] = dict(
save_all=True,
append_images=next_frames,
duration=durations,
loop=0,
)
match format:
case AnimationFormat.GIF:
params.update(optimize=False)
case AnimationFormat.PNG:
params.update(optimize=True)
case AnimationFormat.WEBP:
params.update(lossless=True)
image_io = BytesIO()
first_frame.save(image_io, str(format).upper(), **params)
return image_io
def display_graph_animation(title: str, subtitle: str, graph_data: GraphData) -> None:
image_io = generate_animation(title, subtitle, graph_data, ANIMATION_FORMAT)
ipython_image = display.Image(data=image_io.getvalue())
display.display(ipython_image)
def get_graph_title_and_subtitle(
source: Source | str,
model: Model | None = None,
domain: str = "",
) -> tuple[str, str]:
if isinstance(source, str):
source_name = f'"{source.strip()[:20]}…"'
else:
source_name = source.name
model_id = model.value if model else Model.DEFAULT.value
model_id = model_id.removesuffix("-preview")
separator = " • "
title_parts = ["Knowledge Graph"]
if domain:
title_parts.append(domain)
subtitle_parts = [source_name, model_id]
return separator.join(title_parts), separator.join(subtitle_parts)
def display_knowledge_graph(
knowledge_graph: KnowledgeGraph,
source: Source | str,
model: Model | None = None,
animated: bool = False,
domain: str = "",
remove_orphan_nodes: bool = True,
) -> None:
if not knowledge_graph.entities:
return
title, subtitle = get_graph_title_and_subtitle(source, model, domain)
graph_data = GraphData(knowledge_graph, remove_orphan_nodes)
if animated:
display_graph_animation(title, subtitle, graph_data)
else:
display_graph(title, subtitle, graph_data)
def extract_knowledge_graph(
data_schema: str,
instructions: str,
source: Source | str,
model: Model | None = None,
*,
domain: str = "",
animated: bool = False,
) -> None:
if isinstance(source, Source):
display_input_data_caption(source)
knowledge_graph = generate_knowledge_graph(
data_schema,
instructions,
source,
model=model,
)
display_knowledge_graph(knowledge_graph, source, model, animated, domain)
print("✅ Data visualization helpers defined")
✅ Data visualization helpers defined
Let's test this:
display_knowledge_graph(knowledge_graph, text)
💡 We can now quickly visualize and understand our knowledge graphs. This helps us iterate faster when refining prompts.
ℹ️ While this simple approach is great for a quick overview, you might want to swap it out for more specialized libraries if you want to explore the graph interactively.
Let's define a book analysis function:
def analyze_book(
source: Source,
model: Model | None = None,
animated: bool = False,
) -> None:
extract_knowledge_graph(
BOOK_ANALYSIS_DATA_SCHEMA,
BOOK_ANALYSIS_INSTRUCTIONS,
source,
model,
domain="Character Connections",
animated=animated,
)
🧪 First, let's build a knowledge graph based on Zola's Thérèse Raquin:
analyze_book(Classic.fr_zola_thérèse_raquin)
Input data (fr_zola_thérèse_raquin)
----------------------- Request / gemini-3.1-flash-lite ------------------------
Input tokens : 119,518
Cached tokens : 118,753
Output tokens : 357
----------------------------- Knowledge Graph Info -----------------------------
Entities : 11
Relationships : 28
--------------------------------------------------------------------------------
💡 We can extract and understand the book's narrative in seconds: The love triangle between Thérèse, Camille, and Laurent clearly stands out. Despite being over 200 pages long (and 100k+ tokens), this novel is incredibly minimalistic, revolving around a limited set of characters, which is reflected in the network graph. Adding locations to the extracted entities would also reveal the claustrophobic atmosphere of the book in the resulting knowledge graph.
ℹ️ We used the original French version with English instructions, which works seamlessly. If you translate the instructions, you can also generate dynamic relationship links in different languages (see the 100+ supported languages).
🧪 Let's see how the model handles the interconnected cast of Hugo's Les Misérables:
analyze_book(
Classic.en_hugo_les_misérables,
model=Model.GEMINI_3_5_FLASH,
)
Input data (en_hugo_les_misérables)
-------------------------- Request / gemini-3.5-flash --------------------------
Input tokens : 803,328
Output tokens : 2,908
----------------------------- Knowledge Graph Info -----------------------------
Entities : 65
Relationships : 225
--------------------------------------------------------------------------------
💡 We quickly get the gist of the novel.
ℹ️ Donald Knuth famously used Les Misérables as an example back in 1994 (see the Stanford GraphBase). It's been consistently used in Natural Language Processing (NLP) because of its massive scale, linguistic complexity, and multiple high-quality translations. At 800k+ tokens, this book is still a true stress test. Note that we used Gemini 3.5 Flash (instead of 3.1 Flash-Lite). Larger models can infer more and perform deeper consolidation.
🧪 Now, let's process just the first volume of Le Comte de Monte-Cristo (in French):
analyze_book(Classic.fr_dumas_comte_de_monte_cristo_1)
Input data (fr_dumas_comte_de_monte_cristo_1)
----------------------- Request / gemini-3.1-flash-lite ------------------------
Input tokens : 215,582
Output tokens : 713
----------------------------- Knowledge Graph Info -----------------------------
Entities : 36
Relationships : 36
--------------------------------------------------------------------------------
💡 The graph shows the initial setup of Edmond Dantès' world, his friends, and his betrayers. The love triangle between Edmond, Mercédès, and Fernand also comes to light.
ℹ️ Note that the antagonist is referred to only as Fernand, which is expected. We learn his full name, Fernand Mondego, only in volume three. In the prompt, we asked the model to "extract" entities and determine names "from the input data" to avoid generating memorized knowledge. To detect regressions when updating the prompt, we can use this as a unit test to ensure that our response actually provides extracted data and not memorized information.
🧪 And finally, let's analyze the entire saga of Le Comte de Monte-Cristo, all four volumes at once (800k+ tokens):
analyze_book(
Collection.fr_dumas_comte_de_monte_cristo,
model=Model.GEMINI_3_5_FLASH,
animated=True,
)
Input data (fr_dumas_comte_de_monte_cristo_1, fr_dumas_comte_de_monte_cristo_2, fr_dumas_comte_de_monte_cristo_3, fr_dumas_comte_de_monte_cristo_4)
-------------------------- Request / gemini-3.5-flash --------------------------
Input tokens : 840,385
Cached tokens : 835,551
Output tokens : 2,659
----------------------------- Knowledge Graph Info -----------------------------
Entities : 50
Relationships : 212
--------------------------------------------------------------------------------
💡 The multiple aliases of Edmond Dantès (and other characters) are nicely extracted. The complex plot of this book is built on characters juggling multiple identities in a psychological chess match.
🎉 This is an example of how we can complete the challenge in a single request, using the fewest tokens possible and zero thinking tokens, while providing multiple levels of consolidation.
ℹ️ Note that this is a proof of concept. For an exhaustive extraction in a fully professional solution, we would probably set up a multi-step workflow and consider the following:
location
, date
…), and then save them to a database.evidence
or excerpt
to serve as direct proof, or chapter
or page
for source attribution). In this step, we could also dig deeper and differentiate literal/figurative relationships (e.g., biological vs. figurative parents).With our current setup, generalizing to other types of content is as simple as adapting our data schema and instructions.
For example, legal agreements are among the densest types of documents. They're usually made up of many articles and clauses, with every sentence carrying specific legal weight, outlining obligations, or providing definitions. But what kind of knowledge graph do we want to build from a legal agreement?
🧪 Let's test a minimalistic, open-ended prompt:
source = Document.en_pharma_dev_agreement
model = Model.GEMINI_3_5_FLASH
AGREEMENT_OPEN_DATA_SCHEMA = """
Entity:
- `id`: Unique integer identifier (0, 1, 2…).
- `name`: Name of the entity.
- `label`: Entity type.
Relationship:
- `source_id`: `id` of the subject entity.
- `link`: `snake_case` predicate.
- `target_id`: `id` of the object entity.
"""
AGREEMENT_OPEN_INSTRUCTIONS = """
Perform a comprehensive, highly-granular entity/relationship extraction.
"""
extract_knowledge_graph(
AGREEMENT_OPEN_DATA_SCHEMA,
AGREEMENT_OPEN_INSTRUCTIONS,
source,
model,
domain="Agreement High-Level Extraction",
)
Input data (en_pharma_dev_agreement)
-------------------------- Request / gemini-3.5-flash --------------------------
Input tokens : 51,276
Output tokens : 221
----------------------------- Knowledge Graph Info -----------------------------
Entities : 10
Relationships : 9
--------------------------------------------------------------------------------
💡 Remarks
🧪 Then, let's try semi-open instructions focusing on legal obligations:
class AgreementEntityLabel(StrEnum):
PARTY = auto()
PERSON = auto()
ROLE = auto()
LOCATION = auto()
JURISDICTION = auto()
FINANCIAL_AMOUNT = auto()
DATE = auto()
EVENT = auto()
ASSET = auto()
PRODUCT = auto()
INTELLECTUAL_PROPERTY = auto()
OBLIGATION_TYPE = auto()
AGREEMENT_SEMI_OPEN_DATA_SCHEMA = f"""
Entity:
- `id`: Unique integer identifier (0, 1, 2…).
- `name`: Name of the entity.
- `label`: {pipe_delimited_union(AgreementEntityLabel)}.
Relationship: Obligation, right, or transfer from a source entity to a target entity.
- `source_id`: The `id` of the subject entity.
- `link`: Specific `snake_case` predicate.
- `target_id`: The `id` of the object entity.
"""
AGREEMENT_SEMI_OPEN_INSTRUCTIONS = """
- Analyze the input data for every covenant (e.g., "shall", "will", "must", "is obligated to") and perform an exhaustive extraction.
- Make sure to deconstruct complex obligations: For complex clauses (e.g., "A shall pay B $X Million within Y days of the Effective Date"), extract:
- The primary obligation: `[A] is_obligated_to_pay [B]`
- The value: `[A's obligation] subject_to [$X Million]`
- The trigger: `[A's obligation] triggered_by [Effective Date]`
"""
extract_knowledge_graph(
AGREEMENT_SEMI_OPEN_DATA_SCHEMA,
AGREEMENT_SEMI_OPEN_INSTRUCTIONS,
source,
model,
domain="Agreement Obligations",
)
Input data (en_pharma_dev_agreement)
-------------------------- Request / gemini-3.5-flash --------------------------
Input tokens : 51,466
Output tokens : 2,002
----------------------------- Knowledge Graph Info -----------------------------
Entities : 85
Relationships : 93
--------------------------------------------------------------------------------
💡 Remarks
What if we don't care about legal obligations, but rather the document's architecture? Let's shift our focus to the structure itself and extract how sections, clauses, and defined terms are hierarchically organized…
🧪 And now, let's test closed instructions focusing on the document structure:
class AgreementStructureEntityLabel(StrEnum):
DEFINED_TERM = auto()
DOCUMENT_SECTION = auto()
DOCUMENT = auto()
class AgreementStructureRelationshipType(StrEnum):
DEFINED_IN = auto()
CONTAINS = auto()
AGREEMENT_STRUCTURAL_DATA_SCHEMA = f"""
Entity:
- `id`: Unique integer identifier (0, 1, 2…).
- `name`: Name of the entity.
- `label`: {pipe_delimited_union(AgreementStructureEntityLabel)}.
Relationship: Connection from a source entity to a target entity.
- `source_id`: The `id` of the subject entity.
- `link`: {pipe_delimited_union(AgreementStructureRelationshipType)}.
- `target_id`: The `id` of the object entity.
"""
AGREEMENT_STRUCTURAL_OPEN_INSTRUCTIONS = """
- Extract every distinct entity that matches an allowed `label`.
- Extract every distinct relationship representing a structural connection (hierarchical organization) between these entities:
- You must be comprehensive and highly granular. If multiple distinct relationships exist between the same pair of entities, create a separate entry for each.
"""
extract_knowledge_graph(
AGREEMENT_STRUCTURAL_DATA_SCHEMA,
AGREEMENT_STRUCTURAL_OPEN_INSTRUCTIONS,
source,
model,
domain="Agreement Structure",
)
Input data (en_pharma_dev_agreement)
-------------------------- Request / gemini-3.5-flash --------------------------
Input tokens : 51,351
Output tokens : 9,108
----------------------------- Knowledge Graph Info -----------------------------
Entities : 314
Relationships : 513
--------------------------------------------------------------------------------
💡 If you extract hundreds of entities from a massive document, your graph will quickly turn into an unreadable hairball. For larger datasets, you'll want to export your nodes and edges to a dedicated graph database, which typically comes with its own visualization and exploration tools.
We successfully extracted data and built knowledge graphs from documents by following these steps:
These principles apply to many other data-extraction domains and will allow you to solve your own complex problems. Have fun and happy solving!
➕ More!