June 23, 2025
30 min read
Hierarchical Indices for RAG
RAG
NLP
AI
Hierarchical Retrieval
Summarization
Hierarchical Indices for RAG
In this notebook, I implement a hierarchical indexing approach for RAG systems. This technique improves retrieval by using a two-tier search method: first identifying relevant document sections through summaries, then retrieving specific details from those sections.
Traditional RAG approaches treat all text chunks equally, which can lead to:
- Lost context when chunks are too small
- Irrelevant results when the document collection is large
- Inefficient searches across the entire corpus
Hierarchical retrieval solves these problems by:
- Creating concise summaries for larger document sections
- First searching these summaries to identify relevant sections
- Then retrieving detailed information only from those sections
- Maintaining context while preserving specific details
Setting Up the Environment
We begin by importing necessary libraries.
import os
import numpy as np
import json
import fitz
from openai import OpenAI
import re
import pickle
Setting Up the OpenAI API Client
We initialize the OpenAI client to generate embeddings and responses.
client = OpenAI(
base_url="https://api.studio.nebius.com/v1/",
api_key=os.getenv("OPENAI_API_KEY")
)
Document Processing Functions
def extract_text_from_pdf(pdf_path):
print(f"Extracting text from {pdf_path}...")
pdf = fitz.open(pdf_path)
pages = []
for page_num in range(len(pdf)):
page = pdf[page_num]
text = page.get_text()
if len(text.strip()) > 50:
pages.append({
"text": text,
"metadata": {
"source": pdf_path,
"page": page_num + 1
}
})
print(f"Extracted {len(pages)} pages with content")
return pages
def chunk_text(text, metadata, chunk_size=1000, overlap=200):
chunks = []
for i in range(0, len(text), chunk_size - overlap):
chunk_text = text[i:i + chunk_size]
if chunk_text and len(chunk_text.strip()) > 50:
chunk_metadata = metadata.copy()
chunk_metadata.update({
"chunk_index": len(chunks),
"start_char": i,
"end_char": i + len(chunk_text),
"is_summary": False
})
chunks.append({
"text": chunk_text,
"metadata": chunk_metadata
})
return chunks
Simple Vector Store Implementation
class SimpleVectorStore:
def __init__(self):
self.vectors = []
self.texts = []
self.metadata = []
def add_item(self, text, embedding, metadata=None):
self.vectors.append(np.array(embedding))
self.texts.append(text)
self.metadata.append(metadata or {})
def similarity_search(self, query_embedding, k=5, filter_func=None):
if not self.vectors:
return []
query_vector = np.array(query_embedding)
similarities = []
for i, vector in enumerate(self.vectors):
if filter_func and not filter_func(self.metadata[i]):
continue
similarity = np.dot(query_vector, vector) / (np.linalg.norm(query_vector) * np.linalg.norm(vector))
similarities.append((i, similarity))
similarities.sort(key=lambda x: x[1], reverse=True)
results = []
for i in range(min(k, len(similarities))):
idx, score = similarities[i]
results.append({
"text": self.texts[idx],
"metadata": self.metadata[idx],
"similarity": float(score)
})
return results
Creating Embeddings
def create_embeddings(texts, model="BAAI/bge-en-icl"):
if not texts:
return []
batch_size = 100
all_embeddings = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
response = client.embeddings.create(
model=model,
input=batch
)
batch_embeddings = [item.embedding for item in response.data]
all_embeddings.extend(batch_embeddings)
return all_embeddings
Summarization Function
def generate_page_summary(page_text):
system_prompt = """You are an expert summarization system.
Create a detailed summary of the provided text.
Focus on capturing the main topics, key information, and important facts.
Your summary should be comprehensive enough to understand what the page contains
but more concise than the original."""
max_tokens = 6000
truncated_text = page_text[:max_tokens] if len(page_text) > max_tokens else page_text
response = client.chat.completions.create(
model="meta-llama/Llama-3.2-3B-Instruct",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": f"Please summarize this text:\n\n{truncated_text}"}
],
temperature=0.3
)
return response.choices[0].message.content
Hierarchical Document Processing
def process_document_hierarchically(pdf_path, chunk_size=1000, chunk_overlap=200):
pages = extract_text_from_pdf(pdf_path)
print("Generating page summaries...")
summaries = []
for i, page in enumerate(pages):
print(f"Summarizing page {i+1}/{len(pages)}...")
summary_text = generate_page_summary(page["text"])
summary_metadata = page["metadata"].copy()
summary_metadata.update({"is_summary": True})
summaries.append({
"text": summary_text,
"metadata": summary_metadata
})
detailed_chunks = []
for page in pages:
page_chunks = chunk_text(
page["text"],
page["metadata"],
chunk_size,
chunk_overlap
)
detailed_chunks.extend(page_chunks)
print(f"Created {len(detailed_chunks)} detailed chunks")
print("Creating embeddings for summaries...")
summary_texts = [summary["text"] for summary in summaries]
summary_embeddings = create_embeddings(summary_texts)
print("Creating embeddings for detailed chunks...")
chunk_texts = [chunk["text"] for chunk in detailed_chunks]
chunk_embeddings = create_embeddings(chunk_texts)
summary_store = SimpleVectorStore()
detailed_store = SimpleVectorStore()
for i, summary in enumerate(summaries):
summary_store.add_item(
text=summary["text"],
embedding=summary_embeddings[i],
metadata=summary["metadata"]
)
for i, chunk in enumerate(detailed_chunks):
detailed_store.add_item(
text=chunk["text"],
embedding=chunk_embeddings[i],
metadata=chunk["metadata"]
)
print(f"Created vector stores with {len(summaries)} summaries and {len(detailed_chunks)} chunks")
return summary_store, detailed_store
Hierarchical Retrieval
def retrieve_hierarchically(query, summary_store, detailed_store, k_summaries=3, k_chunks=5):
print(f"Performing hierarchical retrieval for query: {query}")
query_embedding = create_embeddings(query)
summary_results = summary_store.similarity_search(
query_embedding,
k=k_summaries
)
print(f"Retrieved {len(summary_results)} relevant summaries")
relevant_pages = [result["metadata"]["page"] for result in summary_results]
def page_filter(metadata):
return metadata["page"] in relevant_pages
detailed_results = detailed_store.similarity_search(
query_embedding,
k=k_chunks * len(relevant_pages),
filter_func=page_filter
)
print(f"Retrieved {len(detailed_results)} detailed chunks from relevant pages")
for result in detailed_results:
page = result["metadata"]["page"]
matching_summaries = [s for s in summary_results if s["metadata"]["page"] == page]
if matching_summaries:
result["summary"] = matching_summaries[0]["text"]
return detailed_results
Response Generation with Context
def generate_response(query, retrieved_chunks):
context_parts = []
for i, chunk in enumerate(retrieved_chunks):
page_num = chunk["metadata"]["page"]
context_parts.append(f"[Page {page_num}]: {chunk['text']}")
combined_context = "\n\n".join(context_parts)
max_context = 14000
if len(combined_context) > max_context:
combined_context = combined_context[:max_context] + "... [truncated]"
system_message = """You are a helpful AI assistant. Answer the user's question based on the provided context.
Use the information from the context to answer the user's question accurately.
If the context doesn't contain relevant information, acknowledge that.
Include page numbers when referencing specific information."""
response = client.chat.completions.create(
model="meta-llama/Llama-3.2-3B-Instruct",
messages=[
{"role": "system", "content": system_message},
{"role": "user", "content": f"Context:\n\n{combined_context}\n\nQuestion: {query}"}
],
temperature=0.2
)
return response.choices[0].message.content
Complete RAG Pipeline with Hierarchical Retrieval
def hierarchical_rag(query, pdf_path, chunk_size=1000, chunk_overlap=200, k_summaries=3, k_chunks=5, regenerate=False):
summary_store_file = f"{os.path.basename(pdf_path)}_summary_store.pkl"
detailed_store_file = f"{os.path.basename(pdf_path)}_detailed_store.pkl"
if regenerate or not os.path.exists(summary_store_file) or not os.path.exists(detailed_store_file):
print("Processing document and creating vector stores...")
summary_store, detailed_store = process_document_hierarchically(
pdf_path, chunk_size, chunk_overlap
)
with open(summary_store_file, 'wb') as f:
pickle.dump(summary_store, f)
with open(detailed_store_file, 'wb') as f:
pickle.dump(detailed_store, f)
else:
print("Loading existing vector stores...")
with open(summary_store_file, 'rb') as f:
summary_store = pickle.load(f)
with open(detailed_store_file, 'rb') as f:
detailed_store = pickle.load(f)
retrieved_chunks = retrieve_hierarchically(
query, summary_store, detailed_store, k_summaries, k_chunks
)
response = generate_response(query, retrieved_chunks)
return {
"query": query,
"response": response,
"retrieved_chunks": retrieved_chunks,
"summary_count": len(summary_store.texts),
"detailed_count": len(detailed_store.texts)
}
Standard (Non-Hierarchical) RAG for Comparison
def standard_rag(query, pdf_path, chunk_size=1000, chunk_overlap=200, k=15):
pages = extract_text_from_pdf(pdf_path)
chunks = []
for page in pages:
page_chunks = chunk_text(
page["text"],
page["metadata"],
chunk_size,
chunk_overlap
)
chunks.extend(page_chunks)
print(f"Created {len(chunks)} chunks for standard RAG")
store = SimpleVectorStore()
print("Creating embeddings for chunks...")
texts = [chunk["text"] for chunk in chunks]
embeddings = create_embeddings(texts)
for i, chunk in enumerate(chunks):
store.add_item(
text=chunk["text"],
embedding=embeddings[i],
metadata=chunk["metadata"]
)
query_embedding = create_embeddings(query)
retrieved_chunks = store.similarity_search(query_embedding, k=k)
print(f"Retrieved {len(retrieved_chunks)} chunks with standard RAG")
response = generate_response(query, retrieved_chunks)
return {
"query": query,
"response": response,
"retrieved_chunks": retrieved_chunks
}
Evaluation Functions
def compare_approaches(query, pdf_path, reference_answer=None):
print(f"\n=== Comparing RAG approaches for query: {query} ===")
print("\nRunning hierarchical RAG...")
hierarchical_result = hierarchical_rag(query, pdf_path)
hier_response = hierarchical_result["response"]
print("\nRunning standard RAG...")
standard_result = standard_rag(query, pdf_path)
std_response = standard_result["response"]
comparison = compare_responses(query, hier_response, std_response, reference_answer)
return {
"query": query,
"hierarchical_response": hier_response,
"standard_response": std_response,
"reference_answer": reference_answer,
"comparison": comparison,
"hierarchical_chunks_count": len(hierarchical_result["retrieved_chunks"]),
"standard_chunks_count": len(standard_result["retrieved_chunks"])
}
def compare_responses(query, hierarchical_response, standard_response, reference=None):
system_prompt = """You are an expert evaluator of information retrieval systems.
Compare the two responses to the same query, one generated using hierarchical retrieval
and the other using standard retrieval.
Evaluate them based on:
1. Accuracy: Which response provides more factually correct information?
2. Comprehensiveness: Which response better covers all aspects of the query?
3. Coherence: Which response has better logical flow and organization?
4. Page References: Does either response make better use of page references?
Be specific in your analysis of the strengths and weaknesses of each approach."""
user_prompt = f"""Query: {query}
Response from Hierarchical RAG:
{hierarchical_response}
Response from Standard RAG:
{standard_response}"""
if reference:
user_prompt += f"""
Reference Answer:
{reference}"""
user_prompt += """
Please provide a detailed comparison of these two responses, highlighting which approach performed better and why."""
response = client.chat.completions.create(
model="meta-llama/Llama-3.2-3B-Instruct",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
],
temperature=0
)
return response.choices[0].message.content
def run_evaluation(pdf_path, test_queries, reference_answers=None):
results = []
for i, query in enumerate(test_queries):
print(f"Query: {query}")
reference = None
if reference_answers and i < len(reference_answers):
reference = reference_answers[i]
result = compare_approaches(query, pdf_path, reference)
results.append(result)
print("\n=== Proposition-Based Response ===")
print(result["hierarchical_response"])
print("\n=== Chunk-Based Response ===")
print(result["standard_response"])
print("\n=== Evaluation ===")
print(result["comparison"])
overall_analysis = generate_overall_analysis(results)
return {
"results": results,
"overall_analysis": overall_analysis
}
def generate_overall_analysis(results):
system_prompt = """You are an expert at evaluating information retrieval systems.
Based on multiple test queries, provide an overall analysis comparing hierarchical RAG
with standard RAG.
Focus on:
1. When hierarchical retrieval performs better and why
2. When standard retrieval performs better and why
3. The overall strengths and weaknesses of each approach
4. Recommendations for when to use each approach"""
evaluations_summary = ""
for i, result in enumerate(results):
evaluations_summary += f"Query {i+1}: {result['query']}\n"
evaluations_summary += f"Hierarchical chunks: {result['hierarchical_chunks_count']}, Standard chunks: {result['standard_chunks_count']}\n"
evaluations_summary += f"Comparison summary: {result['comparison'][:200]}...\n\n"
user_prompt = f"""Based on the following evaluations comparing hierarchical vs standard RAG across {len(results)} queries, provide an overall analysis of these two approaches:
{evaluations_summary}
Please provide a comprehensive analysis of the relative strengths and weaknesses of hierarchical RAG compared to standard RAG, with specific focus on retrieval quality and response generation."""
response = client.chat.completions.create(
model="meta-llama/Llama-3.2-3B-Instruct",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
],
temperature=0
)
return response.choices[0].message.content
Evaluation of Hierarchical RAG
pdf_path = "data/AI_Information.pdf"
query = "What are the key applications of transformer models in natural language processing?"
result = hierarchical_rag(query, pdf_path)
print("\n=== Response ===")
print(result["response"])
test_queries = [
"How do transformers handle sequential data compared to RNNs?"
]
reference_answers = [
"Transformers handle sequential data differently from RNNs by using self-attention mechanisms instead of recurrent connections. This allows transformers to process all tokens in parallel rather than sequentially, capturing long-range dependencies more efficiently and enabling better parallelization during training. Unlike RNNs, transformers don't suffer from vanishing gradient problems with long sequences."
]
evaluation_results = run_evaluation(
pdf_path=pdf_path,
test_queries=test_queries,
reference_answers=reference_answers
)
print("\n=== OVERALL ANALYSIS ===")
print(evaluation_results["overall_analysis"])