Document Augmentation RAG with Question Generation
Document Augmentation RAG with Question Generation
This notebook implements an enhanced RAG approach using document augmentation through question generation. By generating relevant questions for each text chunk, we improve the retrieval process, leading to better responses from the language model.
In this implementation, we follow these steps:
- Data Ingestion: Extract text from a PDF file.
- Chunking: Split the text into manageable chunks.
- Question Generation: Generate relevant questions for each chunk.
- Embedding Creation: Create embeddings for both chunks and generated questions.
- Vector Store Creation: Build a simple vector store using NumPy.
- Semantic Search: Retrieve relevant chunks and questions for user queries.
- Response Generation: Generate answers based on retrieved content.
- Evaluation: Assess the quality of the generated responses.
Setting Up the Environment
We begin by importing necessary libraries.
import fitz
import os
import numpy as np
import json
from openai import OpenAI
import re
from tqdm import tqdm
Extracting Text from a PDF File
To implement RAG, we first need a source of textual data. In this case, we extract text from a PDF file using the PyMuPDF library.
def extract_text_from_pdf(pdf_path):
mypdf = fitz.open(pdf_path)
all_text = ""
for page_num in range(mypdf.page_count):
page = mypdf[page_num]
text = page.get_text("text")
all_text += text
return all_text
Chunking the Extracted Text
Once we have the extracted text, we divide it into smaller, overlapping chunks to improve retrieval accuracy.
def chunk_text(text, n, overlap):
chunks = []
for i in range(0, len(text), n - overlap):
chunks.append(text[i:i + n])
return chunks
Setting Up the OpenAI API Client
We initialize the OpenAI client to generate embeddings and responses.
client = OpenAI(
base_url="https://api.studio.nebius.com/v1/",
api_key=os.getenv("OPENAI_API_KEY")
)
Generating Questions for Text Chunks
This is the key enhancement over simple RAG. We generate questions that could be answered by each text chunk.
def generate_questions(text_chunk, num_questions=5, model="meta-llama/Llama-3.2-3B-Instruct"):
system_prompt = "You are an expert at generating relevant questions from text. Create concise questions that can be answered using only the provided text. Focus on key information and concepts."
user_prompt = f"""
Based on the following text, generate {num_questions} different questions that can be answered using only this text:
{text_chunk}
Format your response as a numbered list of questions only, with no additional text.
"""
response = client.chat.completions.create(
model=model,
temperature=0.7,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
]
)
questions_text = response.choices[0].message.content.strip()
questions = []
for line in questions_text.split('\n'):
cleaned_line = re.sub(r'^\d+\.\s*', '', line.strip())
if cleaned_line and cleaned_line.endswith('?'):
questions.append(cleaned_line)
return questions
Creating Embeddings for Text
We generate embeddings for both text chunks and generated questions.
def create_embeddings(text, model="BAAI/bge-en-icl"):
response = client.embeddings.create(
model=model,
input=text
)
return response
Building a Simple Vector Store
We'll implement a simple vector store using NumPy.
class SimpleVectorStore:
def __init__(self):
self.vectors = []
self.texts = []
self.metadata = []
def add_item(self, text, embedding, metadata=None):
self.vectors.append(np.array(embedding))
self.texts.append(text)
self.metadata.append(metadata or {})
def similarity_search(self, query_embedding, k=5):
if not self.vectors:
return []
query_vector = np.array(query_embedding)
similarities = []
for i, vector in enumerate(self.vectors):
similarity = np.dot(query_vector, vector) / (np.linalg.norm(query_vector) * np.linalg.norm(vector))
similarities.append((i, similarity))
similarities.sort(key=lambda x: x[1], reverse=True)
results = []
for i in range(min(k, len(similarities))):
idx, score = similarities[i]
results.append({
"text": self.texts[idx],
"metadata": self.metadata[idx],
"similarity": score
})
return results
Processing Documents with Question Augmentation
Now we'll put everything together to process documents, generate questions, and build our augmented vector store.
def process_document(pdf_path, chunk_size=1000, chunk_overlap=200, questions_per_chunk=5):
print("Extracting text from PDF...")
extracted_text = extract_text_from_pdf(pdf_path)
print("Chunking text...")
text_chunks = chunk_text(extracted_text, chunk_size, chunk_overlap)
print(f"Created {len(text_chunks)} text chunks")
vector_store = SimpleVectorStore()
print("Processing chunks and generating questions...")
for i, chunk in enumerate(tqdm(text_chunks, desc="Processing Chunks")):
chunk_embedding_response = create_embeddings(chunk)
chunk_embedding = chunk_embedding_response.data[0].embedding
vector_store.add_item(
text=chunk,
embedding=chunk_embedding,
metadata={"type": "chunk", "index": i}
)
questions = generate_questions(chunk, num_questions=questions_per_chunk)
for j, question in enumerate(questions):
question_embedding_response = create_embeddings(question)
question_embedding = question_embedding_response.data[0].embedding
vector_store.add_item(
text=question,
embedding=question_embedding,
metadata={"type": "question", "chunk_index": i, "original_chunk": chunk}
)
return text_chunks, vector_store
Extracting and Processing the Document
pdf_path = "data/AI_Information.pdf"
text_chunks, vector_store = process_document(
pdf_path,
chunk_size=1000,
chunk_overlap=200,
questions_per_chunk=3
)
print(f"Vector store contains {len(vector_store.texts)} items")
Performing Semantic Search
We implement a semantic search function similar to the simple RAG implementation but adapted to our augmented vector store.
def semantic_search(query, vector_store, k=5):
query_embedding_response = create_embeddings(query)
query_embedding = query_embedding_response.data[0].embedding
results = vector_store.similarity_search(query_embedding, k=k)
return results
Running a Query on the Augmented Vector Store
with open('data/val.json') as f:
data = json.load(f)
query = data[0]['question']
search_results = semantic_search(query, vector_store, k=5)
print("Query:", query)
print("\nSearch Results:")
chunk_results = []
question_results = []
for result in search_results:
if result["metadata"]["type"] == "chunk":
chunk_results.append(result)
else:
question_results.append(result)
print("\nRelevant Document Chunks:")
for i, result in enumerate(chunk_results):
print(f"Context {i + 1} (similarity: {result['similarity']:.4f}):")
print(result["text"][:300] + "...")
print("=====================================")
print("\nMatched Questions:")
for i, result in enumerate(question_results):
print(f"Question {i + 1} (similarity: {result['similarity']:.4f}):")
print(result["text"])
chunk_idx = result["metadata"]["chunk_index"]
print(f"From chunk {chunk_idx}")
print("=====================================")
Generating Context for Response
Now we prepare the context by combining information from relevant chunks and questions.
def prepare_context(search_results):
chunk_indices = set()
context_chunks = []
for result in search_results:
if result["metadata"]["type"] == "chunk":
chunk_indices.add(result["metadata"]["index"])
context_chunks.append(f"Chunk {result['metadata']['index']}:\n{result['text']}")
for result in search_results:
if result["metadata"]["type"] == "question":
chunk_idx = result["metadata"]["chunk_index"]
if chunk_idx not in chunk_indices:
chunk_indices.add(chunk_idx)
context_chunks.append(f"Chunk {chunk_idx} (referenced by question '{result['text']}'):\n{result['metadata']['original_chunk']}")
full_context = "\n\n".join(context_chunks)
return full_context
Generating a Response Based on Retrieved Chunks
def generate_response(query, context, model="meta-llama/Llama-3.2-3B-Instruct"):
system_prompt = "You are an AI assistant that strictly answers based on the given context. If the answer cannot be derived directly from the provided context, respond with: 'I do not have enough information to answer that.'"
user_prompt = f"""
Context:
{context}
Question: {query}
Please answer the question based only on the context provided above. Be concise and accurate.
"""
response = client.chat.completions.create(
model=model,
temperature=0,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
]
)
return response.choices[0].message.content
Generating and Displaying the Response
context = prepare_context(search_results)
response_text = generate_response(query, context)
print("\nQuery:", query)
print("\nResponse:")
print(response_text)
Evaluating the AI Response
We compare the AI response with the expected answer and assign a score.
def evaluate_response(query, response, reference_answer, model="meta-llama/Llama-3.2-3B-Instruct"):
evaluate_system_prompt = """You are an intelligent evaluation system tasked with assessing AI responses.
Compare the AI assistant's response to the true/reference answer, and evaluate based on:
1. Factual correctness - Does the response contain accurate information?
2. Completeness - Does it cover all important aspects from the reference?
3. Relevance - Does it directly address the question?
Assign a score from 0 to 1:
- 1.0: Perfect match in content and meaning
- 0.8: Very good, with minor omissions/differences
- 0.6: Good, covers main points but misses some details
- 0.4: Partial answer with significant omissions
- 0.2: Minimal relevant information
- 0.0: Incorrect or irrelevant
Provide your score with justification.
"""
evaluation_prompt = f"""
User Query: {query}
AI Response:
{response}
Reference Answer:
{reference_answer}
Please evaluate the AI response against the reference answer.
"""
eval_response = client.chat.completions.create(
model=model,
temperature=0,
messages=[
{"role": "system", "content": evaluate_system_prompt},
{"role": "user", "content": evaluation_prompt}
]
)
return eval_response.choices[0].message.content
reference_answer = data[0]['ideal_answer']
evaluation = evaluate_response(query, response_text, reference_answer)
print("\nEvaluation:")
print(evaluation)