Self-RAG: A Dynamic Approach to RAG

In this notebook, I implement Self-RAG, an advanced RAG system that dynamically decides when and how to use retrieved information. Unlike traditional RAG approaches, Self-RAG introduces reflection points throughout the retrieval and generation process, resulting in higher quality and more reliable responses.

Key Components of Self-RAG

Retrieval Decision: Determines if retrieval is even necessary for a given query
Document Retrieval: Fetches potentially relevant documents when needed
Relevance Evaluation: Assesses how relevant each retrieved document is
Response Generation: Creates responses based on relevant contexts
Support Assessment: Evaluates if responses are properly grounded in the context
Utility Evaluation: Rates the overall usefulness of generated responses

Setting Up the Environment

We begin by importing necessary libraries.

import os
import numpy as np
import json
import fitz
from openai import OpenAI
import re

Extracting Text from a PDF File

To implement RAG, we first need a source of textual data. In this case, we extract text from a PDF file using the PyMuPDF library.

def extract_text_from_pdf(pdf_path):
    mypdf = fitz.open(pdf_path)
    all_text = ""
    for page_num in range(mypdf.page_count):
        page = mypdf[page_num]
        text = page.get_text("text")
        all_text += text
    return all_text

Chunking the Extracted Text

Once we have the extracted text, we divide it into smaller, overlapping chunks to improve retrieval accuracy.

def chunk_text(text, n, overlap):
    chunks = []
    for i in range(0, len(text), n - overlap):
        chunks.append(text[i:i + n])
    return chunks

Setting Up the OpenAI API Client

We initialize the OpenAI client to generate embeddings and responses.

client = OpenAI(
    base_url="https://api.studio.nebius.com/v1/",
    api_key=os.getenv("OPENAI_API_KEY")
)

Simple Vector Store Implementation

We'll create a basic vector store to manage document chunks and their embeddings.

class SimpleVectorStore:
    def __init__(self):
        self.vectors = []
        self.texts = []
        self.metadata = []
    
    def add_item(self, text, embedding, metadata=None):
        self.vectors.append(np.array(embedding))
        self.texts.append(text)
        self.metadata.append(metadata or {})
    
    def similarity_search(self, query_embedding, k=5, filter_func=None):
        if not self.vectors:
            return []
        query_vector = np.array(query_embedding)
        similarities = []
        for i, vector in enumerate(self.vectors):
            if filter_func and not filter_func(self.metadata[i]):
                continue
            similarity = np.dot(query_vector, vector) / (np.linalg.norm(query_vector) * np.linalg.norm(vector))
            similarities.append((i, similarity))
        similarities.sort(key=lambda x: x[1], reverse=True)
        results = []
        for i in range(min(k, len(similarities))):
            idx, score = similarities[i]
            results.append({
                "text": self.texts[idx],
                "metadata": self.metadata[idx],
                "similarity": score
            })
        return results

Creating Embeddings

def create_embeddings(text, model="BAAI/bge-en-icl"):
    input_text = text if isinstance(text, list) else [text]
    response = client.embeddings.create(
        model=model,
        input=input_text
    )
    if isinstance(text, str):
        return response.data[0].embedding
    return [item.embedding for item in response.data]

Document Processing Pipeline

def process_document(pdf_path, chunk_size=1000, chunk_overlap=200):
    print("Extracting text from PDF...")
    extracted_text = extract_text_from_pdf(pdf_path)
    print("Chunking text...")
    chunks = chunk_text(extracted_text, chunk_size, chunk_overlap)
    print(f"Created {len(chunks)} text chunks")
    print("Creating embeddings for chunks...")
    chunk_embeddings = create_embeddings(chunks)
    store = SimpleVectorStore()
    for i, (chunk, embedding) in enumerate(zip(chunks, chunk_embeddings)):
        store.add_item(
            text=chunk,
            embedding=embedding,
            metadata={"index": i, "source": pdf_path}
        )
    print(f"Added {len(chunks)} chunks to the vector store")
    return store

Self-RAG Components

1. Retrieval Decision

def determine_if_retrieval_needed(query):
    system_prompt = """You are an AI assistant that determines if retrieval is necessary to answer a query.
    For factual questions, specific information requests, or questions about events, people, or concepts, answer "Yes".
    For opinions, hypothetical scenarios, or simple queries with common knowledge, answer "No".
    Answer with ONLY "Yes" or "No"."""
    user_prompt = f"Query: {query}\n\nIs retrieval necessary to answer this query accurately?"
    response = client.chat.completions.create(
        model="meta-llama/Llama-3.2-3B-Instruct",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0
    )
    answer = response.choices[0].message.content.strip().lower()
    return "yes" in answer

2. Relevance Evaluation

def evaluate_relevance(query, context):
    system_prompt = """You are an AI assistant that determines if a document is relevant to a query.
    Consider whether the document contains information that would be helpful in answering the query.
    Answer with ONLY "Relevant" or "Irrelevant"."""
    max_context_length = 2000
    if len(context) > max_context_length:
        context = context[:max_context_length] + "... [truncated]"
    user_prompt = f"""Query: {query}
    Document content:
    {context}

    Is this document relevant to the query? Answer with ONLY "Relevant" or "Irrelevant".
    """
    response = client.chat.completions.create(
        model="meta-llama/Llama-3.2-3B-Instruct",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0
    )
    answer = response.choices[0].message.content.strip().lower()
    return answer

3. Support Assessment

def assess_support(response, context):
    system_prompt = """You are an AI assistant that determines if a response is supported by the given context.
    Evaluate if the facts, claims, and information in the response are backed by the context.
    Answer with ONLY one of these three options:
    - "Fully supported": All information in the response is directly supported by the context.
    - "Partially supported": Some information in the response is supported by the context, but some is not.
    - "No support": The response contains significant information not found in or contradicting the context.
    """
    max_context_length = 2000
    if len(context) > max_context_length:
        context = context[:max_context_length] + "... [truncated]"
    user_prompt = f"""Context:
    {context}

    Response:
    {response}

    How well is this response supported by the context? Answer with ONLY "Fully supported", "Partially supported", or "No support".
    """
    response = client.chat.completions.create(
        model="meta-llama/Llama-3.2-3B-Instruct",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0
    )
    answer = response.choices[0].message.content.strip().lower()
    return answer

4. Utility Evaluation

def rate_utility(query, response):
    system_prompt = """You are an AI assistant that rates the utility of a response to a query.
    Consider how well the response answers the query, its completeness, correctness, and helpfulness.
    Rate the utility on a scale from 1 to 5, where:
    - 1: Not useful at all
    - 2: Slightly useful
    - 3: Moderately useful
    - 4: Very useful
    - 5: Exceptionally useful
    Answer with ONLY a single number from 1 to 5."""
    user_prompt = f"""Query: {query}
    Response:
    {response}

    Rate the utility of this response on a scale from 1 to 5:"""
    response = client.chat.completions.create(
        model="meta-llama/Llama-3.2-3B-Instruct",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0
    )
    rating = response.choices[0].message.content.strip()
    rating_match = re.search(r'[1-5]', rating)
    if rating_match:
        return int(rating_match.group())
    return 3

Response Generation

def generate_response(query, context=None):
    system_prompt = """You are a helpful AI assistant. Provide a clear, accurate, and informative response to the query."""
    if context:
        user_prompt = f"""Context:
        {context}

        Query: {query}

        Please answer the query based on the provided context.
        """
    else:
        user_prompt = f"""Query: {query}

        Please answer the query to the best of your ability.
        """
    response = client.chat.completions.create(
        model="meta-llama/Llama-3.2-3B-Instruct",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0.2
    )
    return response.choices[0].message.content.strip()

Complete Self-RAG Implementation

def self_rag(query, vector_store, top_k=3):
    print(f"\n=== Starting Self-RAG for query: {query} ===\n")
    print("Step 1: Determining if retrieval is necessary...")
    retrieval_needed = determine_if_retrieval_needed(query)
    print(f"Retrieval needed: {retrieval_needed}")
    metrics = {
        "retrieval_needed": retrieval_needed,
        "documents_retrieved": 0,
        "relevant_documents": 0,
        "response_support_ratings": [],
        "utility_ratings": []
    }
    best_response = None
    best_score = -1
    if retrieval_needed:
        print("\nStep 2: Retrieving relevant documents...")
        query_embedding = create_embeddings(query)
        results = vector_store.similarity_search(query_embedding, k=top_k)
        metrics["documents_retrieved"] = len(results)
        print(f"Retrieved {len(results)} documents")
        print("\nStep 3: Evaluating document relevance...")
        relevant_contexts = []
        for i, result in enumerate(results):
            context = result["text"]
            relevance = evaluate_relevance(query, context)
            print(f"Document {i+1} relevance: {relevance}")
            if relevance == "relevant":
                relevant_contexts.append(context)
        metrics["relevant_documents"] = len(relevant_contexts)
        print(f"Found {len(relevant_contexts)} relevant documents")
        if relevant_contexts:
            print("\nStep 4: Processing relevant contexts...")
            for i, context in enumerate(relevant_contexts):
                print(f"\nProcessing context {i+1}/{len(relevant_contexts)}...")
                print("Generating response...")
                response = generate_response(query, context)
                print("Assessing support...")
                support_rating = assess_support(response, context)
                print(f"Support rating: {support_rating}")
                metrics["response_support_ratings"].append(support_rating)
                print("Rating utility...")
                utility_rating = rate_utility(query, response)
                print(f"Utility rating: {utility_rating}/5")
                metrics["utility_ratings"].append(utility_rating)
                support_score = {
                    "fully supported": 3, 
                    "partially supported": 1, 
                    "no support": 0
                }.get(support_rating, 0)
                overall_score = support_score * 5 + utility_rating
                print(f"Overall score: {overall_score}")
                if overall_score > best_score:
                    best_response = response
                    best_score = overall_score
                    print("New best response found!")
            if not relevant_contexts or best_score <= 0:
                print("\nNo suitable context found or poor responses, generating without retrieval...")
                best_response = generate_response(query)
        else:
            print("\nNo retrieval needed, generating response directly...")
            best_response = generate_response(query)
    metrics["best_score"] = best_score
    metrics["used_retrieval"] = retrieval_needed and best_score > 0
    print("\n=== Self-RAG Completed ===\n")
    return {
        "query": query,
        "response": best_response,
        "metrics": metrics
    }

Running the Complete Self-RAG System

def run_self_rag_example():
    pdf_path = "data/AI_Information.pdf"
    print(f"Processing document: {pdf_path}")
    vector_store = process_document(pdf_path)
    query1 = "What are the main ethical concerns in AI development?"
    print("\n" + "="*80)
    print(f"EXAMPLE 1: {query1}")
    result1 = self_rag(query1, vector_store)
    print("\nFinal response:")
    print(result1["response"])
    print("\nMetrics:")
    print(json.dumps(result1["metrics"], indent=2))
    query2 = "Can you write a short poem about artificial intelligence?"
    print("\n" + "="*80)
    print(f"EXAMPLE 2: {query2}")
    result2 = self_rag(query2, vector_store)
    print("\nFinal response:")
    print(result2["response"])
    print("\nMetrics:")
    print(json.dumps(result2["metrics"], indent=2))
    query3 = "How might AI impact economic growth in developing countries?"
    print("\n" + "="*80)
    print(f"EXAMPLE 3: {query3}")
    result3 = self_rag(query3, vector_store)
    print("\nFinal response:")
    print(result3["response"])
    print("\nMetrics:")
    print(json.dumps(result3["metrics"], indent=2))
    return {
        "example1": result1,
        "example2": result2,
        "example3": result3
    }

Evaluating Self-RAG Against Traditional RAG

def traditional_rag(query, vector_store, top_k=3):
    print(f"\n=== Running traditional RAG for query: {query} ===\n")
    print("Retrieving documents...")
    query_embedding = create_embeddings(query)
    results = vector_store.similarity_search(query_embedding, k=top_k)
    print(f"Retrieved {len(results)} documents")
    contexts = [result["text"] for result in results]
    combined_context = "\n\n".join(contexts)
    print("Generating response...")
    response = generate_response(query, combined_context)
    return response

def evaluate_rag_approaches(pdf_path, test_queries, reference_answers=None):
    print("=== Evaluating RAG Approaches ===")
    vector_store = process_document(pdf_path)
    results = []
    for i, query in enumerate(test_queries):
        print(f"\nProcessing query {i+1}: {query}")
        self_rag_result = self_rag(query, vector_store)
        self_rag_response = self_rag_result["response"]
        trad_rag_response = traditional_rag(query, vector_store)
        reference = reference_answers[i] if reference_answers and i < len(reference_answers) else None
        comparison = compare_responses(query, self_rag_response, trad_rag_response, reference)
        results.append({
            "query": query,
            "self_rag_response": self_rag_response,
            "traditional_rag_response": trad_rag_response,
            "reference_answer": reference,
            "comparison": comparison,
            "self_rag_metrics": self_rag_result["metrics"]
        })
    overall_analysis = generate_overall_analysis(results)
    return {
        "results": results,
        "overall_analysis": overall_analysis
    }

def compare_responses(query, self_rag_response, trad_rag_response, reference=None):
    system_prompt = """You are an expert evaluator of RAG systems. Your task is to compare responses from two different RAG approaches:
    1. Self-RAG: A dynamic approach that decides if retrieval is needed and evaluates information relevance and response quality
    2. Traditional RAG: Always retrieves documents and uses them to generate a response

    Compare the responses based on:
    - Relevance to the query
    - Factual correctness
    - Completeness and informativeness
    - Conciseness and focus"""
    user_prompt = f"""Query: {query}

    Response from Self-RAG:
    {self_rag_response}

    Response from Traditional RAG:
    {trad_rag_response}
    """
    if reference:
        user_prompt += f"""
        Reference Answer (for factual checking):
        {reference}
        """
    user_prompt += """
    Compare these responses and explain which one is better and why.
    Focus on accuracy, relevance, completeness, and quality.
    """
    response = client.chat.completions.create(
        model="meta-llama/Llama-3.2-3B-Instruct",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0
    )
    return response.choices[0].message.content

def generate_overall_analysis(results):
    system_prompt = """You are an expert evaluator of RAG systems. Your task is to provide an overall analysis comparing
    Self-RAG and Traditional RAG based on multiple test queries.

    Focus your analysis on:
    1. When Self-RAG performs better and why
    2. When Traditional RAG performs better and why
    3. The impact of dynamic retrieval decisions in Self-RAG
    4. The value of relevance and support evaluation in Self-RAG
    5. Overall recommendations on which approach to use for different types of queries"""
    comparisons_summary = ""
    for i, result in enumerate(results):
        comparisons_summary += f"Query {i+1}: {result['query']}\n"
        comparisons_summary += f"Self-RAG metrics: Retrieval needed: {result['self_rag_metrics']['retrieval_needed']}, "
        comparisons_summary += f"Relevant docs: {result['self_rag_metrics']['relevant_documents']}/{result['self_rag_metrics']['documents_retrieved']}\n"
        comparisons_summary += f"Comparison summary: {result['comparison'][:200]}...\n\n"
    user_prompt = f"""Based on the following comparison results from {len(results)} test queries, please provide an overall analysis of
    Self-RAG versus Traditional RAG:

    {comparisons_summary}

    Please provide your comprehensive analysis.
    """
    response = client.chat.completions.create(
        model="meta-llama/Llama-3.2-3B-Instruct",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0
    )
    return response.choices[0].message.content

Evaluating the Self-RAG System

The final step is to evaluate the Self-RAG system against traditional RAG approaches. We'll compare the quality of responses generated by both systems and analyze the performance of Self-RAG in different scenarios.

pdf_path = "data/AI_Information.pdf"
test_queries = [
    "What are the main ethical concerns in AI development?",
]
reference_answers = [
    "The main ethical concerns in AI development include bias and fairness, privacy, transparency, accountability, safety, and the potential for misuse or harmful applications.",
]

evaluation_results = evaluate_rag_approaches(
    pdf_path=pdf_path,
    test_queries=test_queries,
    reference_answers=reference_answers
)

print(evaluation_results["overall_analysis"])