Oreoluwa
Reranking for Enhanced RAG Systems
June 21, 2025
20 min read

Reranking for Enhanced RAG Systems

RAG
NLP
AI
Reranking
Search

Reranking for Enhanced RAG Systems

This notebook implements reranking techniques to improve retrieval quality in RAG systems. Reranking acts as a second filtering step after initial retrieval to ensure the most relevant content is used for response generation.

Key Concepts of Reranking

  1. Initial Retrieval: First pass using basic similarity search (less accurate but faster)
  2. Document Scoring: Evaluating each retrieved document's relevance to the query
  3. Reordering: Sorting documents by their relevance scores
  4. Selection: Using only the most relevant documents for response generation

Setting Up the Environment

We begin by importing necessary libraries.

import fitz
import os
import numpy as np
import json
from openai import OpenAI
import re

Extracting Text from a PDF File

To implement RAG, we first need a source of textual data. In this case, we extract text from a PDF file using the PyMuPDF library.

def extract_text_from_pdf(pdf_path):
    mypdf = fitz.open(pdf_path)
    all_text = ""
    for page_num in range(mypdf.page_count):
        page = mypdf[page_num]
        text = page.get_text("text")
        all_text += text
    return all_text

Chunking the Extracted Text

Once we have the extracted text, we divide it into smaller, overlapping chunks to improve retrieval accuracy.

def chunk_text(text, n, overlap):
    chunks = []
    for i in range(0, len(text), n - overlap):
        chunks.append(text[i:i + n])
    return chunks

Setting Up the OpenAI API Client

We initialize the OpenAI client to generate embeddings and responses.

client = OpenAI(
    base_url="https://api.studio.nebius.com/v1/",
    api_key=os.getenv("OPENAI_API_KEY")
)

Building a Simple Vector Store

To demonstrate how reranking integrates with retrieval, let's implement a simple vector store.

class SimpleVectorStore:
    def __init__(self):
        self.vectors = []
        self.texts = []
        self.metadata = []
    def add_item(self, text, embedding, metadata=None):
        self.vectors.append(np.array(embedding))
        self.texts.append(text)
        self.metadata.append(metadata or {})
    def similarity_search(self, query_embedding, k=5):
        if not self.vectors:
            return []
        query_vector = np.array(query_embedding)
        similarities = []
        for i, vector in enumerate(self.vectors):
            similarity = np.dot(query_vector, vector) / (np.linalg.norm(query_vector) * np.linalg.norm(vector))
            similarities.append((i, similarity))
        similarities.sort(key=lambda x: x[1], reverse=True)
        results = []
        for i in range(min(k, len(similarities))):
            idx, score = similarities[i]
            results.append({
                "text": self.texts[idx],
                "metadata": self.metadata[idx],
                "similarity": score
            })
        return results

Creating Embeddings

def create_embeddings(text, model="BAAI/bge-en-icl"):
    input_text = text if isinstance(text, list) else [text]
    response = client.embeddings.create(
        model=model,
        input=input_text
    )
    if isinstance(text, str):
        return response.data[0].embedding
    return [item.embedding for item in response.data]

Document Processing Pipeline

Now that we have defined the necessary functions and classes, we can proceed to define the document processing pipeline.

def process_document(pdf_path, chunk_size=1000, chunk_overlap=200):
    print("Extracting text from PDF...")
    extracted_text = extract_text_from_pdf(pdf_path)
    print("Chunking text...")
    chunks = chunk_text(extracted_text, chunk_size, chunk_overlap)
    print(f"Created {len(chunks)} text chunks")
    print("Creating embeddings for chunks...")
    chunk_embeddings = create_embeddings(chunks)
    store = SimpleVectorStore()
    for i, (chunk, embedding) in enumerate(zip(chunks, chunk_embeddings)):
        store.add_item(
            text=chunk,
            embedding=embedding,
            metadata={"index": i, "source": pdf_path}
        )
    print(f"Added {len(chunks)} chunks to the vector store")
    return store

Implementing LLM-based Reranking

Let's implement the LLM-based reranking function using the OpenAI API.

def rerank_with_llm(query, results, top_n=3, model="meta-llama/Llama-3.2-3B-Instruct"):
    print(f"Reranking {len(results)} documents...")
    scored_results = []
    system_prompt = """You are an expert at evaluating document relevance for search queries.
Your task is to rate documents on a scale from 0 to 10 based on how well they answer the given query.
Guidelines:
- Score 0-2: Document is completely irrelevant
- Score 3-5: Document has some relevant information but doesn't directly answer the query
- Score 6-8: Document is relevant and partially answers the query
- Score 9-10: Document is highly relevant and directly answers the query
You MUST respond with ONLY a single integer score between 0 and 10. Do not include ANY other text."""
    for i, result in enumerate(results):
        if i % 5 == 0:
            print(f"Scoring document {i+1}/{len(results)}...")
        user_prompt = f"""Query: {query}
Document:
{result['text']}
Rate this document's relevance to the query on a scale from 0 to 10:"""
        response = client.chat.completions.create(
            model=model,
            temperature=0,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
        )
        score_text = response.choices[0].message.content.strip()
        score_match = re.search(r'\b(10|[0-9])\b', score_text)
        if score_match:
            score = float(score_match.group(1))
        else:
            print(f"Warning: Could not extract score from response: '{score_text}', using similarity score instead")
            score = result["similarity"] * 10
        scored_results.append({
            "text": result["text"],
            "metadata": result["metadata"],
            "similarity": result["similarity"],
            "relevance_score": score
        })
    reranked_results = sorted(scored_results, key=lambda x: x["relevance_score"], reverse=True)
    return reranked_results[:top_n]

Simple Keyword-based Reranking

def rerank_with_keywords(query, results, top_n=3):
    keywords = [word.lower() for word in query.split() if len(word) > 3]
    scored_results = []
    for result in results:
        document_text = result["text"].lower()
        base_score = result["similarity"] * 0.5
        keyword_score = 0
        for keyword in keywords:
            if keyword in document_text:
                keyword_score += 0.1
                first_position = document_text.find(keyword)
                if first_position < len(document_text) / 4:
                    keyword_score += 0.1
                frequency = document_text.count(keyword)
                keyword_score += min(0.05 * frequency, 0.2)
        final_score = base_score + keyword_score
        scored_results.append({
            "text": result["text"],
            "metadata": result["metadata"],
            "similarity": result["similarity"],
            "relevance_score": final_score
        })
    reranked_results = sorted(scored_results, key=lambda x: x["relevance_score"], reverse=True)
    return reranked_results[:top_n]

Response Generation

def generate_response(query, context, model="meta-llama/Llama-3.2-3B-Instruct"):
    system_prompt = "You are a helpful AI assistant. Answer the user's question based only on the provided context. If you cannot find the answer in the context, state that you don't have enough information."
    user_prompt = f"""
Context:
{context}
Question: {query}
Please provide a comprehensive answer based only on the context above.
"""
    response = client.chat.completions.create(
        model=model,
        temperature=0,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]
    )
    return response.choices[0].message.content

Full RAG Pipeline with Reranking

So far, we have implemented the core components of the RAG pipeline, including document processing, question answering, and reranking. Now, we will combine these components to create a full RAG pipeline.

def rag_with_reranking(query, vector_store, reranking_method="llm", top_n=3, model="meta-llama/Llama-3.2-3B-Instruct"):
    query_embedding = create_embeddings(query)
    initial_results = vector_store.similarity_search(query_embedding, k=10)
    if reranking_method == "llm":
        reranked_results = rerank_with_llm(query, initial_results, top_n=top_n)
    elif reranking_method == "keywords":
        reranked_results = rerank_with_keywords(query, initial_results, top_n=top_n)
    else:
        reranked_results = initial_results[:top_n]
    context = "\n\n===\n\n".join([result["text"] for result in reranked_results])
    response = generate_response(query, context, model)
    return {
        "query": query,
        "reranking_method": reranking_method,
        "initial_results": initial_results[:top_n],
        "reranked_results": reranked_results,
        "context": context,
        "response": response
    }

Evaluating Reranking Quality

with open('data/val.json') as f:
    data = json.load(f)
query = data[0]['question']
reference_answer = data[0]['ideal_answer']
pdf_path = "data/AI_Information.pdf"
vector_store = process_document(pdf_path)
print("Comparing retrieval methods...")
print("\n=== STANDARD RETRIEVAL ===")
standard_results = rag_with_reranking(query, vector_store, reranking_method="none")
print(f"\nQuery: {query}")
print(f"\nResponse:\n{standard_results['response']}")
print("\n=== LLM-BASED RERANKING ===")
llm_results = rag_with_reranking(query, vector_store, reranking_method="llm")
print(f"\nQuery: {query}")
print(f"\nResponse:\n{llm_results['response']}")
print("\n=== KEYWORD-BASED RERANKING ===")
keyword_results = rag_with_reranking(query, vector_store, reranking_method="keywords")
print(f"\nQuery: {query}")
print(f"\nResponse:\n{keyword_results['response']}")
def evaluate_reranking(query, standard_results, reranked_results, reference_answer=None):
    system_prompt = """You are an expert evaluator of RAG systems.
Compare the retrieved contexts and responses from two different retrieval methods.
Assess which one provides better context and a more accurate, comprehensive answer."""
    comparison_text = f"""Query: {query}
Standard Retrieval Context:
{standard_results['context'][:1000]}... [truncated]
Standard Retrieval Answer:
{standard_results['response']}
Reranked Retrieval Context:
{reranked_results['context'][:1000]}... [truncated]
Reranked Retrieval Answer:
{reranked_results['response']}"""
    if reference_answer:
        comparison_text += f"""
Reference Answer:
{reference_answer}"
    user_prompt = f"""
{comparison_text}
Please evaluate which retrieval method provided:
1. More relevant context
2. More accurate answer
3. More comprehensive answer
4. Better overall performance
Provide a detailed analysis with specific examples.
"""
    response = client.chat.completions.create(
        model="meta-llama/Llama-3.2-3B-Instruct",
        temperature=0,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]
    )
    return response.choices[0].message.content

evaluation = evaluate_reranking(
    query=query,
    standard_results=standard_results,
    reranked_results=llm_results,
    reference_answer=reference_answer
)
print("\n=== EVALUATION RESULTS ===")
print(evaluation)