RAG Architecture for Enterprise Document Processing
A practical guide to designing a Retrieval-Augmented Generation system for querying enterprise internal documents with high accuracy using vector databases and LLMs.
RAG (Retrieval-Augmented Generation) is an AI architecture that enables LLMs to answer questions based on enterprise internal data without expensive fine-tuning. Ventra Rocket has deployed multiple RAG systems helping enterprises efficiently query thousands of PDFs, Word documents, and database records.
1. RAG Architecture Overview
A RAG system consists of two primary pipelines:
Indexing Pipeline — Runs offline, processes and stores documents into a vector database.
Query Pipeline — Runs in real-time when a user submits a question.
Indexing:
Documents → Chunking → Embedding → Vector Store
Query:
User Question → Embedding → Vector Search → Context Assembly → LLM → Answer
2. Document Ingestion and Chunking
Chunking strategy directly impacts retrieval quality. Chunks that are too large dilute context; chunks that are too small lose surrounding meaning.
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, Docx2txtLoader
import os
def load_documents(file_path: str):
ext = os.path.splitext(file_path)[1].lower()
if ext == '.pdf':
loader = PyPDFLoader(file_path)
elif ext in ('.docx', '.doc'):
loader = Docx2txtLoader(file_path)
else:
raise ValueError(f"Unsupported file type: {ext}")
return loader.load()
def chunk_documents(documents, chunk_size=800, chunk_overlap=100):
"""
chunk_size=800 tokens is a good balance for technical documents.
chunk_overlap=100 prevents information loss at chunk boundaries.
"""
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separators=["\n\n", "\n", ".", "!", "?", ",", " "],
length_function=len,
)
return splitter.split_documents(documents)
3. Embedding and Vector Storage with Qdrant
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from openai import OpenAI
import uuid
client = OpenAI()
qdrant = QdrantClient(url="http://localhost:6333")
COLLECTION_NAME = "enterprise_docs"
EMBEDDING_DIM = 1536 # text-embedding-3-small
def create_collection():
qdrant.recreate_collection(
collection_name=COLLECTION_NAME,
vectors_config=VectorParams(size=EMBEDDING_DIM, distance=Distance.COSINE),
)
def embed_and_store(chunks: list, metadata: dict):
texts = [chunk.page_content for chunk in chunks]
# Batch embedding to reduce API calls
response = client.embeddings.create(
input=texts,
model="text-embedding-3-small",
)
embeddings = [item.embedding for item in response.data]
points = [
PointStruct(
id=str(uuid.uuid4()),
vector=embedding,
payload={
"text": text,
"source": metadata.get("source", ""),
"department": metadata.get("department", ""),
"doc_type": metadata.get("doc_type", ""),
},
)
for text, embedding in zip(texts, embeddings)
]
qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
4. Retrieval with Filtered Search
from qdrant_client.models import Filter, FieldCondition, MatchValue
def retrieve_context(
query: str,
department_filter: str | None = None,
top_k: int = 5,
) -> list[dict]:
query_embedding = client.embeddings.create(
input=query,
model="text-embedding-3-small",
).data[0].embedding
search_filter = None
if department_filter:
search_filter = Filter(
must=[FieldCondition(key="department", match=MatchValue(value=department_filter))]
)
results = qdrant.search(
collection_name=COLLECTION_NAME,
query_vector=query_embedding,
query_filter=search_filter,
limit=top_k,
score_threshold=0.72,
with_payload=True,
)
return [
{"text": hit.payload["text"], "source": hit.payload["source"], "score": hit.score}
for hit in results
]
5. Generation with Context Assembly
def build_prompt(query: str, contexts: list[dict]) -> str:
context_text = "\n\n---\n\n".join(
f"[Source: {ctx['source']}]\n{ctx['text']}"
for ctx in contexts
)
return f"""You are an internal AI assistant. Answer the question based only on the provided documents.
If the information is not in the documents, say so clearly rather than guessing.
Always cite the source document in your answer.
REFERENCE DOCUMENTS:
{context_text}
QUESTION: {query}
ANSWER:"""
def answer_question(query: str, department: str | None = None) -> dict:
contexts = retrieve_context(query, department_filter=department)
if not contexts:
return {"answer": "No relevant information found in internal documents.", "sources": []}
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": build_prompt(query, contexts)}],
temperature=0.1,
max_tokens=1000,
)
return {
"answer": response.choices[0].message.content,
"sources": list({ctx["source"] for ctx in contexts}),
}
6. Evaluating RAG Quality with RAGAS
Measure quality using three RAGAS metrics:
- Faithfulness — Is the answer grounded in the retrieved context?
- Answer Relevancy — Does the answer actually address the question?
- Context Recall — Does the retrieved context contain enough information to answer?
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_recall
results = evaluate(
dataset=test_dataset,
metrics=[faithfulness, answer_relevancy, context_recall],
)
print(results)
# {'faithfulness': 0.91, 'answer_relevancy': 0.87, 'context_recall': 0.84}
Conclusion
RAG is the most practical architecture for bringing AI into enterprise workflows. It requires no model fine-tuning, makes data updates straightforward, and keeps answer scope controllable. Ventra Rocket has successfully deployed RAG systems for multiple enterprises achieving over 90% accuracy on domain-specific queries.
Related Articles
Building Enterprise AI Chatbots: Architecture and Best Practices
How to design production-grade AI chatbots — intent classification, RAG integration, conversation memory, escalation flows, and evaluation metrics.
LLM Integration Best Practices for Enterprise
A battle-tested guide to integrating Large Language Models into enterprise systems — prompt engineering, cost optimisation, safety guardrails, structured output, and evaluation frameworks.