RAG with LangChain 🦜🔗¶
In [1]:
Copied!
# requirements for this example:
%pip install -qq docling docling-core python-dotenv langchain-text-splitters langchain-huggingface langchain-milvus
# requirements for this example:
%pip install -qq docling docling-core python-dotenv langchain-text-splitters langchain-huggingface langchain-milvus
Note: you may need to restart the kernel to use updated packages.
In [2]:
Copied!
import os
from dotenv import load_dotenv
load_dotenv()
import os
from dotenv import load_dotenv
load_dotenv()
Out[2]:
True
Setup¶
Loader and splitter¶
Below we set up:
- a
Loader
which will be used to create LangChain documents, and - a splitter, which will be used to split these documents
In [3]:
Copied!
from typing import Iterator
from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document as LCDocument
from docling.document_converter import DocumentConverter
class DoclingPDFLoader(BaseLoader):
def __init__(self, file_path: str | list[str]) -> None:
self._file_paths = file_path if isinstance(file_path, list) else [file_path]
self._converter = DocumentConverter()
def lazy_load(self) -> Iterator[LCDocument]:
for source in self._file_paths:
dl_doc = self._converter.convert(source).document
text = dl_doc.export_to_markdown()
yield LCDocument(page_content=text)
from typing import Iterator
from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document as LCDocument
from docling.document_converter import DocumentConverter
class DoclingPDFLoader(BaseLoader):
def __init__(self, file_path: str | list[str]) -> None:
self._file_paths = file_path if isinstance(file_path, list) else [file_path]
self._converter = DocumentConverter()
def lazy_load(self) -> Iterator[LCDocument]:
for source in self._file_paths:
dl_doc = self._converter.convert(source).document
text = dl_doc.export_to_markdown()
yield LCDocument(page_content=text)
In [4]:
Copied!
FILE_PATH = "https://raw.githubusercontent.com/DS4SD/docling/main/tests/data/2206.01062.pdf" # DocLayNet paper
FILE_PATH = "https://raw.githubusercontent.com/DS4SD/docling/main/tests/data/2206.01062.pdf" # DocLayNet paper
In [5]:
Copied!
from langchain_text_splitters import RecursiveCharacterTextSplitter
loader = DoclingPDFLoader(file_path=FILE_PATH)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
)
from langchain_text_splitters import RecursiveCharacterTextSplitter
loader = DoclingPDFLoader(file_path=FILE_PATH)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
)
We now used the above-defined objects to get the document splits:
In [6]:
Copied!
docs = loader.load()
splits = text_splitter.split_documents(docs)
docs = loader.load()
splits = text_splitter.split_documents(docs)
Embeddings¶
In [7]:
Copied!
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
HF_EMBED_MODEL_ID = "BAAI/bge-small-en-v1.5"
embeddings = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID)
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
HF_EMBED_MODEL_ID = "BAAI/bge-small-en-v1.5"
embeddings = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID)
Vector store¶
In [8]:
Copied!
from tempfile import TemporaryDirectory
from langchain_milvus import Milvus
MILVUS_URI = os.environ.get(
"MILVUS_URI", f"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db"
)
vectorstore = Milvus.from_documents(
splits,
embeddings,
connection_args={"uri": MILVUS_URI},
drop_old=True,
)
from tempfile import TemporaryDirectory
from langchain_milvus import Milvus
MILVUS_URI = os.environ.get(
"MILVUS_URI", f"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db"
)
vectorstore = Milvus.from_documents(
splits,
embeddings,
connection_args={"uri": MILVUS_URI},
drop_old=True,
)
LLM¶
In [9]:
Copied!
from langchain_huggingface import HuggingFaceEndpoint
HF_API_KEY = os.environ.get("HF_API_KEY")
HF_LLM_MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"
llm = HuggingFaceEndpoint(
repo_id=HF_LLM_MODEL_ID,
huggingfacehub_api_token=HF_API_KEY,
)
from langchain_huggingface import HuggingFaceEndpoint
HF_API_KEY = os.environ.get("HF_API_KEY")
HF_LLM_MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"
llm = HuggingFaceEndpoint(
repo_id=HF_LLM_MODEL_ID,
huggingfacehub_api_token=HF_API_KEY,
)
The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. Token is valid (permission: write). Your token has been saved to /Users/pva/.cache/huggingface/token Login successful
RAG¶
In [10]:
Copied!
from typing import Iterable
from langchain_core.documents import Document as LCDocument
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
def format_docs(docs: Iterable[LCDocument]):
return "\n\n".join(doc.page_content for doc in docs)
retriever = vectorstore.as_retriever()
prompt = PromptTemplate.from_template(
"Context information is below.\n---------------------\n{context}\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: {question}\nAnswer:\n"
)
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
from typing import Iterable
from langchain_core.documents import Document as LCDocument
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
def format_docs(docs: Iterable[LCDocument]):
return "\n\n".join(doc.page_content for doc in docs)
retriever = vectorstore.as_retriever()
prompt = PromptTemplate.from_template(
"Context information is below.\n---------------------\n{context}\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: {question}\nAnswer:\n"
)
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
In [11]:
Copied!
rag_chain.invoke("How many pages were human annotated for DocLayNet?")
rag_chain.invoke("How many pages were human annotated for DocLayNet?")
Out[11]:
'- 80,863 pages were human annotated for DocLayNet.'