Python code:
$ vi langchain-semantic-search.py
from langchain_huggingface.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores.faiss import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_core.documents import Document
from uuid import uuid4
import faiss
# create document objects with metadata (BBC article titles)
documents = [
Document(page_content="Researchers in Japan and the US have unlocked the 60-year mystery of what gives these cats their orange colour.", metadata={"source":"bbc"}),
Document(page_content="Astronomers have spotted around a dozen of these weird, rare blasts. Could they be signs of a special kind of black hole?", metadata={"source":"bbc"}),
Document(page_content="The world's largest cloud computing company plans to spend £8bn on new data centres in the UK over the next four years.", metadata={"source":"bbc"}),
Document(page_content="The Caribbean island is building a power station that will use steam naturally heated by volcanic rock.", metadata={"source":"bbc"}),
Document(page_content="As Barcelona celebrate winning La Liga, Spanish football expert Guillem Balague looks at how manager Hansi Flick turned his young side into champions.", metadata={"source":"bbc"}),
Document(page_content="Venezuela's Jhonattan Vegas leads the US PGA Championship with several European players close behind, but Rory McIlroy endures a tough start.", metadata={"source":"bbc"}),
Document(page_content="Locals and ecologists are troubled by the potential impacts a looming seawall could have on the biodiverse Japanese island of Amami Ōshima.", metadata={"source":"bbc"}),
Document(page_content="The government has made little progress in preparing the UK for rising temperatures, climate watchdog the CCC says.", metadata={"source":"bbc"}),
Document(page_content="Half a century after the world's first deep sea mining tests picked nodules from the seafloor off the US east coast, the damage has barely begun to heal.", metadata={"source":"bbc"}),
Document(page_content="The Cuyahoga River was so polluted it regularly went up in flames. Images of one dramatic blaze in 1952 shaped the US's nascent environmental movement, long after the flames went out.", metadata={"source":"bbc"})
]
# configure embedding model
model_name = "all-MiniLM-L6-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embedding_model = HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs
)
# initialize FAISS index
index = faiss.IndexFlatL2(len(embedding_model.embed_query("hello semantic search")))
# create vector store
vector_store = FAISS(
embedding_function=embedding_model,
index=index,
docstore=InMemoryDocstore(),
index_to_docstore_id={},
)
# add documents with unique IDs
uuids = [str(uuid4()) for _ in range(len(documents))]
# add documents to vector store
vector_store.add_documents(documents=documents, ids=uuids)
# perform filtered search
query = "major achievement"
# similarity search (+ use metadata filter)
results = vector_store.similarity_search(
query,
k=2,
filter={"source": "bbc"},
)
for res in results:
print(f"{res.page_content}")
Run the Python script:
$ python3 langchain-semantic-search.py
Output:
As Barcelona celebrate winning La Liga, Spanish football expert Guillem Balague looks at how manager Hansi Flick turned his young side into champions.
The Cuyahoga River was so polluted it regularly went up in flames. Images of one dramatic blaze in 1952 shaped the US's nascent environmental movement, long after the flames went out.