Below is the code
vector_store = MongoDBAtlasVectorSearch(client, db_name=db_name, collection_name=collection_name, index_name="autoprod_index", embeddings = embed_model)
for url in urls:
company_name = extract_company_name(url)
#AHL_loader = AsyncHtmlLoader(url)
#AHL_docs = AHL_loader.load()
url = os.path.join('/content/html_files',url)
loader = UnstructuredHTMLLoader(url)
data = loader.load()
#AHL_docs_HTML2TEXT = htmlTotext.transform_documents(AHL_docs)
doc = Document(text = data[0].page_content)
#doc = Document(text=AHL_docs_HTML2TEXT[0].page_content)
parser = HierarchicalNodeParser.from_defaults(chunk_sizes=[2048, 512, 128])
nodes = parser.get_nodes_from_documents([doc])
for node in nodes:
node_embedding = embed_model.get_text_embedding(
node.get_content(metadata_mode=MetadataMode.ALL))
node.embedding = node_embedding
node.metadata['company_name'] = company_name
node.metadata['url'] = url
vector_store.add(nodes)
index = VectorStoreIndex.from_vector_store(vector_store)#, storage_context = storage_context)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
postproc = None
reranker = SentenceTransformerRerank(top_n=3)
base_retriever = index.as_retriever(similarity_top_k = 3, filters = MetadataFilters(
filters=[
ExactMatchFilter(key="metadata.company_name", value="grail"),
ExactMatchFilter(key="metadata.url", value="/content/html_files/https.grail.com.html")
]))
retriever = AutoMergingRetriever(base_retriever, storage_context=storage_context, verbose=True)
#response_synthesizer = get_response_synthesizer(response_mode='tree_summarize')
node_postprocessors = [postproc, reranker]
node_postprocessors = [processor for processor in node_postprocessors if processor is not None]
query_engine = RetrieverQueryEngine(retriever, node_postprocessors=node_postprocessors)
summary_whole = query_engine.query("Who is the CEO of grail? Answer if you are 100 % sure.")
print(summary_whole)
It’s giving this error
ValueError: doc_id 7109edf3-cbda-4c55-9c34-a5a18c14aea1 not found.
Where this doc id is present in the actual mongodb collection. Attaching the screenshot as proof. If I just use base_retriever in the query_engine, then i get not good errors. Also, most of the tutorials of llama index+auto merging retriever are taking stuff from documents/chunks/nodes and/or vector store.