Below's the code which i used to get the documents
import pandas as pd
import glob
import os
import json
class CustomCSVLoader:
def __init__(self, file_path, metadata_columns, encoding="utf-8"):
self.file_path = file_path
self.metadata_columns = metadata_columns
self.encoding = encoding
def load(self):
# Load the CSV file with the specified encoding
df = pd.read_csv(self.file_path, encoding=self.encoding)
# Filter the dataframe based on the specified metadata columns and add any additional columns if needed
if self.metadata_columns:
df = df[self.metadata_columns + ['Text']]
# Convert the dataframe to a dictionary, then to the desired format
my_dict = json.loads(df.to_json(orient='index'))
documents = self._process_chunks(my_dict)
return documents
def _process_chunks(self, my_dict):
chunks = [item for item in my_dict.values()]
for chunk in chunks:
# Assuming get_token_set is a function you've defined elsewhere
chunk.update({'token_set': get_token_set(chunk['Text'])})
# Assuming Document is a class you've defined elsewhere and removekey is a function to remove keys from a dict
docs = [Document(page_content=item['Text'], metadata=removekey(item, 'Text')) for item in chunks]
return docs
def load_csv_files(folder_path):
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))
metadata_columns = ['x1', 'y1', 'x2', 'y2', 'page_no', 'source_url', 'doc_type', 'company', 'doc_year']
documents = []
for file_path in csv_files:
loader = CustomCSVLoader(file_path=file_path, metadata_columns=metadata_columns)
documents.extend(loader.load())
return documents
folder_path = 'data'
docs= load_csv_files(folder_path)
print(docs[0])
below’s the output of the docs[0]
Document(page_content='what's happening over there.', metadata={'x1': 198.7965924442, 'y1': 204.7538135126, 'x2': 1448.4322473817, 'y2': 1536.8232888756, 'page_no': 1, 'source_url': 'data.pdf', 'doc_type': 'sports', 'company': 'BCCI', 'doc_year': 2022, 'token_set': {'publicly', 'the', 'that'}})
then i wrote a code which will ingest the above data into MongoDBAtlas
db = MongoDBAtlasVectorSearch.from_documents(
documents=docs,
embedding=embedding_model,
collection=mongodb_collection,
index_name="cricket"
)
and below is the output
InvalidDocument: cannot encode object: {{'publicly', 'the', 'that'}})
I successfully imported data into MongoDB Atlas using the CSVLoader from Langchain. However, when I attempted to modify the data by adding custom values and import it again without using CSVLoader, I encountered issues. This happened despite maintaining the same data format. I am looking for guidance on how to import data with custom modifications into MongoDB Atlas without using CSVLoader.