Hello everyone,
I am working on a project where I need to add a custom field (target_id
) to the metadata of a large set of documents before inserting them into MongoDB for vector search indexing. Here is the approach I currently use:
for doc in docs:
doc.metadata["target_id"] = target_id
# Insert docs into MongoDB and proceed with vector search indexing
This method involves looping over each document to insert the custom field, which seems inefficient for large datasets.
So my question is: is there a better way to populate the vector database with custom metadata field rather than loop docs
here is the full code i implemented
from flask import Blueprint, request, jsonify
from langchain.text_splitter import RecursiveCharacterTextSplitter
from .db import MongoDBHandler
load_blueprint = Blueprint('upload', __name__)
class UnsupportedDocumentType(Exception):
pass
@load_blueprint.route('/load', methods=['POST'])
def load():
body = request.json
type = body.get('type')
url = body.get('url')
target_id = body.get('target_id')
try:
docs = load_document(type, url)
for doc in docs:
doc.metadata["target_id"] = target_id
db_handler = MongoDBHandler()
db_handler.vector_search_from_documents(
"suppalism",
"knowledges",
"knowledge_vector_search_index",
docs
)
return jsonify({"message": "Processed "+type+" file"})
except Exception as e:
return jsonify({"status": 500, "message": str(e)})
def load_document(type, url):
loaders = {
'pdf': 'langchain.document_loaders.PyPDFLoader',
'doc': 'langchain_community.document_loaders.UnstructuredURLLoader',
'video': 'langchain_community.document_loaders.YoutubeLoader',
'website': 'langchain_community.document_loaders.WebBaseLoader'
}
if type not in loaders:
raise UnsupportedDocumentType('Document format is not supported!')
module_name, class_name = loaders[type].rsplit('.', 1)
LoaderClass = getattr(__import__(module_name, fromlist=[class_name]), class_name)
if type == 'video':
loader = LoaderClass.from_youtube_url(url, add_video_info=True)
elif type == "website":
loader = LoaderClass(url)
elif type == 'doc':
loader = LoaderClass(urls=[url])
elif type == 'pdf':
loader = LoaderClass(url)
data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=200,
chunk_overlap=20,
)
docs = text_splitter.split_documents(data)
return docs