neo4j-labs · prakriti-solankey · Sep 3, 2024 · Jul 11, 2024 · Jul 12, 2024 · Jul 12, 2024
diff --git a/README.md b/README.md
@@ -149,7 +149,7 @@ Allow unauthenticated request : Yes
 | VITE_LLM_MODELS              | Mandatory          | diffbot,openai-gpt-3.5,openai-gpt-4o | Models available for selection on the frontend, used for entities extraction and Q&A
 | VITE_CHAT_MODES              | Mandatory          | vector,graph+vector,graph,hybrid | Chat modes available for Q&A
 | VITE_ENV                     | Mandatory          | DEV or PROD           | Environment variable for the app                                                                 |
-| VITE_TIME_PER_CHUNK          | Optional           | 4             | Time per chunk for processing                                                                    |
+| VITE_TIME_PER_PAGE          | Optional           | 50             | Time per page for processing                                                                    |
 | VITE_CHUNK_SIZE              | Optional           | 5242880       | Size of each chunk of file for upload                                                                |
 | VITE_GOOGLE_CLIENT_ID        | Optional           |               | Client ID for Google authentication                                                              |
 | GCS_FILE_CACHE          | Optional           | False         | If set to True, will save the files to process into GCS. If set to False, will save the files locally   |

diff --git a/backend/Performance_test.py b/backend/Performance_test.py
@@ -94,6 +94,7 @@ def performance_main():
         for _ in range(CONCURRENT_REQUESTS):
             futures.append(executor.submit(post_request_chunk))
 
+        #  Chatbot request futures
         #  Chatbot request futures
         # for message in CHATBOT_MESSAGES:
         #     futures.append(executor.submit(chatbot_request, message))

diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -69,18 +69,18 @@ jsonpath-python==1.0.6
 jsonpointer==2.4
 json-repair==0.25.2
 kiwisolver==1.4.5
-langchain==0.2.8
-langchain-aws==0.1.9
-langchain-anthropic==0.1.19
-langchain-fireworks==0.1.4
-langchain-google-genai==1.0.7
-langchain-community==0.2.7
-langchain-core==0.2.19
-langchain-experimental==0.0.62
-langchain-google-vertexai==1.0.6
-langchain-groq==0.1.6
-langchain-openai==0.1.14
-langchain-text-splitters==0.2.2
+langchain
+langchain-aws
+langchain-anthropic
+langchain-fireworks
+langchain-google-genai
+langchain-community
+langchain-core
+langchain-experimental
+langchain-google-vertexai
+langchain-groq
+langchain-openai
+langchain-text-splitters
 langdetect==1.0.9
 langsmith==0.1.83
 layoutparser==0.3.4

diff --git a/backend/score.py b/backend/score.py
@@ -105,8 +105,8 @@ async def create_source_knowledge_graph_url(
             return create_api_response('Failed',message='source_type is other than accepted source')
 
         message = f"Source Node created successfully for source type: {source_type} and source: {source}"
-        josn_obj = {'api_name':'url_scan','db_url':uri,'url_scanned_file':lst_file_name, 'source_url':source_url, 'wiki_query':wiki_query, 'logging_time': formatted_time(datetime.now(timezone.utc))}
-        logger.log_struct(josn_obj)
+        json_obj = {'api_name':'url_scan','db_url':uri,'url_scanned_file':lst_file_name, 'source_url':source_url, 'wiki_query':wiki_query, 'logging_time': formatted_time(datetime.now(timezone.utc))}
+        logger.log_struct(json_obj)
         return create_api_response("Success",message=message,success_count=success_count,failed_count=failed_count,file_name=lst_file_name)    
     except Exception as e:
         error_message = str(e)
@@ -208,9 +208,9 @@ async def extract_knowledge_graph_from_file(
             else:
                 logging.info(f'Deleted File Path: {merged_file_path} and Deleted File Name : {file_name}')
                 delete_uploaded_local_file(merged_file_path,file_name)
-        josn_obj = {'message':message,'error_message':error_message, 'file_name': file_name,'status':'Failed','db_url':uri,'failed_count':1, 'source_type': source_type, 'source_url':source_url, 'wiki_query':wiki_query, 'logging_time': formatted_time(datetime.now(timezone.utc))}
-        logger.log_struct(josn_obj)
-        logging.exception(f'File Failed in extraction: {josn_obj}')
+        json_obj = {'message':message,'error_message':error_message, 'file_name': file_name,'status':'Failed','db_url':uri,'failed_count':1, 'source_type': source_type, 'source_url':source_url, 'wiki_query':wiki_query, 'logging_time': formatted_time(datetime.now(timezone.utc))}
+        logger.log_struct(json_obj)
+        logging.exception(f'File Failed in extraction: {json_obj}')
         return create_api_response('Failed', message=message + error_message[:100], error=error_message, file_name = file_name)
     finally:
         gc.collect()
@@ -225,8 +225,8 @@ async def get_source_list(uri:str, userName:str, password:str, database:str=None
         if " " in uri:
             uri = uri.replace(" ","+")
         result = await asyncio.to_thread(get_source_list_from_graph,uri,userName,decoded_password,database)
-        josn_obj = {'api_name':'sources_list','db_url':uri, 'logging_time': formatted_time(datetime.now(timezone.utc))}
-        logger.log_struct(josn_obj)
+        json_obj = {'api_name':'sources_list','db_url':uri, 'logging_time': formatted_time(datetime.now(timezone.utc))}
+        logger.log_struct(json_obj)
         return create_api_response("Success",data=result)
     except Exception as e:
         job_status = "Failed"
@@ -243,19 +243,20 @@ async def post_processing(uri=Form(), userName=Form(), password=Form(), database
 
         if "materialize_text_chunk_similarities" in tasks:
             await asyncio.to_thread(update_graph, graph)
-            josn_obj = {'api_name': 'post_processing/materialize_text_chunk_similarities', 'db_url': uri, 'logging_time': formatted_time(datetime.now(timezone.utc))}
-            logger.log_struct(josn_obj)
+            json_obj = {'api_name': 'post_processing/update_similarity_graph', 'db_url': uri, 'logging_time': formatted_time(datetime.now(timezone.utc))}
+            logger.log_struct(json_obj)
             logging.info(f'Updated KNN Graph')
+
         if "enable_hybrid_search_and_fulltext_search_in_bloom" in tasks:
             await asyncio.to_thread(create_fulltext, uri=uri, username=userName, password=password, database=database,type="entities")
-            await asyncio.to_thread(create_fulltext, uri=uri, username=userName, password=password, database=database,type="keyword")
+            # await asyncio.to_thread(create_fulltext, uri=uri, username=userName, password=password, database=database,type="keyword")
             josn_obj = {'api_name': 'post_processing/enable_hybrid_search_and_fulltext_search_in_bloom', 'db_url': uri, 'logging_time': formatted_time(datetime.now(timezone.utc))}
             logger.log_struct(josn_obj)
             logging.info(f'Full Text index created')
         if os.environ.get('ENTITY_EMBEDDING','False').upper()=="TRUE" and "materialize_entity_similarities" in tasks:
             await asyncio.to_thread(create_entity_embedding, graph)
-            josn_obj = {'api_name': 'post_processing/materialize_entity_similarities', 'db_url': uri, 'logging_time': formatted_time(datetime.now(timezone.utc))}
-            logger.log_struct(josn_obj)
+            json_obj = {'api_name': 'post_processing/create_entity_embedding', 'db_url': uri, 'logging_time': formatted_time(datetime.now(timezone.utc))}
+            logger.log_struct(json_obj)
             logging.info(f'Entity Embeddings created')
         return create_api_response('Success', message='All tasks completed successfully')
 
@@ -284,8 +285,8 @@ async def chat_bot(uri=Form(),model=Form(None),userName=Form(), password=Form(),
         logging.info(f"Total Response time is  {total_call_time:.2f} seconds")
         result["info"]["response_time"] = round(total_call_time, 2)
 
-        josn_obj = {'api_name':'chat_bot','db_url':uri,'session_id':session_id, 'logging_time': formatted_time(datetime.now(timezone.utc))}
-        logger.log_struct(josn_obj)
+        json_obj = {'api_name':'chat_bot','db_url':uri,'session_id':session_id, 'logging_time': formatted_time(datetime.now(timezone.utc))}
+        logger.log_struct(json_obj)
         return create_api_response('Success',data=result)
     except Exception as e:
         job_status = "Failed"
@@ -301,8 +302,8 @@ async def chunk_entities(uri=Form(),userName=Form(), password=Form(), chunk_ids=
     try:
         logging.info(f"URI: {uri}, Username: {userName}, chunk_ids: {chunk_ids}")
         result = await asyncio.to_thread(get_entities_from_chunkids,uri=uri, username=userName, password=password, chunk_ids=chunk_ids)
-        josn_obj = {'api_name':'chunk_entities','db_url':uri, 'logging_time': formatted_time(datetime.now(timezone.utc))}
-        logger.log_struct(josn_obj)
+        json_obj = {'api_name':'chunk_entities','db_url':uri, 'logging_time': formatted_time(datetime.now(timezone.utc))}
+        logger.log_struct(json_obj)
         return create_api_response('Success',data=result)
     except Exception as e:
         job_status = "Failed"
@@ -329,8 +330,8 @@ async def graph_query(
             password=password,
             document_names=document_names
         )
-        josn_obj = {'api_name':'graph_query','db_url':uri,'document_names':document_names, 'logging_time': formatted_time(datetime.now(timezone.utc))}
-        logger.log_struct(josn_obj)
+        json_obj = {'api_name':'graph_query','db_url':uri,'document_names':document_names, 'logging_time': formatted_time(datetime.now(timezone.utc))}
+        logger.log_struct(json_obj)
         return create_api_response('Success', data=result)
     except Exception as e:
         job_status = "Failed"
@@ -379,8 +380,8 @@ async def upload_large_file_into_chunks(file:UploadFile = File(...), chunkNumber
     try:
         graph = create_graph_database_connection(uri, userName, password, database)
         result = await asyncio.to_thread(upload_file, graph, model, file, chunkNumber, totalChunks, originalname, uri, CHUNK_DIR, MERGED_DIR)
-        josn_obj = {'api_name':'upload','db_url':uri, 'logging_time': formatted_time(datetime.now(timezone.utc))}
-        logger.log_struct(josn_obj)
+        json_obj = {'api_name':'upload','db_url':uri, 'logging_time': formatted_time(datetime.now(timezone.utc))}
+        logger.log_struct(json_obj)
         if int(chunkNumber) == int(totalChunks):
             return create_api_response('Success',data=result, message='Source Node Created Successfully')
         else:
@@ -401,8 +402,8 @@ async def get_structured_schema(uri=Form(), userName=Form(), password=Form(), da
         graph = create_graph_database_connection(uri, userName, password, database)
         result = await asyncio.to_thread(get_labels_and_relationtypes, graph)
         logging.info(f'Schema result from DB: {result}')
-        josn_obj = {'api_name':'schema','db_url':uri, 'logging_time': formatted_time(datetime.now(timezone.utc))}
-        logger.log_struct(josn_obj)
+        json_obj = {'api_name':'schema','db_url':uri, 'logging_time': formatted_time(datetime.now(timezone.utc))}
+        logger.log_struct(json_obj)
         return create_api_response('Success', data=result)
     except Exception as e:
         message="Unable to get the labels and relationtypes from neo4j database"
@@ -470,8 +471,8 @@ async def delete_document_and_entities(uri=Form(),
         result, files_list_size = await asyncio.to_thread(graphDb_data_Access.delete_file_from_graph, filenames, source_types, deleteEntities, MERGED_DIR, uri)
         # entities_count = result[0]['deletedEntities'] if 'deletedEntities' in result[0] else 0
         message = f"Deleted {files_list_size} documents with entities from database"
-        josn_obj = {'api_name':'delete_document_and_entities','db_url':uri, 'logging_time': formatted_time(datetime.now(timezone.utc))}
-        logger.log_struct(josn_obj)
+        json_obj = {'api_name':'delete_document_and_entities','db_url':uri, 'logging_time': formatted_time(datetime.now(timezone.utc))}
+        logger.log_struct(json_obj)
         return create_api_response('Success',message=message)
     except Exception as e:
         job_status = "Failed"
@@ -627,4 +628,4 @@ async def merge_duplicate_nodes(uri=Form(), userName=Form(), password=Form(), da
         gc.collect()
 
 if __name__ == "__main__":
-    uvicorn.run(app)
+    uvicorn.run(app)
diff --git a/backend/src/QA_integration_new.py b/backend/src/QA_integration_new.py
@@ -41,26 +41,26 @@
 
 def get_neo4j_retriever(graph, retrieval_query,document_names,mode,index_name="vector",keyword_index="keyword", search_k=CHAT_SEARCH_KWARG_K, score_threshold=CHAT_SEARCH_KWARG_SCORE_THRESHOLD):
     try:
-        if mode == "hybrid":
-            # neo_db = Neo4jVector.from_existing_graph(
-            #     embedding=EMBEDDING_FUNCTION,
-            #     index_name=index_name,
-            #     retrieval_query=retrieval_query,
-            #     graph=graph,
-            #     search_type="hybrid",
-            #     node_label="Chunk",
-            #     embedding_node_property="embedding",
-            #     text_node_properties=["text"]
-            #     # keyword_index_name=keyword_index
-            # )
-            neo_db = Neo4jVector.from_existing_index(
+        if mode == "fulltext" or mode == "graph + vector + fulltext":
+            neo_db = Neo4jVector.from_existing_graph(
                 embedding=EMBEDDING_FUNCTION,
                 index_name=index_name,
                 retrieval_query=retrieval_query,
                 graph=graph,
                 search_type="hybrid",
+                node_label="Chunk",
+                embedding_node_property="embedding",
+                text_node_properties=["text"],
                 keyword_index_name=keyword_index
             )
+            # neo_db = Neo4jVector.from_existing_index(
+            #     embedding=EMBEDDING_FUNCTION,
+            #     index_name=index_name,
+            #     retrieval_query=retrieval_query,
+            #     graph=graph,
+            #     search_type="hybrid",
+            #     keyword_index_name=keyword_index
+            # )
             logging.info(f"Successfully retrieved Neo4jVector index '{index_name}' and keyword index '{keyword_index}'")
         else:
             neo_db = Neo4jVector.from_existing_index(
@@ -374,7 +374,7 @@ def QA_RAG(graph, model, question, document_names,session_id, mode):
                 "user": "chatbot"
             } 
             return result
-        elif mode == "vector" or mode == "hybrid":
+        elif mode == "vector" or mode == "fulltext":
             retrieval_query = VECTOR_SEARCH_QUERY
         else:
             retrieval_query = VECTOR_GRAPH_SEARCH_QUERY.format(no_of_entites=VECTOR_GRAPH_SEARCH_ENTITY_LIMIT)