Skip to content

Commit 6e29f89

Browse files
aashipandyaprakriti-solankeypraveshkumar1988
authored
DO NOT MERGE - Document, chunk node labels and relation labels updated with underscore (#626)
* Document and Chunk node label updated * added underscores for all relationship types * updated queries from DEV * updated Document and Chunk nodes after DEV merge * Label Changes * Exclude labels from schema API which label start from __ --------- Co-authored-by: Prakriti Solankey <[email protected]> Co-authored-by: Pravesh Kumar <[email protected]>
1 parent 0799b31 commit 6e29f89

File tree

15 files changed

+128
-121
lines changed

15 files changed

+128
-121
lines changed

backend/src/QA_integration.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,18 +30,18 @@
3030

3131

3232
# RETRIEVAL_QUERY = """
33-
# WITH node, score, apoc.text.join([ (node)-[:HAS_ENTITY]->(e) | head(labels(e)) + ": "+ e.id],", ") as entities
34-
# MATCH (node)-[:PART_OF]->(d:Document)
33+
# WITH node, score, apoc.text.join([ (node)-[:__HAS_ENTITY__]->(e) | head(labels(e)) + ": "+ e.id],", ") as entities
34+
# MATCH (node)-[:__PART_OF__]->(d:Document)
3535
# WITH d, apoc.text.join(collect(node.text + "\n" + entities),"\n----\n") as text, avg(score) as score
3636
# RETURN text, score, {source: COALESCE(CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName)} as metadata
3737
# """
3838

3939
RETRIEVAL_QUERY = """
4040
WITH node as chunk, score
41-
MATCH (chunk)-[:PART_OF]->(d:Document)
41+
MATCH (chunk)-[:__PART_OF__]->(d:__Document__)
4242
CALL { WITH chunk
43-
MATCH (chunk)-[:HAS_ENTITY]->(e)
44-
MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,3}(:!Chunk&!Document)
43+
MATCH (chunk)-[:__HAS_ENTITY__]->(e)
44+
MATCH path=(e)(()-[rels:!__HAS_ENTITY__&!__PART_OF__]-()){0,3}(:!__Chunk__&!__Document__)
4545
UNWIND rels as r
4646
RETURN collect(distinct r) as rels
4747
}

backend/src/QA_optimization.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,13 @@ async def _vector_embed_results(self):
4646
t=datetime.now()
4747
print("Vector embeddings start time",t)
4848
# retrieval_query="""
49-
# MATCH (node)-[:PART_OF]->(d:Document)
49+
# MATCH (node)-[:__PART_OF__]->(d:Document)
5050
# WITH d, apoc.text.join(collect(node.text),"\n----\n") as text, avg(score) as score
5151
# RETURN text, score, {source: COALESCE(CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName)} as metadata
5252
# """
5353
retrieval_query="""
54-
WITH node, score, apoc.text.join([ (node)-[:HAS_ENTITY]->(e) | head(labels(e)) + ": "+ e.id],", ") as entities
55-
MATCH (node)-[:PART_OF]->(d:Document)
54+
WITH node, score, apoc.text.join([ (node)-[:__HAS_ENTITY__]->(e) | head(labels(e)) + ": "+ e.id],", ") as entities
55+
MATCH (node)-[:__PART_OF__]->(d:__Document__)
5656
WITH d, apoc.text.join(collect(node.text + "\n" + entities),"\n----\n") as text, avg(score) as score
5757
RETURN text, score, {source: COALESCE(CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName)} as metadata
5858
"""

backend/src/chunkid_entities.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@
33
from src.graph_query import *
44

55
CHUNK_QUERY = """
6-
match (chunk:Chunk) where chunk.id IN $chunksIds
6+
match (chunk:__Chunk__) where chunk.id IN $chunksIds
77
8-
MATCH (chunk)-[:PART_OF]->(d:Document)
8+
MATCH (chunk)-[:__PART_OF__]->(d:__Document__)
99
CALL {WITH chunk
10-
MATCH (chunk)-[:HAS_ENTITY]->(e)
11-
MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,2}(:!Chunk&!Document)
10+
MATCH (chunk)-[:__HAS_ENTITY__]->(e)
11+
MATCH path=(e)(()-[rels:!__HAS_ENTITY__&!__PART_OF__]-()){0,2}(:!__Chunk__&!__Document__)
1212
UNWIND rels as r
1313
RETURN collect(distinct r) as rels
1414
}

backend/src/graphDB_dataAccess.py

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ def update_exception_db(self, file_name, exp_msg):
2020
is_cancelled_status = result[0]['is_cancelled']
2121
if bool(is_cancelled_status) == True:
2222
job_status = 'Cancelled'
23-
self.graph.query("""MERGE(d:Document {fileName :$fName}) SET d.status = $status, d.errorMessage = $error_msg""",
23+
self.graph.query("""MERGE(d:__Document__ {fileName :$fName}) SET d.status = $status, d.errorMessage = $error_msg""",
2424
{"fName":file_name, "status":job_status, "error_msg":exp_msg})
2525
except Exception as e:
2626
error_message = str(e)
@@ -31,7 +31,7 @@ def create_source_node(self, obj_source_node:sourceNode):
3131
try:
3232
job_status = "New"
3333
logging.info("creating source node if does not exist")
34-
self.graph.query("""MERGE(d:Document {fileName :$fn}) SET d.fileSize = $fs, d.fileType = $ft ,
34+
self.graph.query("""MERGE(d:__Document__ {fileName :$fn}) SET d.fileSize = $fs, d.fileType = $ft ,
3535
d.status = $st, d.url = $url, d.awsAccessKeyId = $awsacc_key_id,
3636
d.fileSource = $f_source, d.createdAt = $c_at, d.updatedAt = $u_at,
3737
d.processingTime = $pt, d.errorMessage = $e_message, d.nodeCount= $n_count,
@@ -95,7 +95,7 @@ def update_source_node(self, obj_source_node:sourceNode):
9595
param= {"props":params}
9696

9797
print(f'Base Param value 1 : {param}')
98-
query = "MERGE(d:Document {fileName :$props.fileName}) SET d += $props"
98+
query = "MERGE(d:__Document__ {fileName :$props.fileName}) SET d += $props"
9999
logging.info("Update source node properties")
100100
self.graph.query(query,param)
101101
except Exception as e:
@@ -117,7 +117,7 @@ def get_source_list(self):
117117
sorting the list by the last updated date.
118118
"""
119119
logging.info("Get existing files list from graph")
120-
query = "MATCH(d:Document) WHERE d.fileName IS NOT NULL RETURN d ORDER BY d.updatedAt DESC"
120+
query = "MATCH(d:__Document__) WHERE d.fileName IS NOT NULL RETURN d ORDER BY d.updatedAt DESC"
121121
result = self.graph.query(query)
122122
list_of_json_objects = [entry['d'] for entry in result]
123123
return list_of_json_objects
@@ -131,10 +131,10 @@ def update_KNN_graph(self):
131131
knn_min_score = os.environ.get('KNN_MIN_SCORE')
132132
if len(index) > 0:
133133
logging.info('update KNN graph')
134-
self.graph.query("""MATCH (c:Chunk)
135-
WHERE c.embedding IS NOT NULL AND count { (c)-[:SIMILAR]-() } < 5
134+
self.graph.query("""MATCH (c:__Chunk__)
135+
WHERE c.embedding IS NOT NULL AND count { (c)-[:__SIMILAR__]-() } < 5
136136
CALL db.index.vector.queryNodes('vector', 6, c.embedding) yield node, score
137-
WHERE node <> c and score >= $score MERGE (c)-[rel:SIMILAR]-(node) SET rel.score = score
137+
WHERE node <> c and score >= $score MERGE (c)-[rel:__SIMILAR__]-(node) SET rel.score = score
138138
""",
139139
{"score":float(knn_min_score)}
140140
)
@@ -174,7 +174,7 @@ def execute_query(self, query, param=None):
174174

175175
def get_current_status_document_node(self, file_name):
176176
query = """
177-
MATCH(d:Document {fileName : $file_name}) RETURN d.status AS Status , d.processingTime AS processingTime,
177+
MATCH(d:__Document__ {fileName : $file_name}) RETURN d.status AS Status , d.processingTime AS processingTime,
178178
d.nodeCount AS nodeCount, d.model as model, d.relationshipCount as relationshipCount,
179179
d.total_pages AS total_pages, d.total_chunks AS total_chunks , d.fileSize as fileSize,
180180
d.is_cancelled as is_cancelled, d.processed_chunk as processed_chunk, d.fileSource as fileSource
@@ -197,23 +197,23 @@ def delete_file_from_graph(self, filenames, source_types, deleteEntities:str, me
197197
logging.info(f'Deleted File Path: {merged_file_path} and Deleted File Name : {file_name}')
198198
delete_uploaded_local_file(merged_file_path,file_name)
199199
query_to_delete_document="""
200-
MATCH (d:Document) where d.fileName in $filename_list and d.fileSource in $source_types_list
200+
MATCH (d:__Document__) where d.fileName in $filename_list and d.fileSource in $source_types_list
201201
with collect(d) as documents
202202
unwind documents as d
203-
optional match (d)<-[:PART_OF]-(c:Chunk)
203+
optional match (d)<-[:__PART_OF__]-(c:__Chunk__)
204204
detach delete c, d
205205
return count(*) as deletedChunks
206206
"""
207207
query_to_delete_document_and_entities="""
208-
MATCH (d:Document) where d.fileName in $filename_list and d.fileSource in $source_types_list
208+
MATCH (d:__Document__) where d.fileName in $filename_list and d.fileSource in $source_types_list
209209
with collect(d) as documents
210210
unwind documents as d
211-
optional match (d)<-[:PART_OF]-(c:Chunk)
211+
optional match (d)<-[:__PART_OF__]-(c:__Chunk__)
212212
// if delete-entities checkbox is set
213213
call { with c, documents
214-
match (c)-[:HAS_ENTITY]->(e)
214+
match (c)-[:__HAS_ENTITY__]->(e)
215215
// belongs to another document
216-
where not exists { (d2)<-[:PART_OF]-()-[:HAS_ENTITY]->(e) WHERE NOT d2 IN documents }
216+
where not exists { (d2)<-[:__PART_OF__]-()-[:__HAS_ENTITY__]->(e) WHERE NOT d2 IN documents }
217217
detach delete e
218218
return count(*) as entities
219219
}
@@ -231,17 +231,17 @@ def delete_file_from_graph(self, filenames, source_types, deleteEntities:str, me
231231

232232
def list_unconnected_nodes(self):
233233
query = """
234-
MATCH (e:!Chunk&!Document)
235-
WHERE NOT exists { (e)--(:!Chunk&!Document) }
236-
OPTIONAL MATCH (doc:Document)<-[:PART_OF]-(c:Chunk)-[:HAS_ENTITY]->(e)
234+
MATCH (e:!__Chunk__&!__Document__)
235+
WHERE NOT exists { (e)--(:!__Chunk__&!__Document__) }
236+
OPTIONAL MATCH (doc:__Document__)<-[:__PART_OF__]-(c:__Chunk__)-[:__HAS_ENTITY__]->(e)
237237
RETURN e {.*, embedding:null, elementId:elementId(e), labels:labels(e)} as e,
238238
collect(distinct doc.fileName) as documents, count(distinct c) as chunkConnections
239239
ORDER BY e.id ASC
240240
LIMIT 100
241241
"""
242242
query_total_nodes = """
243-
MATCH (e:!Chunk&!Document)
244-
WHERE NOT exists { (e)--(:!Chunk&!Document) }
243+
MATCH (e:!__Chunk__&!__Document__)
244+
WHERE NOT exists { (e)--(:!__Chunk__&!__Document__) }
245245
RETURN count(*) as total
246246
"""
247247
nodes_list = self.execute_query(query)
@@ -261,7 +261,7 @@ def get_duplicate_nodes_list(self):
261261
score_value = float(os.environ.get('DUPLICATE_SCORE_VALUE'))
262262
text_distance = int(os.environ.get('DUPLICATE_TEXT_DISTANCE'))
263263
query_duplicate_nodes = """
264-
MATCH (n:!Chunk&!Document) with n
264+
MATCH (n:!__Chunk__&!__Document__) with n
265265
WHERE n.embedding is not null and n.id is not null // and size(n.id) > 3
266266
WITH n ORDER BY count {{ (n)--() }} DESC, size(n.id) DESC // updated
267267
WITH collect(n) as nodes
@@ -289,7 +289,7 @@ def get_duplicate_nodes_list(self):
289289
where none(other in all where other <> nodes and size(other) > size(nodes) and size(apoc.coll.subtract(nodes, other))=0)
290290
return head(nodes) as n, tail(nodes) as similar
291291
}}
292-
OPTIONAL MATCH (doc:Document)<-[:PART_OF]-(c:Chunk)-[:HAS_ENTITY]->(n)
292+
OPTIONAL MATCH (doc:__Document__)<-[:__PART_OF__]-(c:__Chunk__)-[:__HAS_ENTITY__]->(n)
293293
{return_statement}
294294
"""
295295
return_query_duplicate_nodes = """
@@ -335,7 +335,7 @@ def drop_create_vector_index(self, is_vector_index_recreate):
335335
if is_vector_index_recreate == 'true':
336336
self.graph.query("""drop index vector""")
337337

338-
self.graph.query("""CREATE VECTOR INDEX `vector` if not exists for (c:Chunk) on (c.embedding)
338+
self.graph.query("""CREATE VECTOR INDEX `vector` if not exists for (c:__Chunk__) on (c.embedding)
339339
OPTIONS {indexConfig: {
340340
`vector.dimensions`: $dimensions,
341341
`vector.similarity_function`: 'cosine'

backend/src/graph_query.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ def get_completed_documents(driver):
162162
"""
163163
Retrieves the names of all documents with the status 'Completed' from the database.
164164
"""
165-
docs_query = "MATCH(node:Document {status:'Completed'}) RETURN node"
165+
docs_query = "MATCH(node:__Document__ {status:'Completed'}) RETURN node"
166166

167167
try:
168168
logging.info("Executing query to retrieve completed documents.")

backend/src/main.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -508,11 +508,13 @@ def get_labels_and_relationtypes(graph):
508508
query = """
509509
RETURN collect {
510510
CALL db.labels() yield label
511-
WHERE NOT label IN ['Chunk','_Bloom_Perspective_']
511+
WHERE NOT label IN ['__Chunk__','_Bloom_Perspective_']
512+
AND NOT label STARTS WITH ('__')
513+
AND NOT label ENDS WITH('__')
512514
return label order by label limit 100 } as labels,
513515
collect {
514516
CALL db.relationshipTypes() yield relationshipType as type
515-
WHERE NOT type IN ['PART_OF', 'NEXT_CHUNK', 'HAS_ENTITY', '_Bloom_Perspective_']
517+
WHERE NOT type IN ['__PART_OF__', '__NEXT_CHUNK__', '__HAS_ENTITY__', '_Bloom_Perspective_']
516518
return type order by type LIMIT 100 } as relationshipTypes
517519
"""
518520
graphDb_data_Access = graphDBdataAccess(graph)

backend/src/make_relationships.py

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
def merge_relationship_between_chunk_and_entites(graph: Neo4jGraph, graph_documents_chunk_chunk_Id : list):
1313
batch_data = []
14-
logging.info("Create HAS_ENTITY relationship between chunks and entities")
14+
logging.info("Create __HAS_ENTITY__ relationship between chunks and entities")
1515
chunk_node_id_set = 'id:"{}"'
1616
for graph_doc_chunk_id in graph_documents_chunk_chunk_Id:
1717
for node in graph_doc_chunk_id['graph_doc'].nodes:
@@ -24,14 +24,14 @@ def merge_relationship_between_chunk_and_entites(graph: Neo4jGraph, graph_docume
2424
#node_id = node.id
2525
#Below query is also unable to change as parametrize because we can't make parameter of Label or node type
2626
#https://neo4j.com/docs/cypher-manual/current/syntax/parameters/
27-
#graph.query('MATCH(c:Chunk {'+chunk_node_id_set.format(graph_doc_chunk_id['chunk_id'])+'}) MERGE (n:'+ node.type +'{ id: "'+node_id+'"}) MERGE (c)-[:HAS_ENTITY]->(n)')
27+
#graph.query('MATCH(c:Chunk {'+chunk_node_id_set.format(graph_doc_chunk_id['chunk_id'])+'}) MERGE (n:'+ node.type +'{ id: "'+node_id+'"}) MERGE (c)-[:__HAS_ENTITY__]->(n)')
2828

2929
if batch_data:
3030
unwind_query = """
3131
UNWIND $batch_data AS data
32-
MATCH (c:Chunk {id: data.chunk_id})
32+
MATCH (c:__Chunk__ {id: data.chunk_id})
3333
CALL apoc.merge.node([data.node_type], {id: data.node_id}) YIELD node AS n
34-
MERGE (c)-[:HAS_ENTITY]->(n)
34+
MERGE (c)-[:__HAS_ENTITY__]->(n)
3535
"""
3636
graph.query(unwind_query, params={"batch_data": batch_data})
3737

@@ -55,9 +55,9 @@ def update_embedding_create_vector_index(graph, chunkId_chunkDoc_list, file_name
5555
"chunkId": row['chunk_id'],
5656
"embeddings": embeddings_arr
5757
})
58-
# graph.query("""MATCH (d:Document {fileName : $fileName})
59-
# MERGE (c:Chunk {id:$chunkId}) SET c.embedding = $embeddings
60-
# MERGE (c)-[:PART_OF]->(d)
58+
# graph.query("""MATCH (d:__Document__ {fileName : $fileName})
59+
# MERGE (c:__Chunk__ {id:$chunkId}) SET c.embedding = $embeddings
60+
# MERGE (c)-[:__PART_OF__]->(d)
6161
# """,
6262
# {
6363
# "fileName" : file_name,
@@ -67,7 +67,7 @@ def update_embedding_create_vector_index(graph, chunkId_chunkDoc_list, file_name
6767
# )
6868
# logging.info('create vector index on chunk embedding')
6969

70-
graph.query("""CREATE VECTOR INDEX `vector` if not exists for (c:Chunk) on (c.embedding)
70+
graph.query("""CREATE VECTOR INDEX `vector` if not exists for (c:__Chunk__) on (c.embedding)
7171
OPTIONS {indexConfig: {
7272
`vector.dimensions`: $dimensions,
7373
`vector.similarity_function`: 'cosine'
@@ -80,10 +80,10 @@ def update_embedding_create_vector_index(graph, chunkId_chunkDoc_list, file_name
8080

8181
query_to_create_embedding = """
8282
UNWIND $data AS row
83-
MATCH (d:Document {fileName: $fileName})
84-
MERGE (c:Chunk {id: row.chunkId})
83+
MATCH (d:__Document__ {fileName: $fileName})
84+
MERGE (c:__Chunk__ {id: row.chunkId})
8585
SET c.embedding = row.embeddings
86-
MERGE (c)-[:PART_OF]->(d)
86+
MERGE (c)-[:__PART_OF__]->(d)
8787
"""
8888
graph.query(query_to_create_embedding, params={"fileName":file_name, "data":data_for_query})
8989

@@ -134,44 +134,44 @@ def create_relation_between_chunks(graph, file_name, chunks: List[Document])->li
134134

135135
# create relationships between chunks
136136
if firstChunk:
137-
relationships.append({"type": "FIRST_CHUNK", "chunk_id": current_chunk_id})
137+
relationships.append({"type": "__FIRST_CHUNK__", "chunk_id": current_chunk_id})
138138
else:
139139
relationships.append({
140-
"type": "NEXT_CHUNK",
140+
"type": "__NEXT_CHUNK__",
141141
"previous_chunk_id": previous_chunk_id, # ID of previous chunk
142142
"current_chunk_id": current_chunk_id
143143
})
144144

145145
query_to_create_chunk_and_PART_OF_relation = """
146146
UNWIND $batch_data AS data
147-
MERGE (c:Chunk {id: data.id})
147+
MERGE (c:__Chunk__ {id: data.id})
148148
SET c.text = data.pg_content, c.position = data.position, c.length = data.length, c.fileName=data.f_name, c.content_offset=data.content_offset
149149
WITH data, c
150150
SET c.page_number = CASE WHEN data.page_number IS NOT NULL THEN data.page_number END,
151151
c.start_time = CASE WHEN data.start_time IS NOT NULL THEN data.start_time END,
152152
c.end_time = CASE WHEN data.end_time IS NOT NULL THEN data.end_time END
153153
WITH data, c
154-
MATCH (d:Document {fileName: data.f_name})
155-
MERGE (c)-[:PART_OF]->(d)
154+
MATCH (d:__Document__ {fileName: data.f_name})
155+
MERGE (c)-[:__PART_OF__]->(d)
156156
"""
157157
graph.query(query_to_create_chunk_and_PART_OF_relation, params={"batch_data": batch_data})
158158

159159
query_to_create_FIRST_relation = """
160160
UNWIND $relationships AS relationship
161-
MATCH (d:Document {fileName: $f_name})
162-
MATCH (c:Chunk {id: relationship.chunk_id})
163-
FOREACH(r IN CASE WHEN relationship.type = 'FIRST_CHUNK' THEN [1] ELSE [] END |
164-
MERGE (d)-[:FIRST_CHUNK]->(c))
161+
MATCH (d:__Document__ {fileName: $f_name})
162+
MATCH (c:__Chunk__ {id: relationship.chunk_id})
163+
FOREACH(r IN CASE WHEN relationship.type = '__FIRST_CHUNK__' THEN [1] ELSE [] END |
164+
MERGE (d)-[:__FIRST_CHUNK__]->(c))
165165
"""
166166
graph.query(query_to_create_FIRST_relation, params={"f_name": file_name, "relationships": relationships})
167167

168168
query_to_create_NEXT_CHUNK_relation = """
169169
UNWIND $relationships AS relationship
170-
MATCH (c:Chunk {id: relationship.current_chunk_id})
170+
MATCH (c:__Chunk__ {id: relationship.current_chunk_id})
171171
WITH c, relationship
172-
MATCH (pc:Chunk {id: relationship.previous_chunk_id})
173-
FOREACH(r IN CASE WHEN relationship.type = 'NEXT_CHUNK' THEN [1] ELSE [] END |
174-
MERGE (c)<-[:NEXT_CHUNK]-(pc))
172+
MATCH (pc:__Chunk__ {id: relationship.previous_chunk_id})
173+
FOREACH(r IN CASE WHEN relationship.type = '__NEXT_CHUNK__' THEN [1] ELSE [] END |
174+
MERGE (c)<-[:__NEXT_CHUNK__]-(pc))
175175
"""
176176
graph.query(query_to_create_NEXT_CHUNK_relation, params={"relationships": relationships})
177177

0 commit comments

Comments
 (0)