Skip to content

Commit 9e5890d

Browse files
Chunks to be created (#1015)
* Restrict chunks creation configurable * Added chunk_to_be_created en value in connect endpoint * Added Chunks To be Processed * Update Content.tsx * Conditional display --------- Co-authored-by: Pravesh Kumar <[email protected]>
1 parent 1ba89d1 commit 9e5890d

File tree

15 files changed

+150
-117
lines changed

15 files changed

+150
-117
lines changed

backend/example.env

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ LLM_MODEL_CONFIG_ollama_llama3="model_name,model_local_url"
4444
YOUTUBE_TRANSCRIPT_PROXY="https://user:pass@domain:port"
4545
EFFECTIVE_SEARCH_RATIO=5
4646
GRAPH_CLEANUP_MODEL="openai_gpt_4o"
47-
CHUNKS_TO_BE_PROCESSED="50"
47+
CHUNKS_TO_BE_CREATED="50"
4848
BEDROCK_EMBEDDING_MODEL="model_name,aws_access_key,aws_secret_key,region_name" #model_name="amazon.titan-embed-text-v1"
4949
LLM_MODEL_CONFIG_bedrock_nova_micro_v1="model_name,aws_access_key,aws_secret_key,region_name" #model_name="amazon.nova-micro-v1:0"
5050
LLM_MODEL_CONFIG_bedrock_nova_lite_v1="model_name,aws_access_key,aws_secret_key,region_name" #model_name="amazon.nova-lite-v1:0"

backend/score.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -503,12 +503,14 @@ async def connect(uri=Form(), userName=Form(), password=Form(), database=Form())
503503
graph = create_graph_database_connection(uri, userName, password, database)
504504
result = await asyncio.to_thread(connection_check_and_get_vector_dimensions, graph, database)
505505
gcs_file_cache = os.environ.get('GCS_FILE_CACHE')
506+
chunk_to_be_created = int(os.environ.get('CHUNKS_TO_BE_CREATED', '50'))
506507
end = time.time()
507508
elapsed_time = end - start
508509
json_obj = {'api_name':'connect','db_url':uri, 'userName':userName, 'database':database, 'count':1, 'logging_time': formatted_time(datetime.now(timezone.utc)), 'elapsed_api_time':f'{elapsed_time:.2f}'}
509510
logger.log_struct(json_obj, "INFO")
510511
result['elapsed_api_time'] = f'{elapsed_time:.2f}'
511512
result['gcs_file_cache'] = gcs_file_cache
513+
result['chunk_to_be_created']= chunk_to_be_created
512514
return create_api_response('Success',data=result)
513515
except Exception as e:
514516
job_status = "Failed"
@@ -981,8 +983,8 @@ async def backend_connection_configuration():
981983
database= os.getenv('NEO4J_DATABASE')
982984
password= os.getenv('NEO4J_PASSWORD')
983985
gcs_file_cache = os.environ.get('GCS_FILE_CACHE')
986+
chunk_to_be_created = int(os.environ.get('CHUNKS_TO_BE_CREATED', '50'))
984987
if all([uri, username, database, password]):
985-
print(f'uri:{uri}, usrName:{username}, database :{database}, password: {password}')
986988
graph = Neo4jGraph()
987989
logging.info(f'login connection status of object: {graph}')
988990
if graph is not None:
@@ -996,6 +998,7 @@ async def backend_connection_configuration():
996998
result["database"] = database
997999
result["password"] = encoded_password
9981000
result['gcs_file_cache'] = gcs_file_cache
1001+
result['chunk_to_be_created']= chunk_to_be_created
9991002
end = time.time()
10001003
elapsed_time = end - start
10011004
result['api_name'] = 'backend_connection_configuration'

backend/src/create_chunks.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import logging
55
from src.document_sources.youtube import get_chunks_with_timestamps, get_calculated_timestamps
66
import re
7+
import os
78

89
logging.basicConfig(format="%(asctime)s - %(message)s", level="INFO")
910

@@ -25,23 +26,28 @@ def split_file_into_chunks(self):
2526
"""
2627
logging.info("Split file into smaller chunks")
2728
text_splitter = TokenTextSplitter(chunk_size=200, chunk_overlap=20)
29+
chunk_to_be_created = int(os.environ.get('CHUNKS_TO_BE_CREATED', '50'))
2830
if 'page' in self.pages[0].metadata:
2931
chunks = []
3032
for i, document in enumerate(self.pages):
3133
page_number = i + 1
32-
for chunk in text_splitter.split_documents([document]):
33-
chunks.append(Document(page_content=chunk.page_content, metadata={'page_number':page_number}))
34+
if len(chunks) >= chunk_to_be_created:
35+
break
36+
else:
37+
for chunk in text_splitter.split_documents([document]):
38+
chunks.append(Document(page_content=chunk.page_content, metadata={'page_number':page_number}))
3439

3540
elif 'length' in self.pages[0].metadata:
3641
if len(self.pages) == 1 or (len(self.pages) > 1 and self.pages[1].page_content.strip() == ''):
3742
match = re.search(r'(?:v=)([0-9A-Za-z_-]{11})\s*',self.pages[0].metadata['source'])
3843
youtube_id=match.group(1)
3944
chunks_without_time_range = text_splitter.split_documents([self.pages[0]])
40-
chunks = get_calculated_timestamps(chunks_without_time_range, youtube_id)
41-
45+
chunks = get_calculated_timestamps(chunks_without_time_range[:chunk_to_be_created], youtube_id)
4246
else:
43-
chunks_without_time_range = text_splitter.split_documents(self.pages)
44-
chunks = get_chunks_with_timestamps(chunks_without_time_range)
47+
chunks_without_time_range = text_splitter.split_documents(self.pages)
48+
chunks = get_chunks_with_timestamps(chunks_without_time_range[:chunk_to_be_created])
4549
else:
4650
chunks = text_splitter.split_documents(self.pages)
51+
52+
chunks = chunks[:chunk_to_be_created]
4753
return chunks

backend/src/main.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -361,14 +361,12 @@ async def processing_source(uri, userName, password, database, model, file_name,
361361

362362
logging.info('Update the status as Processing')
363363
update_graph_chunk_processed = int(os.environ.get('UPDATE_GRAPH_CHUNKS_PROCESSED'))
364-
chunk_to_be_processed = int(os.environ.get('CHUNKS_TO_BE_PROCESSED', '50'))
364+
365365
# selected_chunks = []
366366
is_cancelled_status = False
367367
job_status = "Completed"
368368
for i in range(0, len(chunkId_chunkDoc_list), update_graph_chunk_processed):
369369
select_chunks_upto = i+update_graph_chunk_processed
370-
if select_chunks_upto > chunk_to_be_processed:
371-
break
372370
logging.info(f'Selected Chunks upto: {select_chunks_upto}')
373371
if len(chunkId_chunkDoc_list) <= select_chunks_upto:
374372
select_chunks_upto = len(chunkId_chunkDoc_list)

frontend/src/HOC/WithVisibility.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { VisibilityProps } from "../types";
1+
import { VisibilityProps } from '../types';
22

33
export function withVisibility<P>(WrappedComponent: React.ComponentType<P>) {
44
const VisibityControlled = (props: P & VisibilityProps) => {

frontend/src/components/Content.tsx

Lines changed: 46 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import { useEffect, useState, useMemo, useRef, Suspense, useReducer, useCallback } from 'react';
22
import FileTable from './FileTable';
3-
import { Button, Typography, Flex, StatusIndicator, useMediaQuery } from '@neo4j-ndl/react';
3+
import { Button, Typography, Flex, StatusIndicator, useMediaQuery, Callout } from '@neo4j-ndl/react';
44
import { useCredentials } from '../context/UserCredentials';
55
import { useFileContext } from '../context/UsersFiles';
66
import { extractAPI } from '../utils/FileAPI';
@@ -63,8 +63,15 @@ const Content: React.FC<ContentProps> = ({
6363
const [openGraphView, setOpenGraphView] = useState<boolean>(false);
6464
const [inspectedName, setInspectedName] = useState<string>('');
6565
const [documentName, setDocumentName] = useState<string>('');
66-
const { setUserCredentials, userCredentials, setConnectionStatus, isGdsActive, isReadOnlyUser, isGCSActive } =
67-
useCredentials();
66+
const {
67+
setUserCredentials,
68+
userCredentials,
69+
setConnectionStatus,
70+
isGdsActive,
71+
isReadOnlyUser,
72+
isGCSActive,
73+
chunksToBeProces,
74+
} = useCredentials();
6875
const [showConfirmationModal, setshowConfirmationModal] = useState<boolean>(false);
6976
const [showExpirationModal, setshowExpirationModal] = useState<boolean>(false);
7077
const [extractLoading, setextractLoading] = useState<boolean>(false);
@@ -106,7 +113,7 @@ const Content: React.FC<ContentProps> = ({
106113
);
107114
const [showDeletePopUp, setshowDeletePopUp] = useState<boolean>(false);
108115
const [deleteLoading, setdeleteLoading] = useState<boolean>(false);
109-
const hasSelections = useHasSelections(selectedNodes, selectedRels);
116+
const hasSelections = useHasSelections(selectedNodes, selectedRels);
110117

111118
const { updateStatusForLargeFiles } = useServerSideEvent(
112119
(inMinutes, time, fileName) => {
@@ -150,8 +157,10 @@ const Content: React.FC<ContentProps> = ({
150157
? postProcessingTasks.filter((task) => task !== 'graph_schema_consolidation')
151158
: postProcessingTasks
152159
: hasSelections
153-
? postProcessingTasks.filter((task) => task !== 'graph_schema_consolidation' && task !== 'enable_communities')
154-
: postProcessingTasks.filter((task) => task !== 'enable_communities');
160+
? postProcessingTasks.filter(
161+
(task) => task !== 'graph_schema_consolidation' && task !== 'enable_communities'
162+
)
163+
: postProcessingTasks.filter((task) => task !== 'enable_communities');
155164
const response = await postProcessing(userCredentials as UserCredentials, payload);
156165
if (response.data.status === 'Success') {
157166
const communityfiles = response.data?.data;
@@ -381,7 +390,11 @@ const Content: React.FC<ContentProps> = ({
381390
const addFilesToQueue = async (remainingFiles: CustomFile[]) => {
382391
if (!remainingFiles.length) {
383392
showNormalToast(
384-
<PostProcessingToast isGdsActive={isGdsActive} postProcessingTasks={postProcessingTasks} isSchema={hasSelections} />
393+
<PostProcessingToast
394+
isGdsActive={isGdsActive}
395+
postProcessingTasks={postProcessingTasks}
396+
isSchema={hasSelections}
397+
/>
385398
);
386399
try {
387400
const response = await postProcessing(userCredentials as UserCredentials, postProcessingTasks);
@@ -532,8 +545,9 @@ const Content: React.FC<ContentProps> = ({
532545
const handleOpenGraphClick = () => {
533546
const bloomUrl = process.env.VITE_BLOOM_URL;
534547
const uriCoded = userCredentials?.uri.replace(/:\d+$/, '');
535-
const connectURL = `${uriCoded?.split('//')[0]}//${userCredentials?.userName}@${uriCoded?.split('//')[1]}:${userCredentials?.port ?? '7687'
536-
}`;
548+
const connectURL = `${uriCoded?.split('//')[0]}//${userCredentials?.userName}@${uriCoded?.split('//')[1]}:${
549+
userCredentials?.port ?? '7687'
550+
}`;
537551
const encodedURL = encodeURIComponent(connectURL);
538552
const replacedUrl = bloomUrl?.replace('{CONNECT_URL}', encodedURL);
539553
window.open(replacedUrl, '_blank');
@@ -586,19 +600,19 @@ const Content: React.FC<ContentProps> = ({
586600
(response.data?.message as string).includes('Chunks are not created')
587601
) {
588602
showNormalToast(response.data.message as string);
589-
retryOnclose()
603+
retryOnclose();
590604
} else {
591605
const isStartFromBegining = retryoption === RETRY_OPIONS[0] || retryoption === RETRY_OPIONS[1];
592606
setFilesData((prev) => {
593607
return prev.map((f) => {
594608
return f.name === filename
595609
? {
596-
...f,
597-
status: 'Ready to Reprocess',
598-
processingProgress: isStartFromBegining ? 0 : f.processingProgress,
599-
nodesCount: isStartFromBegining ? 0 : f.nodesCount,
600-
relationshipsCount: isStartFromBegining ? 0 : f.relationshipsCount,
601-
}
610+
...f,
611+
status: 'Ready to Reprocess',
612+
processingProgress: isStartFromBegining ? 0 : f.processingProgress,
613+
nodesCount: isStartFromBegining ? 0 : f.nodesCount,
614+
relationshipsCount: isStartFromBegining ? 0 : f.relationshipsCount,
615+
}
602616
: f;
603617
});
604618
});
@@ -706,7 +720,7 @@ const Content: React.FC<ContentProps> = ({
706720
const selectedRows = childRef.current?.getSelectedRows();
707721
if (selectedRows?.length) {
708722
const expiredFilesExists = selectedRows.some(
709-
(c) => c.status !== 'Ready to Reprocess' && isExpired(c?.createdAt as Date ?? new Date())
723+
(c) => c.status !== 'Ready to Reprocess' && isExpired((c?.createdAt as Date) ?? new Date())
710724
);
711725
const largeFileExists = selectedRows.some(
712726
(c) => isFileReadyToProcess(c, true) && typeof c.size === 'number' && c.size > largeFileSize
@@ -715,15 +729,12 @@ const Content: React.FC<ContentProps> = ({
715729
setshowExpirationModal(true);
716730
} else if (largeFileExists && isGCSActive) {
717731
setshowConfirmationModal(true);
718-
} else if (largeFileExists && isGCSActive) {
719732
setshowExpirationModal(true);
720-
} else {
733+
} else {
721734
handleGenerateGraph(selectedRows.filter((f) => isFileReadyToProcess(f, false)));
722735
}
723736
} else if (filesData.length) {
724-
const expiredFileExists = filesData.some(
725-
(c) => isExpired(c?.createdAt as Date)
726-
);
737+
const expiredFileExists = filesData.some((c) => isExpired(c?.createdAt as Date));
727738
const largeFileExists = filesData.some(
728739
(c) => isFileReadyToProcess(c, true) && typeof c.size === 'number' && c.size > largeFileSize
729740
);
@@ -863,20 +874,12 @@ const Content: React.FC<ContentProps> = ({
863874
uri={userCredentials && userCredentials?.uri}
864875
/>
865876
<div className='pt-1 flex gap-1 items-center'>
877+
<div>{!hasSelections ? <StatusIndicator type='danger' /> : <StatusIndicator type='success' />}</div>
866878
<div>
867-
{!hasSelections ? (
868-
<StatusIndicator type='danger' />
869-
) :
870-
(<StatusIndicator type='success' />
871-
)}
872-
</div>
873-
<div>
874-
{hasSelections? (
879+
{hasSelections ? (
875880
<span className='n-body-small'>
876-
{(hasSelections)} Graph Schema configured
877-
{hasSelections
878-
? `(${selectedNodes.length} Labels + ${selectedRels.length} Rel Types)`
879-
: ''}
881+
{hasSelections} Graph Schema configured
882+
{hasSelections ? `(${selectedNodes.length} Labels + ${selectedRels.length} Rel Types)` : ''}
880883
</span>
881884
) : (
882885
<span className='n-body-small'>No Graph Schema configured</span>
@@ -913,7 +916,15 @@ const Content: React.FC<ContentProps> = ({
913916
)
914917
)}
915918
</div>
919+
{connectionStatus && (
920+
<Callout
921+
className='!w-[93%] m-auto '
922+
type='note'
923+
description={`Large files may be partially processed up to ${chunksToBeProces} chunks due to resource limits. If you need more comprehensive processing, consider splitting larger documents.`}
924+
></Callout>
925+
)}
916926
</Flex>
927+
917928
<FileTable
918929
connectionStatus={connectionStatus}
919930
setConnectionStatus={setConnectionStatus}
@@ -940,6 +951,7 @@ const Content: React.FC<ContentProps> = ({
940951
ref={childRef}
941952
handleGenerateGraph={processWaitingFilesOnRefresh}
942953
></FileTable>
954+
943955
<Flex
944956
className={`p-2.5 mt-1.5 absolute bottom-0 w-full`}
945957
justifyContent='space-between'

frontend/src/components/FileTable.tsx

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1009,7 +1009,9 @@ const FileTable: ForwardRefRenderFunction<ChildRef, FileTableProps> = (props, re
10091009
}}
10101010
isLoading={isLoading}
10111011
rootProps={{
1012-
className: `absolute h-[67%] left-10 filetable ${!largedesktops ? 'top-[17%]' : 'top-[14%]'}`,
1012+
className: `absolute left-10 filetable ${
1013+
!largedesktops && connectionStatus ? 'h-[50%]' : connectionStatus ? 'h-[60%]' : 'h-[67%]'
1014+
} ${!largedesktops && connectionStatus ? 'top-[29%]' : connectionStatus ? 'top-[26%]' : 'top-[14%]'}`,
10131015
}}
10141016
components={{
10151017
Body: () => (

frontend/src/components/Layout/PageLayout.tsx

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ const PageLayout: React.FC = () => {
6262
setShowDisconnectButton,
6363
showDisconnectButton,
6464
setIsGCSActive,
65+
setChunksToBeProces,
6566
} = useCredentials();
6667
const { cancel } = useSpeechSynthesis();
6768

@@ -85,6 +86,7 @@ const PageLayout: React.FC = () => {
8586
setIsGCSActive(credentials.isGCSActive ?? false);
8687
setGdsActive(credentials.isgdsActive);
8788
setIsReadOnlyUser(credentials.isReadonlyUser);
89+
setChunksToBeProces(credentials.chunksTobeProcess);
8890
localStorage.setItem(
8991
'neo4j.connection',
9092
JSON.stringify({
@@ -96,6 +98,7 @@ const PageLayout: React.FC = () => {
9698
isReadOnlyUser: credentials.isReadonlyUser,
9799
isgdsActive: credentials.isgdsActive,
98100
isGCSActive: credentials.isGCSActive,
101+
chunksTobeProcess: credentials.chunksTobeProcess,
99102
})
100103
);
101104
};
@@ -158,7 +161,9 @@ const PageLayout: React.FC = () => {
158161
isReadonlyUser: !connectionData.data.write_access,
159162
isgdsActive: connectionData.data.gds_status,
160163
isGCSActive: connectionData?.data?.gcs_file_cache === 'True',
164+
chunksTobeProcess: parseInt(connectionData.data.chunk_to_be_created),
161165
};
166+
setChunksToBeProces(envCredentials.chunksTobeProcess);
162167
setIsGCSActive(envCredentials.isGCSActive);
163168
if (session) {
164169
const updated = updateSessionIfNeeded(envCredentials, session);

frontend/src/components/Popups/ConnectionModal/ConnectionModal.tsx

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ export default function ConnectionModal({
4949
errorMessage,
5050
setIsGCSActive,
5151
setShowDisconnectButton,
52+
setChunksToBeProces,
5253
} = useCredentials();
5354
const [isLoading, setIsLoading] = useState<boolean>(false);
5455
const [searchParams, setSearchParams] = useSearchParams();
@@ -224,10 +225,11 @@ export default function ConnectionModal({
224225
const isgdsActive = response.data.data.gds_status;
225226
const isReadOnlyUser = !response.data.data.write_access;
226227
const isGCSActive = response.data.data.gcs_file_cache === 'True';
228+
const chunksTobeProcess = parseInt(response.data.data.chunk_to_be_created);
227229
setIsGCSActive(isGCSActive);
228230
setGdsActive(isgdsActive);
229231
setIsReadOnlyUser(isReadOnlyUser);
230-
232+
setChunksToBeProces(chunksTobeProcess);
231233
localStorage.setItem(
232234
'neo4j.connection',
233235
JSON.stringify({
@@ -239,6 +241,7 @@ export default function ConnectionModal({
239241
isgdsActive,
240242
isReadOnlyUser,
241243
isGCSActive,
244+
chunksTobeProcess,
242245
})
243246
);
244247
setUserDbVectorIndex(response.data.data.db_vector_dimension);

frontend/src/components/Popups/GraphEnhancementDialog/EnitityExtraction/EntityExtractionSetting.tsx

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,8 @@ export default function EntityExtractionSetting({
2929
closeEnhanceGraphSchemaDialog?: () => void;
3030
}) {
3131
const { breakpoints } = tokens;
32-
const {
33-
setSelectedRels,
34-
setSelectedNodes,
35-
selectedNodes,
36-
selectedRels,
37-
selectedSchemas,
38-
setSelectedSchemas,
39-
} = useFileContext();
32+
const { setSelectedRels, setSelectedNodes, selectedNodes, selectedRels, selectedSchemas, setSelectedSchemas } =
33+
useFileContext();
4034
const { userCredentials } = useCredentials();
4135
const [loading, setLoading] = useState<boolean>(false);
4236
const isTablet = useMediaQuery(`(min-width:${breakpoints.xs}) and (max-width: ${breakpoints.lg})`);

0 commit comments

Comments
 (0)