Skip to content

Remove total_pages propert. It is not used in DB. #714

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions backend/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,6 @@ async def generate():
'relationshipCount':result[0]['relationshipCount'],
'model':result[0]['model'],
'total_chunks':result[0]['total_chunks'],
'total_pages':result[0]['total_pages'],
'fileSize':result[0]['fileSize'],
'processed_chunk':result[0]['processed_chunk'],
'fileSource':result[0]['fileSource']
Expand Down Expand Up @@ -504,7 +503,6 @@ async def get_document_status(file_name, url, userName, password, database):
'relationshipCount':result[0]['relationshipCount'],
'model':result[0]['model'],
'total_chunks':result[0]['total_chunks'],
'total_pages':result[0]['total_pages'],
'fileSize':result[0]['fileSize'],
'processed_chunk':result[0]['processed_chunk'],
'fileSource':result[0]['fileSource']
Expand Down
1 change: 0 additions & 1 deletion backend/src/document_sources/gcs_bucket.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,6 @@ def merge_file_gcs(bucket_name, original_file_name: str, folder_name_sha1_hashed
blob.upload_from_file(file_io)
# pdf_reader = PdfReader(file_io)
file_size = len(merged_file)
# total_pages = len(pdf_reader.pages)

return file_size
except Exception as e:
Expand Down
16 changes: 8 additions & 8 deletions backend/src/document_sources/local_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,19 +56,19 @@ def get_pages_with_page_numbers(unstructured_pages):
if page.metadata['page_number']==page_number:
page_content += page.page_content
metadata = {'source':page.metadata['source'],'page_number':page_number, 'filename':page.metadata['filename'],
'filetype':page.metadata['filetype'], 'total_pages':unstructured_pages[-1].metadata['page_number']}
'filetype':page.metadata['filetype']}

if page.metadata['page_number']>page_number:
page_number+=1
if not metadata:
metadata = {'total_pages':unstructured_pages[-1].metadata['page_number']}
pages.append(Document(page_content = page_content, metadata=metadata))
# if not metadata:
# metadata = {'total_pages':unstructured_pages[-1].metadata['page_number']}
pages.append(Document(page_content = page_content))
page_content=''

if page == unstructured_pages[-1]:
if not metadata:
metadata = {'total_pages':unstructured_pages[-1].metadata['page_number']}
pages.append(Document(page_content = page_content, metadata=metadata))
# if not metadata:
# metadata = {'total_pages':unstructured_pages[-1].metadata['page_number']}
pages.append(Document(page_content = page_content))

elif page.metadata['category']=='PageBreak' and page!=unstructured_pages[0]:
page_number+=1
Expand All @@ -80,7 +80,7 @@ def get_pages_with_page_numbers(unstructured_pages):
page_content += page.page_content
metadata_with_custom_page_number = {'source':page.metadata['source'],
'page_number':1, 'filename':page.metadata['filename'],
'filetype':page.metadata['filetype'], 'total_pages':1}
'filetype':page.metadata['filetype']}
if page == unstructured_pages[-1]:
pages.append(Document(page_content = page_content, metadata=metadata_with_custom_page_number))
return pages
1 change: 0 additions & 1 deletion backend/src/entities/source_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ class sourceNode:
updated_at:datetime=None
processing_time:float=None
error_message:str=None
total_pages:int=None
total_chunks:int=None
language:str=None
is_cancelled:bool=None
Expand Down
9 changes: 3 additions & 6 deletions backend/src/graphDB_dataAccess.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,14 @@ def create_source_node(self, obj_source_node:sourceNode):
d.processingTime = $pt, d.errorMessage = $e_message, d.nodeCount= $n_count,
d.relationshipCount = $r_count, d.model= $model, d.gcsBucket=$gcs_bucket,
d.gcsBucketFolder= $gcs_bucket_folder, d.language= $language,d.gcsProjectId= $gcs_project_id,
d.is_cancelled=False, d.total_chunks=0, d.processed_chunk=0, d.total_pages=$total_pages,
d.is_cancelled=False, d.total_chunks=0, d.processed_chunk=0,
d.access_token=$access_token""",
{"fn":obj_source_node.file_name, "fs":obj_source_node.file_size, "ft":obj_source_node.file_type, "st":job_status,
"url":obj_source_node.url,
"awsacc_key_id":obj_source_node.awsAccessKeyId, "f_source":obj_source_node.file_source, "c_at":obj_source_node.created_at,
"u_at":obj_source_node.created_at, "pt":0, "e_message":'', "n_count":0, "r_count":0, "model":obj_source_node.model,
"gcs_bucket": obj_source_node.gcsBucket, "gcs_bucket_folder": obj_source_node.gcsBucketFolder,
"language":obj_source_node.language, "gcs_project_id":obj_source_node.gcsProjectId, "total_pages": obj_source_node.total_pages,
"language":obj_source_node.language, "gcs_project_id":obj_source_node.gcsProjectId,
"access_token":obj_source_node.access_token})
except Exception as e:
error_message = str(e)
Expand Down Expand Up @@ -80,9 +80,6 @@ def update_source_node(self, obj_source_node:sourceNode):
if obj_source_node.model is not None and obj_source_node.model != '':
params['model'] = obj_source_node.model

if obj_source_node.total_pages is not None and obj_source_node.total_pages != 0:
params['total_pages'] = obj_source_node.total_pages

if obj_source_node.total_chunks is not None and obj_source_node.total_chunks != 0:
params['total_chunks'] = obj_source_node.total_chunks

Expand Down Expand Up @@ -190,7 +187,7 @@ def get_current_status_document_node(self, file_name):
query = """
MATCH(d:Document {fileName : $file_name}) RETURN d.status AS Status , d.processingTime AS processingTime,
d.nodeCount AS nodeCount, d.model as model, d.relationshipCount as relationshipCount,
d.total_pages AS total_pages, d.total_chunks AS total_chunks , d.fileSize as fileSize,
d.total_chunks AS total_chunks , d.fileSize as fileSize,
d.is_cancelled as is_cancelled, d.processed_chunk as processed_chunk, d.fileSource as fileSource
"""
param = {"file_name" : file_name}
Expand Down
8 changes: 1 addition & 7 deletions backend/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ def create_source_node_graph_url_s3(graph, model, source_url, aws_access_key_id,
obj_source_node.file_type = 'pdf'
obj_source_node.file_size = file_info['file_size_bytes']
obj_source_node.file_source = source_type
obj_source_node.total_pages = 'N/A'
obj_source_node.model = model
obj_source_node.url = str(source_url+file_name)
obj_source_node.awsAccessKeyId = aws_access_key_id
Expand Down Expand Up @@ -82,7 +81,6 @@ def create_source_node_graph_url_gcs(graph, model, gcs_project_id, gcs_bucket_na
obj_source_node.file_size = file_metadata['fileSize']
obj_source_node.url = file_metadata['url']
obj_source_node.file_source = source_type
obj_source_node.total_pages = 'N/A'
obj_source_node.model = model
obj_source_node.file_type = 'pdf'
obj_source_node.gcsBucket = gcs_bucket_name
Expand Down Expand Up @@ -116,7 +114,6 @@ def create_source_node_graph_web_url(graph, model, source_url, source_type):
obj_source_node.file_type = 'text'
obj_source_node.file_source = source_type
obj_source_node.model = model
obj_source_node.total_pages = 1
obj_source_node.url = urllib.parse.unquote(source_url)
obj_source_node.created_at = datetime.now()
obj_source_node.file_name = pages[0].metadata['title']
Expand All @@ -139,7 +136,6 @@ def create_source_node_graph_url_youtube(graph, model, source_url, source_type):
obj_source_node.file_type = 'text'
obj_source_node.file_source = source_type
obj_source_node.model = model
obj_source_node.total_pages = 1
obj_source_node.url = youtube_url
obj_source_node.created_at = datetime.now()
match = re.search(r'(?:v=)([0-9A-Za-z_-]{11})\s*',obj_source_node.url)
Expand Down Expand Up @@ -177,7 +173,6 @@ def create_source_node_graph_url_wikipedia(graph, model, wiki_query, source_type
obj_source_node.file_type = 'text'
obj_source_node.file_source = source_type
obj_source_node.file_size = sys.getsizeof(pages[0].page_content)
obj_source_node.total_pages = len(pages)
obj_source_node.model = model
obj_source_node.url = urllib.parse.unquote(pages[0].metadata['source'])
obj_source_node.created_at = datetime.now()
Expand Down Expand Up @@ -289,8 +284,7 @@ def processing_source(uri, userName, password, database, model, file_name, pages
status = "Processing"
obj_source_node.file_name = file_name
obj_source_node.status = status
obj_source_node.total_chunks = total_chunks
obj_source_node.total_pages = len(pages)
obj_source_node.total_chunks = len(chunks)
obj_source_node.model = model
if retry_condition == START_FROM_LAST_PROCESSED_POSITION:
node_count = result[0]['nodeCount']
Expand Down
1 change: 0 additions & 1 deletion docs/backend/backend_docs.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -580,7 +580,6 @@ The API provides a continuous update on the extraction status of a specified fil
"relationshipCount": 0,
"model": "OpenAI GPT 3.5",
"total_chunks": 3,
"total_pages": 1,
"fileSize": 92373,
"processed_chunk": 0
}
Expand Down