Spaces:
Running
on
Zero
Running
on
Zero
logging
Browse files- .huggingface-spaces +15 -0
- README.md +2 -3
- app.py +420 -213
- colpali_manager.py +3 -3
- middleware.py +25 -5
- milvus_manager.py +115 -115
- pdf_manager.py +19 -4
- rag.py +174 -29
.huggingface-spaces
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Hugging Face Spaces Configuration
|
2 |
+
# This file helps ensure proper deployment and configuration
|
3 |
+
|
4 |
+
# Environment variables for Hugging Face Spaces
|
5 |
+
SPACE_ID=${SPACE_ID}
|
6 |
+
HF_SPACE_ID=${HF_SPACE_ID}
|
7 |
+
|
8 |
+
# File path configuration
|
9 |
+
BASE_DIR=/tmp/pages
|
10 |
+
FALLBACK_DIR=pages
|
11 |
+
|
12 |
+
# Ensure proper permissions
|
13 |
+
chmod 755 /tmp
|
14 |
+
mkdir -p /tmp/pages
|
15 |
+
chmod 755 /tmp/pages
|
README.md
CHANGED
@@ -4,7 +4,7 @@ emoji: π
|
|
4 |
colorFrom: blue
|
5 |
colorTo: purple
|
6 |
sdk: gradio
|
7 |
-
sdk_version:
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
@@ -339,5 +339,4 @@ For support and questions:
|
|
339 |
|
340 |
---
|
341 |
|
342 |
-
**Made by Collar** - Enhanced with Team Management & Chat History
|
343 |
-
|
|
|
4 |
colorFrom: blue
|
5 |
colorTo: purple
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 5.44.1
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
|
|
339 |
|
340 |
---
|
341 |
|
342 |
+
**Made by Collar** - Enhanced with Team Management & Chat History
|
|
app.py
CHANGED
@@ -17,6 +17,7 @@ import requests
|
|
17 |
import base64
|
18 |
from PIL import Image
|
19 |
import io
|
|
|
20 |
|
21 |
from middleware import Middleware
|
22 |
from rag import Rag
|
@@ -28,7 +29,14 @@ from dotenv import load_dotenv, dotenv_values
|
|
28 |
import dotenv
|
29 |
import platform
|
30 |
import time
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
# Import libraries for DOC and Excel export
|
34 |
try:
|
@@ -378,31 +386,29 @@ class PDFSearchApp:
|
|
378 |
self.db_manager = db_manager
|
379 |
self.session_manager = session_manager
|
380 |
|
381 |
-
def upload_and_convert(self,
|
382 |
-
"""Upload and convert files
|
383 |
|
384 |
if files is None:
|
385 |
return "No file uploaded"
|
386 |
|
387 |
try:
|
388 |
-
# Get user info from session if available
|
389 |
-
user_info = None
|
390 |
-
team = "default"
|
391 |
-
if session_id:
|
392 |
-
session = self.session_manager.get_session(session_id)
|
393 |
-
if session:
|
394 |
-
user_info = session['user_info']
|
395 |
-
team = user_info['team']
|
396 |
-
|
397 |
total_pages = 0
|
398 |
uploaded_files = []
|
399 |
|
400 |
-
# Create
|
401 |
if folder_name:
|
402 |
folder_name = folder_name.replace(" ", "_").replace("-", "_")
|
403 |
-
collection_name = f"{
|
404 |
else:
|
405 |
-
collection_name = f"{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
406 |
|
407 |
for file in files[:]:
|
408 |
# Extract the last part of the path (file name)
|
@@ -412,13 +418,16 @@ class PDFSearchApp:
|
|
412 |
|
413 |
# Convert PPT to PDF if needed
|
414 |
if ext.lower() in [".ppt", ".pptx"]:
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
|
|
|
|
|
|
422 |
|
423 |
# Create unique document ID
|
424 |
doc_id = f"{collection_name}_{name.replace(' ', '_').replace('-', '_')}"
|
@@ -426,31 +435,93 @@ class PDFSearchApp:
|
|
426 |
print(f"Uploading file: {doc_id}")
|
427 |
middleware = Middleware(collection_name, create_collection=True)
|
428 |
|
429 |
-
|
|
|
430 |
total_pages += len(pages) if pages else 0
|
431 |
uploaded_files.append(doc_id)
|
432 |
-
|
433 |
-
self.indexed_docs[doc_id] = True
|
434 |
|
435 |
-
#
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
len(uploaded_files)
|
442 |
-
)
|
443 |
|
444 |
-
return
|
445 |
|
446 |
except Exception as e:
|
447 |
return f"Error processing files: {str(e)}"
|
448 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
449 |
|
450 |
-
def display_file_list(text):
|
451 |
try:
|
452 |
# Retrieve all entries in the specified directory
|
453 |
-
|
|
|
|
|
454 |
current_working_directory = os.getcwd()
|
455 |
directory_path = os.path.join(current_working_directory, directory_path)
|
456 |
entries = os.listdir(directory_path)
|
@@ -465,39 +536,134 @@ class PDFSearchApp:
|
|
465 |
return str(e)
|
466 |
|
467 |
|
468 |
-
def search_documents(self,
|
469 |
print(f"Searching for query: {query}")
|
470 |
|
471 |
if not query:
|
472 |
print("Please enter a search query")
|
473 |
-
return "Please enter a search query", "--", "Please enter a search query", [], None
|
474 |
|
475 |
try:
|
476 |
-
#
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
484 |
|
485 |
# Enhanced multi-page retrieval with vision-guided chunking approach
|
486 |
# Get more results than requested to allow for intelligent filtering
|
487 |
# Request 3x the number of results for better selection
|
488 |
search_results = middleware.search([query], topk=max(num_results * 3, 20))[0]
|
489 |
|
490 |
-
#
|
491 |
-
print(f"π
|
|
|
492 |
if len(search_results) > 0:
|
493 |
-
print(f"
|
494 |
-
print(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
495 |
|
496 |
if not search_results:
|
497 |
-
return "No search results found", "--", "No search results found for your query", [], None
|
498 |
|
499 |
# Implement intelligent multi-page selection based on research
|
500 |
-
selected_results = self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
501 |
|
502 |
# Process selected results
|
503 |
cited_pages = []
|
@@ -507,13 +673,22 @@ class PDFSearchApp:
|
|
507 |
|
508 |
print(f"π Processing {len(selected_results)} selected results...")
|
509 |
|
510 |
-
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
515 |
|
516 |
-
if
|
517 |
img_paths.append(img_path)
|
518 |
all_paths.append(path)
|
519 |
page_scores.append(score)
|
@@ -521,16 +696,78 @@ class PDFSearchApp:
|
|
521 |
print(f"β
Retrieved page {i+1}: {img_path} (Score: {score:.3f})")
|
522 |
else:
|
523 |
print(f"β Image file not found: {img_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
524 |
|
525 |
print(f"π Final count: {len(img_paths)} valid pages out of {len(selected_results)} selected")
|
526 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
527 |
if not img_paths:
|
528 |
-
return "No valid image files found", "--", "Error: No valid image files found for the search results", [], None
|
529 |
|
530 |
# Generate RAG response with multiple pages using enhanced approach
|
531 |
-
|
532 |
-
|
533 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
534 |
|
535 |
# Prepare downloads
|
536 |
csv_download = self._prepare_csv_download(csv_filepath)
|
@@ -556,81 +793,42 @@ class PDFSearchApp:
|
|
556 |
|
557 |
except Exception as e:
|
558 |
error_msg = f"Error during search: {str(e)}"
|
|
|
|
|
559 |
return error_msg, "--", error_msg, [], None, None, None, None
|
560 |
-
|
561 |
-
def
|
562 |
"""
|
563 |
-
Intelligent page selection
|
564 |
-
Based on research from M3DocRAG and multi-modal retrieval models
|
565 |
"""
|
566 |
if len(search_results) <= num_results:
|
567 |
return search_results
|
568 |
|
569 |
-
# Detect if query needs multiple pages
|
570 |
-
multi_page_keywords = [
|
571 |
-
'compare', 'difference', 'similarities', 'both', 'multiple', 'various',
|
572 |
-
'different', 'types', 'kinds', 'categories', 'procedures', 'methods',
|
573 |
-
'approaches', 'techniques', 'safety', 'protocols', 'guidelines',
|
574 |
-
'overview', 'summary', 'comprehensive', 'complete', 'all', 'everything'
|
575 |
-
]
|
576 |
-
|
577 |
-
query_lower = query.lower()
|
578 |
-
needs_multiple_pages = any(keyword in query_lower for keyword in multi_page_keywords)
|
579 |
-
|
580 |
# Sort by relevance score
|
581 |
sorted_results = sorted(search_results, key=lambda x: x[0], reverse=True)
|
582 |
|
583 |
-
#
|
584 |
-
|
585 |
-
|
586 |
-
# Strategy 1: Include highest scoring result from each collection (diversity)
|
587 |
-
selected = []
|
588 |
-
seen_collections = set()
|
589 |
-
|
590 |
-
# First pass: get one page from each collection for diversity
|
591 |
-
for score, page_num, coll_num in sorted_results:
|
592 |
-
if coll_num not in seen_collections and len(selected) < min(num_results // 2, len(search_results)):
|
593 |
-
selected.append((score, page_num, coll_num))
|
594 |
-
seen_collections.add(coll_num)
|
595 |
-
|
596 |
-
# Strategy 2: Fill remaining slots with highest scoring results
|
597 |
-
for score, page_num, coll_num in sorted_results:
|
598 |
-
if (score, page_num, coll_num) not in selected and len(selected) < num_results:
|
599 |
-
selected.append((score, page_num, coll_num))
|
600 |
-
|
601 |
-
# Strategy 3: If we still don't have enough, add more from any collection
|
602 |
-
if len(selected) < num_results:
|
603 |
-
for score, page_num, coll_num in sorted_results:
|
604 |
-
if (score, page_num, coll_num) not in selected and len(selected) < num_results:
|
605 |
-
selected.append((score, page_num, coll_num))
|
606 |
-
|
607 |
-
# Strategy 4: If we have too many, trim to exact number requested
|
608 |
-
if len(selected) > num_results:
|
609 |
-
selected = selected[:num_results]
|
610 |
-
|
611 |
-
# Strategy 5: If we have too few, add more from the sorted results
|
612 |
-
if len(selected) < num_results and len(sorted_results) >= num_results:
|
613 |
-
for score, page_num, coll_num in sorted_results:
|
614 |
-
if (score, page_num, coll_num) not in selected and len(selected) < num_results:
|
615 |
-
selected.append((score, page_num, coll_num))
|
616 |
-
|
617 |
-
# Sort selected results by score for consistency
|
618 |
-
selected.sort(key=lambda x: x[0], reverse=True)
|
619 |
-
|
620 |
-
print(f"Requested {num_results} pages, selected {len(selected)} pages from {len(seen_collections)} collections")
|
621 |
|
622 |
-
|
623 |
-
if len(selected) != num_results:
|
624 |
-
print(f"β οΈ Warning: Requested {num_results} pages but selected {len(selected)} pages")
|
625 |
-
if len(selected) < num_results and len(sorted_results) >= num_results:
|
626 |
-
# Add more pages to reach the target
|
627 |
-
for score, page_num, coll_num in sorted_results:
|
628 |
-
if (score, page_num, coll_num) not in selected and len(selected) < num_results:
|
629 |
-
selected.append((score, page_num, coll_num))
|
630 |
-
print(f"Added more pages to reach target: {len(selected)} pages")
|
631 |
|
632 |
return selected
|
633 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
634 |
def _optimize_consecutive_pages(self, selected, all_results, target_count=None):
|
635 |
"""
|
636 |
Optimize selection to include consecutive pages when beneficial
|
@@ -1167,7 +1365,7 @@ The system detected you requested tabular data, but the current response doesn't
|
|
1167 |
cell_str = str(cell)
|
1168 |
if ',' in cell_str or '"' in cell_str or '\n' in cell_str:
|
1169 |
# Escape quotes and wrap in quotes
|
1170 |
-
cell_str =
|
1171 |
escaped_row.append(cell_str)
|
1172 |
csv_lines.append(','.join(escaped_row))
|
1173 |
|
@@ -2798,76 +2996,113 @@ The system detected you requested tabular data, but the current response doesn't
|
|
2798 |
# Fallback to simple response with enhanced prompt
|
2799 |
return rag.get_answer_from_gemini(detailed_prompt, img_paths), None, None, None
|
2800 |
|
2801 |
-
|
2802 |
-
|
2803 |
-
|
2804 |
-
if
|
2805 |
-
|
2806 |
-
|
2807 |
-
|
2808 |
-
|
2809 |
-
|
2810 |
-
def logout_user(self, session_id):
|
2811 |
-
"""Logout user and remove session"""
|
2812 |
-
if session_id:
|
2813 |
-
self.session_manager.remove_session(session_id)
|
2814 |
-
return "Logged out successfully", None, None
|
2815 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2816 |
|
2817 |
-
def
|
2818 |
-
"""
|
2819 |
-
|
2820 |
-
|
|
|
|
|
|
|
|
|
2821 |
|
2822 |
-
|
2823 |
-
|
2824 |
-
return "Session expired. Please log in again."
|
2825 |
|
2826 |
-
|
2827 |
-
|
|
|
2828 |
|
2829 |
-
|
2830 |
-
|
|
|
|
|
|
|
|
|
2831 |
|
2832 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2833 |
|
2834 |
def create_ui():
|
2835 |
app = PDFSearchApp()
|
2836 |
|
2837 |
with gr.Blocks(theme=gr.themes.Ocean(), css="footer{display:none !important}") as demo:
|
2838 |
-
# Session state management
|
2839 |
-
session_state = gr.State(value=None)
|
2840 |
-
user_info_state = gr.State(value=None)
|
2841 |
-
|
2842 |
gr.Markdown("# Collar Multimodal RAG Demo - Streamlined")
|
2843 |
-
gr.Markdown("
|
2844 |
-
|
2845 |
-
#
|
2846 |
-
with gr.Tab("
|
2847 |
-
with gr.Row():
|
2848 |
-
with gr.Column(scale=1):
|
2849 |
-
gr.Markdown("### Login")
|
2850 |
-
username_input = gr.Textbox(label="Username", placeholder="Enter username")
|
2851 |
-
password_input = gr.Textbox(label="Password", type="password", placeholder="Enter password")
|
2852 |
-
login_btn = gr.Button("Login", variant="primary")
|
2853 |
-
logout_btn = gr.Button("Logout")
|
2854 |
-
auth_status = gr.Textbox(label="Authentication Status", interactive=False)
|
2855 |
-
current_team = gr.Textbox(label="Current Team", interactive=False)
|
2856 |
-
|
2857 |
-
with gr.Column(scale=1):
|
2858 |
-
gr.Markdown("### Default Users")
|
2859 |
-
gr.Markdown("""
|
2860 |
-
**Team A:** admin_team_a / admin123_team_a
|
2861 |
-
**Team B:** admin_team_b / admin123_team_b
|
2862 |
-
""")
|
2863 |
-
|
2864 |
-
# Document Management Tab
|
2865 |
-
with gr.Tab("π Document Management"):
|
2866 |
with gr.Column():
|
2867 |
-
gr.Markdown("### Upload Documents
|
2868 |
folder_name_input = gr.Textbox(
|
2869 |
-
label="
|
2870 |
-
placeholder="
|
2871 |
)
|
2872 |
max_pages_input = gr.Slider(
|
2873 |
minimum=1,
|
@@ -2877,19 +3112,11 @@ def create_ui():
|
|
2877 |
label="Max pages to extract and index per document"
|
2878 |
)
|
2879 |
file_input = gr.Files(
|
2880 |
-
label="Upload PPTs/PDFs (Multiple files supported)",
|
2881 |
file_count="multiple"
|
2882 |
)
|
2883 |
-
upload_btn = gr.Button("Upload
|
2884 |
upload_status = gr.Textbox(label="Upload Status", interactive=False)
|
2885 |
-
|
2886 |
-
gr.Markdown("### Team Collections")
|
2887 |
-
refresh_collections_btn = gr.Button("Refresh Collections")
|
2888 |
-
team_collections_display = gr.Textbox(
|
2889 |
-
label="Available Collections",
|
2890 |
-
interactive=False,
|
2891 |
-
lines=5
|
2892 |
-
)
|
2893 |
|
2894 |
# Enhanced Query Tab
|
2895 |
with gr.Tab("π Advanced Query"):
|
@@ -2958,36 +3185,16 @@ def create_ui():
|
|
2958 |
|
2959 |
|
2960 |
# Event handlers
|
2961 |
-
# Authentication events
|
2962 |
-
login_btn.click(
|
2963 |
-
fn=app.authenticate_user,
|
2964 |
-
inputs=[username_input, password_input],
|
2965 |
-
outputs=[auth_status, session_state, current_team]
|
2966 |
-
)
|
2967 |
-
|
2968 |
-
logout_btn.click(
|
2969 |
-
fn=app.logout_user,
|
2970 |
-
inputs=[session_state],
|
2971 |
-
outputs=[auth_status, session_state, current_team]
|
2972 |
-
)
|
2973 |
-
|
2974 |
-
# Document management events
|
2975 |
upload_btn.click(
|
2976 |
fn=app.upload_and_convert,
|
2977 |
-
inputs=[
|
2978 |
outputs=[upload_status]
|
2979 |
)
|
2980 |
|
2981 |
-
refresh_collections_btn.click(
|
2982 |
-
fn=app.get_team_collections,
|
2983 |
-
inputs=[session_state],
|
2984 |
-
outputs=[team_collections_display]
|
2985 |
-
)
|
2986 |
-
|
2987 |
# Query events
|
2988 |
search_btn.click(
|
2989 |
fn=app.search_documents,
|
2990 |
-
inputs=[
|
2991 |
outputs=[path, images, llm_answer, cited_pages_display, csv_download, doc_download, excel_download]
|
2992 |
)
|
2993 |
|
|
|
17 |
import base64
|
18 |
from PIL import Image
|
19 |
import io
|
20 |
+
import traceback
|
21 |
|
22 |
from middleware import Middleware
|
23 |
from rag import Rag
|
|
|
29 |
import dotenv
|
30 |
import platform
|
31 |
import time
|
32 |
+
# Only enable PPT/PPTX conversion on Windows where COM is available
|
33 |
+
PPT_CONVERT_AVAILABLE = False
|
34 |
+
if platform.system() == 'Windows':
|
35 |
+
try:
|
36 |
+
from pptxtopdf import convert
|
37 |
+
PPT_CONVERT_AVAILABLE = True
|
38 |
+
except Exception:
|
39 |
+
PPT_CONVERT_AVAILABLE = False
|
40 |
|
41 |
# Import libraries for DOC and Excel export
|
42 |
try:
|
|
|
386 |
self.db_manager = db_manager
|
387 |
self.session_manager = session_manager
|
388 |
|
389 |
+
def upload_and_convert(self, files, max_pages, folder_name=None):
|
390 |
+
"""Upload and convert files without authentication or team scoping"""
|
391 |
|
392 |
if files is None:
|
393 |
return "No file uploaded"
|
394 |
|
395 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
396 |
total_pages = 0
|
397 |
uploaded_files = []
|
398 |
|
399 |
+
# Create simple collection name
|
400 |
if folder_name:
|
401 |
folder_name = folder_name.replace(" ", "_").replace("-", "_")
|
402 |
+
collection_name = f"{folder_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
403 |
else:
|
404 |
+
collection_name = f"documents_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
405 |
+
|
406 |
+
# Store the collection name in indexed_docs BEFORE processing files
|
407 |
+
self.indexed_docs[collection_name] = True
|
408 |
+
print(f"π Created collection: {collection_name}")
|
409 |
+
|
410 |
+
# Clear old collections to ensure only the latest upload is referenced
|
411 |
+
self._clear_old_collections(collection_name)
|
412 |
|
413 |
for file in files[:]:
|
414 |
# Extract the last part of the path (file name)
|
|
|
418 |
|
419 |
# Convert PPT to PDF if needed
|
420 |
if ext.lower() in [".ppt", ".pptx"]:
|
421 |
+
if PPT_CONVERT_AVAILABLE:
|
422 |
+
output_file = os.path.splitext(file.name)[0] + '.pdf'
|
423 |
+
output_directory = os.path.dirname(file.name)
|
424 |
+
outfile = os.path.join(output_directory, output_file)
|
425 |
+
convert(file.name, outfile)
|
426 |
+
pdf_path = outfile
|
427 |
+
name = os.path.basename(outfile)
|
428 |
+
name, ext = os.path.splitext(name)
|
429 |
+
else:
|
430 |
+
return "PPT/PPTX conversion is only supported on Windows. Please upload PDFs instead."
|
431 |
|
432 |
# Create unique document ID
|
433 |
doc_id = f"{collection_name}_{name.replace(' ', '_').replace('-', '_')}"
|
|
|
435 |
print(f"Uploading file: {doc_id}")
|
436 |
middleware = Middleware(collection_name, create_collection=True)
|
437 |
|
438 |
+
# Pass collection_name as id to ensure images are saved to the right directory
|
439 |
+
pages = middleware.index(pdf_path, id=collection_name, max_pages=max_pages)
|
440 |
total_pages += len(pages) if pages else 0
|
441 |
uploaded_files.append(doc_id)
|
|
|
|
|
442 |
|
443 |
+
# Get the current active collection after cleanup
|
444 |
+
current_collection = self.get_current_collection()
|
445 |
+
status_message = f"Uploaded {len(uploaded_files)} files with {total_pages} total pages to collection: {collection_name}"
|
446 |
+
|
447 |
+
if current_collection:
|
448 |
+
status_message += f"\nβ
This is now your active collection for searches."
|
|
|
|
|
449 |
|
450 |
+
return status_message
|
451 |
|
452 |
except Exception as e:
|
453 |
return f"Error processing files: {str(e)}"
|
454 |
|
455 |
+
def _clear_old_collections(self, current_collection_name):
|
456 |
+
"""Clear old collections to ensure only the latest upload is referenced"""
|
457 |
+
try:
|
458 |
+
# Get all collections except the current one
|
459 |
+
collections_to_remove = [name for name in self.indexed_docs.keys() if name != current_collection_name]
|
460 |
+
|
461 |
+
if collections_to_remove:
|
462 |
+
print(f"ποΈ Clearing {len(collections_to_remove)} old collections to maintain latest upload reference")
|
463 |
+
|
464 |
+
for old_collection in collections_to_remove:
|
465 |
+
# Remove from indexed_docs
|
466 |
+
del self.indexed_docs[old_collection]
|
467 |
+
|
468 |
+
# Try to drop the collection from Milvus
|
469 |
+
try:
|
470 |
+
middleware = Middleware(old_collection, create_collection=False)
|
471 |
+
if middleware.drop_collection():
|
472 |
+
print(f"ποΈ Successfully dropped Milvus collection '{old_collection}'")
|
473 |
+
else:
|
474 |
+
print(f"β οΈ Failed to drop Milvus collection '{old_collection}'")
|
475 |
+
except Exception as e:
|
476 |
+
print(f"β οΈ Warning: Could not clean up Milvus collection '{old_collection}': {e}")
|
477 |
+
|
478 |
+
print(f"β
Kept only the latest collection: {current_collection_name}")
|
479 |
+
else:
|
480 |
+
print(f"β
No old collections to clear. Current collection: {current_collection_name}")
|
481 |
+
|
482 |
+
except Exception as e:
|
483 |
+
print(f"β οΈ Warning: Error clearing old collections: {e}")
|
484 |
+
# Don't fail the upload if cleanup fails
|
485 |
+
|
486 |
+
def get_current_collection_status(self):
|
487 |
+
"""Get a user-friendly status message about the current collection"""
|
488 |
+
current_collection = self.get_current_collection()
|
489 |
+
if current_collection:
|
490 |
+
return f"β
Currently active collection: {current_collection}"
|
491 |
+
else:
|
492 |
+
return "β No documents uploaded yet. Please upload a document to get started."
|
493 |
+
|
494 |
+
def get_current_collection(self):
|
495 |
+
"""Get the name of the currently active collection (most recent upload)"""
|
496 |
+
if not self.indexed_docs:
|
497 |
+
return None
|
498 |
+
|
499 |
+
available_collections = list(self.indexed_docs.keys())
|
500 |
+
if not available_collections:
|
501 |
+
return None
|
502 |
+
|
503 |
+
# Sort by timestamp to get the most recent one
|
504 |
+
def extract_timestamp(collection_name):
|
505 |
+
try:
|
506 |
+
parts = collection_name.split('_')
|
507 |
+
if len(parts) >= 3:
|
508 |
+
date_part = parts[-2]
|
509 |
+
time_part = parts[-1]
|
510 |
+
timestamp = f"{date_part}_{time_part}"
|
511 |
+
return timestamp
|
512 |
+
return collection_name
|
513 |
+
except:
|
514 |
+
return collection_name
|
515 |
+
|
516 |
+
available_collections.sort(key=extract_timestamp, reverse=True)
|
517 |
+
return available_collections[0]
|
518 |
|
519 |
+
def display_file_list(self, text):
|
520 |
try:
|
521 |
# Retrieve all entries in the specified directory
|
522 |
+
# Use the same base directory logic as PdfManager
|
523 |
+
base_output_dir = self._ensure_base_directory()
|
524 |
+
directory_path = base_output_dir
|
525 |
current_working_directory = os.getcwd()
|
526 |
directory_path = os.path.join(current_working_directory, directory_path)
|
527 |
entries = os.listdir(directory_path)
|
|
|
536 |
return str(e)
|
537 |
|
538 |
|
539 |
+
def search_documents(self, query, num_results):
|
540 |
print(f"Searching for query: {query}")
|
541 |
|
542 |
if not query:
|
543 |
print("Please enter a search query")
|
544 |
+
return "Please enter a search query", "--", "Please enter a search query", [], None, None, None, None
|
545 |
|
546 |
try:
|
547 |
+
# First, check if there are any indexed documents
|
548 |
+
if not self.indexed_docs:
|
549 |
+
return "No documents have been uploaded yet. Please upload some documents first.", "--", "No documents available for search", [], None, None, None, None
|
550 |
+
|
551 |
+
# Clean up any invalid collections first
|
552 |
+
print("π§Ή Cleaning up invalid collections...")
|
553 |
+
removed_count = self._cleanup_invalid_collections()
|
554 |
+
if removed_count > 0:
|
555 |
+
print(f"ποΈ Removed {removed_count} invalid collections")
|
556 |
+
|
557 |
+
# Check again after cleanup
|
558 |
+
if not self.indexed_docs:
|
559 |
+
return "No valid collections found after cleanup. Please re-upload your documents.", "--", "No valid collections available", [], None, None, None, None
|
560 |
+
|
561 |
+
# Get the most recent collection name from indexed docs (latest upload)
|
562 |
+
available_collections = list(self.indexed_docs.keys())
|
563 |
+
print(f"π Available collections after cleanup: {available_collections}")
|
564 |
+
|
565 |
+
if not available_collections:
|
566 |
+
return "No collections available for search. Please upload some documents first.", "--", "No collections available", [], None, None, None, None
|
567 |
+
|
568 |
+
# Sort collections by timestamp to get the most recent one
|
569 |
+
# Collections are named like "documents_20250101_120000" or "folder_20250101_120000"
|
570 |
+
def extract_timestamp(collection_name):
|
571 |
+
try:
|
572 |
+
# Extract the timestamp part after the last underscore
|
573 |
+
parts = collection_name.split('_')
|
574 |
+
if len(parts) >= 3:
|
575 |
+
# Get the last two parts which should be date and time
|
576 |
+
date_part = parts[-2]
|
577 |
+
time_part = parts[-1]
|
578 |
+
timestamp = f"{date_part}_{time_part}"
|
579 |
+
return timestamp
|
580 |
+
return collection_name
|
581 |
+
except:
|
582 |
+
return collection_name
|
583 |
+
|
584 |
+
# Sort by timestamp in descending order (most recent first)
|
585 |
+
available_collections.sort(key=extract_timestamp, reverse=True)
|
586 |
+
collection_name = available_collections[0]
|
587 |
+
print(f"π Available collections sorted by timestamp: {available_collections}")
|
588 |
+
print(f"π Searching in most recent collection: {collection_name}")
|
589 |
+
|
590 |
+
# Add collection info to the search results for user clarity
|
591 |
+
collection_info = f"π Searching in collection: {collection_name}"
|
592 |
+
|
593 |
+
middleware = Middleware(collection_name, create_collection=False)
|
594 |
|
595 |
# Enhanced multi-page retrieval with vision-guided chunking approach
|
596 |
# Get more results than requested to allow for intelligent filtering
|
597 |
# Request 3x the number of results for better selection
|
598 |
search_results = middleware.search([query], topk=max(num_results * 3, 20))[0]
|
599 |
|
600 |
+
# π COMPREHENSIVE SEARCH RESULTS LOGGING
|
601 |
+
print(f"\nπ SEARCH RESULTS SUMMARY")
|
602 |
+
print(f"π Retrieved {len(search_results)} total results from search")
|
603 |
if len(search_results) > 0:
|
604 |
+
print(f"π Top result score: {search_results[0][0]:.4f}")
|
605 |
+
print(f"π Bottom result score: {search_results[-1][0]:.4f}")
|
606 |
+
print(f"π Score range: {search_results[-1][0]:.4f} - {search_results[0][0]:.4f}")
|
607 |
+
|
608 |
+
# Show top 5 results with page numbers
|
609 |
+
print(f"\nπ TOP 5 HIGHEST SCORING PAGES:")
|
610 |
+
for i, (score, doc_id) in enumerate(search_results[:5], 1):
|
611 |
+
page_num = doc_id + 1 # Convert to 1-based page numbering
|
612 |
+
print(f" {i}. Page {page_num} (doc_id: {doc_id}) - Score: {score:.4f}")
|
613 |
+
|
614 |
+
# Calculate and display score statistics
|
615 |
+
scores = [result[0] for result in search_results]
|
616 |
+
avg_score = sum(scores) / len(scores)
|
617 |
+
print(f"\nπ SCORE STATISTICS:")
|
618 |
+
print(f" Average Score: {avg_score:.4f}")
|
619 |
+
print(f" Score Variance: {sum((s - avg_score) ** 2 for s in scores) / len(scores):.4f}")
|
620 |
+
|
621 |
+
# Count pages by relevance level
|
622 |
+
excellent = sum(1 for s in scores if s >= 0.90)
|
623 |
+
very_good = sum(1 for s in scores if 0.80 <= s < 0.90)
|
624 |
+
good = sum(1 for s in scores if 0.70 <= s < 0.80)
|
625 |
+
moderate = sum(1 for s in scores if 0.60 <= s < 0.70)
|
626 |
+
basic = sum(1 for s in scores if 0.50 <= s < 0.60)
|
627 |
+
poor = sum(1 for s in scores if s < 0.50)
|
628 |
+
|
629 |
+
print(f"\nπ RELEVANCE DISTRIBUTION:")
|
630 |
+
print(f" π’ Excellent (β₯0.90): {excellent} pages")
|
631 |
+
print(f" π‘ Very Good (0.80-0.89): {very_good} pages")
|
632 |
+
print(f" π Good (0.70-0.79): {good} pages")
|
633 |
+
print(f" π΅ Moderate (0.60-0.69): {moderate} pages")
|
634 |
+
print(f" π£ Basic (0.50-0.59): {basic} pages")
|
635 |
+
print(f" π΄ Poor (<0.50): {poor} pages")
|
636 |
+
print("-" * 60)
|
637 |
|
638 |
if not search_results:
|
639 |
+
return "No search results found", "--", "No search results found for your query", [], None, None, None, None
|
640 |
|
641 |
# Implement intelligent multi-page selection based on research
|
642 |
+
selected_results = self._select_relevant_pages_new_format(search_results, query, num_results)
|
643 |
+
|
644 |
+
# π SELECTION LOGGING - Show which pages were selected
|
645 |
+
print(f"\nπ― PAGE SELECTION RESULTS")
|
646 |
+
print(f"π Requested: {num_results} pages")
|
647 |
+
print(f"π Selected: {len(selected_results)} pages")
|
648 |
+
print(f"π Selection rate: {len(selected_results)/len(search_results)*100:.1f}% of available results")
|
649 |
+
print("-" * 60)
|
650 |
+
|
651 |
+
print(f"π SELECTED PAGES WITH SCORES:")
|
652 |
+
for i, (score, doc_id) in enumerate(selected_results, 1):
|
653 |
+
page_num = doc_id + 1
|
654 |
+
relevance_level = self._get_relevance_level(score)
|
655 |
+
print(f" {i}. Page {page_num:2d} (doc_id: {doc_id:2d}) | Score: {score:8.4f} | {relevance_level}")
|
656 |
+
|
657 |
+
# Calculate selection statistics
|
658 |
+
if selected_results:
|
659 |
+
selected_scores = [result[0] for result in selected_results]
|
660 |
+
avg_selected_score = sum(selected_scores) / len(selected_scores)
|
661 |
+
print(f"\nπ SELECTION STATISTICS:")
|
662 |
+
print(f" Average selected score: {avg_selected_score:.4f}")
|
663 |
+
print(f" Highest selected score: {selected_scores[0]:.4f}")
|
664 |
+
print(f" Lowest selected score: {selected_scores[-1]:.4f}")
|
665 |
+
print(f" Score improvement over average: {avg_selected_score - avg_score:.4f}")
|
666 |
+
print("-" * 60)
|
667 |
|
668 |
# Process selected results
|
669 |
cited_pages = []
|
|
|
673 |
|
674 |
print(f"π Processing {len(selected_results)} selected results...")
|
675 |
|
676 |
+
# Ensure base directory exists and get the correct path
|
677 |
+
base_output_dir = self._ensure_base_directory()
|
678 |
+
print(f"π Using base directory: {base_output_dir}")
|
679 |
+
print(f"π Collection name: {collection_name}")
|
680 |
+
print(f"π Environment: {'Hugging Face Spaces' if self._is_huggingface_spaces() else 'Local Development'}")
|
681 |
+
|
682 |
+
for i, (score, doc_id) in enumerate(selected_results):
|
683 |
+
# Use the index as page number since doc_id is just an identifier
|
684 |
+
# This ensures we look for page_1.png, page_2.png, etc.
|
685 |
+
display_page_num = i + 1
|
686 |
+
coll_num = collection_name # Use the current collection name
|
687 |
+
|
688 |
+
# Use debug function to get paths and check existence
|
689 |
+
img_path, path, file_exists = self._debug_file_paths(base_output_dir, coll_num, display_page_num)
|
690 |
|
691 |
+
if file_exists:
|
692 |
img_paths.append(img_path)
|
693 |
all_paths.append(path)
|
694 |
page_scores.append(score)
|
|
|
696 |
print(f"β
Retrieved page {i+1}: {img_path} (Score: {score:.3f})")
|
697 |
else:
|
698 |
print(f"β Image file not found: {img_path}")
|
699 |
+
# Try alternative paths with better fallback logic
|
700 |
+
alt_paths = [
|
701 |
+
# Primary path (should work in Hugging Face Spaces)
|
702 |
+
img_path,
|
703 |
+
# Relative paths from app directory
|
704 |
+
os.path.join(os.path.dirname(os.path.abspath(__file__)), "pages", coll_num, f"page_{display_page_num}.png"),
|
705 |
+
# Current working directory paths
|
706 |
+
f"pages/{coll_num}/page_{display_page_num}.png",
|
707 |
+
f"./pages/{coll_num}/page_{display_page_num}.png",
|
708 |
+
os.path.join(os.getcwd(), "pages", coll_num, f"page_{display_page_num}.png"),
|
709 |
+
# Alternative base directories
|
710 |
+
os.path.join("/tmp", "pages", coll_num, f"page_{display_page_num}.png"),
|
711 |
+
os.path.join("/home/user", "pages", coll_num, f"page_{display_page_num}.png")
|
712 |
+
]
|
713 |
+
|
714 |
+
print(f"π Trying alternative paths for page {display_page_num}:")
|
715 |
+
for alt_path in alt_paths:
|
716 |
+
print(f" π Checking: {alt_path}")
|
717 |
+
if os.path.exists(alt_path):
|
718 |
+
print(f"β
Found alternative path: {alt_path}")
|
719 |
+
img_paths.append(alt_path)
|
720 |
+
all_paths.append(alt_path.replace(".png", ""))
|
721 |
+
page_scores.append(score)
|
722 |
+
cited_pages.append(f"Page {display_page_num} from {coll_num}")
|
723 |
+
break
|
724 |
+
else:
|
725 |
+
print(f"β No alternative path found for page {display_page_num}")
|
726 |
|
727 |
print(f"π Final count: {len(img_paths)} valid pages out of {len(selected_results)} selected")
|
728 |
|
729 |
+
# π FINAL RESULTS SUMMARY
|
730 |
+
if img_paths:
|
731 |
+
print(f"\nπ FINAL RETRIEVAL SUMMARY")
|
732 |
+
print(f"π Successfully retrieved: {len(img_paths)} pages")
|
733 |
+
print(f"π Final page scores:")
|
734 |
+
for i, (img_path, score) in enumerate(zip(img_paths, page_scores), 1):
|
735 |
+
# Extract page number from path
|
736 |
+
page_num = img_path.split('page_')[1].split('.png')[0] if 'page_' in img_path else f"Page {i}"
|
737 |
+
print(f" {i}. {page_num} - Score: {score:.4f}")
|
738 |
+
|
739 |
+
if page_scores:
|
740 |
+
final_avg_score = sum(page_scores) / len(page_scores)
|
741 |
+
print(f"\nπ FINAL STATISTICS:")
|
742 |
+
print(f" Average final score: {final_avg_score:.4f}")
|
743 |
+
print(f" Highest final score: {max(page_scores):.4f}")
|
744 |
+
print(f" Lowest final score: {min(page_scores):.4f}")
|
745 |
+
print("=" * 60)
|
746 |
+
|
747 |
if not img_paths:
|
748 |
+
return "No valid image files found", "--", "Error: No valid image files found for the search results", [], None, None, None, None
|
749 |
|
750 |
# Generate RAG response with multiple pages using enhanced approach
|
751 |
+
try:
|
752 |
+
print("π€ Generating RAG response...")
|
753 |
+
rag_response, csv_filepath, doc_filepath, excel_filepath = self._generate_multi_page_response(query, img_paths, cited_pages, page_scores)
|
754 |
+
print("β
RAG response generated successfully")
|
755 |
+
except Exception as e:
|
756 |
+
error_code = "RAG001"
|
757 |
+
error_msg = f"β **Error {error_code}**: Failed to generate RAG response"
|
758 |
+
print(f"{error_msg}: {str(e)}")
|
759 |
+
print(f"β Traceback: {traceback.format_exc()}")
|
760 |
+
|
761 |
+
# Return error response with proper format
|
762 |
+
return (
|
763 |
+
error_msg, # path
|
764 |
+
"--", # images
|
765 |
+
f"{error_msg}\n\n**Details**: {str(e)}\n\n**Error Code**: {error_code}", # llm_answer
|
766 |
+
cited_pages, # cited_pages_display
|
767 |
+
None, # csv_download
|
768 |
+
None, # doc_download
|
769 |
+
None # excel_download
|
770 |
+
)
|
771 |
|
772 |
# Prepare downloads
|
773 |
csv_download = self._prepare_csv_download(csv_filepath)
|
|
|
793 |
|
794 |
except Exception as e:
|
795 |
error_msg = f"Error during search: {str(e)}"
|
796 |
+
print(f"β Search error: {error_msg}")
|
797 |
+
# Return exactly 7 outputs to match Gradio expectations
|
798 |
return error_msg, "--", error_msg, [], None, None, None, None
|
799 |
+
|
800 |
+
def _select_relevant_pages_new_format(self, search_results, query, num_results):
|
801 |
"""
|
802 |
+
Intelligent page selection for new Milvus format: (score, doc_id)
|
|
|
803 |
"""
|
804 |
if len(search_results) <= num_results:
|
805 |
return search_results
|
806 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
807 |
# Sort by relevance score
|
808 |
sorted_results = sorted(search_results, key=lambda x: x[0], reverse=True)
|
809 |
|
810 |
+
# Simple strategy: take top N results
|
811 |
+
selected = sorted_results[:num_results]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
812 |
|
813 |
+
print(f"Requested {num_results} pages, selected {len(selected)} pages")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
814 |
|
815 |
return selected
|
816 |
|
817 |
+
def _get_relevance_level(self, score):
|
818 |
+
"""Get human-readable relevance level based on score"""
|
819 |
+
if score >= 0.90:
|
820 |
+
return "π’ EXCELLENT - Highly relevant"
|
821 |
+
elif score >= 0.80:
|
822 |
+
return "π‘ VERY GOOD - Very relevant"
|
823 |
+
elif score >= 0.70:
|
824 |
+
return "π GOOD - Relevant"
|
825 |
+
elif score >= 0.60:
|
826 |
+
return "π΅ MODERATE - Somewhat relevant"
|
827 |
+
elif score >= 0.50:
|
828 |
+
return "π£ BASIC - Minimally relevant"
|
829 |
+
else:
|
830 |
+
return "π΄ POOR - Not relevant"
|
831 |
+
|
832 |
def _optimize_consecutive_pages(self, selected, all_results, target_count=None):
|
833 |
"""
|
834 |
Optimize selection to include consecutive pages when beneficial
|
|
|
1365 |
cell_str = str(cell)
|
1366 |
if ',' in cell_str or '"' in cell_str or '\n' in cell_str:
|
1367 |
# Escape quotes and wrap in quotes
|
1368 |
+
cell_str = '"' + cell_str.replace('"', '""') + '"'
|
1369 |
escaped_row.append(cell_str)
|
1370 |
csv_lines.append(','.join(escaped_row))
|
1371 |
|
|
|
2996 |
# Fallback to simple response with enhanced prompt
|
2997 |
return rag.get_answer_from_gemini(detailed_prompt, img_paths), None, None, None
|
2998 |
|
2999 |
+
# Authentication and team collection methods removed for simplified app
|
3000 |
+
|
3001 |
+
def _is_huggingface_spaces(self):
|
3002 |
+
"""Check if running in Hugging Face Spaces environment"""
|
3003 |
+
return (
|
3004 |
+
os.path.exists("/tmp") and
|
3005 |
+
os.access("/tmp", os.W_OK) and
|
3006 |
+
(os.getenv('SPACE_ID') or os.getenv('HF_SPACE_ID'))
|
3007 |
+
)
|
|
|
|
|
|
|
|
|
|
|
3008 |
|
3009 |
+
def _get_optimal_base_dir(self):
|
3010 |
+
"""Get the optimal base directory based on environment"""
|
3011 |
+
if self._is_huggingface_spaces():
|
3012 |
+
base_dir = "/tmp/pages"
|
3013 |
+
print(f"π Detected Hugging Face Spaces environment, using: {base_dir}")
|
3014 |
+
else:
|
3015 |
+
# Use relative path from app directory
|
3016 |
+
app_dir = os.path.dirname(os.path.abspath(__file__))
|
3017 |
+
base_dir = os.path.join(app_dir, "pages")
|
3018 |
+
print(f"π» Using local development path: {base_dir}")
|
3019 |
+
|
3020 |
+
# Ensure directory exists
|
3021 |
+
os.makedirs(base_dir, exist_ok=True)
|
3022 |
+
return base_dir
|
3023 |
+
|
3024 |
+
def _ensure_base_directory(self):
|
3025 |
+
"""Ensure the base directory for storing pages exists"""
|
3026 |
+
base_output_dir = self._get_optimal_base_dir()
|
3027 |
+
|
3028 |
+
# Create the base directory if it doesn't exist
|
3029 |
+
if not os.path.exists(base_output_dir):
|
3030 |
+
try:
|
3031 |
+
os.makedirs(base_output_dir, exist_ok=True)
|
3032 |
+
print(f"β
Created base directory: {base_output_dir}")
|
3033 |
+
except Exception as e:
|
3034 |
+
print(f"β Failed to create base directory {base_output_dir}: {e}")
|
3035 |
+
# Fallback to current working directory
|
3036 |
+
base_output_dir = os.path.join(os.getcwd(), "pages")
|
3037 |
+
os.makedirs(base_output_dir, exist_ok=True)
|
3038 |
+
print(f"β
Using fallback directory: {base_output_dir}")
|
3039 |
+
|
3040 |
+
return base_output_dir
|
3041 |
|
3042 |
+
def _debug_file_paths(self, base_output_dir, coll_num, display_page_num):
|
3043 |
+
"""Helper function to debug file path issues"""
|
3044 |
+
img_path = os.path.join(base_output_dir, coll_num, f"page_{display_page_num}.png")
|
3045 |
+
path = os.path.join(base_output_dir, coll_num, f"page_{display_page_num}")
|
3046 |
+
|
3047 |
+
# Check if directory exists
|
3048 |
+
dir_path = os.path.dirname(img_path)
|
3049 |
+
dir_exists = os.path.exists(dir_path)
|
3050 |
|
3051 |
+
# Check if file exists
|
3052 |
+
file_exists = os.path.exists(img_path)
|
|
|
3053 |
|
3054 |
+
# Get absolute paths for debugging
|
3055 |
+
abs_img_path = os.path.abspath(img_path)
|
3056 |
+
abs_dir_path = os.path.abspath(dir_path)
|
3057 |
|
3058 |
+
print(f"π Path Debug for {coll_num}/page_{display_page_num}:")
|
3059 |
+
print(f" Base dir: {base_output_dir}")
|
3060 |
+
print(f" Directory: {dir_path} (exists: {dir_exists})")
|
3061 |
+
print(f" File: {img_path} (exists: {file_exists})")
|
3062 |
+
print(f" Abs dir: {abs_dir_path}")
|
3063 |
+
print(f" Abs file: {abs_img_path}")
|
3064 |
|
3065 |
+
return img_path, path, file_exists
|
3066 |
+
|
3067 |
+
def _cleanup_invalid_collections(self):
|
3068 |
+
"""Remove collections that no longer exist in Milvus from indexed_docs"""
|
3069 |
+
invalid_collections = []
|
3070 |
+
|
3071 |
+
for collection_name in list(self.indexed_docs.keys()):
|
3072 |
+
try:
|
3073 |
+
# Try to create a middleware instance to check if collection exists
|
3074 |
+
middleware = Middleware(collection_name, create_collection=False)
|
3075 |
+
print(f"οΏ½οΏ½ Collection {collection_name} is valid")
|
3076 |
+
except Exception as e:
|
3077 |
+
print(f"β οΈ Collection {collection_name} not accessible: {e}")
|
3078 |
+
invalid_collections.append(collection_name)
|
3079 |
+
|
3080 |
+
# Remove invalid collections
|
3081 |
+
for collection_name in invalid_collections:
|
3082 |
+
if collection_name in self.indexed_docs:
|
3083 |
+
del self.indexed_docs[collection_name]
|
3084 |
+
print(f"ποΈ Removed invalid collection: {collection_name}")
|
3085 |
+
|
3086 |
+
return len(invalid_collections)
|
3087 |
+
|
3088 |
+
def _check_collections_exist(self):
|
3089 |
+
# This method should be implemented to check if collections exist in Milvus
|
3090 |
+
pass
|
3091 |
|
3092 |
def create_ui():
|
3093 |
app = PDFSearchApp()
|
3094 |
|
3095 |
with gr.Blocks(theme=gr.themes.Ocean(), css="footer{display:none !important}") as demo:
|
|
|
|
|
|
|
|
|
3096 |
gr.Markdown("# Collar Multimodal RAG Demo - Streamlined")
|
3097 |
+
gr.Markdown("Basic document upload and search (no authentication)")
|
3098 |
+
|
3099 |
+
# Document Upload
|
3100 |
+
with gr.Tab("π Document Upload"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3101 |
with gr.Column():
|
3102 |
+
gr.Markdown("### Upload Documents")
|
3103 |
folder_name_input = gr.Textbox(
|
3104 |
+
label="Collection Name (Optional)",
|
3105 |
+
placeholder="Optional name for this document collection"
|
3106 |
)
|
3107 |
max_pages_input = gr.Slider(
|
3108 |
minimum=1,
|
|
|
3112 |
label="Max pages to extract and index per document"
|
3113 |
)
|
3114 |
file_input = gr.Files(
|
3115 |
+
label="Upload PPTs/PDFs (Multiple files supported)",
|
3116 |
file_count="multiple"
|
3117 |
)
|
3118 |
+
upload_btn = gr.Button("Upload", variant="primary")
|
3119 |
upload_status = gr.Textbox(label="Upload Status", interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3120 |
|
3121 |
# Enhanced Query Tab
|
3122 |
with gr.Tab("π Advanced Query"):
|
|
|
3185 |
|
3186 |
|
3187 |
# Event handlers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3188 |
upload_btn.click(
|
3189 |
fn=app.upload_and_convert,
|
3190 |
+
inputs=[file_input, max_pages_input, folder_name_input],
|
3191 |
outputs=[upload_status]
|
3192 |
)
|
3193 |
|
|
|
|
|
|
|
|
|
|
|
|
|
3194 |
# Query events
|
3195 |
search_btn.click(
|
3196 |
fn=app.search_documents,
|
3197 |
+
inputs=[query_input, num_results],
|
3198 |
outputs=[path, images, llm_answer, cited_pages_display, csv_download, doc_download, excel_download]
|
3199 |
)
|
3200 |
|
colpali_manager.py
CHANGED
@@ -25,7 +25,7 @@ import dotenv
|
|
25 |
dotenv_file = dotenv.find_dotenv()
|
26 |
dotenv.load_dotenv(dotenv_file)
|
27 |
|
28 |
-
model_name =
|
29 |
device = get_torch_device("cuda") #try using cpu instead of cuda?
|
30 |
|
31 |
#switch to locally downloading models & loading locally rather than from hf
|
@@ -97,7 +97,7 @@ class ColpaliManager:
|
|
97 |
return [Image.open(path) for path in paths]
|
98 |
|
99 |
@spaces.GPU
|
100 |
-
def process_images(self, image_paths:list[str], batch_size=
|
101 |
model.to("cuda")
|
102 |
print(f"Processing {len(image_paths)} image_paths")
|
103 |
|
@@ -161,7 +161,7 @@ class ColpaliManager:
|
|
161 |
|
162 |
dataloader = DataLoader(
|
163 |
dataset=ListDataset[str](texts),
|
164 |
-
batch_size=
|
165 |
shuffle=False,
|
166 |
collate_fn=lambda x: processor.process_queries(x),
|
167 |
)
|
|
|
25 |
dotenv_file = dotenv.find_dotenv()
|
26 |
dotenv.load_dotenv(dotenv_file)
|
27 |
|
28 |
+
model_name = 'vidore/colpali-v1.3' #"vidore/colSmol-256M"
|
29 |
device = get_torch_device("cuda") #try using cpu instead of cuda?
|
30 |
|
31 |
#switch to locally downloading models & loading locally rather than from hf
|
|
|
97 |
return [Image.open(path) for path in paths]
|
98 |
|
99 |
@spaces.GPU
|
100 |
+
def process_images(self, image_paths:list[str], batch_size=5):
|
101 |
model.to("cuda")
|
102 |
print(f"Processing {len(image_paths)} image_paths")
|
103 |
|
|
|
161 |
|
162 |
dataloader = DataLoader(
|
163 |
dataset=ListDataset[str](texts),
|
164 |
+
batch_size=5, #OG is 5, try reducing batch size to maximise gpu use
|
165 |
shuffle=False,
|
166 |
collate_fn=lambda x: processor.process_queries(x),
|
167 |
)
|
middleware.py
CHANGED
@@ -43,20 +43,40 @@ class Middleware:
|
|
43 |
print("Indexing completed")
|
44 |
|
45 |
return image_paths
|
46 |
-
|
|
|
|
|
|
|
47 |
|
48 |
|
49 |
def search(self, search_queries: list[str], topk: int = 10):
|
50 |
-
print(f"
|
|
|
|
|
|
|
51 |
|
52 |
final_res = []
|
53 |
|
54 |
-
for query in search_queries:
|
55 |
-
print(f"
|
|
|
|
|
56 |
query_vec = colpali_manager.process_text([query])[0]
|
|
|
|
|
|
|
57 |
search_res = self.milvus_manager.search(query_vec, topk=topk)
|
58 |
-
|
|
|
|
|
|
|
|
|
59 |
final_res.append(search_res)
|
60 |
|
|
|
|
|
|
|
|
|
|
|
61 |
return final_res
|
62 |
|
|
|
43 |
print("Indexing completed")
|
44 |
|
45 |
return image_paths
|
46 |
+
|
47 |
+
def drop_collection(self):
|
48 |
+
"""Drop the current collection from Milvus"""
|
49 |
+
return self.milvus_manager.drop_collection()
|
50 |
|
51 |
|
52 |
def search(self, search_queries: list[str], topk: int = 10):
|
53 |
+
print(f"\nπ MIDDLEWARE SEARCH INITIATED")
|
54 |
+
print(f"π Queries to process: {len(search_queries)}")
|
55 |
+
print(f"π― Top-k requested: {topk}")
|
56 |
+
print("-" * 60)
|
57 |
|
58 |
final_res = []
|
59 |
|
60 |
+
for i, query in enumerate(search_queries, 1):
|
61 |
+
print(f"\nπ Processing Query {i}/{len(search_queries)}: '{query}'")
|
62 |
+
print(f"π Converting query to vector representation...")
|
63 |
+
|
64 |
query_vec = colpali_manager.process_text([query])[0]
|
65 |
+
print(f"β
Query vector generated (dimension: {len(query_vec)})")
|
66 |
+
|
67 |
+
print(f"π Executing vector search in Milvus...")
|
68 |
search_res = self.milvus_manager.search(query_vec, topk=topk)
|
69 |
+
|
70 |
+
print(f"β
Search completed: {len(search_res)} results retrieved")
|
71 |
+
if search_res:
|
72 |
+
print(f"π Score range: {search_res[0][0]:.4f} (highest) to {search_res[-1][0]:.4f} (lowest)")
|
73 |
+
|
74 |
final_res.append(search_res)
|
75 |
|
76 |
+
print(f"\nπ MIDDLEWARE SEARCH COMPLETED")
|
77 |
+
print(f"π Total queries processed: {len(search_queries)}")
|
78 |
+
print(f"π Total results across all queries: {sum(len(res) for res in final_res)}")
|
79 |
+
print("=" * 60)
|
80 |
+
|
81 |
return final_res
|
82 |
|
milvus_manager.py
CHANGED
@@ -1,49 +1,24 @@
|
|
1 |
from pymilvus import MilvusClient, DataType
|
2 |
-
try:
|
3 |
-
from milvus import default_server # Milvus Lite
|
4 |
-
except Exception:
|
5 |
-
default_server = None
|
6 |
import numpy as np
|
7 |
import concurrent.futures
|
8 |
-
|
9 |
-
import os
|
10 |
|
11 |
class MilvusManager:
|
12 |
def __init__(self, milvus_uri, collection_name, create_collection, dim=128):
|
13 |
-
|
14 |
-
#import environ variables from .env
|
15 |
-
import dotenv
|
16 |
-
# Load the .env file
|
17 |
-
dotenv_file = dotenv.find_dotenv()
|
18 |
-
dotenv.load_dotenv(dotenv_file)
|
19 |
-
|
20 |
-
# Start embedded Milvus Lite server and connect locally
|
21 |
-
if default_server is not None:
|
22 |
-
try:
|
23 |
-
# Optionally set base dir here if desired, e.g. default_server.set_base_dir('volumes/milvus_lite')
|
24 |
-
default_server.start()
|
25 |
-
except Exception:
|
26 |
-
pass
|
27 |
-
local_uri = f"http://127.0.0.1:{default_server.listen_port}"
|
28 |
-
self.client = MilvusClient(uri=local_uri)
|
29 |
-
else:
|
30 |
-
# Fallback to standard local server (assumes docker-compose or system service)
|
31 |
-
self.client = MilvusClient(uri="http://127.0.0.1:19530")
|
32 |
self.collection_name = collection_name
|
|
|
|
|
33 |
self.dim = dim
|
34 |
|
35 |
-
if
|
36 |
-
self.client.load_collection(collection_name=self.collection_name)
|
37 |
-
print("Loaded existing collection.")
|
38 |
-
elif create_collection:
|
39 |
self.create_collection()
|
40 |
self.create_index()
|
41 |
|
|
|
42 |
def create_collection(self):
|
43 |
if self.client.has_collection(collection_name=self.collection_name):
|
44 |
-
|
45 |
-
return
|
46 |
-
|
47 |
schema = self.client.create_schema(
|
48 |
auto_id=True,
|
49 |
enable_dynamic_fields=True,
|
@@ -61,16 +36,19 @@ class MilvusManager:
|
|
61 |
)
|
62 |
|
63 |
def create_index(self):
|
|
|
|
|
|
|
|
|
64 |
index_params = self.client.prepare_index_params()
|
65 |
-
|
66 |
index_params.add_index(
|
67 |
field_name="vector",
|
68 |
index_name="vector_index",
|
69 |
-
index_type="
|
70 |
-
metric_type=
|
71 |
params={
|
72 |
-
"M":
|
73 |
-
"efConstruction":
|
74 |
},
|
75 |
)
|
76 |
|
@@ -78,78 +56,33 @@ class MilvusManager:
|
|
78 |
collection_name=self.collection_name, index_params=index_params, sync=True
|
79 |
)
|
80 |
|
81 |
-
def
|
82 |
-
|
83 |
-
collections = self.client.list_collections()
|
84 |
-
|
85 |
-
# Set search parameters (here, using Inner Product metric).
|
86 |
-
search_params = {"metric_type": os.environ["metrictype"], "params": {}} #default metric type is "IP"
|
87 |
-
|
88 |
-
# Set to store unique (doc_id, collection_name) pairs across all collections.
|
89 |
-
doc_collection_pairs = set()
|
90 |
-
|
91 |
-
# Query each collection individually
|
92 |
-
for collection in collections:
|
93 |
-
self.client.load_collection(collection_name=collection)
|
94 |
-
print("collection loaded:"+ collection)
|
95 |
-
results = self.client.search(
|
96 |
-
collection,
|
97 |
-
data,
|
98 |
-
limit=int(os.environ["topk"]), # Adjust limit per collection as needed. (default is 50)
|
99 |
-
output_fields=["vector", "seq_id", "doc_id"],
|
100 |
-
search_params=search_params,
|
101 |
-
)
|
102 |
-
# Accumulate document IDs along with their originating collection.
|
103 |
-
for r_id in range(len(results)):
|
104 |
-
for r in range(len(results[r_id])):
|
105 |
-
doc_id = results[r_id][r]["entity"]["doc_id"]
|
106 |
-
doc_collection_pairs.add((doc_id, collection))
|
107 |
|
108 |
-
|
|
|
|
|
|
|
|
|
|
|
109 |
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
collection_name=collection_name,
|
114 |
-
filter=f"doc_id in [{doc_id}, {doc_id + 1}]",
|
115 |
-
output_fields=["seq_id", "vector", "doc"],
|
116 |
-
limit=16380,
|
117 |
-
)
|
118 |
-
# Stack the vectors for dot product computation.
|
119 |
-
doc_vecs = np.vstack(
|
120 |
-
[doc_colbert_vecs[i]["vector"] for i in range(len(doc_colbert_vecs))]
|
121 |
-
)
|
122 |
-
# Compute a similarity score via dot product.
|
123 |
-
score = np.dot(data, doc_vecs.T).max(1).sum()
|
124 |
-
return (score, doc_id, collection_name)
|
125 |
|
126 |
-
|
127 |
-
with concurrent.futures.ThreadPoolExecutor(max_workers=300) as executor:
|
128 |
-
futures = {
|
129 |
-
executor.submit(rerank_single_doc, doc_id, data, self.client, collection): (doc_id, collection)
|
130 |
-
for doc_id, collection in doc_collection_pairs
|
131 |
-
}
|
132 |
-
for future in concurrent.futures.as_completed(futures):
|
133 |
-
score, doc_id, collection = future.result()
|
134 |
-
scores.append((score, doc_id, collection))
|
135 |
-
#doc_id is page number!
|
136 |
-
|
137 |
-
# Sort the reranked results by score in descending order.
|
138 |
-
scores.sort(key=lambda x: x[0], reverse=True)
|
139 |
-
# Unload the collection after search to free memory.
|
140 |
-
self.client.release_collection(collection_name=collection)
|
141 |
-
|
142 |
-
return scores[:topk] if len(scores) >= topk else scores #topk is the number of scores to return back
|
143 |
-
"""
|
144 |
search_params = {"metric_type": "IP", "params": {}}
|
145 |
results = self.client.search(
|
146 |
self.collection_name,
|
147 |
data,
|
148 |
-
limit=50,
|
149 |
output_fields=["vector", "seq_id", "doc_id"],
|
150 |
search_params=search_params,
|
151 |
)
|
152 |
-
doc_ids =
|
|
|
|
|
|
|
153 |
|
154 |
scores = []
|
155 |
|
@@ -161,10 +94,10 @@ class MilvusManager:
|
|
161 |
limit=1000,
|
162 |
)
|
163 |
doc_vecs = np.vstack(
|
164 |
-
[
|
165 |
)
|
166 |
score = np.dot(data, doc_vecs.T).max(1).sum()
|
167 |
-
return score, doc_id
|
168 |
|
169 |
with concurrent.futures.ThreadPoolExecutor(max_workers=300) as executor:
|
170 |
futures = {
|
@@ -178,13 +111,59 @@ class MilvusManager:
|
|
178 |
scores.append((score, doc_id))
|
179 |
|
180 |
scores.sort(key=lambda x: x[0], reverse=True)
|
181 |
-
|
182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
|
184 |
def insert(self, data):
|
185 |
-
colbert_vecs = data["colbert_vecs"]
|
186 |
seq_length = len(colbert_vecs)
|
187 |
-
doc_ids = [data["doc_id"]
|
188 |
seq_ids = list(range(seq_length))
|
189 |
docs = [""] * seq_length
|
190 |
docs[0] = data["filepath"]
|
@@ -202,17 +181,38 @@ class MilvusManager:
|
|
202 |
],
|
203 |
)
|
204 |
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
|
|
|
|
|
|
|
|
211 |
}
|
212 |
-
|
213 |
-
|
|
|
|
|
214 |
|
215 |
def insert_images_data(self, image_data):
|
216 |
data = self.get_images_as_doc(image_data)
|
217 |
-
|
218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from pymilvus import MilvusClient, DataType
|
|
|
|
|
|
|
|
|
2 |
import numpy as np
|
3 |
import concurrent.futures
|
4 |
+
|
|
|
5 |
|
6 |
class MilvusManager:
|
7 |
def __init__(self, milvus_uri, collection_name, create_collection, dim=128):
|
8 |
+
self.client = MilvusClient(uri=milvus_uri)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
self.collection_name = collection_name
|
10 |
+
if self.client.has_collection(collection_name=self.collection_name):
|
11 |
+
self.client.load_collection(collection_name)
|
12 |
self.dim = dim
|
13 |
|
14 |
+
if create_collection:
|
|
|
|
|
|
|
15 |
self.create_collection()
|
16 |
self.create_index()
|
17 |
|
18 |
+
|
19 |
def create_collection(self):
|
20 |
if self.client.has_collection(collection_name=self.collection_name):
|
21 |
+
self.client.drop_collection(collection_name=self.collection_name)
|
|
|
|
|
22 |
schema = self.client.create_schema(
|
23 |
auto_id=True,
|
24 |
enable_dynamic_fields=True,
|
|
|
36 |
)
|
37 |
|
38 |
def create_index(self):
|
39 |
+
self.client.release_collection(collection_name=self.collection_name)
|
40 |
+
self.client.drop_index(
|
41 |
+
collection_name=self.collection_name, index_name="vector"
|
42 |
+
)
|
43 |
index_params = self.client.prepare_index_params()
|
|
|
44 |
index_params.add_index(
|
45 |
field_name="vector",
|
46 |
index_name="vector_index",
|
47 |
+
index_type="FLAT",
|
48 |
+
metric_type="IP",
|
49 |
params={
|
50 |
+
"M": 16,
|
51 |
+
"efConstruction": 500,
|
52 |
},
|
53 |
)
|
54 |
|
|
|
56 |
collection_name=self.collection_name, index_params=index_params, sync=True
|
57 |
)
|
58 |
|
59 |
+
def create_scalar_index(self):
|
60 |
+
self.client.release_collection(collection_name=self.collection_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
+
index_params = self.client.prepare_index_params()
|
63 |
+
index_params.add_index(
|
64 |
+
field_name="doc_id",
|
65 |
+
index_name="int32_index",
|
66 |
+
index_type="INVERTED",
|
67 |
+
)
|
68 |
|
69 |
+
self.client.create_index(
|
70 |
+
collection_name=self.collection_name, index_params=index_params, sync=True
|
71 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
|
73 |
+
def search(self, data, topk):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
search_params = {"metric_type": "IP", "params": {}}
|
75 |
results = self.client.search(
|
76 |
self.collection_name,
|
77 |
data,
|
78 |
+
limit=int(50),
|
79 |
output_fields=["vector", "seq_id", "doc_id"],
|
80 |
search_params=search_params,
|
81 |
)
|
82 |
+
doc_ids = set()
|
83 |
+
for r_id in range(len(results)):
|
84 |
+
for r in range(len(results[r_id])):
|
85 |
+
doc_ids.add(results[r_id][r]["entity"]["doc_id"])
|
86 |
|
87 |
scores = []
|
88 |
|
|
|
94 |
limit=1000,
|
95 |
)
|
96 |
doc_vecs = np.vstack(
|
97 |
+
[doc_colbert_vecs[i]["vector"] for i in range(len(doc_colbert_vecs))]
|
98 |
)
|
99 |
score = np.dot(data, doc_vecs.T).max(1).sum()
|
100 |
+
return (score, doc_id)
|
101 |
|
102 |
with concurrent.futures.ThreadPoolExecutor(max_workers=300) as executor:
|
103 |
futures = {
|
|
|
111 |
scores.append((score, doc_id))
|
112 |
|
113 |
scores.sort(key=lambda x: x[0], reverse=True)
|
114 |
+
|
115 |
+
# π DETAILED SCORE LOGGING - Print page numbers with highest scores
|
116 |
+
print("\n" + "="*80)
|
117 |
+
print("π RETRIEVAL SCORES - PAGE NUMBERS WITH HIGHEST SCORES")
|
118 |
+
print("="*80)
|
119 |
+
print(f"π Collection: {self.collection_name}")
|
120 |
+
print(f"π Total documents found: {len(scores)}")
|
121 |
+
print(f"π― Requested top-k: {topk}")
|
122 |
+
print("-"*80)
|
123 |
+
|
124 |
+
# Display top 10 scores with detailed information
|
125 |
+
display_count = min(10, len(scores))
|
126 |
+
for i, (score, doc_id) in enumerate(scores[:display_count]):
|
127 |
+
page_num = doc_id + 1 # Convert doc_id to page number (0-based to 1-based)
|
128 |
+
relevance_level = self._get_relevance_level(score)
|
129 |
+
print(f"π Page {page_num:2d} (doc_id: {doc_id:2d}) | Score: {score:8.4f} | {relevance_level}")
|
130 |
+
|
131 |
+
if len(scores) > display_count:
|
132 |
+
print(f"... and {len(scores) - display_count} more results")
|
133 |
+
|
134 |
+
print("-"*80)
|
135 |
+
print(f"π HIGHEST SCORING PAGES:")
|
136 |
+
top_3 = scores[:3]
|
137 |
+
for i, (score, doc_id) in enumerate(top_3, 1):
|
138 |
+
page_num = doc_id + 1
|
139 |
+
print(f" {i}. Page {page_num} - Score: {score:.4f}")
|
140 |
+
|
141 |
+
print("="*80 + "\n")
|
142 |
+
|
143 |
+
if len(scores) >= topk:
|
144 |
+
return scores[:topk]
|
145 |
+
else:
|
146 |
+
return scores
|
147 |
+
|
148 |
+
def _get_relevance_level(self, score):
|
149 |
+
"""Get human-readable relevance level based on score"""
|
150 |
+
if score >= 0.90:
|
151 |
+
return "π’ EXCELLENT - Highly relevant"
|
152 |
+
elif score >= 0.80:
|
153 |
+
return "π‘ VERY GOOD - Very relevant"
|
154 |
+
elif score >= 0.70:
|
155 |
+
return "π GOOD - Relevant"
|
156 |
+
elif score >= 0.60:
|
157 |
+
return "π΅ MODERATE - Somewhat relevant"
|
158 |
+
elif score >= 0.50:
|
159 |
+
return "π£ BASIC - Minimally relevant"
|
160 |
+
else:
|
161 |
+
return "π΄ POOR - Not relevant"
|
162 |
|
163 |
def insert(self, data):
|
164 |
+
colbert_vecs = [vec for vec in data["colbert_vecs"]]
|
165 |
seq_length = len(colbert_vecs)
|
166 |
+
doc_ids = [data["doc_id"] for i in range(seq_length)]
|
167 |
seq_ids = list(range(seq_length))
|
168 |
docs = [""] * seq_length
|
169 |
docs[0] = data["filepath"]
|
|
|
181 |
],
|
182 |
)
|
183 |
|
184 |
+
|
185 |
+
def get_images_as_doc(self, images_with_vectors:list):
|
186 |
+
|
187 |
+
images_data = []
|
188 |
+
|
189 |
+
for i in range(len(images_with_vectors)):
|
190 |
+
data = {
|
191 |
+
"colbert_vecs": images_with_vectors[i]["colbert_vecs"],
|
192 |
+
"doc_id": i,
|
193 |
+
"filepath": images_with_vectors[i]["filepath"],
|
194 |
}
|
195 |
+
images_data.append(data)
|
196 |
+
|
197 |
+
return images_data
|
198 |
+
|
199 |
|
200 |
def insert_images_data(self, image_data):
|
201 |
data = self.get_images_as_doc(image_data)
|
202 |
+
|
203 |
+
for i in range(len(data)):
|
204 |
+
self.insert(data[i])
|
205 |
+
|
206 |
+
def drop_collection(self):
|
207 |
+
"""Drop the current collection from Milvus"""
|
208 |
+
try:
|
209 |
+
if self.client.has_collection(collection_name=self.collection_name):
|
210 |
+
self.client.drop_collection(collection_name=self.collection_name)
|
211 |
+
print(f"ποΈ Dropped Milvus collection: {self.collection_name}")
|
212 |
+
return True
|
213 |
+
else:
|
214 |
+
print(f"β οΈ Collection {self.collection_name} does not exist in Milvus")
|
215 |
+
return False
|
216 |
+
except Exception as e:
|
217 |
+
print(f"β Error dropping collection {self.collection_name}: {e}")
|
218 |
+
return False
|
pdf_manager.py
CHANGED
@@ -4,7 +4,21 @@ import shutil
|
|
4 |
|
5 |
class PdfManager:
|
6 |
def __init__(self):
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
def clear_and_recreate_dir(self, output_folder):
|
10 |
|
@@ -19,7 +33,8 @@ class PdfManager:
|
|
19 |
#print("Clearing is unused for now for persistency")
|
20 |
|
21 |
def save_images(self, id, pdf_path, max_pages, pages: list[int] = None) -> list[str]:
|
22 |
-
|
|
|
23 |
images = convert_from_path(pdf_path)
|
24 |
|
25 |
print(f"Saving images from {pdf_path} to {output_folder}. Max pages: {max_pages}")
|
@@ -35,7 +50,7 @@ class PdfManager:
|
|
35 |
if pages and i not in pages:
|
36 |
continue
|
37 |
|
38 |
-
full_save_path = f"
|
39 |
|
40 |
#print(f"Saving image to {full_save_path}")
|
41 |
|
@@ -43,4 +58,4 @@ class PdfManager:
|
|
43 |
|
44 |
num_page_processed += 1
|
45 |
|
46 |
-
return [f"
|
|
|
4 |
|
5 |
class PdfManager:
|
6 |
def __init__(self):
|
7 |
+
# Use relative paths for Hugging Face Spaces compatibility
|
8 |
+
# Get the directory where the main application file is located
|
9 |
+
app_dir = os.path.dirname(os.path.abspath(__file__))
|
10 |
+
|
11 |
+
# Use /tmp for Hugging Face Spaces, fallback to relative path
|
12 |
+
if os.path.exists("/tmp") and os.access("/tmp", os.W_OK):
|
13 |
+
self.base_output_dir = "/tmp/pages"
|
14 |
+
print(f"β
Using /tmp directory for Hugging Face Spaces: {self.base_output_dir}")
|
15 |
+
else:
|
16 |
+
# Fallback to relative path from app directory
|
17 |
+
self.base_output_dir = os.path.join(app_dir, "pages")
|
18 |
+
print(f"β
Using relative path: {self.base_output_dir}")
|
19 |
+
|
20 |
+
# Ensure the base directory exists
|
21 |
+
os.makedirs(self.base_output_dir, exist_ok=True)
|
22 |
|
23 |
def clear_and_recreate_dir(self, output_folder):
|
24 |
|
|
|
33 |
#print("Clearing is unused for now for persistency")
|
34 |
|
35 |
def save_images(self, id, pdf_path, max_pages, pages: list[int] = None) -> list[str]:
|
36 |
+
# Use absolute path for Hugging Face Spaces compatibility
|
37 |
+
output_folder = os.path.join(self.base_output_dir, id)
|
38 |
images = convert_from_path(pdf_path)
|
39 |
|
40 |
print(f"Saving images from {pdf_path} to {output_folder}. Max pages: {max_pages}")
|
|
|
50 |
if pages and i not in pages:
|
51 |
continue
|
52 |
|
53 |
+
full_save_path = os.path.join(output_folder, f"page_{i + 1}.png")
|
54 |
|
55 |
#print(f"Saving image to {full_save_path}")
|
56 |
|
|
|
58 |
|
59 |
num_page_processed += 1
|
60 |
|
61 |
+
return [os.path.join(output_folder, f"page_{i + 1}.png") for i in range(num_page_processed)]
|
rag.py
CHANGED
@@ -5,7 +5,7 @@ import re
|
|
5 |
from typing import List
|
6 |
from utils import encode_image
|
7 |
from PIL import Image
|
8 |
-
from
|
9 |
import torch
|
10 |
import subprocess
|
11 |
import psutil
|
@@ -64,30 +64,28 @@ class Rag:
|
|
64 |
|
65 |
return response_text
|
66 |
|
67 |
-
def get_answer_from_gemini(self, query
|
68 |
-
|
|
|
|
|
|
|
69 |
try:
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
pass
|
85 |
-
|
86 |
-
chat_session = model.start_chat()
|
87 |
-
response = chat_session.send_message([*images, query])
|
88 |
-
return response.text
|
89 |
except Exception as e:
|
90 |
-
print(f"
|
91 |
return f"Error: {str(e)}"
|
92 |
|
93 |
#os.environ['OPENAI_API_KEY'] = "for the love of Jesus let this work"
|
@@ -100,13 +98,160 @@ class Rag:
|
|
100 |
dotenv_file = dotenv.find_dotenv()
|
101 |
dotenv.load_dotenv(dotenv_file)
|
102 |
|
103 |
-
#
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
except Exception as e:
|
109 |
-
print(f"
|
110 |
return None
|
111 |
|
112 |
|
|
|
5 |
from typing import List
|
6 |
from utils import encode_image
|
7 |
from PIL import Image
|
8 |
+
from ollama import chat
|
9 |
import torch
|
10 |
import subprocess
|
11 |
import psutil
|
|
|
64 |
|
65 |
return response_text
|
66 |
|
67 |
+
def get_answer_from_gemini(self, query, imagePaths):
|
68 |
+
|
69 |
+
|
70 |
+
print(f"Querying Gemini for query={query}, imagePaths={imagePaths}")
|
71 |
+
|
72 |
try:
|
73 |
+
client = genai.Client(api_key="AIzaSyCwRr9054tCuh2S8yGpwKFvOAxYMT4WNIs")
|
74 |
+
|
75 |
+
images = [Image.open(path) for path in imagePaths]
|
76 |
+
|
77 |
+
response = client.models.generate_content(
|
78 |
+
model="gemini-2.5-flash",
|
79 |
+
contents=[images, query],
|
80 |
+
)
|
81 |
+
|
82 |
+
print(response.text)
|
83 |
+
answer = response.text
|
84 |
+
|
85 |
+
return answer
|
86 |
+
|
|
|
|
|
|
|
|
|
|
|
87 |
except Exception as e:
|
88 |
+
print(f"An error occurred while querying Gemini: {e}")
|
89 |
return f"Error: {str(e)}"
|
90 |
|
91 |
#os.environ['OPENAI_API_KEY'] = "for the love of Jesus let this work"
|
|
|
98 |
dotenv_file = dotenv.find_dotenv()
|
99 |
dotenv.load_dotenv(dotenv_file)
|
100 |
|
101 |
+
#ollama method below
|
102 |
+
|
103 |
+
torch.cuda.empty_cache() #release cuda so that ollama can use gpu!
|
104 |
+
|
105 |
+
|
106 |
+
os.environ['OLLAMA_FLASH_ATTENTION'] = os.environ['flashattn'] #int "1"
|
107 |
+
if os.environ['ollama'] == "minicpm-v":
|
108 |
+
os.environ['ollama'] = "minicpm-v:8b-2.6-q8_0" #set to quantized version
|
109 |
+
elif os.environ['ollama'] == "gemma3":
|
110 |
+
os.environ['ollama'] = "gemma3:12b" #set to upscaled version 12b when needed
|
111 |
+
# Add specific environment variables for Gemma3 to prevent raw token issues
|
112 |
+
os.environ['OLLAMA_KEEP_ALIVE'] = "5m"
|
113 |
+
os.environ['OLLAMA_ORIGINS'] = "*"
|
114 |
+
|
115 |
+
|
116 |
+
# Close model thread (colpali)
|
117 |
+
print(f"Querying OpenAI for query={query}, imagesPaths={imagesPaths}")
|
118 |
+
|
119 |
+
try:
|
120 |
+
|
121 |
+
# Enhanced prompt for more detailed responses with explicit page usage
|
122 |
+
enhanced_query = f"""
|
123 |
+
Please provide a comprehensive and detailed answer to the following query.
|
124 |
+
Use ALL available information from the provided document images to give a thorough response.
|
125 |
+
|
126 |
+
Query: {query}
|
127 |
+
|
128 |
+
CRITICAL INSTRUCTIONS:
|
129 |
+
- You have been provided with {len(imagesPaths)} document page(s)
|
130 |
+
- You MUST reference information from ALL {len(imagesPaths)} page(s) in your response
|
131 |
+
- Do not skip any pages - each page contains relevant information
|
132 |
+
- If you mention one page, you must also mention the others
|
133 |
+
- Ensure your response reflects the complete information from all pages
|
134 |
+
|
135 |
+
Instructions for detailed response:
|
136 |
+
1. Provide extensive background information and context
|
137 |
+
2. Include specific details, examples, and data points from ALL documents
|
138 |
+
3. Explain concepts thoroughly with step-by-step breakdowns
|
139 |
+
4. Provide comprehensive analysis rather than simple answers when requested
|
140 |
+
5. Explicitly reference each page and what information it contributes
|
141 |
+
6. Cross-reference information between pages when relevant
|
142 |
+
7. Ensure no page is left unmentioned in your analysis
|
143 |
+
|
144 |
+
SPECIAL INSTRUCTIONS FOR TABULAR DATA:
|
145 |
+
- If the query requests a table, list, or structured data, organize your response in a clear, structured format
|
146 |
+
- Use numbered lists, bullet points, or clear categories when appropriate
|
147 |
+
- Include specific data points or comparisons when available
|
148 |
+
- Structure information in a way that can be easily converted to a table format
|
149 |
+
|
150 |
+
IMPORTANT: Respond with natural, human-readable text only. Do not include any special tokens, codes, or technical identifiers in your response.
|
151 |
+
|
152 |
+
Make sure to acknowledge and use information from all {len(imagesPaths)} provided pages.
|
153 |
+
"""
|
154 |
+
|
155 |
+
# Try with current model first
|
156 |
+
current_model = os.environ['ollama']
|
157 |
+
|
158 |
+
# Set different options based on the model
|
159 |
+
if "gemma3" in current_model.lower():
|
160 |
+
# Specific options for Gemma3 to prevent raw token issues
|
161 |
+
model_options = {
|
162 |
+
"num_predict": 1024, # Shorter responses for Gemma3
|
163 |
+
"stop": ["<eos>", "<|endoftext|>", "</s>", "<|im_end|>"], # More stop tokens
|
164 |
+
"top_k": 20, # Lower top_k for more focused generation
|
165 |
+
"top_p": 0.8, # Lower top_p for more deterministic output
|
166 |
+
"repeat_penalty": 1.2, # Higher repeat penalty
|
167 |
+
"seed": 42, # Consistent results
|
168 |
+
"temperature": 0.7, # Lower temperature for more focused responses
|
169 |
+
}
|
170 |
+
else:
|
171 |
+
# Default options for other models
|
172 |
+
model_options = {
|
173 |
+
"num_predict": 2048, # Limit response length
|
174 |
+
"stop": ["<eos>", "<|endoftext|>", "</s>"], # Stop at end tokens
|
175 |
+
"top_k": 40, # Reduce randomness
|
176 |
+
"top_p": 0.9, # Nucleus sampling
|
177 |
+
"repeat_penalty": 1.1, # Prevent repetition
|
178 |
+
"seed": 42, # Consistent results
|
179 |
+
}
|
180 |
+
|
181 |
+
response = chat(
|
182 |
+
model=current_model,
|
183 |
+
messages=[
|
184 |
+
{
|
185 |
+
'role': 'user',
|
186 |
+
'content': enhanced_query,
|
187 |
+
'images': imagesPaths,
|
188 |
+
"temperature":float(os.environ['temperature']), #test if temp makes a diff
|
189 |
+
}
|
190 |
+
],
|
191 |
+
options=model_options
|
192 |
+
)
|
193 |
+
|
194 |
+
answer = response.message.content
|
195 |
+
|
196 |
+
# Clean the response to handle raw token issues
|
197 |
+
cleaned_answer = self._clean_raw_token_response(answer)
|
198 |
+
|
199 |
+
# If the cleaned answer is still problematic, try fallback models
|
200 |
+
if cleaned_answer and "β **Model Response Error**" in cleaned_answer:
|
201 |
+
print(f"β οΈ Primary model {current_model} failed, trying fallback models...")
|
202 |
+
|
203 |
+
# List of fallback models to try
|
204 |
+
fallback_models = [
|
205 |
+
"llama3.2-vision:latest",
|
206 |
+
"llava:latest",
|
207 |
+
"bakllava:latest",
|
208 |
+
"llama3.2:latest"
|
209 |
+
]
|
210 |
+
|
211 |
+
for fallback_model in fallback_models:
|
212 |
+
try:
|
213 |
+
print(f"π Trying fallback model: {fallback_model}")
|
214 |
+
response = chat(
|
215 |
+
model=fallback_model,
|
216 |
+
messages=[
|
217 |
+
{
|
218 |
+
'role': 'user',
|
219 |
+
'content': enhanced_query,
|
220 |
+
'images': imagesPaths,
|
221 |
+
"temperature":float(os.environ['temperature']),
|
222 |
+
}
|
223 |
+
],
|
224 |
+
options={
|
225 |
+
"num_predict": 2048,
|
226 |
+
"stop": ["<eos>", "<|endoftext|>", "</s>"],
|
227 |
+
"top_k": 40,
|
228 |
+
"top_p": 0.9,
|
229 |
+
"repeat_penalty": 1.1,
|
230 |
+
"seed": 42,
|
231 |
+
}
|
232 |
+
)
|
233 |
+
|
234 |
+
fallback_answer = response.message.content
|
235 |
+
cleaned_fallback = self._clean_raw_token_response(fallback_answer)
|
236 |
+
|
237 |
+
if cleaned_fallback and "β **Model Response Error**" not in cleaned_fallback:
|
238 |
+
print(f"β
Fallback model {fallback_model} succeeded")
|
239 |
+
return cleaned_fallback
|
240 |
+
|
241 |
+
except Exception as fallback_error:
|
242 |
+
print(f"β Fallback model {fallback_model} failed: {fallback_error}")
|
243 |
+
continue
|
244 |
+
|
245 |
+
# If all fallbacks fail, return the original error
|
246 |
+
return cleaned_answer
|
247 |
+
|
248 |
+
print(f"Original response: {answer}")
|
249 |
+
print(f"Cleaned response: {cleaned_answer}")
|
250 |
+
|
251 |
+
return cleaned_answer
|
252 |
+
|
253 |
except Exception as e:
|
254 |
+
print(f"An error occurred while querying OpenAI: {e}")
|
255 |
return None
|
256 |
|
257 |
|