Spaces:

Kazel
/

demo-updated

Running on Zero

App Files Files Community

Kazel commited on 9 days ago

Commit

d901124

2 Parent(s): 79696c1 2ee6344

logging

Browse files

Files changed (8) hide show

.huggingface-spaces +15 -0
README.md +2 -3
app.py +420 -213
colpali_manager.py +3 -3
middleware.py +25 -5
milvus_manager.py +115 -115
pdf_manager.py +19 -4
rag.py +174 -29

.huggingface-spaces ADDED Viewed

	@@ -0,0 +1,15 @@

+# Hugging Face Spaces Configuration
+# This file helps ensure proper deployment and configuration
+# Environment variables for Hugging Face Spaces
+SPACE_ID=${SPACE_ID}
+HF_SPACE_ID=${HF_SPACE_ID}
+# File path configuration
+BASE_DIR=/tmp/pages
+FALLBACK_DIR=pages
+# Ensure proper permissions
+chmod 755 /tmp
+mkdir -p /tmp/pages
+chmod 755 /tmp/pages

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🔍
 colorFrom: blue
 colorTo: purple
 sdk: gradio
-sdk_version: "4.0.0"
 app_file: app.py
 pinned: false
 ---
@@ -339,5 +339,4 @@ For support and questions:
 ---
-**Made by Collar** - Enhanced with Team Management & Chat History

 colorFrom: blue
 colorTo: purple
 sdk: gradio
+sdk_version: 5.44.1
 app_file: app.py
 pinned: false
 ---
 ---
+**Made by Collar** - Enhanced with Team Management & Chat History

app.py CHANGED Viewed

@@ -17,6 +17,7 @@ import requests
 import base64
 from PIL import Image
 import io
 from middleware import Middleware
 from rag import Rag
@@ -28,7 +29,14 @@ from dotenv import load_dotenv, dotenv_values
 import dotenv
 import platform
 import time
-from pptxtopdf import convert
 # Import libraries for DOC and Excel export
 try:
@@ -378,31 +386,29 @@ class PDFSearchApp:
         self.db_manager = db_manager
         self.session_manager = session_manager
-    def upload_and_convert(self, state, files, max_pages, session_id=None, folder_name=None):
-        """Upload and convert files with team-based organization"""
         if files is None:
             return "No file uploaded"
         try:
-            # Get user info from session if available
-            user_info = None
-            team = "default"
-            if session_id:
-                session = self.session_manager.get_session(session_id)
-                if session:
-                    user_info = session['user_info']
-                    team = user_info['team']
             total_pages = 0
             uploaded_files = []
-            # Create team-specific folder if folder_name is provided
             if folder_name:
                 folder_name = folder_name.replace(" ", "_").replace("-", "_")
-                collection_name = f"{team}_{folder_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
             else:
-                collection_name = f"{team}_documents_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
             for file in files[:]:
                     # Extract the last part of the path (file name)
@@ -412,13 +418,16 @@ class PDFSearchApp:
                 # Convert PPT to PDF if needed
                 if ext.lower() in [".ppt", ".pptx"]:
-                    output_file = os.path.splitext(file.name)[0] + '.pdf'
-                    output_directory = os.path.dirname(file.name)
-                    outfile = os.path.join(output_directory, output_file)
-                    convert(file.name, outfile)
-                    pdf_path = outfile
-                    name = os.path.basename(outfile)
-                    name, ext = os.path.splitext(name)
                 # Create unique document ID
                 doc_id = f"{collection_name}_{name.replace(' ', '_').replace('-', '_')}"
@@ -426,31 +435,93 @@ class PDFSearchApp:
                 print(f"Uploading file: {doc_id}")
                 middleware = Middleware(collection_name, create_collection=True)
-                pages = middleware.index(pdf_path, id=doc_id, max_pages=max_pages)
                 total_pages += len(pages) if pages else 0
                 uploaded_files.append(doc_id)
-                self.indexed_docs[doc_id] = True
-            # Save collection info to database
-            if user_info:
-                self.db_manager.save_document_collection(
-                    collection_name,
-                    team,
-                    user_info['id'],
-                    len(uploaded_files)
-                )
-            return f"Uploaded {len(uploaded_files)} files with {total_pages} total pages to collection: {collection_name}"
         except Exception as e:
             return f"Error processing files: {str(e)}"
-    def display_file_list(text):
         try:
         # Retrieve all entries in the specified directory
-            directory_path = "pages"
             current_working_directory = os.getcwd()
             directory_path = os.path.join(current_working_directory, directory_path)
             entries = os.listdir(directory_path)
@@ -465,39 +536,134 @@ class PDFSearchApp:
             return str(e)
-    def search_documents(self, state, query, num_results, session_id=None):
         print(f"Searching for query: {query}")
         if not query:
             print("Please enter a search query")
-            return "Please enter a search query", "--", "Please enter a search query", [], None
         try:
-            # Get user info from session if available
-            user_info = None
-            if session_id:
-                session = self.session_manager.get_session(session_id)
-                if session:
-                    user_info = session['user_info']
-            middleware = Middleware("test", create_collection=False)
             # Enhanced multi-page retrieval with vision-guided chunking approach
             # Get more results than requested to allow for intelligent filtering
             # Request 3x the number of results for better selection
             search_results = middleware.search([query], topk=max(num_results * 3, 20))[0]
-            # Debug: Log the number of results retrieved
-            print(f"🔍 Retrieved {len(search_results)} total results from search")
             if len(search_results) > 0:
-                print(f"🔍 Top result score: {search_results[0][0]:.3f}")
-                print(f"🔍 Bottom result score: {search_results[-1][0]:.3f}")
             if not search_results:
-                return "No search results found", "--", "No search results found for your query", [], None
             # Implement intelligent multi-page selection based on research
-            selected_results = self._select_relevant_pages(search_results, query, num_results)
             # Process selected results
             cited_pages = []
@@ -507,13 +673,22 @@ class PDFSearchApp:
             print(f"📄 Processing {len(selected_results)} selected results...")
-            for i, (score, page_num, coll_num) in enumerate(selected_results):
-                # Convert 0-based page number to 1-based for file naming
-                display_page_num = page_num + 1
-                img_path = f"pages/{coll_num}/page_{display_page_num}.png"
-                path = f"pages/{coll_num}/page_{display_page_num}"
-                if os.path.exists(img_path):
                     img_paths.append(img_path)
                     all_paths.append(path)
                     page_scores.append(score)
@@ -521,16 +696,78 @@ class PDFSearchApp:
                     print(f"✅ Retrieved page {i+1}: {img_path} (Score: {score:.3f})")
                 else:
                     print(f"❌ Image file not found: {img_path}")
             print(f"📊 Final count: {len(img_paths)} valid pages out of {len(selected_results)} selected")
             if not img_paths:
-                return "No valid image files found", "--", "Error: No valid image files found for the search results", [], None
             # Generate RAG response with multiple pages using enhanced approach
-            rag_response, csv_filepath, doc_filepath, excel_filepath = self._generate_multi_page_response(query, img_paths, cited_pages, page_scores)
             # Prepare downloads
             csv_download = self._prepare_csv_download(csv_filepath)
@@ -556,81 +793,42 @@ class PDFSearchApp:
         except Exception as e:
             error_msg = f"Error during search: {str(e)}"
             return error_msg, "--", error_msg, [], None, None, None, None
-    def _select_relevant_pages(self, search_results, query, num_results):
         """
-        Intelligent page selection using vision-guided chunking principles
-        Based on research from M3DocRAG and multi-modal retrieval models
         """
         if len(search_results) <= num_results:
             return search_results
-        # Detect if query needs multiple pages
-        multi_page_keywords = [
-            'compare', 'difference', 'similarities', 'both', 'multiple', 'various',
-            'different', 'types', 'kinds', 'categories', 'procedures', 'methods',
-            'approaches', 'techniques', 'safety', 'protocols', 'guidelines',
-            'overview', 'summary', 'comprehensive', 'complete', 'all', 'everything'
-        ]
-        query_lower = query.lower()
-        needs_multiple_pages = any(keyword in query_lower for keyword in multi_page_keywords)
         # Sort by relevance score
         sorted_results = sorted(search_results, key=lambda x: x[0], reverse=True)
-        # CRITICAL FIX: Ensure we return exactly the number of pages requested
-        # This addresses the ColPali retrieval configuration issue mentioned in research
-        # Strategy 1: Include highest scoring result from each collection (diversity)
-        selected = []
-        seen_collections = set()
-        # First pass: get one page from each collection for diversity
-        for score, page_num, coll_num in sorted_results:
-            if coll_num not in seen_collections and len(selected) < min(num_results // 2, len(search_results)):
-                selected.append((score, page_num, coll_num))
-                seen_collections.add(coll_num)
-        # Strategy 2: Fill remaining slots with highest scoring results
-        for score, page_num, coll_num in sorted_results:
-            if (score, page_num, coll_num) not in selected and len(selected) < num_results:
-                selected.append((score, page_num, coll_num))
-        # Strategy 3: If we still don't have enough, add more from any collection
-        if len(selected) < num_results:
-            for score, page_num, coll_num in sorted_results:
-                if (score, page_num, coll_num) not in selected and len(selected) < num_results:
-                    selected.append((score, page_num, coll_num))
-        # Strategy 4: If we have too many, trim to exact number requested
-        if len(selected) > num_results:
-            selected = selected[:num_results]
-        # Strategy 5: If we have too few, add more from the sorted results
-        if len(selected) < num_results and len(sorted_results) >= num_results:
-            for score, page_num, coll_num in sorted_results:
-                if (score, page_num, coll_num) not in selected and len(selected) < num_results:
-                    selected.append((score, page_num, coll_num))
-        # Sort selected results by score for consistency
-        selected.sort(key=lambda x: x[0], reverse=True)
-        print(f"Requested {num_results} pages, selected {len(selected)} pages from {len(seen_collections)} collections")
-        # Final verification: ensure we return exactly the requested number
-        if len(selected) != num_results:
-            print(f"⚠️ Warning: Requested {num_results} pages but selected {len(selected)} pages")
-            if len(selected) < num_results and len(sorted_results) >= num_results:
-                # Add more pages to reach the target
-                for score, page_num, coll_num in sorted_results:
-                    if (score, page_num, coll_num) not in selected and len(selected) < num_results:
-                        selected.append((score, page_num, coll_num))
-                print(f"Added more pages to reach target: {len(selected)} pages")
         return selected
     def _optimize_consecutive_pages(self, selected, all_results, target_count=None):
         """
         Optimize selection to include consecutive pages when beneficial
@@ -1167,7 +1365,7 @@ The system detected you requested tabular data, but the current response doesn't
                     cell_str = str(cell)
                     if ',' in cell_str or '"' in cell_str or '\n' in cell_str:
                         # Escape quotes and wrap in quotes
-                        cell_str = f'"{cell_str.replace('"', '""')}"'
                     escaped_row.append(cell_str)
                 csv_lines.append(','.join(escaped_row))
@@ -2798,76 +2996,113 @@ The system detected you requested tabular data, but the current response doesn't
             # Fallback to simple response with enhanced prompt
             return rag.get_answer_from_gemini(detailed_prompt, img_paths), None, None, None
-    def authenticate_user(self, username, password):
-        """Authenticate user and create session"""
-        user_info = self.db_manager.authenticate_user(username, password)
-        if user_info:
-            session_id = self.session_manager.create_session(user_info)
-            return f"Welcome {user_info['username']} from {user_info['team']}!", session_id, user_info['team']
-        else:
-            return "Invalid username or password", None, None
-    def logout_user(self, session_id):
-        """Logout user and remove session"""
-        if session_id:
-            self.session_manager.remove_session(session_id)
-        return "Logged out successfully", None, None
-    def get_team_collections(self, session_id):
-        """Get available collections for the user's team"""
-        if not session_id:
-            return "Please log in to view team collections"
-        session = self.session_manager.get_session(session_id)
-        if not session:
-            return "Session expired. Please log in again."
-        team = session['user_info']['team']
-        collections = self.db_manager.get_team_collections(team)
-        if not collections:
-            return f"No collections found for {team}"
-        return f"**{team} Collections:**\n" + "\n".join([f"- {coll}" for coll in collections])
 def create_ui():
     app = PDFSearchApp()
     with gr.Blocks(theme=gr.themes.Ocean(), css="footer{display:none !important}") as demo:
-        # Session state management
-        session_state = gr.State(value=None)
-        user_info_state = gr.State(value=None)
         gr.Markdown("# Collar Multimodal RAG Demo - Streamlined")
-        gr.Markdown("Made by Collar - Document Upload and Query System")
-        # Authentication Tab
-        with gr.Tab("🔐 Authentication"):
-            with gr.Row():
-                with gr.Column(scale=1):
-                    gr.Markdown("### Login")
-                    username_input = gr.Textbox(label="Username", placeholder="Enter username")
-                    password_input = gr.Textbox(label="Password", type="password", placeholder="Enter password")
-                    login_btn = gr.Button("Login", variant="primary")
-                    logout_btn = gr.Button("Logout")
-                    auth_status = gr.Textbox(label="Authentication Status", interactive=False)
-                    current_team = gr.Textbox(label="Current Team", interactive=False)
-                with gr.Column(scale=1):
-                    gr.Markdown("### Default Users")
-                    gr.Markdown("""
-                    **Team A:** admin_team_a / admin123_team_a
-                    **Team B:** admin_team_b / admin123_team_b
-                    """)
-        # Document Management Tab
-        with gr.Tab("📁 Document Management"):
             with gr.Column():
-                gr.Markdown("### Upload Documents to Team Repository")
                 folder_name_input = gr.Textbox(
-                    label="Folder/Collection Name (Optional)",
-                    placeholder="Enter a name for this document collection"
                 )
                 max_pages_input = gr.Slider(
                     minimum=1,
@@ -2877,19 +3112,11 @@ def create_ui():
                     label="Max pages to extract and index per document"
                 )
                 file_input = gr.Files(
-                    label="Upload PPTs/PDFs (Multiple files supported)",
                     file_count="multiple"
                 )
-                upload_btn = gr.Button("Upload to Repository", variant="primary")
                 upload_status = gr.Textbox(label="Upload Status", interactive=False)
-                gr.Markdown("### Team Collections")
-                refresh_collections_btn = gr.Button("Refresh Collections")
-                team_collections_display = gr.Textbox(
-                    label="Available Collections",
-                    interactive=False,
-                    lines=5
-                )
         # Enhanced Query Tab
         with gr.Tab("🔍 Advanced Query"):
@@ -2958,36 +3185,16 @@ def create_ui():
         # Event handlers
-        # Authentication events
-        login_btn.click(
-            fn=app.authenticate_user,
-            inputs=[username_input, password_input],
-            outputs=[auth_status, session_state, current_team]
-        )
-        logout_btn.click(
-            fn=app.logout_user,
-            inputs=[session_state],
-            outputs=[auth_status, session_state, current_team]
-        )
-        # Document management events
         upload_btn.click(
             fn=app.upload_and_convert,
-            inputs=[session_state, file_input, max_pages_input, session_state, folder_name_input],
             outputs=[upload_status]
         )
-        refresh_collections_btn.click(
-            fn=app.get_team_collections,
-            inputs=[session_state],
-            outputs=[team_collections_display]
-        )
         # Query events
         search_btn.click(
             fn=app.search_documents,
-            inputs=[session_state, query_input, num_results, session_state],
             outputs=[path, images, llm_answer, cited_pages_display, csv_download, doc_download, excel_download]
         )

 import base64
 from PIL import Image
 import io
+import traceback
 from middleware import Middleware
 from rag import Rag
 import dotenv
 import platform
 import time
+# Only enable PPT/PPTX conversion on Windows where COM is available
+PPT_CONVERT_AVAILABLE = False
+if platform.system() == 'Windows':
+    try:
+        from pptxtopdf import convert
+        PPT_CONVERT_AVAILABLE = True
+    except Exception:
+        PPT_CONVERT_AVAILABLE = False
 # Import libraries for DOC and Excel export
 try:
         self.db_manager = db_manager
         self.session_manager = session_manager
+    def upload_and_convert(self, files, max_pages, folder_name=None):
+        """Upload and convert files without authentication or team scoping"""
         if files is None:
             return "No file uploaded"
         try:
             total_pages = 0
             uploaded_files = []
+            # Create simple collection name
             if folder_name:
                 folder_name = folder_name.replace(" ", "_").replace("-", "_")
+                collection_name = f"{folder_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
             else:
+                collection_name = f"documents_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+            # Store the collection name in indexed_docs BEFORE processing files
+            self.indexed_docs[collection_name] = True
+            print(f"📁 Created collection: {collection_name}")
+            # Clear old collections to ensure only the latest upload is referenced
+            self._clear_old_collections(collection_name)
             for file in files[:]:
                     # Extract the last part of the path (file name)
                 # Convert PPT to PDF if needed
                 if ext.lower() in [".ppt", ".pptx"]:
+                    if PPT_CONVERT_AVAILABLE:
+                        output_file = os.path.splitext(file.name)[0] + '.pdf'
+                        output_directory = os.path.dirname(file.name)
+                        outfile = os.path.join(output_directory, output_file)
+                        convert(file.name, outfile)
+                        pdf_path = outfile
+                        name = os.path.basename(outfile)
+                        name, ext = os.path.splitext(name)
+                    else:
+                        return "PPT/PPTX conversion is only supported on Windows. Please upload PDFs instead."
                 # Create unique document ID
                 doc_id = f"{collection_name}_{name.replace(' ', '_').replace('-', '_')}"
                 print(f"Uploading file: {doc_id}")
                 middleware = Middleware(collection_name, create_collection=True)
+                # Pass collection_name as id to ensure images are saved to the right directory
+                pages = middleware.index(pdf_path, id=collection_name, max_pages=max_pages)
                 total_pages += len(pages) if pages else 0
                 uploaded_files.append(doc_id)
+            # Get the current active collection after cleanup
+            current_collection = self.get_current_collection()
+            status_message = f"Uploaded {len(uploaded_files)} files with {total_pages} total pages to collection: {collection_name}"
+            if current_collection:
+                status_message += f"\n✅ This is now your active collection for searches."
+            return status_message
         except Exception as e:
             return f"Error processing files: {str(e)}"
+    def _clear_old_collections(self, current_collection_name):
+        """Clear old collections to ensure only the latest upload is referenced"""
+        try:
+            # Get all collections except the current one
+            collections_to_remove = [name for name in self.indexed_docs.keys() if name != current_collection_name]
+            if collections_to_remove:
+                print(f"🗑️ Clearing {len(collections_to_remove)} old collections to maintain latest upload reference")
+                for old_collection in collections_to_remove:
+                    # Remove from indexed_docs
+                    del self.indexed_docs[old_collection]
+                    # Try to drop the collection from Milvus
+                    try:
+                        middleware = Middleware(old_collection, create_collection=False)
+                        if middleware.drop_collection():
+                            print(f"🗑️ Successfully dropped Milvus collection '{old_collection}'")
+                        else:
+                            print(f"⚠️ Failed to drop Milvus collection '{old_collection}'")
+                    except Exception as e:
+                        print(f"⚠️ Warning: Could not clean up Milvus collection '{old_collection}': {e}")
+                print(f"✅ Kept only the latest collection: {current_collection_name}")
+            else:
+                print(f"✅ No old collections to clear. Current collection: {current_collection_name}")
+        except Exception as e:
+            print(f"⚠️ Warning: Error clearing old collections: {e}")
+            # Don't fail the upload if cleanup fails
+    def get_current_collection_status(self):
+        """Get a user-friendly status message about the current collection"""
+        current_collection = self.get_current_collection()
+        if current_collection:
+            return f"✅ Currently active collection: {current_collection}"
+        else:
+            return "❌ No documents uploaded yet. Please upload a document to get started."
+    def get_current_collection(self):
+        """Get the name of the currently active collection (most recent upload)"""
+        if not self.indexed_docs:
+            return None
+        available_collections = list(self.indexed_docs.keys())
+        if not available_collections:
+            return None
+        # Sort by timestamp to get the most recent one
+        def extract_timestamp(collection_name):
+            try:
+                parts = collection_name.split('_')
+                if len(parts) >= 3:
+                    date_part = parts[-2]
+                    time_part = parts[-1]
+                    timestamp = f"{date_part}_{time_part}"
+                    return timestamp
+                return collection_name
+            except:
+                return collection_name
+        available_collections.sort(key=extract_timestamp, reverse=True)
+        return available_collections[0]
+    def display_file_list(self, text):
         try:
         # Retrieve all entries in the specified directory
+            # Use the same base directory logic as PdfManager
+            base_output_dir = self._ensure_base_directory()
+            directory_path = base_output_dir
             current_working_directory = os.getcwd()
             directory_path = os.path.join(current_working_directory, directory_path)
             entries = os.listdir(directory_path)
             return str(e)
+    def search_documents(self, query, num_results):
         print(f"Searching for query: {query}")
         if not query:
             print("Please enter a search query")
+            return "Please enter a search query", "--", "Please enter a search query", [], None, None, None, None
         try:
+            # First, check if there are any indexed documents
+            if not self.indexed_docs:
+                return "No documents have been uploaded yet. Please upload some documents first.", "--", "No documents available for search", [], None, None, None, None
+            # Clean up any invalid collections first
+            print("🧹 Cleaning up invalid collections...")
+            removed_count = self._cleanup_invalid_collections()
+            if removed_count > 0:
+                print(f"🗑️ Removed {removed_count} invalid collections")
+            # Check again after cleanup
+            if not self.indexed_docs:
+                return "No valid collections found after cleanup. Please re-upload your documents.", "--", "No valid collections available", [], None, None, None, None
+            # Get the most recent collection name from indexed docs (latest upload)
+            available_collections = list(self.indexed_docs.keys())
+            print(f"🔍 Available collections after cleanup: {available_collections}")
+            if not available_collections:
+                return "No collections available for search. Please upload some documents first.", "--", "No collections available", [], None, None, None, None
+            # Sort collections by timestamp to get the most recent one
+            # Collections are named like "documents_20250101_120000" or "folder_20250101_120000"
+            def extract_timestamp(collection_name):
+                try:
+                    # Extract the timestamp part after the last underscore
+                    parts = collection_name.split('_')
+                    if len(parts) >= 3:
+                        # Get the last two parts which should be date and time
+                        date_part = parts[-2]
+                        time_part = parts[-1]
+                        timestamp = f"{date_part}_{time_part}"
+                        return timestamp
+                    return collection_name
+                except:
+                    return collection_name
+            # Sort by timestamp in descending order (most recent first)
+            available_collections.sort(key=extract_timestamp, reverse=True)
+            collection_name = available_collections[0]
+            print(f"🔍 Available collections sorted by timestamp: {available_collections}")
+            print(f"🔍 Searching in most recent collection: {collection_name}")
+            # Add collection info to the search results for user clarity
+            collection_info = f"🔍 Searching in collection: {collection_name}"
+            middleware = Middleware(collection_name, create_collection=False)
             # Enhanced multi-page retrieval with vision-guided chunking approach
             # Get more results than requested to allow for intelligent filtering
             # Request 3x the number of results for better selection
             search_results = middleware.search([query], topk=max(num_results * 3, 20))[0]
+            # 📊 COMPREHENSIVE SEARCH RESULTS LOGGING
+            print(f"\n🔍 SEARCH RESULTS SUMMARY")
+            print(f"📄 Retrieved {len(search_results)} total results from search")
             if len(search_results) > 0:
+                print(f"🏆 Top result score: {search_results[0][0]:.4f}")
+                print(f"📉 Bottom result score: {search_results[-1][0]:.4f}")
+                print(f"📊 Score range: {search_results[-1][0]:.4f} - {search_results[0][0]:.4f}")
+                # Show top 5 results with page numbers
+                print(f"\n🏆 TOP 5 HIGHEST SCORING PAGES:")
+                for i, (score, doc_id) in enumerate(search_results[:5], 1):
+                    page_num = doc_id + 1  # Convert to 1-based page numbering
+                    print(f"   {i}. Page {page_num} (doc_id: {doc_id}) - Score: {score:.4f}")
+                # Calculate and display score statistics
+                scores = [result[0] for result in search_results]
+                avg_score = sum(scores) / len(scores)
+                print(f"\n📊 SCORE STATISTICS:")
+                print(f"   Average Score: {avg_score:.4f}")
+                print(f"   Score Variance: {sum((s - avg_score) ** 2 for s in scores) / len(scores):.4f}")
+                # Count pages by relevance level
+                excellent = sum(1 for s in scores if s >= 0.90)
+                very_good = sum(1 for s in scores if 0.80 <= s < 0.90)
+                good = sum(1 for s in scores if 0.70 <= s < 0.80)
+                moderate = sum(1 for s in scores if 0.60 <= s < 0.70)
+                basic = sum(1 for s in scores if 0.50 <= s < 0.60)
+                poor = sum(1 for s in scores if s < 0.50)
+                print(f"\n📈 RELEVANCE DISTRIBUTION:")
+                print(f"   🟢 Excellent (≥0.90): {excellent} pages")
+                print(f"   🟡 Very Good (0.80-0.89): {very_good} pages")
+                print(f"   🟠 Good (0.70-0.79): {good} pages")
+                print(f"   🔵 Moderate (0.60-0.69): {moderate} pages")
+                print(f"   🟣 Basic (0.50-0.59): {basic} pages")
+                print(f"   🔴 Poor (<0.50): {poor} pages")
+                print("-" * 60)
             if not search_results:
+                return "No search results found", "--", "No search results found for your query", [], None, None, None, None
             # Implement intelligent multi-page selection based on research
+            selected_results = self._select_relevant_pages_new_format(search_results, query, num_results)
+            # 📊 SELECTION LOGGING - Show which pages were selected
+            print(f"\n🎯 PAGE SELECTION RESULTS")
+            print(f"📄 Requested: {num_results} pages")
+            print(f"📄 Selected: {len(selected_results)} pages")
+            print(f"📄 Selection rate: {len(selected_results)/len(search_results)*100:.1f}% of available results")
+            print("-" * 60)
+            print(f"🏆 SELECTED PAGES WITH SCORES:")
+            for i, (score, doc_id) in enumerate(selected_results, 1):
+                page_num = doc_id + 1
+                relevance_level = self._get_relevance_level(score)
+                print(f"   {i}. Page {page_num:2d} (doc_id: {doc_id:2d}) | Score: {score:8.4f} | {relevance_level}")
+            # Calculate selection statistics
+            if selected_results:
+                selected_scores = [result[0] for result in selected_results]
+                avg_selected_score = sum(selected_scores) / len(selected_scores)
+                print(f"\n📊 SELECTION STATISTICS:")
+                print(f"   Average selected score: {avg_selected_score:.4f}")
+                print(f"   Highest selected score: {selected_scores[0]:.4f}")
+                print(f"   Lowest selected score: {selected_scores[-1]:.4f}")
+                print(f"   Score improvement over average: {avg_selected_score - avg_score:.4f}")
+            print("-" * 60)
             # Process selected results
             cited_pages = []
             print(f"📄 Processing {len(selected_results)} selected results...")
+            # Ensure base directory exists and get the correct path
+            base_output_dir = self._ensure_base_directory()
+            print(f"🔍 Using base directory: {base_output_dir}")
+            print(f"🔍 Collection name: {collection_name}")
+            print(f"🔍 Environment: {'Hugging Face Spaces' if self._is_huggingface_spaces() else 'Local Development'}")
+            for i, (score, doc_id) in enumerate(selected_results):
+                # Use the index as page number since doc_id is just an identifier
+                # This ensures we look for page_1.png, page_2.png, etc.
+                display_page_num = i + 1
+                coll_num = collection_name  # Use the current collection name
+                # Use debug function to get paths and check existence
+                img_path, path, file_exists = self._debug_file_paths(base_output_dir, coll_num, display_page_num)
+                if file_exists:
                     img_paths.append(img_path)
                     all_paths.append(path)
                     page_scores.append(score)
                     print(f"✅ Retrieved page {i+1}: {img_path} (Score: {score:.3f})")
                 else:
                     print(f"❌ Image file not found: {img_path}")
+                    # Try alternative paths with better fallback logic
+                    alt_paths = [
+                        # Primary path (should work in Hugging Face Spaces)
+                        img_path,
+                        # Relative paths from app directory
+                        os.path.join(os.path.dirname(os.path.abspath(__file__)), "pages", coll_num, f"page_{display_page_num}.png"),
+                        # Current working directory paths
+                        f"pages/{coll_num}/page_{display_page_num}.png",
+                        f"./pages/{coll_num}/page_{display_page_num}.png",
+                        os.path.join(os.getcwd(), "pages", coll_num, f"page_{display_page_num}.png"),
+                        # Alternative base directories
+                        os.path.join("/tmp", "pages", coll_num, f"page_{display_page_num}.png"),
+                        os.path.join("/home/user", "pages", coll_num, f"page_{display_page_num}.png")
+                    ]
+                    print(f"🔍 Trying alternative paths for page {display_page_num}:")
+                    for alt_path in alt_paths:
+                        print(f"  🔍 Checking: {alt_path}")
+                        if os.path.exists(alt_path):
+                            print(f"✅ Found alternative path: {alt_path}")
+                            img_paths.append(alt_path)
+                            all_paths.append(alt_path.replace(".png", ""))
+                            page_scores.append(score)
+                            cited_pages.append(f"Page {display_page_num} from {coll_num}")
+                            break
+                    else:
+                        print(f"❌ No alternative path found for page {display_page_num}")
             print(f"📊 Final count: {len(img_paths)} valid pages out of {len(selected_results)} selected")
+            # 📊 FINAL RESULTS SUMMARY
+            if img_paths:
+                print(f"\n🎉 FINAL RETRIEVAL SUMMARY")
+                print(f"📄 Successfully retrieved: {len(img_paths)} pages")
+                print(f"📊 Final page scores:")
+                for i, (img_path, score) in enumerate(zip(img_paths, page_scores), 1):
+                    # Extract page number from path
+                    page_num = img_path.split('page_')[1].split('.png')[0] if 'page_' in img_path else f"Page {i}"
+                    print(f"   {i}. {page_num} - Score: {score:.4f}")
+                if page_scores:
+                    final_avg_score = sum(page_scores) / len(page_scores)
+                    print(f"\n📊 FINAL STATISTICS:")
+                    print(f"   Average final score: {final_avg_score:.4f}")
+                    print(f"   Highest final score: {max(page_scores):.4f}")
+                    print(f"   Lowest final score: {min(page_scores):.4f}")
+                print("=" * 60)
             if not img_paths:
+                return "No valid image files found", "--", "Error: No valid image files found for the search results", [], None, None, None, None
             # Generate RAG response with multiple pages using enhanced approach
+            try:
+                print("🤖 Generating RAG response...")
+                rag_response, csv_filepath, doc_filepath, excel_filepath = self._generate_multi_page_response(query, img_paths, cited_pages, page_scores)
+                print("✅ RAG response generated successfully")
+            except Exception as e:
+                error_code = "RAG001"
+                error_msg = f"❌ **Error {error_code}**: Failed to generate RAG response"
+                print(f"{error_msg}: {str(e)}")
+                print(f"❌ Traceback: {traceback.format_exc()}")
+                # Return error response with proper format
+                return (
+                    error_msg,  # path
+                    "--",       # images
+                    f"{error_msg}\n\n**Details**: {str(e)}\n\n**Error Code**: {error_code}",  # llm_answer
+                    cited_pages,  # cited_pages_display
+                    None,       # csv_download
+                    None,       # doc_download
+                    None        # excel_download
+                )
             # Prepare downloads
             csv_download = self._prepare_csv_download(csv_filepath)
         except Exception as e:
             error_msg = f"Error during search: {str(e)}"
+            print(f"❌ Search error: {error_msg}")
+            # Return exactly 7 outputs to match Gradio expectations
             return error_msg, "--", error_msg, [], None, None, None, None
+    def _select_relevant_pages_new_format(self, search_results, query, num_results):
         """
+        Intelligent page selection for new Milvus format: (score, doc_id)
         """
         if len(search_results) <= num_results:
             return search_results
         # Sort by relevance score
         sorted_results = sorted(search_results, key=lambda x: x[0], reverse=True)
+        # Simple strategy: take top N results
+        selected = sorted_results[:num_results]
+        print(f"Requested {num_results} pages, selected {len(selected)} pages")
         return selected
+    def _get_relevance_level(self, score):
+        """Get human-readable relevance level based on score"""
+        if score >= 0.90:
+            return "🟢 EXCELLENT - Highly relevant"
+        elif score >= 0.80:
+            return "🟡 VERY GOOD - Very relevant"
+        elif score >= 0.70:
+            return "🟠 GOOD - Relevant"
+        elif score >= 0.60:
+            return "🔵 MODERATE - Somewhat relevant"
+        elif score >= 0.50:
+            return "🟣 BASIC - Minimally relevant"
+        else:
+            return "🔴 POOR - Not relevant"
     def _optimize_consecutive_pages(self, selected, all_results, target_count=None):
         """
         Optimize selection to include consecutive pages when beneficial
                     cell_str = str(cell)
                     if ',' in cell_str or '"' in cell_str or '\n' in cell_str:
                         # Escape quotes and wrap in quotes
+                        cell_str = '"' + cell_str.replace('"', '""') + '"'
                     escaped_row.append(cell_str)
                 csv_lines.append(','.join(escaped_row))
             # Fallback to simple response with enhanced prompt
             return rag.get_answer_from_gemini(detailed_prompt, img_paths), None, None, None
+    # Authentication and team collection methods removed for simplified app
+    def _is_huggingface_spaces(self):
+        """Check if running in Hugging Face Spaces environment"""
+        return (
+            os.path.exists("/tmp") and
+            os.access("/tmp", os.W_OK) and
+            (os.getenv('SPACE_ID') or os.getenv('HF_SPACE_ID'))
+        )
+    def _get_optimal_base_dir(self):
+        """Get the optimal base directory based on environment"""
+        if self._is_huggingface_spaces():
+            base_dir = "/tmp/pages"
+            print(f"🚀 Detected Hugging Face Spaces environment, using: {base_dir}")
+        else:
+            # Use relative path from app directory
+            app_dir = os.path.dirname(os.path.abspath(__file__))
+            base_dir = os.path.join(app_dir, "pages")
+            print(f"💻 Using local development path: {base_dir}")
+        # Ensure directory exists
+        os.makedirs(base_dir, exist_ok=True)
+        return base_dir
+    def _ensure_base_directory(self):
+        """Ensure the base directory for storing pages exists"""
+        base_output_dir = self._get_optimal_base_dir()
+        # Create the base directory if it doesn't exist
+        if not os.path.exists(base_output_dir):
+            try:
+                os.makedirs(base_output_dir, exist_ok=True)
+                print(f"✅ Created base directory: {base_output_dir}")
+            except Exception as e:
+                print(f"❌ Failed to create base directory {base_output_dir}: {e}")
+                # Fallback to current working directory
+                base_output_dir = os.path.join(os.getcwd(), "pages")
+                os.makedirs(base_output_dir, exist_ok=True)
+                print(f"✅ Using fallback directory: {base_output_dir}")
+        return base_output_dir
+    def _debug_file_paths(self, base_output_dir, coll_num, display_page_num):
+        """Helper function to debug file path issues"""
+        img_path = os.path.join(base_output_dir, coll_num, f"page_{display_page_num}.png")
+        path = os.path.join(base_output_dir, coll_num, f"page_{display_page_num}")
+        # Check if directory exists
+        dir_path = os.path.dirname(img_path)
+        dir_exists = os.path.exists(dir_path)
+        # Check if file exists
+        file_exists = os.path.exists(img_path)
+        # Get absolute paths for debugging
+        abs_img_path = os.path.abspath(img_path)
+        abs_dir_path = os.path.abspath(dir_path)
+        print(f"🔍 Path Debug for {coll_num}/page_{display_page_num}:")
+        print(f"  Base dir: {base_output_dir}")
+        print(f"  Directory: {dir_path} (exists: {dir_exists})")
+        print(f"  File: {img_path} (exists: {file_exists})")
+        print(f"  Abs dir: {abs_dir_path}")
+        print(f"  Abs file: {abs_img_path}")
+        return img_path, path, file_exists
+    def _cleanup_invalid_collections(self):
+        """Remove collections that no longer exist in Milvus from indexed_docs"""
+        invalid_collections = []
+        for collection_name in list(self.indexed_docs.keys()):
+            try:
+                # Try to create a middleware instance to check if collection exists
+                middleware = Middleware(collection_name, create_collection=False)
+                print(f"�� Collection {collection_name} is valid")
+            except Exception as e:
+                print(f"⚠️ Collection {collection_name} not accessible: {e}")
+                invalid_collections.append(collection_name)
+        # Remove invalid collections
+        for collection_name in invalid_collections:
+            if collection_name in self.indexed_docs:
+                del self.indexed_docs[collection_name]
+                print(f"🗑️ Removed invalid collection: {collection_name}")
+        return len(invalid_collections)
+    def _check_collections_exist(self):
+        # This method should be implemented to check if collections exist in Milvus
+        pass
 def create_ui():
     app = PDFSearchApp()
     with gr.Blocks(theme=gr.themes.Ocean(), css="footer{display:none !important}") as demo:
         gr.Markdown("# Collar Multimodal RAG Demo - Streamlined")
+        gr.Markdown("Basic document upload and search (no authentication)")
+        # Document Upload
+        with gr.Tab("📁 Document Upload"):
             with gr.Column():
+                gr.Markdown("### Upload Documents")
                 folder_name_input = gr.Textbox(
+                    label="Collection Name (Optional)",
+                    placeholder="Optional name for this document collection"
                 )
                 max_pages_input = gr.Slider(
                     minimum=1,
                     label="Max pages to extract and index per document"
                 )
                 file_input = gr.Files(
+                    label="Upload PPTs/PDFs (Multiple files supported)",
                     file_count="multiple"
                 )
+                upload_btn = gr.Button("Upload", variant="primary")
                 upload_status = gr.Textbox(label="Upload Status", interactive=False)
         # Enhanced Query Tab
         with gr.Tab("🔍 Advanced Query"):
         # Event handlers
         upload_btn.click(
             fn=app.upload_and_convert,
+            inputs=[file_input, max_pages_input, folder_name_input],
             outputs=[upload_status]
         )
         # Query events
         search_btn.click(
             fn=app.search_documents,
+            inputs=[query_input, num_results],
             outputs=[path, images, llm_answer, cited_pages_display, csv_download, doc_download, excel_download]
         )

colpali_manager.py CHANGED Viewed

@@ -25,7 +25,7 @@ import dotenv
 dotenv_file = dotenv.find_dotenv()
 dotenv.load_dotenv(dotenv_file)
-model_name = os.environ['colpali']   #"vidore/colSmol-256M"
 device = get_torch_device("cuda") #try using cpu instead of cuda?
 #switch to locally downloading models & loading locally rather than from hf
@@ -97,7 +97,7 @@ class ColpaliManager:
         return [Image.open(path) for path in paths]
     @spaces.GPU
-    def process_images(self, image_paths:list[str], batch_size=int(os.environ['batchsize'])):
         model.to("cuda")
         print(f"Processing {len(image_paths)} image_paths")
@@ -161,7 +161,7 @@ class ColpaliManager:
         dataloader = DataLoader(
             dataset=ListDataset[str](texts),
-            batch_size=int(os.environ['batchsize']), #OG is 5, try reducing batch size to maximise gpu use
             shuffle=False,
             collate_fn=lambda x: processor.process_queries(x),
         )

 dotenv_file = dotenv.find_dotenv()
 dotenv.load_dotenv(dotenv_file)
+model_name = 'vidore/colpali-v1.3'   #"vidore/colSmol-256M"
 device = get_torch_device("cuda") #try using cpu instead of cuda?
 #switch to locally downloading models & loading locally rather than from hf
         return [Image.open(path) for path in paths]
     @spaces.GPU
+    def process_images(self, image_paths:list[str], batch_size=5):
         model.to("cuda")
         print(f"Processing {len(image_paths)} image_paths")
         dataloader = DataLoader(
             dataset=ListDataset[str](texts),
+            batch_size=5, #OG is 5, try reducing batch size to maximise gpu use
             shuffle=False,
             collate_fn=lambda x: processor.process_queries(x),
         )

middleware.py CHANGED Viewed

@@ -43,20 +43,40 @@ class Middleware:
         print("Indexing completed")
         return image_paths
     def search(self, search_queries: list[str], topk: int = 10):
-        print(f"Searching for {len(search_queries)} queries with topk={topk}")
         final_res = []
-        for query in search_queries:
-            print(f"Searching for query: {query}")
             query_vec = colpali_manager.process_text([query])[0]
             search_res = self.milvus_manager.search(query_vec, topk=topk)
-            print(f"Search result: {len(search_res)} results for query: {query}")
             final_res.append(search_res)
         return final_res

         print("Indexing completed")
         return image_paths
+    def drop_collection(self):
+        """Drop the current collection from Milvus"""
+        return self.milvus_manager.drop_collection()
     def search(self, search_queries: list[str], topk: int = 10):
+        print(f"\n🔍 MIDDLEWARE SEARCH INITIATED")
+        print(f"📝 Queries to process: {len(search_queries)}")
+        print(f"🎯 Top-k requested: {topk}")
+        print("-" * 60)
         final_res = []
+        for i, query in enumerate(search_queries, 1):
+            print(f"\n🔍 Processing Query {i}/{len(search_queries)}: '{query}'")
+            print(f"📊 Converting query to vector representation...")
             query_vec = colpali_manager.process_text([query])[0]
+            print(f"✅ Query vector generated (dimension: {len(query_vec)})")
+            print(f"🔍 Executing vector search in Milvus...")
             search_res = self.milvus_manager.search(query_vec, topk=topk)
+            print(f"✅ Search completed: {len(search_res)} results retrieved")
+            if search_res:
+                print(f"📊 Score range: {search_res[0][0]:.4f} (highest) to {search_res[-1][0]:.4f} (lowest)")
             final_res.append(search_res)
+        print(f"\n🎉 MIDDLEWARE SEARCH COMPLETED")
+        print(f"📊 Total queries processed: {len(search_queries)}")
+        print(f"📄 Total results across all queries: {sum(len(res) for res in final_res)}")
+        print("=" * 60)
         return final_res

milvus_manager.py CHANGED Viewed

@@ -1,49 +1,24 @@
 from pymilvus import MilvusClient, DataType
-try:
-    from milvus import default_server  # Milvus Lite
-except Exception:
-    default_server = None
 import numpy as np
 import concurrent.futures
-from pymilvus import Collection
-import os
 class MilvusManager:
     def __init__(self, milvus_uri, collection_name, create_collection, dim=128):
-        #import environ variables from .env
-        import dotenv
-         # Load the .env file
-        dotenv_file = dotenv.find_dotenv()
-        dotenv.load_dotenv(dotenv_file)
-        # Start embedded Milvus Lite server and connect locally
-        if default_server is not None:
-            try:
-                # Optionally set base dir here if desired, e.g. default_server.set_base_dir('volumes/milvus_lite')
-                default_server.start()
-            except Exception:
-                pass
-            local_uri = f"http://127.0.0.1:{default_server.listen_port}"
-            self.client = MilvusClient(uri=local_uri)
-        else:
-            # Fallback to standard local server (assumes docker-compose or system service)
-            self.client = MilvusClient(uri="http://127.0.0.1:19530")
         self.collection_name = collection_name
         self.dim = dim
-        if self.client.has_collection(collection_name=self.collection_name):
-            self.client.load_collection(collection_name=self.collection_name)
-            print("Loaded existing collection.")
-        elif create_collection:
             self.create_collection()
             self.create_index()
     def create_collection(self):
         if self.client.has_collection(collection_name=self.collection_name):
-            print("Collection already exists.")
-            return
         schema = self.client.create_schema(
             auto_id=True,
             enable_dynamic_fields=True,
@@ -61,16 +36,19 @@ class MilvusManager:
         )
     def create_index(self):
         index_params = self.client.prepare_index_params()
         index_params.add_index(
             field_name="vector",
             index_name="vector_index",
-            index_type="HNSW", #use HNSW option if got more mem, if not use IVF for faster processing
-            metric_type=os.environ["metrictype"], #"IP"
             params={
-                "M": int(os.environ["mnum"]), #M:16 for HNSW, capital M
-                "efConstruction": int(os.environ["efnum"]), #500 for HNSW
             },
         )
@@ -78,78 +56,33 @@ class MilvusManager:
             collection_name=self.collection_name, index_params=index_params, sync=True
         )
-    def search(self, data, topk):
-         # Retrieve all collection names from the Milvus client.
-        collections = self.client.list_collections()
-        # Set search parameters (here, using Inner Product metric).
-        search_params = {"metric_type": os.environ["metrictype"], "params": {}} #default metric type is "IP"
-        # Set to store unique (doc_id, collection_name) pairs across all collections.
-        doc_collection_pairs = set()
-        # Query each collection individually
-        for collection in collections:
-            self.client.load_collection(collection_name=collection)
-            print("collection loaded:"+ collection)
-            results = self.client.search(
-                collection,
-                data,
-                limit=int(os.environ["topk"]),  # Adjust limit per collection as needed. (default is 50)
-                output_fields=["vector", "seq_id", "doc_id"],
-                search_params=search_params,
-            )
-            # Accumulate document IDs along with their originating collection.
-            for r_id in range(len(results)):
-                for r in range(len(results[r_id])):
-                    doc_id = results[r_id][r]["entity"]["doc_id"]
-                    doc_collection_pairs.add((doc_id, collection))
-        scores = []
-        def rerank_single_doc(doc_id, data, client, collection_name):
-            # Query for detailed document vectors in the given collection.
-            doc_colbert_vecs = client.query(
-                collection_name=collection_name,
-                filter=f"doc_id in [{doc_id}, {doc_id + 1}]",
-                output_fields=["seq_id", "vector", "doc"],
-                limit=16380,
-            )
-            # Stack the vectors for dot product computation.
-            doc_vecs = np.vstack(
-                [doc_colbert_vecs[i]["vector"] for i in range(len(doc_colbert_vecs))]
-            )
-            # Compute a similarity score via dot product.
-            score = np.dot(data, doc_vecs.T).max(1).sum()
-            return (score, doc_id, collection_name)
-        # Use a thread pool to rerank each document concurrently.
-        with concurrent.futures.ThreadPoolExecutor(max_workers=300) as executor:
-            futures = {
-                executor.submit(rerank_single_doc, doc_id, data, self.client, collection): (doc_id, collection)
-                for doc_id, collection in doc_collection_pairs
-            }
-            for future in concurrent.futures.as_completed(futures):
-                score, doc_id, collection = future.result()
-                scores.append((score, doc_id, collection))
-                #doc_id is page number!
-        # Sort the reranked results by score in descending order.
-        scores.sort(key=lambda x: x[0], reverse=True)
-        # Unload the collection after search to free memory.
-        self.client.release_collection(collection_name=collection)
-        return scores[:topk] if len(scores) >= topk else scores #topk is the number of scores to return back
-        """
         search_params = {"metric_type": "IP", "params": {}}
         results = self.client.search(
             self.collection_name,
             data,
-            limit=50,
             output_fields=["vector", "seq_id", "doc_id"],
             search_params=search_params,
         )
-        doc_ids = {result["entity"]["doc_id"] for result in results[0]}
         scores = []
@@ -161,10 +94,10 @@ class MilvusManager:
                 limit=1000,
             )
             doc_vecs = np.vstack(
-                [doc["vector"] for doc in doc_colbert_vecs]
             )
             score = np.dot(data, doc_vecs.T).max(1).sum()
-            return score, doc_id
         with concurrent.futures.ThreadPoolExecutor(max_workers=300) as executor:
             futures = {
@@ -178,13 +111,59 @@ class MilvusManager:
                 scores.append((score, doc_id))
         scores.sort(key=lambda x: x[0], reverse=True)
-        return scores[:topk]
-        """
     def insert(self, data):
-        colbert_vecs = data["colbert_vecs"]
         seq_length = len(colbert_vecs)
-        doc_ids = [data["doc_id"]] * seq_length
         seq_ids = list(range(seq_length))
         docs = [""] * seq_length
         docs[0] = data["filepath"]
@@ -202,17 +181,38 @@ class MilvusManager:
             ],
         )
-    def get_images_as_doc(self, images_with_vectors):
-        return [
-            {
-                "colbert_vecs": image["colbert_vecs"],
-                "doc_id": idx,
-                "filepath": image["filepath"],
             }
-            for idx, image in enumerate(images_with_vectors)
-        ]
     def insert_images_data(self, image_data):
         data = self.get_images_as_doc(image_data)
-        for item in data:
-            self.insert(item)

 from pymilvus import MilvusClient, DataType
 import numpy as np
 import concurrent.futures
 class MilvusManager:
     def __init__(self, milvus_uri, collection_name, create_collection, dim=128):
+        self.client = MilvusClient(uri=milvus_uri)
         self.collection_name = collection_name
+        if self.client.has_collection(collection_name=self.collection_name):
+            self.client.load_collection(collection_name)
         self.dim = dim
+        if create_collection:
             self.create_collection()
             self.create_index()
     def create_collection(self):
         if self.client.has_collection(collection_name=self.collection_name):
+            self.client.drop_collection(collection_name=self.collection_name)
         schema = self.client.create_schema(
             auto_id=True,
             enable_dynamic_fields=True,
         )
     def create_index(self):
+        self.client.release_collection(collection_name=self.collection_name)
+        self.client.drop_index(
+            collection_name=self.collection_name, index_name="vector"
+        )
         index_params = self.client.prepare_index_params()
         index_params.add_index(
             field_name="vector",
             index_name="vector_index",
+            index_type="FLAT",
+            metric_type="IP",
             params={
+                "M": 16,
+                "efConstruction": 500,
             },
         )
             collection_name=self.collection_name, index_params=index_params, sync=True
         )
+    def create_scalar_index(self):
+        self.client.release_collection(collection_name=self.collection_name)
+        index_params = self.client.prepare_index_params()
+        index_params.add_index(
+            field_name="doc_id",
+            index_name="int32_index",
+            index_type="INVERTED",
+        )
+        self.client.create_index(
+            collection_name=self.collection_name, index_params=index_params, sync=True
+        )
+    def search(self, data, topk):
         search_params = {"metric_type": "IP", "params": {}}
         results = self.client.search(
             self.collection_name,
             data,
+            limit=int(50),
             output_fields=["vector", "seq_id", "doc_id"],
             search_params=search_params,
         )
+        doc_ids = set()
+        for r_id in range(len(results)):
+            for r in range(len(results[r_id])):
+                doc_ids.add(results[r_id][r]["entity"]["doc_id"])
         scores = []
                 limit=1000,
             )
             doc_vecs = np.vstack(
+                [doc_colbert_vecs[i]["vector"] for i in range(len(doc_colbert_vecs))]
             )
             score = np.dot(data, doc_vecs.T).max(1).sum()
+            return (score, doc_id)
         with concurrent.futures.ThreadPoolExecutor(max_workers=300) as executor:
             futures = {
                 scores.append((score, doc_id))
         scores.sort(key=lambda x: x[0], reverse=True)
+        # 📊 DETAILED SCORE LOGGING - Print page numbers with highest scores
+        print("\n" + "="*80)
+        print("📊 RETRIEVAL SCORES - PAGE NUMBERS WITH HIGHEST SCORES")
+        print("="*80)
+        print(f"🔍 Collection: {self.collection_name}")
+        print(f"📄 Total documents found: {len(scores)}")
+        print(f"🎯 Requested top-k: {topk}")
+        print("-"*80)
+        # Display top 10 scores with detailed information
+        display_count = min(10, len(scores))
+        for i, (score, doc_id) in enumerate(scores[:display_count]):
+            page_num = doc_id + 1  # Convert doc_id to page number (0-based to 1-based)
+            relevance_level = self._get_relevance_level(score)
+            print(f"📄 Page {page_num:2d} (doc_id: {doc_id:2d}) | Score: {score:8.4f} | {relevance_level}")
+        if len(scores) > display_count:
+            print(f"... and {len(scores) - display_count} more results")
+        print("-"*80)
+        print(f"🏆 HIGHEST SCORING PAGES:")
+        top_3 = scores[:3]
+        for i, (score, doc_id) in enumerate(top_3, 1):
+            page_num = doc_id + 1
+            print(f"   {i}. Page {page_num} - Score: {score:.4f}")
+        print("="*80 + "\n")
+        if len(scores) >= topk:
+            return scores[:topk]
+        else:
+            return scores
+    def _get_relevance_level(self, score):
+        """Get human-readable relevance level based on score"""
+        if score >= 0.90:
+            return "🟢 EXCELLENT - Highly relevant"
+        elif score >= 0.80:
+            return "🟡 VERY GOOD - Very relevant"
+        elif score >= 0.70:
+            return "🟠 GOOD - Relevant"
+        elif score >= 0.60:
+            return "🔵 MODERATE - Somewhat relevant"
+        elif score >= 0.50:
+            return "🟣 BASIC - Minimally relevant"
+        else:
+            return "🔴 POOR - Not relevant"
     def insert(self, data):
+        colbert_vecs = [vec for vec in data["colbert_vecs"]]
         seq_length = len(colbert_vecs)
+        doc_ids = [data["doc_id"] for i in range(seq_length)]
         seq_ids = list(range(seq_length))
         docs = [""] * seq_length
         docs[0] = data["filepath"]
             ],
         )
+    def get_images_as_doc(self, images_with_vectors:list):
+        images_data = []
+        for i in range(len(images_with_vectors)):
+            data = {
+                "colbert_vecs": images_with_vectors[i]["colbert_vecs"],
+                "doc_id": i,
+                "filepath": images_with_vectors[i]["filepath"],
             }
+            images_data.append(data)
+        return images_data
     def insert_images_data(self, image_data):
         data = self.get_images_as_doc(image_data)
+        for i in range(len(data)):
+            self.insert(data[i])
+    def drop_collection(self):
+        """Drop the current collection from Milvus"""
+        try:
+            if self.client.has_collection(collection_name=self.collection_name):
+                self.client.drop_collection(collection_name=self.collection_name)
+                print(f"🗑️ Dropped Milvus collection: {self.collection_name}")
+                return True
+            else:
+                print(f"⚠️ Collection {self.collection_name} does not exist in Milvus")
+                return False
+        except Exception as e:
+            print(f"❌ Error dropping collection {self.collection_name}: {e}")
+            return False

pdf_manager.py CHANGED Viewed

@@ -4,7 +4,21 @@ import shutil
 class PdfManager:
     def __init__(self):
-        pass
     def clear_and_recreate_dir(self, output_folder):
@@ -19,7 +33,8 @@ class PdfManager:
         #print("Clearing is unused for now for persistency")
     def save_images(self, id, pdf_path, max_pages, pages: list[int] = None) -> list[str]:
-        output_folder = f"pages/{id}" #remove last backslash to avoid error,test this
         images = convert_from_path(pdf_path)
         print(f"Saving images from {pdf_path} to {output_folder}. Max pages: {max_pages}")
@@ -35,7 +50,7 @@ class PdfManager:
             if pages and i not in pages:
                 continue
-            full_save_path = f"{output_folder}/page_{i + 1}.png"
             #print(f"Saving image to {full_save_path}")
@@ -43,4 +58,4 @@ class PdfManager:
             num_page_processed += 1
-        return [f"{output_folder}/page_{i + 1}.png" for i in range(num_page_processed)]

 class PdfManager:
     def __init__(self):
+        # Use relative paths for Hugging Face Spaces compatibility
+        # Get the directory where the main application file is located
+        app_dir = os.path.dirname(os.path.abspath(__file__))
+        # Use /tmp for Hugging Face Spaces, fallback to relative path
+        if os.path.exists("/tmp") and os.access("/tmp", os.W_OK):
+            self.base_output_dir = "/tmp/pages"
+            print(f"✅ Using /tmp directory for Hugging Face Spaces: {self.base_output_dir}")
+        else:
+            # Fallback to relative path from app directory
+            self.base_output_dir = os.path.join(app_dir, "pages")
+            print(f"✅ Using relative path: {self.base_output_dir}")
+        # Ensure the base directory exists
+        os.makedirs(self.base_output_dir, exist_ok=True)
     def clear_and_recreate_dir(self, output_folder):
         #print("Clearing is unused for now for persistency")
     def save_images(self, id, pdf_path, max_pages, pages: list[int] = None) -> list[str]:
+        # Use absolute path for Hugging Face Spaces compatibility
+        output_folder = os.path.join(self.base_output_dir, id)
         images = convert_from_path(pdf_path)
         print(f"Saving images from {pdf_path} to {output_folder}. Max pages: {max_pages}")
             if pages and i not in pages:
                 continue
+            full_save_path = os.path.join(output_folder, f"page_{i + 1}.png")
             #print(f"Saving image to {full_save_path}")
             num_page_processed += 1
+        return [os.path.join(output_folder, f"page_{i + 1}.png") for i in range(num_page_processed)]

rag.py CHANGED Viewed

@@ -5,7 +5,7 @@ import re
 from typing import List
 from utils import encode_image
 from PIL import Image
-from google import genai
 import torch
 import subprocess
 import psutil
@@ -64,30 +64,28 @@ class Rag:
         return response_text
-    def get_answer_from_gemini(self, query: str, image_paths: List[str]) -> str:
-        print(f"Querying Gemini 2.5 Pro for query={query}, image_paths={image_paths}")
         try:
-            # Use environment variable GEMINI_API_KEY
-            api_key = os.environ.get('GEMINI_API_KEY')
-            if not api_key:
-                return "Error: GEMINI_API_KEY is not set."
-            genai.configure(api_key=api_key)
-            model = genai.GenerativeModel('gemini-2.5-pro')
-            # Load images
-            images = []
-            for p in image_paths:
-                try:
-                    images.append(Image.open(p))
-                except Exception:
-                    pass
-            chat_session = model.start_chat()
-            response = chat_session.send_message([*images, query])
-            return response.text
         except Exception as e:
-            print(f"Gemini error: {e}")
             return f"Error: {str(e)}"
     #os.environ['OPENAI_API_KEY'] = "for the love of Jesus let this work"
@@ -100,13 +98,160 @@ class Rag:
         dotenv_file = dotenv.find_dotenv()
         dotenv.load_dotenv(dotenv_file)
-        # This function formerly used Ollama. Replace with Gemini 2.5 Pro.
-        print(f"Querying Gemini (replacement for Ollama) for query={query}, imagesPaths={imagesPaths}")
-        try:
-            enhanced_query = f"Use all {len(imagesPaths)} pages to answer comprehensively.\n\nQuery: {query}"
-            return self.get_answer_from_gemini(enhanced_query, imagesPaths)
         except Exception as e:
-            print(f"Gemini replacement error: {e}")
             return None

 from typing import List
 from utils import encode_image
 from PIL import Image
+from ollama import chat
 import torch
 import subprocess
 import psutil
         return response_text
+    def get_answer_from_gemini(self, query, imagePaths):
+        print(f"Querying Gemini for query={query}, imagePaths={imagePaths}")
         try:
+            client = genai.Client(api_key="AIzaSyCwRr9054tCuh2S8yGpwKFvOAxYMT4WNIs")
+            images = [Image.open(path) for path in imagePaths]
+            response = client.models.generate_content(
+                model="gemini-2.5-flash",
+                contents=[images, query],
+            )
+            print(response.text)
+            answer = response.text
+            return answer
         except Exception as e:
+            print(f"An error occurred while querying Gemini: {e}")
             return f"Error: {str(e)}"
     #os.environ['OPENAI_API_KEY'] = "for the love of Jesus let this work"
         dotenv_file = dotenv.find_dotenv()
         dotenv.load_dotenv(dotenv_file)
+        #ollama method below
+        torch.cuda.empty_cache() #release cuda so that ollama can use gpu!
+        os.environ['OLLAMA_FLASH_ATTENTION'] = os.environ['flashattn'] #int "1"
+        if os.environ['ollama'] == "minicpm-v":
+            os.environ['ollama'] = "minicpm-v:8b-2.6-q8_0" #set to quantized version
+        elif os.environ['ollama'] == "gemma3":
+            os.environ['ollama'] = "gemma3:12b" #set to upscaled version 12b when needed
+            # Add specific environment variables for Gemma3 to prevent raw token issues
+            os.environ['OLLAMA_KEEP_ALIVE'] = "5m"
+            os.environ['OLLAMA_ORIGINS'] = "*"
+        # Close model thread (colpali)
+        print(f"Querying OpenAI for query={query}, imagesPaths={imagesPaths}")
+        try:
+            # Enhanced prompt for more detailed responses with explicit page usage
+            enhanced_query = f"""
+            Please provide a comprehensive and detailed answer to the following query.
+            Use ALL available information from the provided document images to give a thorough response.
+            Query: {query}
+            CRITICAL INSTRUCTIONS:
+            - You have been provided with {len(imagesPaths)} document page(s)
+            - You MUST reference information from ALL {len(imagesPaths)} page(s) in your response
+            - Do not skip any pages - each page contains relevant information
+            - If you mention one page, you must also mention the others
+            - Ensure your response reflects the complete information from all pages
+            Instructions for detailed response:
+            1. Provide extensive background information and context
+            2. Include specific details, examples, and data points from ALL documents
+            3. Explain concepts thoroughly with step-by-step breakdowns
+            4. Provide comprehensive analysis rather than simple answers when requested
+            5. Explicitly reference each page and what information it contributes
+            6. Cross-reference information between pages when relevant
+            7. Ensure no page is left unmentioned in your analysis
+            SPECIAL INSTRUCTIONS FOR TABULAR DATA:
+            - If the query requests a table, list, or structured data, organize your response in a clear, structured format
+            - Use numbered lists, bullet points, or clear categories when appropriate
+            - Include specific data points or comparisons when available
+            - Structure information in a way that can be easily converted to a table format
+            IMPORTANT: Respond with natural, human-readable text only. Do not include any special tokens, codes, or technical identifiers in your response.
+            Make sure to acknowledge and use information from all {len(imagesPaths)} provided pages.
+            """
+            # Try with current model first
+            current_model = os.environ['ollama']
+            # Set different options based on the model
+            if "gemma3" in current_model.lower():
+                # Specific options for Gemma3 to prevent raw token issues
+                model_options = {
+                    "num_predict": 1024,  # Shorter responses for Gemma3
+                    "stop": ["<eos>", "<|endoftext|>", "</s>", "<|im_end|>"],  # More stop tokens
+                    "top_k": 20,  # Lower top_k for more focused generation
+                    "top_p": 0.8,  # Lower top_p for more deterministic output
+                    "repeat_penalty": 1.2,  # Higher repeat penalty
+                    "seed": 42,  # Consistent results
+                    "temperature": 0.7,  # Lower temperature for more focused responses
+                }
+            else:
+                # Default options for other models
+                model_options = {
+                    "num_predict": 2048,  # Limit response length
+                    "stop": ["<eos>", "<|endoftext|>", "</s>"],  # Stop at end tokens
+                    "top_k": 40,  # Reduce randomness
+                    "top_p": 0.9,  # Nucleus sampling
+                    "repeat_penalty": 1.1,  # Prevent repetition
+                    "seed": 42,  # Consistent results
+                }
+            response = chat(
+                    model=current_model,
+                    messages=[
+                    {
+                    'role': 'user',
+                    'content': enhanced_query,
+                    'images': imagesPaths,
+                    "temperature":float(os.environ['temperature']), #test if temp makes a diff
+                    }
+                ],
+                options=model_options
+                )
+            answer = response.message.content
+            # Clean the response to handle raw token issues
+            cleaned_answer = self._clean_raw_token_response(answer)
+            # If the cleaned answer is still problematic, try fallback models
+            if cleaned_answer and "❌ **Model Response Error**" in cleaned_answer:
+                print(f"⚠️  Primary model {current_model} failed, trying fallback models...")
+                # List of fallback models to try
+                fallback_models = [
+                    "llama3.2-vision:latest",
+                    "llava:latest",
+                    "bakllava:latest",
+                    "llama3.2:latest"
+                ]
+                for fallback_model in fallback_models:
+                    try:
+                        print(f"🔄 Trying fallback model: {fallback_model}")
+                        response = chat(
+                            model=fallback_model,
+                            messages=[
+                            {
+                            'role': 'user',
+                            'content': enhanced_query,
+                            'images': imagesPaths,
+                            "temperature":float(os.environ['temperature']),
+                            }
+                        ],
+                        options={
+                            "num_predict": 2048,
+                            "stop": ["<eos>", "<|endoftext|>", "</s>"],
+                            "top_k": 40,
+                            "top_p": 0.9,
+                            "repeat_penalty": 1.1,
+                            "seed": 42,
+                        }
+                        )
+                        fallback_answer = response.message.content
+                        cleaned_fallback = self._clean_raw_token_response(fallback_answer)
+                        if cleaned_fallback and "❌ **Model Response Error**" not in cleaned_fallback:
+                            print(f"✅ Fallback model {fallback_model} succeeded")
+                            return cleaned_fallback
+                    except Exception as fallback_error:
+                        print(f"❌ Fallback model {fallback_model} failed: {fallback_error}")
+                        continue
+                # If all fallbacks fail, return the original error
+                return cleaned_answer
+            print(f"Original response: {answer}")
+            print(f"Cleaned response: {cleaned_answer}")
+            return cleaned_answer
         except Exception as e:
+            print(f"An error occurred while querying OpenAI: {e}")
             return None