Kazel commited on
Commit
d901124
Β·
2 Parent(s): 79696c1 2ee6344
Files changed (8) hide show
  1. .huggingface-spaces +15 -0
  2. README.md +2 -3
  3. app.py +420 -213
  4. colpali_manager.py +3 -3
  5. middleware.py +25 -5
  6. milvus_manager.py +115 -115
  7. pdf_manager.py +19 -4
  8. rag.py +174 -29
.huggingface-spaces ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Hugging Face Spaces Configuration
2
+ # This file helps ensure proper deployment and configuration
3
+
4
+ # Environment variables for Hugging Face Spaces
5
+ SPACE_ID=${SPACE_ID}
6
+ HF_SPACE_ID=${HF_SPACE_ID}
7
+
8
+ # File path configuration
9
+ BASE_DIR=/tmp/pages
10
+ FALLBACK_DIR=pages
11
+
12
+ # Ensure proper permissions
13
+ chmod 755 /tmp
14
+ mkdir -p /tmp/pages
15
+ chmod 755 /tmp/pages
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: πŸ”
4
  colorFrom: blue
5
  colorTo: purple
6
  sdk: gradio
7
- sdk_version: "4.0.0"
8
  app_file: app.py
9
  pinned: false
10
  ---
@@ -339,5 +339,4 @@ For support and questions:
339
 
340
  ---
341
 
342
- **Made by Collar** - Enhanced with Team Management & Chat History
343
-
 
4
  colorFrom: blue
5
  colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 5.44.1
8
  app_file: app.py
9
  pinned: false
10
  ---
 
339
 
340
  ---
341
 
342
+ **Made by Collar** - Enhanced with Team Management & Chat History
 
app.py CHANGED
@@ -17,6 +17,7 @@ import requests
17
  import base64
18
  from PIL import Image
19
  import io
 
20
 
21
  from middleware import Middleware
22
  from rag import Rag
@@ -28,7 +29,14 @@ from dotenv import load_dotenv, dotenv_values
28
  import dotenv
29
  import platform
30
  import time
31
- from pptxtopdf import convert
 
 
 
 
 
 
 
32
 
33
  # Import libraries for DOC and Excel export
34
  try:
@@ -378,31 +386,29 @@ class PDFSearchApp:
378
  self.db_manager = db_manager
379
  self.session_manager = session_manager
380
 
381
- def upload_and_convert(self, state, files, max_pages, session_id=None, folder_name=None):
382
- """Upload and convert files with team-based organization"""
383
 
384
  if files is None:
385
  return "No file uploaded"
386
 
387
  try:
388
- # Get user info from session if available
389
- user_info = None
390
- team = "default"
391
- if session_id:
392
- session = self.session_manager.get_session(session_id)
393
- if session:
394
- user_info = session['user_info']
395
- team = user_info['team']
396
-
397
  total_pages = 0
398
  uploaded_files = []
399
 
400
- # Create team-specific folder if folder_name is provided
401
  if folder_name:
402
  folder_name = folder_name.replace(" ", "_").replace("-", "_")
403
- collection_name = f"{team}_{folder_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
404
  else:
405
- collection_name = f"{team}_documents_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
 
 
 
 
 
 
 
406
 
407
  for file in files[:]:
408
  # Extract the last part of the path (file name)
@@ -412,13 +418,16 @@ class PDFSearchApp:
412
 
413
  # Convert PPT to PDF if needed
414
  if ext.lower() in [".ppt", ".pptx"]:
415
- output_file = os.path.splitext(file.name)[0] + '.pdf'
416
- output_directory = os.path.dirname(file.name)
417
- outfile = os.path.join(output_directory, output_file)
418
- convert(file.name, outfile)
419
- pdf_path = outfile
420
- name = os.path.basename(outfile)
421
- name, ext = os.path.splitext(name)
 
 
 
422
 
423
  # Create unique document ID
424
  doc_id = f"{collection_name}_{name.replace(' ', '_').replace('-', '_')}"
@@ -426,31 +435,93 @@ class PDFSearchApp:
426
  print(f"Uploading file: {doc_id}")
427
  middleware = Middleware(collection_name, create_collection=True)
428
 
429
- pages = middleware.index(pdf_path, id=doc_id, max_pages=max_pages)
 
430
  total_pages += len(pages) if pages else 0
431
  uploaded_files.append(doc_id)
432
-
433
- self.indexed_docs[doc_id] = True
434
 
435
- # Save collection info to database
436
- if user_info:
437
- self.db_manager.save_document_collection(
438
- collection_name,
439
- team,
440
- user_info['id'],
441
- len(uploaded_files)
442
- )
443
 
444
- return f"Uploaded {len(uploaded_files)} files with {total_pages} total pages to collection: {collection_name}"
445
 
446
  except Exception as e:
447
  return f"Error processing files: {str(e)}"
448
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
 
450
- def display_file_list(text):
451
  try:
452
  # Retrieve all entries in the specified directory
453
- directory_path = "pages"
 
 
454
  current_working_directory = os.getcwd()
455
  directory_path = os.path.join(current_working_directory, directory_path)
456
  entries = os.listdir(directory_path)
@@ -465,39 +536,134 @@ class PDFSearchApp:
465
  return str(e)
466
 
467
 
468
- def search_documents(self, state, query, num_results, session_id=None):
469
  print(f"Searching for query: {query}")
470
 
471
  if not query:
472
  print("Please enter a search query")
473
- return "Please enter a search query", "--", "Please enter a search query", [], None
474
 
475
  try:
476
- # Get user info from session if available
477
- user_info = None
478
- if session_id:
479
- session = self.session_manager.get_session(session_id)
480
- if session:
481
- user_info = session['user_info']
482
-
483
- middleware = Middleware("test", create_collection=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
484
 
485
  # Enhanced multi-page retrieval with vision-guided chunking approach
486
  # Get more results than requested to allow for intelligent filtering
487
  # Request 3x the number of results for better selection
488
  search_results = middleware.search([query], topk=max(num_results * 3, 20))[0]
489
 
490
- # Debug: Log the number of results retrieved
491
- print(f"πŸ” Retrieved {len(search_results)} total results from search")
 
492
  if len(search_results) > 0:
493
- print(f"πŸ” Top result score: {search_results[0][0]:.3f}")
494
- print(f"πŸ” Bottom result score: {search_results[-1][0]:.3f}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
 
496
  if not search_results:
497
- return "No search results found", "--", "No search results found for your query", [], None
498
 
499
  # Implement intelligent multi-page selection based on research
500
- selected_results = self._select_relevant_pages(search_results, query, num_results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
501
 
502
  # Process selected results
503
  cited_pages = []
@@ -507,13 +673,22 @@ class PDFSearchApp:
507
 
508
  print(f"πŸ“„ Processing {len(selected_results)} selected results...")
509
 
510
- for i, (score, page_num, coll_num) in enumerate(selected_results):
511
- # Convert 0-based page number to 1-based for file naming
512
- display_page_num = page_num + 1
513
- img_path = f"pages/{coll_num}/page_{display_page_num}.png"
514
- path = f"pages/{coll_num}/page_{display_page_num}"
 
 
 
 
 
 
 
 
 
515
 
516
- if os.path.exists(img_path):
517
  img_paths.append(img_path)
518
  all_paths.append(path)
519
  page_scores.append(score)
@@ -521,16 +696,78 @@ class PDFSearchApp:
521
  print(f"βœ… Retrieved page {i+1}: {img_path} (Score: {score:.3f})")
522
  else:
523
  print(f"❌ Image file not found: {img_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
524
 
525
  print(f"πŸ“Š Final count: {len(img_paths)} valid pages out of {len(selected_results)} selected")
526
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
527
  if not img_paths:
528
- return "No valid image files found", "--", "Error: No valid image files found for the search results", [], None
529
 
530
  # Generate RAG response with multiple pages using enhanced approach
531
- rag_response, csv_filepath, doc_filepath, excel_filepath = self._generate_multi_page_response(query, img_paths, cited_pages, page_scores)
532
-
533
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
534
 
535
  # Prepare downloads
536
  csv_download = self._prepare_csv_download(csv_filepath)
@@ -556,81 +793,42 @@ class PDFSearchApp:
556
 
557
  except Exception as e:
558
  error_msg = f"Error during search: {str(e)}"
 
 
559
  return error_msg, "--", error_msg, [], None, None, None, None
560
-
561
- def _select_relevant_pages(self, search_results, query, num_results):
562
  """
563
- Intelligent page selection using vision-guided chunking principles
564
- Based on research from M3DocRAG and multi-modal retrieval models
565
  """
566
  if len(search_results) <= num_results:
567
  return search_results
568
 
569
- # Detect if query needs multiple pages
570
- multi_page_keywords = [
571
- 'compare', 'difference', 'similarities', 'both', 'multiple', 'various',
572
- 'different', 'types', 'kinds', 'categories', 'procedures', 'methods',
573
- 'approaches', 'techniques', 'safety', 'protocols', 'guidelines',
574
- 'overview', 'summary', 'comprehensive', 'complete', 'all', 'everything'
575
- ]
576
-
577
- query_lower = query.lower()
578
- needs_multiple_pages = any(keyword in query_lower for keyword in multi_page_keywords)
579
-
580
  # Sort by relevance score
581
  sorted_results = sorted(search_results, key=lambda x: x[0], reverse=True)
582
 
583
- # CRITICAL FIX: Ensure we return exactly the number of pages requested
584
- # This addresses the ColPali retrieval configuration issue mentioned in research
585
-
586
- # Strategy 1: Include highest scoring result from each collection (diversity)
587
- selected = []
588
- seen_collections = set()
589
-
590
- # First pass: get one page from each collection for diversity
591
- for score, page_num, coll_num in sorted_results:
592
- if coll_num not in seen_collections and len(selected) < min(num_results // 2, len(search_results)):
593
- selected.append((score, page_num, coll_num))
594
- seen_collections.add(coll_num)
595
-
596
- # Strategy 2: Fill remaining slots with highest scoring results
597
- for score, page_num, coll_num in sorted_results:
598
- if (score, page_num, coll_num) not in selected and len(selected) < num_results:
599
- selected.append((score, page_num, coll_num))
600
-
601
- # Strategy 3: If we still don't have enough, add more from any collection
602
- if len(selected) < num_results:
603
- for score, page_num, coll_num in sorted_results:
604
- if (score, page_num, coll_num) not in selected and len(selected) < num_results:
605
- selected.append((score, page_num, coll_num))
606
-
607
- # Strategy 4: If we have too many, trim to exact number requested
608
- if len(selected) > num_results:
609
- selected = selected[:num_results]
610
-
611
- # Strategy 5: If we have too few, add more from the sorted results
612
- if len(selected) < num_results and len(sorted_results) >= num_results:
613
- for score, page_num, coll_num in sorted_results:
614
- if (score, page_num, coll_num) not in selected and len(selected) < num_results:
615
- selected.append((score, page_num, coll_num))
616
-
617
- # Sort selected results by score for consistency
618
- selected.sort(key=lambda x: x[0], reverse=True)
619
-
620
- print(f"Requested {num_results} pages, selected {len(selected)} pages from {len(seen_collections)} collections")
621
 
622
- # Final verification: ensure we return exactly the requested number
623
- if len(selected) != num_results:
624
- print(f"⚠️ Warning: Requested {num_results} pages but selected {len(selected)} pages")
625
- if len(selected) < num_results and len(sorted_results) >= num_results:
626
- # Add more pages to reach the target
627
- for score, page_num, coll_num in sorted_results:
628
- if (score, page_num, coll_num) not in selected and len(selected) < num_results:
629
- selected.append((score, page_num, coll_num))
630
- print(f"Added more pages to reach target: {len(selected)} pages")
631
 
632
  return selected
633
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
634
  def _optimize_consecutive_pages(self, selected, all_results, target_count=None):
635
  """
636
  Optimize selection to include consecutive pages when beneficial
@@ -1167,7 +1365,7 @@ The system detected you requested tabular data, but the current response doesn't
1167
  cell_str = str(cell)
1168
  if ',' in cell_str or '"' in cell_str or '\n' in cell_str:
1169
  # Escape quotes and wrap in quotes
1170
- cell_str = f'"{cell_str.replace('"', '""')}"'
1171
  escaped_row.append(cell_str)
1172
  csv_lines.append(','.join(escaped_row))
1173
 
@@ -2798,76 +2996,113 @@ The system detected you requested tabular data, but the current response doesn't
2798
  # Fallback to simple response with enhanced prompt
2799
  return rag.get_answer_from_gemini(detailed_prompt, img_paths), None, None, None
2800
 
2801
- def authenticate_user(self, username, password):
2802
- """Authenticate user and create session"""
2803
- user_info = self.db_manager.authenticate_user(username, password)
2804
- if user_info:
2805
- session_id = self.session_manager.create_session(user_info)
2806
- return f"Welcome {user_info['username']} from {user_info['team']}!", session_id, user_info['team']
2807
- else:
2808
- return "Invalid username or password", None, None
2809
-
2810
- def logout_user(self, session_id):
2811
- """Logout user and remove session"""
2812
- if session_id:
2813
- self.session_manager.remove_session(session_id)
2814
- return "Logged out successfully", None, None
2815
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2816
 
2817
- def get_team_collections(self, session_id):
2818
- """Get available collections for the user's team"""
2819
- if not session_id:
2820
- return "Please log in to view team collections"
 
 
 
 
2821
 
2822
- session = self.session_manager.get_session(session_id)
2823
- if not session:
2824
- return "Session expired. Please log in again."
2825
 
2826
- team = session['user_info']['team']
2827
- collections = self.db_manager.get_team_collections(team)
 
2828
 
2829
- if not collections:
2830
- return f"No collections found for {team}"
 
 
 
 
2831
 
2832
- return f"**{team} Collections:**\n" + "\n".join([f"- {coll}" for coll in collections])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2833
 
2834
  def create_ui():
2835
  app = PDFSearchApp()
2836
 
2837
  with gr.Blocks(theme=gr.themes.Ocean(), css="footer{display:none !important}") as demo:
2838
- # Session state management
2839
- session_state = gr.State(value=None)
2840
- user_info_state = gr.State(value=None)
2841
-
2842
  gr.Markdown("# Collar Multimodal RAG Demo - Streamlined")
2843
- gr.Markdown("Made by Collar - Document Upload and Query System")
2844
-
2845
- # Authentication Tab
2846
- with gr.Tab("πŸ” Authentication"):
2847
- with gr.Row():
2848
- with gr.Column(scale=1):
2849
- gr.Markdown("### Login")
2850
- username_input = gr.Textbox(label="Username", placeholder="Enter username")
2851
- password_input = gr.Textbox(label="Password", type="password", placeholder="Enter password")
2852
- login_btn = gr.Button("Login", variant="primary")
2853
- logout_btn = gr.Button("Logout")
2854
- auth_status = gr.Textbox(label="Authentication Status", interactive=False)
2855
- current_team = gr.Textbox(label="Current Team", interactive=False)
2856
-
2857
- with gr.Column(scale=1):
2858
- gr.Markdown("### Default Users")
2859
- gr.Markdown("""
2860
- **Team A:** admin_team_a / admin123_team_a
2861
- **Team B:** admin_team_b / admin123_team_b
2862
- """)
2863
-
2864
- # Document Management Tab
2865
- with gr.Tab("πŸ“ Document Management"):
2866
  with gr.Column():
2867
- gr.Markdown("### Upload Documents to Team Repository")
2868
  folder_name_input = gr.Textbox(
2869
- label="Folder/Collection Name (Optional)",
2870
- placeholder="Enter a name for this document collection"
2871
  )
2872
  max_pages_input = gr.Slider(
2873
  minimum=1,
@@ -2877,19 +3112,11 @@ def create_ui():
2877
  label="Max pages to extract and index per document"
2878
  )
2879
  file_input = gr.Files(
2880
- label="Upload PPTs/PDFs (Multiple files supported)",
2881
  file_count="multiple"
2882
  )
2883
- upload_btn = gr.Button("Upload to Repository", variant="primary")
2884
  upload_status = gr.Textbox(label="Upload Status", interactive=False)
2885
-
2886
- gr.Markdown("### Team Collections")
2887
- refresh_collections_btn = gr.Button("Refresh Collections")
2888
- team_collections_display = gr.Textbox(
2889
- label="Available Collections",
2890
- interactive=False,
2891
- lines=5
2892
- )
2893
 
2894
  # Enhanced Query Tab
2895
  with gr.Tab("πŸ” Advanced Query"):
@@ -2958,36 +3185,16 @@ def create_ui():
2958
 
2959
 
2960
  # Event handlers
2961
- # Authentication events
2962
- login_btn.click(
2963
- fn=app.authenticate_user,
2964
- inputs=[username_input, password_input],
2965
- outputs=[auth_status, session_state, current_team]
2966
- )
2967
-
2968
- logout_btn.click(
2969
- fn=app.logout_user,
2970
- inputs=[session_state],
2971
- outputs=[auth_status, session_state, current_team]
2972
- )
2973
-
2974
- # Document management events
2975
  upload_btn.click(
2976
  fn=app.upload_and_convert,
2977
- inputs=[session_state, file_input, max_pages_input, session_state, folder_name_input],
2978
  outputs=[upload_status]
2979
  )
2980
 
2981
- refresh_collections_btn.click(
2982
- fn=app.get_team_collections,
2983
- inputs=[session_state],
2984
- outputs=[team_collections_display]
2985
- )
2986
-
2987
  # Query events
2988
  search_btn.click(
2989
  fn=app.search_documents,
2990
- inputs=[session_state, query_input, num_results, session_state],
2991
  outputs=[path, images, llm_answer, cited_pages_display, csv_download, doc_download, excel_download]
2992
  )
2993
 
 
17
  import base64
18
  from PIL import Image
19
  import io
20
+ import traceback
21
 
22
  from middleware import Middleware
23
  from rag import Rag
 
29
  import dotenv
30
  import platform
31
  import time
32
+ # Only enable PPT/PPTX conversion on Windows where COM is available
33
+ PPT_CONVERT_AVAILABLE = False
34
+ if platform.system() == 'Windows':
35
+ try:
36
+ from pptxtopdf import convert
37
+ PPT_CONVERT_AVAILABLE = True
38
+ except Exception:
39
+ PPT_CONVERT_AVAILABLE = False
40
 
41
  # Import libraries for DOC and Excel export
42
  try:
 
386
  self.db_manager = db_manager
387
  self.session_manager = session_manager
388
 
389
+ def upload_and_convert(self, files, max_pages, folder_name=None):
390
+ """Upload and convert files without authentication or team scoping"""
391
 
392
  if files is None:
393
  return "No file uploaded"
394
 
395
  try:
 
 
 
 
 
 
 
 
 
396
  total_pages = 0
397
  uploaded_files = []
398
 
399
+ # Create simple collection name
400
  if folder_name:
401
  folder_name = folder_name.replace(" ", "_").replace("-", "_")
402
+ collection_name = f"{folder_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
403
  else:
404
+ collection_name = f"documents_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
405
+
406
+ # Store the collection name in indexed_docs BEFORE processing files
407
+ self.indexed_docs[collection_name] = True
408
+ print(f"πŸ“ Created collection: {collection_name}")
409
+
410
+ # Clear old collections to ensure only the latest upload is referenced
411
+ self._clear_old_collections(collection_name)
412
 
413
  for file in files[:]:
414
  # Extract the last part of the path (file name)
 
418
 
419
  # Convert PPT to PDF if needed
420
  if ext.lower() in [".ppt", ".pptx"]:
421
+ if PPT_CONVERT_AVAILABLE:
422
+ output_file = os.path.splitext(file.name)[0] + '.pdf'
423
+ output_directory = os.path.dirname(file.name)
424
+ outfile = os.path.join(output_directory, output_file)
425
+ convert(file.name, outfile)
426
+ pdf_path = outfile
427
+ name = os.path.basename(outfile)
428
+ name, ext = os.path.splitext(name)
429
+ else:
430
+ return "PPT/PPTX conversion is only supported on Windows. Please upload PDFs instead."
431
 
432
  # Create unique document ID
433
  doc_id = f"{collection_name}_{name.replace(' ', '_').replace('-', '_')}"
 
435
  print(f"Uploading file: {doc_id}")
436
  middleware = Middleware(collection_name, create_collection=True)
437
 
438
+ # Pass collection_name as id to ensure images are saved to the right directory
439
+ pages = middleware.index(pdf_path, id=collection_name, max_pages=max_pages)
440
  total_pages += len(pages) if pages else 0
441
  uploaded_files.append(doc_id)
 
 
442
 
443
+ # Get the current active collection after cleanup
444
+ current_collection = self.get_current_collection()
445
+ status_message = f"Uploaded {len(uploaded_files)} files with {total_pages} total pages to collection: {collection_name}"
446
+
447
+ if current_collection:
448
+ status_message += f"\nβœ… This is now your active collection for searches."
 
 
449
 
450
+ return status_message
451
 
452
  except Exception as e:
453
  return f"Error processing files: {str(e)}"
454
 
455
+ def _clear_old_collections(self, current_collection_name):
456
+ """Clear old collections to ensure only the latest upload is referenced"""
457
+ try:
458
+ # Get all collections except the current one
459
+ collections_to_remove = [name for name in self.indexed_docs.keys() if name != current_collection_name]
460
+
461
+ if collections_to_remove:
462
+ print(f"πŸ—‘οΈ Clearing {len(collections_to_remove)} old collections to maintain latest upload reference")
463
+
464
+ for old_collection in collections_to_remove:
465
+ # Remove from indexed_docs
466
+ del self.indexed_docs[old_collection]
467
+
468
+ # Try to drop the collection from Milvus
469
+ try:
470
+ middleware = Middleware(old_collection, create_collection=False)
471
+ if middleware.drop_collection():
472
+ print(f"πŸ—‘οΈ Successfully dropped Milvus collection '{old_collection}'")
473
+ else:
474
+ print(f"⚠️ Failed to drop Milvus collection '{old_collection}'")
475
+ except Exception as e:
476
+ print(f"⚠️ Warning: Could not clean up Milvus collection '{old_collection}': {e}")
477
+
478
+ print(f"βœ… Kept only the latest collection: {current_collection_name}")
479
+ else:
480
+ print(f"βœ… No old collections to clear. Current collection: {current_collection_name}")
481
+
482
+ except Exception as e:
483
+ print(f"⚠️ Warning: Error clearing old collections: {e}")
484
+ # Don't fail the upload if cleanup fails
485
+
486
+ def get_current_collection_status(self):
487
+ """Get a user-friendly status message about the current collection"""
488
+ current_collection = self.get_current_collection()
489
+ if current_collection:
490
+ return f"βœ… Currently active collection: {current_collection}"
491
+ else:
492
+ return "❌ No documents uploaded yet. Please upload a document to get started."
493
+
494
+ def get_current_collection(self):
495
+ """Get the name of the currently active collection (most recent upload)"""
496
+ if not self.indexed_docs:
497
+ return None
498
+
499
+ available_collections = list(self.indexed_docs.keys())
500
+ if not available_collections:
501
+ return None
502
+
503
+ # Sort by timestamp to get the most recent one
504
+ def extract_timestamp(collection_name):
505
+ try:
506
+ parts = collection_name.split('_')
507
+ if len(parts) >= 3:
508
+ date_part = parts[-2]
509
+ time_part = parts[-1]
510
+ timestamp = f"{date_part}_{time_part}"
511
+ return timestamp
512
+ return collection_name
513
+ except:
514
+ return collection_name
515
+
516
+ available_collections.sort(key=extract_timestamp, reverse=True)
517
+ return available_collections[0]
518
 
519
+ def display_file_list(self, text):
520
  try:
521
  # Retrieve all entries in the specified directory
522
+ # Use the same base directory logic as PdfManager
523
+ base_output_dir = self._ensure_base_directory()
524
+ directory_path = base_output_dir
525
  current_working_directory = os.getcwd()
526
  directory_path = os.path.join(current_working_directory, directory_path)
527
  entries = os.listdir(directory_path)
 
536
  return str(e)
537
 
538
 
539
+ def search_documents(self, query, num_results):
540
  print(f"Searching for query: {query}")
541
 
542
  if not query:
543
  print("Please enter a search query")
544
+ return "Please enter a search query", "--", "Please enter a search query", [], None, None, None, None
545
 
546
  try:
547
+ # First, check if there are any indexed documents
548
+ if not self.indexed_docs:
549
+ return "No documents have been uploaded yet. Please upload some documents first.", "--", "No documents available for search", [], None, None, None, None
550
+
551
+ # Clean up any invalid collections first
552
+ print("🧹 Cleaning up invalid collections...")
553
+ removed_count = self._cleanup_invalid_collections()
554
+ if removed_count > 0:
555
+ print(f"πŸ—‘οΈ Removed {removed_count} invalid collections")
556
+
557
+ # Check again after cleanup
558
+ if not self.indexed_docs:
559
+ return "No valid collections found after cleanup. Please re-upload your documents.", "--", "No valid collections available", [], None, None, None, None
560
+
561
+ # Get the most recent collection name from indexed docs (latest upload)
562
+ available_collections = list(self.indexed_docs.keys())
563
+ print(f"πŸ” Available collections after cleanup: {available_collections}")
564
+
565
+ if not available_collections:
566
+ return "No collections available for search. Please upload some documents first.", "--", "No collections available", [], None, None, None, None
567
+
568
+ # Sort collections by timestamp to get the most recent one
569
+ # Collections are named like "documents_20250101_120000" or "folder_20250101_120000"
570
+ def extract_timestamp(collection_name):
571
+ try:
572
+ # Extract the timestamp part after the last underscore
573
+ parts = collection_name.split('_')
574
+ if len(parts) >= 3:
575
+ # Get the last two parts which should be date and time
576
+ date_part = parts[-2]
577
+ time_part = parts[-1]
578
+ timestamp = f"{date_part}_{time_part}"
579
+ return timestamp
580
+ return collection_name
581
+ except:
582
+ return collection_name
583
+
584
+ # Sort by timestamp in descending order (most recent first)
585
+ available_collections.sort(key=extract_timestamp, reverse=True)
586
+ collection_name = available_collections[0]
587
+ print(f"πŸ” Available collections sorted by timestamp: {available_collections}")
588
+ print(f"πŸ” Searching in most recent collection: {collection_name}")
589
+
590
+ # Add collection info to the search results for user clarity
591
+ collection_info = f"πŸ” Searching in collection: {collection_name}"
592
+
593
+ middleware = Middleware(collection_name, create_collection=False)
594
 
595
  # Enhanced multi-page retrieval with vision-guided chunking approach
596
  # Get more results than requested to allow for intelligent filtering
597
  # Request 3x the number of results for better selection
598
  search_results = middleware.search([query], topk=max(num_results * 3, 20))[0]
599
 
600
+ # πŸ“Š COMPREHENSIVE SEARCH RESULTS LOGGING
601
+ print(f"\nπŸ” SEARCH RESULTS SUMMARY")
602
+ print(f"πŸ“„ Retrieved {len(search_results)} total results from search")
603
  if len(search_results) > 0:
604
+ print(f"πŸ† Top result score: {search_results[0][0]:.4f}")
605
+ print(f"πŸ“‰ Bottom result score: {search_results[-1][0]:.4f}")
606
+ print(f"πŸ“Š Score range: {search_results[-1][0]:.4f} - {search_results[0][0]:.4f}")
607
+
608
+ # Show top 5 results with page numbers
609
+ print(f"\nπŸ† TOP 5 HIGHEST SCORING PAGES:")
610
+ for i, (score, doc_id) in enumerate(search_results[:5], 1):
611
+ page_num = doc_id + 1 # Convert to 1-based page numbering
612
+ print(f" {i}. Page {page_num} (doc_id: {doc_id}) - Score: {score:.4f}")
613
+
614
+ # Calculate and display score statistics
615
+ scores = [result[0] for result in search_results]
616
+ avg_score = sum(scores) / len(scores)
617
+ print(f"\nπŸ“Š SCORE STATISTICS:")
618
+ print(f" Average Score: {avg_score:.4f}")
619
+ print(f" Score Variance: {sum((s - avg_score) ** 2 for s in scores) / len(scores):.4f}")
620
+
621
+ # Count pages by relevance level
622
+ excellent = sum(1 for s in scores if s >= 0.90)
623
+ very_good = sum(1 for s in scores if 0.80 <= s < 0.90)
624
+ good = sum(1 for s in scores if 0.70 <= s < 0.80)
625
+ moderate = sum(1 for s in scores if 0.60 <= s < 0.70)
626
+ basic = sum(1 for s in scores if 0.50 <= s < 0.60)
627
+ poor = sum(1 for s in scores if s < 0.50)
628
+
629
+ print(f"\nπŸ“ˆ RELEVANCE DISTRIBUTION:")
630
+ print(f" 🟒 Excellent (β‰₯0.90): {excellent} pages")
631
+ print(f" 🟑 Very Good (0.80-0.89): {very_good} pages")
632
+ print(f" 🟠 Good (0.70-0.79): {good} pages")
633
+ print(f" πŸ”΅ Moderate (0.60-0.69): {moderate} pages")
634
+ print(f" 🟣 Basic (0.50-0.59): {basic} pages")
635
+ print(f" πŸ”΄ Poor (<0.50): {poor} pages")
636
+ print("-" * 60)
637
 
638
  if not search_results:
639
+ return "No search results found", "--", "No search results found for your query", [], None, None, None, None
640
 
641
  # Implement intelligent multi-page selection based on research
642
+ selected_results = self._select_relevant_pages_new_format(search_results, query, num_results)
643
+
644
+ # πŸ“Š SELECTION LOGGING - Show which pages were selected
645
+ print(f"\n🎯 PAGE SELECTION RESULTS")
646
+ print(f"πŸ“„ Requested: {num_results} pages")
647
+ print(f"πŸ“„ Selected: {len(selected_results)} pages")
648
+ print(f"πŸ“„ Selection rate: {len(selected_results)/len(search_results)*100:.1f}% of available results")
649
+ print("-" * 60)
650
+
651
+ print(f"πŸ† SELECTED PAGES WITH SCORES:")
652
+ for i, (score, doc_id) in enumerate(selected_results, 1):
653
+ page_num = doc_id + 1
654
+ relevance_level = self._get_relevance_level(score)
655
+ print(f" {i}. Page {page_num:2d} (doc_id: {doc_id:2d}) | Score: {score:8.4f} | {relevance_level}")
656
+
657
+ # Calculate selection statistics
658
+ if selected_results:
659
+ selected_scores = [result[0] for result in selected_results]
660
+ avg_selected_score = sum(selected_scores) / len(selected_scores)
661
+ print(f"\nπŸ“Š SELECTION STATISTICS:")
662
+ print(f" Average selected score: {avg_selected_score:.4f}")
663
+ print(f" Highest selected score: {selected_scores[0]:.4f}")
664
+ print(f" Lowest selected score: {selected_scores[-1]:.4f}")
665
+ print(f" Score improvement over average: {avg_selected_score - avg_score:.4f}")
666
+ print("-" * 60)
667
 
668
  # Process selected results
669
  cited_pages = []
 
673
 
674
  print(f"πŸ“„ Processing {len(selected_results)} selected results...")
675
 
676
+ # Ensure base directory exists and get the correct path
677
+ base_output_dir = self._ensure_base_directory()
678
+ print(f"πŸ” Using base directory: {base_output_dir}")
679
+ print(f"πŸ” Collection name: {collection_name}")
680
+ print(f"πŸ” Environment: {'Hugging Face Spaces' if self._is_huggingface_spaces() else 'Local Development'}")
681
+
682
+ for i, (score, doc_id) in enumerate(selected_results):
683
+ # Use the index as page number since doc_id is just an identifier
684
+ # This ensures we look for page_1.png, page_2.png, etc.
685
+ display_page_num = i + 1
686
+ coll_num = collection_name # Use the current collection name
687
+
688
+ # Use debug function to get paths and check existence
689
+ img_path, path, file_exists = self._debug_file_paths(base_output_dir, coll_num, display_page_num)
690
 
691
+ if file_exists:
692
  img_paths.append(img_path)
693
  all_paths.append(path)
694
  page_scores.append(score)
 
696
  print(f"βœ… Retrieved page {i+1}: {img_path} (Score: {score:.3f})")
697
  else:
698
  print(f"❌ Image file not found: {img_path}")
699
+ # Try alternative paths with better fallback logic
700
+ alt_paths = [
701
+ # Primary path (should work in Hugging Face Spaces)
702
+ img_path,
703
+ # Relative paths from app directory
704
+ os.path.join(os.path.dirname(os.path.abspath(__file__)), "pages", coll_num, f"page_{display_page_num}.png"),
705
+ # Current working directory paths
706
+ f"pages/{coll_num}/page_{display_page_num}.png",
707
+ f"./pages/{coll_num}/page_{display_page_num}.png",
708
+ os.path.join(os.getcwd(), "pages", coll_num, f"page_{display_page_num}.png"),
709
+ # Alternative base directories
710
+ os.path.join("/tmp", "pages", coll_num, f"page_{display_page_num}.png"),
711
+ os.path.join("/home/user", "pages", coll_num, f"page_{display_page_num}.png")
712
+ ]
713
+
714
+ print(f"πŸ” Trying alternative paths for page {display_page_num}:")
715
+ for alt_path in alt_paths:
716
+ print(f" πŸ” Checking: {alt_path}")
717
+ if os.path.exists(alt_path):
718
+ print(f"βœ… Found alternative path: {alt_path}")
719
+ img_paths.append(alt_path)
720
+ all_paths.append(alt_path.replace(".png", ""))
721
+ page_scores.append(score)
722
+ cited_pages.append(f"Page {display_page_num} from {coll_num}")
723
+ break
724
+ else:
725
+ print(f"❌ No alternative path found for page {display_page_num}")
726
 
727
  print(f"πŸ“Š Final count: {len(img_paths)} valid pages out of {len(selected_results)} selected")
728
 
729
+ # πŸ“Š FINAL RESULTS SUMMARY
730
+ if img_paths:
731
+ print(f"\nπŸŽ‰ FINAL RETRIEVAL SUMMARY")
732
+ print(f"πŸ“„ Successfully retrieved: {len(img_paths)} pages")
733
+ print(f"πŸ“Š Final page scores:")
734
+ for i, (img_path, score) in enumerate(zip(img_paths, page_scores), 1):
735
+ # Extract page number from path
736
+ page_num = img_path.split('page_')[1].split('.png')[0] if 'page_' in img_path else f"Page {i}"
737
+ print(f" {i}. {page_num} - Score: {score:.4f}")
738
+
739
+ if page_scores:
740
+ final_avg_score = sum(page_scores) / len(page_scores)
741
+ print(f"\nπŸ“Š FINAL STATISTICS:")
742
+ print(f" Average final score: {final_avg_score:.4f}")
743
+ print(f" Highest final score: {max(page_scores):.4f}")
744
+ print(f" Lowest final score: {min(page_scores):.4f}")
745
+ print("=" * 60)
746
+
747
  if not img_paths:
748
+ return "No valid image files found", "--", "Error: No valid image files found for the search results", [], None, None, None, None
749
 
750
  # Generate RAG response with multiple pages using enhanced approach
751
+ try:
752
+ print("πŸ€– Generating RAG response...")
753
+ rag_response, csv_filepath, doc_filepath, excel_filepath = self._generate_multi_page_response(query, img_paths, cited_pages, page_scores)
754
+ print("βœ… RAG response generated successfully")
755
+ except Exception as e:
756
+ error_code = "RAG001"
757
+ error_msg = f"❌ **Error {error_code}**: Failed to generate RAG response"
758
+ print(f"{error_msg}: {str(e)}")
759
+ print(f"❌ Traceback: {traceback.format_exc()}")
760
+
761
+ # Return error response with proper format
762
+ return (
763
+ error_msg, # path
764
+ "--", # images
765
+ f"{error_msg}\n\n**Details**: {str(e)}\n\n**Error Code**: {error_code}", # llm_answer
766
+ cited_pages, # cited_pages_display
767
+ None, # csv_download
768
+ None, # doc_download
769
+ None # excel_download
770
+ )
771
 
772
  # Prepare downloads
773
  csv_download = self._prepare_csv_download(csv_filepath)
 
793
 
794
  except Exception as e:
795
  error_msg = f"Error during search: {str(e)}"
796
+ print(f"❌ Search error: {error_msg}")
797
+ # Return exactly 7 outputs to match Gradio expectations
798
  return error_msg, "--", error_msg, [], None, None, None, None
799
+
800
+ def _select_relevant_pages_new_format(self, search_results, query, num_results):
801
  """
802
+ Intelligent page selection for new Milvus format: (score, doc_id)
 
803
  """
804
  if len(search_results) <= num_results:
805
  return search_results
806
 
 
 
 
 
 
 
 
 
 
 
 
807
  # Sort by relevance score
808
  sorted_results = sorted(search_results, key=lambda x: x[0], reverse=True)
809
 
810
+ # Simple strategy: take top N results
811
+ selected = sorted_results[:num_results]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
812
 
813
+ print(f"Requested {num_results} pages, selected {len(selected)} pages")
 
 
 
 
 
 
 
 
814
 
815
  return selected
816
 
817
+ def _get_relevance_level(self, score):
818
+ """Get human-readable relevance level based on score"""
819
+ if score >= 0.90:
820
+ return "🟒 EXCELLENT - Highly relevant"
821
+ elif score >= 0.80:
822
+ return "🟑 VERY GOOD - Very relevant"
823
+ elif score >= 0.70:
824
+ return "🟠 GOOD - Relevant"
825
+ elif score >= 0.60:
826
+ return "πŸ”΅ MODERATE - Somewhat relevant"
827
+ elif score >= 0.50:
828
+ return "🟣 BASIC - Minimally relevant"
829
+ else:
830
+ return "πŸ”΄ POOR - Not relevant"
831
+
832
  def _optimize_consecutive_pages(self, selected, all_results, target_count=None):
833
  """
834
  Optimize selection to include consecutive pages when beneficial
 
1365
  cell_str = str(cell)
1366
  if ',' in cell_str or '"' in cell_str or '\n' in cell_str:
1367
  # Escape quotes and wrap in quotes
1368
+ cell_str = '"' + cell_str.replace('"', '""') + '"'
1369
  escaped_row.append(cell_str)
1370
  csv_lines.append(','.join(escaped_row))
1371
 
 
2996
  # Fallback to simple response with enhanced prompt
2997
  return rag.get_answer_from_gemini(detailed_prompt, img_paths), None, None, None
2998
 
2999
+ # Authentication and team collection methods removed for simplified app
3000
+
3001
+ def _is_huggingface_spaces(self):
3002
+ """Check if running in Hugging Face Spaces environment"""
3003
+ return (
3004
+ os.path.exists("/tmp") and
3005
+ os.access("/tmp", os.W_OK) and
3006
+ (os.getenv('SPACE_ID') or os.getenv('HF_SPACE_ID'))
3007
+ )
 
 
 
 
 
3008
 
3009
+ def _get_optimal_base_dir(self):
3010
+ """Get the optimal base directory based on environment"""
3011
+ if self._is_huggingface_spaces():
3012
+ base_dir = "/tmp/pages"
3013
+ print(f"πŸš€ Detected Hugging Face Spaces environment, using: {base_dir}")
3014
+ else:
3015
+ # Use relative path from app directory
3016
+ app_dir = os.path.dirname(os.path.abspath(__file__))
3017
+ base_dir = os.path.join(app_dir, "pages")
3018
+ print(f"πŸ’» Using local development path: {base_dir}")
3019
+
3020
+ # Ensure directory exists
3021
+ os.makedirs(base_dir, exist_ok=True)
3022
+ return base_dir
3023
+
3024
+ def _ensure_base_directory(self):
3025
+ """Ensure the base directory for storing pages exists"""
3026
+ base_output_dir = self._get_optimal_base_dir()
3027
+
3028
+ # Create the base directory if it doesn't exist
3029
+ if not os.path.exists(base_output_dir):
3030
+ try:
3031
+ os.makedirs(base_output_dir, exist_ok=True)
3032
+ print(f"βœ… Created base directory: {base_output_dir}")
3033
+ except Exception as e:
3034
+ print(f"❌ Failed to create base directory {base_output_dir}: {e}")
3035
+ # Fallback to current working directory
3036
+ base_output_dir = os.path.join(os.getcwd(), "pages")
3037
+ os.makedirs(base_output_dir, exist_ok=True)
3038
+ print(f"βœ… Using fallback directory: {base_output_dir}")
3039
+
3040
+ return base_output_dir
3041
 
3042
+ def _debug_file_paths(self, base_output_dir, coll_num, display_page_num):
3043
+ """Helper function to debug file path issues"""
3044
+ img_path = os.path.join(base_output_dir, coll_num, f"page_{display_page_num}.png")
3045
+ path = os.path.join(base_output_dir, coll_num, f"page_{display_page_num}")
3046
+
3047
+ # Check if directory exists
3048
+ dir_path = os.path.dirname(img_path)
3049
+ dir_exists = os.path.exists(dir_path)
3050
 
3051
+ # Check if file exists
3052
+ file_exists = os.path.exists(img_path)
 
3053
 
3054
+ # Get absolute paths for debugging
3055
+ abs_img_path = os.path.abspath(img_path)
3056
+ abs_dir_path = os.path.abspath(dir_path)
3057
 
3058
+ print(f"πŸ” Path Debug for {coll_num}/page_{display_page_num}:")
3059
+ print(f" Base dir: {base_output_dir}")
3060
+ print(f" Directory: {dir_path} (exists: {dir_exists})")
3061
+ print(f" File: {img_path} (exists: {file_exists})")
3062
+ print(f" Abs dir: {abs_dir_path}")
3063
+ print(f" Abs file: {abs_img_path}")
3064
 
3065
+ return img_path, path, file_exists
3066
+
3067
+ def _cleanup_invalid_collections(self):
3068
+ """Remove collections that no longer exist in Milvus from indexed_docs"""
3069
+ invalid_collections = []
3070
+
3071
+ for collection_name in list(self.indexed_docs.keys()):
3072
+ try:
3073
+ # Try to create a middleware instance to check if collection exists
3074
+ middleware = Middleware(collection_name, create_collection=False)
3075
+ print(f"οΏ½οΏ½ Collection {collection_name} is valid")
3076
+ except Exception as e:
3077
+ print(f"⚠️ Collection {collection_name} not accessible: {e}")
3078
+ invalid_collections.append(collection_name)
3079
+
3080
+ # Remove invalid collections
3081
+ for collection_name in invalid_collections:
3082
+ if collection_name in self.indexed_docs:
3083
+ del self.indexed_docs[collection_name]
3084
+ print(f"πŸ—‘οΈ Removed invalid collection: {collection_name}")
3085
+
3086
+ return len(invalid_collections)
3087
+
3088
+ def _check_collections_exist(self):
3089
+ # This method should be implemented to check if collections exist in Milvus
3090
+ pass
3091
 
3092
  def create_ui():
3093
  app = PDFSearchApp()
3094
 
3095
  with gr.Blocks(theme=gr.themes.Ocean(), css="footer{display:none !important}") as demo:
 
 
 
 
3096
  gr.Markdown("# Collar Multimodal RAG Demo - Streamlined")
3097
+ gr.Markdown("Basic document upload and search (no authentication)")
3098
+
3099
+ # Document Upload
3100
+ with gr.Tab("πŸ“ Document Upload"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3101
  with gr.Column():
3102
+ gr.Markdown("### Upload Documents")
3103
  folder_name_input = gr.Textbox(
3104
+ label="Collection Name (Optional)",
3105
+ placeholder="Optional name for this document collection"
3106
  )
3107
  max_pages_input = gr.Slider(
3108
  minimum=1,
 
3112
  label="Max pages to extract and index per document"
3113
  )
3114
  file_input = gr.Files(
3115
+ label="Upload PPTs/PDFs (Multiple files supported)",
3116
  file_count="multiple"
3117
  )
3118
+ upload_btn = gr.Button("Upload", variant="primary")
3119
  upload_status = gr.Textbox(label="Upload Status", interactive=False)
 
 
 
 
 
 
 
 
3120
 
3121
  # Enhanced Query Tab
3122
  with gr.Tab("πŸ” Advanced Query"):
 
3185
 
3186
 
3187
  # Event handlers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3188
  upload_btn.click(
3189
  fn=app.upload_and_convert,
3190
+ inputs=[file_input, max_pages_input, folder_name_input],
3191
  outputs=[upload_status]
3192
  )
3193
 
 
 
 
 
 
 
3194
  # Query events
3195
  search_btn.click(
3196
  fn=app.search_documents,
3197
+ inputs=[query_input, num_results],
3198
  outputs=[path, images, llm_answer, cited_pages_display, csv_download, doc_download, excel_download]
3199
  )
3200
 
colpali_manager.py CHANGED
@@ -25,7 +25,7 @@ import dotenv
25
  dotenv_file = dotenv.find_dotenv()
26
  dotenv.load_dotenv(dotenv_file)
27
 
28
- model_name = os.environ['colpali'] #"vidore/colSmol-256M"
29
  device = get_torch_device("cuda") #try using cpu instead of cuda?
30
 
31
  #switch to locally downloading models & loading locally rather than from hf
@@ -97,7 +97,7 @@ class ColpaliManager:
97
  return [Image.open(path) for path in paths]
98
 
99
  @spaces.GPU
100
- def process_images(self, image_paths:list[str], batch_size=int(os.environ['batchsize'])):
101
  model.to("cuda")
102
  print(f"Processing {len(image_paths)} image_paths")
103
 
@@ -161,7 +161,7 @@ class ColpaliManager:
161
 
162
  dataloader = DataLoader(
163
  dataset=ListDataset[str](texts),
164
- batch_size=int(os.environ['batchsize']), #OG is 5, try reducing batch size to maximise gpu use
165
  shuffle=False,
166
  collate_fn=lambda x: processor.process_queries(x),
167
  )
 
25
  dotenv_file = dotenv.find_dotenv()
26
  dotenv.load_dotenv(dotenv_file)
27
 
28
+ model_name = 'vidore/colpali-v1.3' #"vidore/colSmol-256M"
29
  device = get_torch_device("cuda") #try using cpu instead of cuda?
30
 
31
  #switch to locally downloading models & loading locally rather than from hf
 
97
  return [Image.open(path) for path in paths]
98
 
99
  @spaces.GPU
100
+ def process_images(self, image_paths:list[str], batch_size=5):
101
  model.to("cuda")
102
  print(f"Processing {len(image_paths)} image_paths")
103
 
 
161
 
162
  dataloader = DataLoader(
163
  dataset=ListDataset[str](texts),
164
+ batch_size=5, #OG is 5, try reducing batch size to maximise gpu use
165
  shuffle=False,
166
  collate_fn=lambda x: processor.process_queries(x),
167
  )
middleware.py CHANGED
@@ -43,20 +43,40 @@ class Middleware:
43
  print("Indexing completed")
44
 
45
  return image_paths
46
-
 
 
 
47
 
48
 
49
  def search(self, search_queries: list[str], topk: int = 10):
50
- print(f"Searching for {len(search_queries)} queries with topk={topk}")
 
 
 
51
 
52
  final_res = []
53
 
54
- for query in search_queries:
55
- print(f"Searching for query: {query}")
 
 
56
  query_vec = colpali_manager.process_text([query])[0]
 
 
 
57
  search_res = self.milvus_manager.search(query_vec, topk=topk)
58
- print(f"Search result: {len(search_res)} results for query: {query}")
 
 
 
 
59
  final_res.append(search_res)
60
 
 
 
 
 
 
61
  return final_res
62
 
 
43
  print("Indexing completed")
44
 
45
  return image_paths
46
+
47
+ def drop_collection(self):
48
+ """Drop the current collection from Milvus"""
49
+ return self.milvus_manager.drop_collection()
50
 
51
 
52
  def search(self, search_queries: list[str], topk: int = 10):
53
+ print(f"\nπŸ” MIDDLEWARE SEARCH INITIATED")
54
+ print(f"πŸ“ Queries to process: {len(search_queries)}")
55
+ print(f"🎯 Top-k requested: {topk}")
56
+ print("-" * 60)
57
 
58
  final_res = []
59
 
60
+ for i, query in enumerate(search_queries, 1):
61
+ print(f"\nπŸ” Processing Query {i}/{len(search_queries)}: '{query}'")
62
+ print(f"πŸ“Š Converting query to vector representation...")
63
+
64
  query_vec = colpali_manager.process_text([query])[0]
65
+ print(f"βœ… Query vector generated (dimension: {len(query_vec)})")
66
+
67
+ print(f"πŸ” Executing vector search in Milvus...")
68
  search_res = self.milvus_manager.search(query_vec, topk=topk)
69
+
70
+ print(f"βœ… Search completed: {len(search_res)} results retrieved")
71
+ if search_res:
72
+ print(f"πŸ“Š Score range: {search_res[0][0]:.4f} (highest) to {search_res[-1][0]:.4f} (lowest)")
73
+
74
  final_res.append(search_res)
75
 
76
+ print(f"\nπŸŽ‰ MIDDLEWARE SEARCH COMPLETED")
77
+ print(f"πŸ“Š Total queries processed: {len(search_queries)}")
78
+ print(f"πŸ“„ Total results across all queries: {sum(len(res) for res in final_res)}")
79
+ print("=" * 60)
80
+
81
  return final_res
82
 
milvus_manager.py CHANGED
@@ -1,49 +1,24 @@
1
  from pymilvus import MilvusClient, DataType
2
- try:
3
- from milvus import default_server # Milvus Lite
4
- except Exception:
5
- default_server = None
6
  import numpy as np
7
  import concurrent.futures
8
- from pymilvus import Collection
9
- import os
10
 
11
  class MilvusManager:
12
  def __init__(self, milvus_uri, collection_name, create_collection, dim=128):
13
-
14
- #import environ variables from .env
15
- import dotenv
16
- # Load the .env file
17
- dotenv_file = dotenv.find_dotenv()
18
- dotenv.load_dotenv(dotenv_file)
19
-
20
- # Start embedded Milvus Lite server and connect locally
21
- if default_server is not None:
22
- try:
23
- # Optionally set base dir here if desired, e.g. default_server.set_base_dir('volumes/milvus_lite')
24
- default_server.start()
25
- except Exception:
26
- pass
27
- local_uri = f"http://127.0.0.1:{default_server.listen_port}"
28
- self.client = MilvusClient(uri=local_uri)
29
- else:
30
- # Fallback to standard local server (assumes docker-compose or system service)
31
- self.client = MilvusClient(uri="http://127.0.0.1:19530")
32
  self.collection_name = collection_name
 
 
33
  self.dim = dim
34
 
35
- if self.client.has_collection(collection_name=self.collection_name):
36
- self.client.load_collection(collection_name=self.collection_name)
37
- print("Loaded existing collection.")
38
- elif create_collection:
39
  self.create_collection()
40
  self.create_index()
41
 
 
42
  def create_collection(self):
43
  if self.client.has_collection(collection_name=self.collection_name):
44
- print("Collection already exists.")
45
- return
46
-
47
  schema = self.client.create_schema(
48
  auto_id=True,
49
  enable_dynamic_fields=True,
@@ -61,16 +36,19 @@ class MilvusManager:
61
  )
62
 
63
  def create_index(self):
 
 
 
 
64
  index_params = self.client.prepare_index_params()
65
-
66
  index_params.add_index(
67
  field_name="vector",
68
  index_name="vector_index",
69
- index_type="HNSW", #use HNSW option if got more mem, if not use IVF for faster processing
70
- metric_type=os.environ["metrictype"], #"IP"
71
  params={
72
- "M": int(os.environ["mnum"]), #M:16 for HNSW, capital M
73
- "efConstruction": int(os.environ["efnum"]), #500 for HNSW
74
  },
75
  )
76
 
@@ -78,78 +56,33 @@ class MilvusManager:
78
  collection_name=self.collection_name, index_params=index_params, sync=True
79
  )
80
 
81
- def search(self, data, topk):
82
- # Retrieve all collection names from the Milvus client.
83
- collections = self.client.list_collections()
84
-
85
- # Set search parameters (here, using Inner Product metric).
86
- search_params = {"metric_type": os.environ["metrictype"], "params": {}} #default metric type is "IP"
87
-
88
- # Set to store unique (doc_id, collection_name) pairs across all collections.
89
- doc_collection_pairs = set()
90
-
91
- # Query each collection individually
92
- for collection in collections:
93
- self.client.load_collection(collection_name=collection)
94
- print("collection loaded:"+ collection)
95
- results = self.client.search(
96
- collection,
97
- data,
98
- limit=int(os.environ["topk"]), # Adjust limit per collection as needed. (default is 50)
99
- output_fields=["vector", "seq_id", "doc_id"],
100
- search_params=search_params,
101
- )
102
- # Accumulate document IDs along with their originating collection.
103
- for r_id in range(len(results)):
104
- for r in range(len(results[r_id])):
105
- doc_id = results[r_id][r]["entity"]["doc_id"]
106
- doc_collection_pairs.add((doc_id, collection))
107
 
108
- scores = []
 
 
 
 
 
109
 
110
- def rerank_single_doc(doc_id, data, client, collection_name):
111
- # Query for detailed document vectors in the given collection.
112
- doc_colbert_vecs = client.query(
113
- collection_name=collection_name,
114
- filter=f"doc_id in [{doc_id}, {doc_id + 1}]",
115
- output_fields=["seq_id", "vector", "doc"],
116
- limit=16380,
117
- )
118
- # Stack the vectors for dot product computation.
119
- doc_vecs = np.vstack(
120
- [doc_colbert_vecs[i]["vector"] for i in range(len(doc_colbert_vecs))]
121
- )
122
- # Compute a similarity score via dot product.
123
- score = np.dot(data, doc_vecs.T).max(1).sum()
124
- return (score, doc_id, collection_name)
125
 
126
- # Use a thread pool to rerank each document concurrently.
127
- with concurrent.futures.ThreadPoolExecutor(max_workers=300) as executor:
128
- futures = {
129
- executor.submit(rerank_single_doc, doc_id, data, self.client, collection): (doc_id, collection)
130
- for doc_id, collection in doc_collection_pairs
131
- }
132
- for future in concurrent.futures.as_completed(futures):
133
- score, doc_id, collection = future.result()
134
- scores.append((score, doc_id, collection))
135
- #doc_id is page number!
136
-
137
- # Sort the reranked results by score in descending order.
138
- scores.sort(key=lambda x: x[0], reverse=True)
139
- # Unload the collection after search to free memory.
140
- self.client.release_collection(collection_name=collection)
141
-
142
- return scores[:topk] if len(scores) >= topk else scores #topk is the number of scores to return back
143
- """
144
  search_params = {"metric_type": "IP", "params": {}}
145
  results = self.client.search(
146
  self.collection_name,
147
  data,
148
- limit=50,
149
  output_fields=["vector", "seq_id", "doc_id"],
150
  search_params=search_params,
151
  )
152
- doc_ids = {result["entity"]["doc_id"] for result in results[0]}
 
 
 
153
 
154
  scores = []
155
 
@@ -161,10 +94,10 @@ class MilvusManager:
161
  limit=1000,
162
  )
163
  doc_vecs = np.vstack(
164
- [doc["vector"] for doc in doc_colbert_vecs]
165
  )
166
  score = np.dot(data, doc_vecs.T).max(1).sum()
167
- return score, doc_id
168
 
169
  with concurrent.futures.ThreadPoolExecutor(max_workers=300) as executor:
170
  futures = {
@@ -178,13 +111,59 @@ class MilvusManager:
178
  scores.append((score, doc_id))
179
 
180
  scores.sort(key=lambda x: x[0], reverse=True)
181
- return scores[:topk]
182
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
  def insert(self, data):
185
- colbert_vecs = data["colbert_vecs"]
186
  seq_length = len(colbert_vecs)
187
- doc_ids = [data["doc_id"]] * seq_length
188
  seq_ids = list(range(seq_length))
189
  docs = [""] * seq_length
190
  docs[0] = data["filepath"]
@@ -202,17 +181,38 @@ class MilvusManager:
202
  ],
203
  )
204
 
205
- def get_images_as_doc(self, images_with_vectors):
206
- return [
207
- {
208
- "colbert_vecs": image["colbert_vecs"],
209
- "doc_id": idx,
210
- "filepath": image["filepath"],
 
 
 
 
211
  }
212
- for idx, image in enumerate(images_with_vectors)
213
- ]
 
 
214
 
215
  def insert_images_data(self, image_data):
216
  data = self.get_images_as_doc(image_data)
217
- for item in data:
218
- self.insert(item)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from pymilvus import MilvusClient, DataType
 
 
 
 
2
  import numpy as np
3
  import concurrent.futures
4
+
 
5
 
6
  class MilvusManager:
7
  def __init__(self, milvus_uri, collection_name, create_collection, dim=128):
8
+ self.client = MilvusClient(uri=milvus_uri)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  self.collection_name = collection_name
10
+ if self.client.has_collection(collection_name=self.collection_name):
11
+ self.client.load_collection(collection_name)
12
  self.dim = dim
13
 
14
+ if create_collection:
 
 
 
15
  self.create_collection()
16
  self.create_index()
17
 
18
+
19
  def create_collection(self):
20
  if self.client.has_collection(collection_name=self.collection_name):
21
+ self.client.drop_collection(collection_name=self.collection_name)
 
 
22
  schema = self.client.create_schema(
23
  auto_id=True,
24
  enable_dynamic_fields=True,
 
36
  )
37
 
38
  def create_index(self):
39
+ self.client.release_collection(collection_name=self.collection_name)
40
+ self.client.drop_index(
41
+ collection_name=self.collection_name, index_name="vector"
42
+ )
43
  index_params = self.client.prepare_index_params()
 
44
  index_params.add_index(
45
  field_name="vector",
46
  index_name="vector_index",
47
+ index_type="FLAT",
48
+ metric_type="IP",
49
  params={
50
+ "M": 16,
51
+ "efConstruction": 500,
52
  },
53
  )
54
 
 
56
  collection_name=self.collection_name, index_params=index_params, sync=True
57
  )
58
 
59
+ def create_scalar_index(self):
60
+ self.client.release_collection(collection_name=self.collection_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
+ index_params = self.client.prepare_index_params()
63
+ index_params.add_index(
64
+ field_name="doc_id",
65
+ index_name="int32_index",
66
+ index_type="INVERTED",
67
+ )
68
 
69
+ self.client.create_index(
70
+ collection_name=self.collection_name, index_params=index_params, sync=True
71
+ )
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
+ def search(self, data, topk):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  search_params = {"metric_type": "IP", "params": {}}
75
  results = self.client.search(
76
  self.collection_name,
77
  data,
78
+ limit=int(50),
79
  output_fields=["vector", "seq_id", "doc_id"],
80
  search_params=search_params,
81
  )
82
+ doc_ids = set()
83
+ for r_id in range(len(results)):
84
+ for r in range(len(results[r_id])):
85
+ doc_ids.add(results[r_id][r]["entity"]["doc_id"])
86
 
87
  scores = []
88
 
 
94
  limit=1000,
95
  )
96
  doc_vecs = np.vstack(
97
+ [doc_colbert_vecs[i]["vector"] for i in range(len(doc_colbert_vecs))]
98
  )
99
  score = np.dot(data, doc_vecs.T).max(1).sum()
100
+ return (score, doc_id)
101
 
102
  with concurrent.futures.ThreadPoolExecutor(max_workers=300) as executor:
103
  futures = {
 
111
  scores.append((score, doc_id))
112
 
113
  scores.sort(key=lambda x: x[0], reverse=True)
114
+
115
+ # πŸ“Š DETAILED SCORE LOGGING - Print page numbers with highest scores
116
+ print("\n" + "="*80)
117
+ print("πŸ“Š RETRIEVAL SCORES - PAGE NUMBERS WITH HIGHEST SCORES")
118
+ print("="*80)
119
+ print(f"πŸ” Collection: {self.collection_name}")
120
+ print(f"πŸ“„ Total documents found: {len(scores)}")
121
+ print(f"🎯 Requested top-k: {topk}")
122
+ print("-"*80)
123
+
124
+ # Display top 10 scores with detailed information
125
+ display_count = min(10, len(scores))
126
+ for i, (score, doc_id) in enumerate(scores[:display_count]):
127
+ page_num = doc_id + 1 # Convert doc_id to page number (0-based to 1-based)
128
+ relevance_level = self._get_relevance_level(score)
129
+ print(f"πŸ“„ Page {page_num:2d} (doc_id: {doc_id:2d}) | Score: {score:8.4f} | {relevance_level}")
130
+
131
+ if len(scores) > display_count:
132
+ print(f"... and {len(scores) - display_count} more results")
133
+
134
+ print("-"*80)
135
+ print(f"πŸ† HIGHEST SCORING PAGES:")
136
+ top_3 = scores[:3]
137
+ for i, (score, doc_id) in enumerate(top_3, 1):
138
+ page_num = doc_id + 1
139
+ print(f" {i}. Page {page_num} - Score: {score:.4f}")
140
+
141
+ print("="*80 + "\n")
142
+
143
+ if len(scores) >= topk:
144
+ return scores[:topk]
145
+ else:
146
+ return scores
147
+
148
+ def _get_relevance_level(self, score):
149
+ """Get human-readable relevance level based on score"""
150
+ if score >= 0.90:
151
+ return "🟒 EXCELLENT - Highly relevant"
152
+ elif score >= 0.80:
153
+ return "🟑 VERY GOOD - Very relevant"
154
+ elif score >= 0.70:
155
+ return "🟠 GOOD - Relevant"
156
+ elif score >= 0.60:
157
+ return "πŸ”΅ MODERATE - Somewhat relevant"
158
+ elif score >= 0.50:
159
+ return "🟣 BASIC - Minimally relevant"
160
+ else:
161
+ return "πŸ”΄ POOR - Not relevant"
162
 
163
  def insert(self, data):
164
+ colbert_vecs = [vec for vec in data["colbert_vecs"]]
165
  seq_length = len(colbert_vecs)
166
+ doc_ids = [data["doc_id"] for i in range(seq_length)]
167
  seq_ids = list(range(seq_length))
168
  docs = [""] * seq_length
169
  docs[0] = data["filepath"]
 
181
  ],
182
  )
183
 
184
+
185
+ def get_images_as_doc(self, images_with_vectors:list):
186
+
187
+ images_data = []
188
+
189
+ for i in range(len(images_with_vectors)):
190
+ data = {
191
+ "colbert_vecs": images_with_vectors[i]["colbert_vecs"],
192
+ "doc_id": i,
193
+ "filepath": images_with_vectors[i]["filepath"],
194
  }
195
+ images_data.append(data)
196
+
197
+ return images_data
198
+
199
 
200
  def insert_images_data(self, image_data):
201
  data = self.get_images_as_doc(image_data)
202
+
203
+ for i in range(len(data)):
204
+ self.insert(data[i])
205
+
206
+ def drop_collection(self):
207
+ """Drop the current collection from Milvus"""
208
+ try:
209
+ if self.client.has_collection(collection_name=self.collection_name):
210
+ self.client.drop_collection(collection_name=self.collection_name)
211
+ print(f"πŸ—‘οΈ Dropped Milvus collection: {self.collection_name}")
212
+ return True
213
+ else:
214
+ print(f"⚠️ Collection {self.collection_name} does not exist in Milvus")
215
+ return False
216
+ except Exception as e:
217
+ print(f"❌ Error dropping collection {self.collection_name}: {e}")
218
+ return False
pdf_manager.py CHANGED
@@ -4,7 +4,21 @@ import shutil
4
 
5
  class PdfManager:
6
  def __init__(self):
7
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  def clear_and_recreate_dir(self, output_folder):
10
 
@@ -19,7 +33,8 @@ class PdfManager:
19
  #print("Clearing is unused for now for persistency")
20
 
21
  def save_images(self, id, pdf_path, max_pages, pages: list[int] = None) -> list[str]:
22
- output_folder = f"pages/{id}" #remove last backslash to avoid error,test this
 
23
  images = convert_from_path(pdf_path)
24
 
25
  print(f"Saving images from {pdf_path} to {output_folder}. Max pages: {max_pages}")
@@ -35,7 +50,7 @@ class PdfManager:
35
  if pages and i not in pages:
36
  continue
37
 
38
- full_save_path = f"{output_folder}/page_{i + 1}.png"
39
 
40
  #print(f"Saving image to {full_save_path}")
41
 
@@ -43,4 +58,4 @@ class PdfManager:
43
 
44
  num_page_processed += 1
45
 
46
- return [f"{output_folder}/page_{i + 1}.png" for i in range(num_page_processed)]
 
4
 
5
  class PdfManager:
6
  def __init__(self):
7
+ # Use relative paths for Hugging Face Spaces compatibility
8
+ # Get the directory where the main application file is located
9
+ app_dir = os.path.dirname(os.path.abspath(__file__))
10
+
11
+ # Use /tmp for Hugging Face Spaces, fallback to relative path
12
+ if os.path.exists("/tmp") and os.access("/tmp", os.W_OK):
13
+ self.base_output_dir = "/tmp/pages"
14
+ print(f"βœ… Using /tmp directory for Hugging Face Spaces: {self.base_output_dir}")
15
+ else:
16
+ # Fallback to relative path from app directory
17
+ self.base_output_dir = os.path.join(app_dir, "pages")
18
+ print(f"βœ… Using relative path: {self.base_output_dir}")
19
+
20
+ # Ensure the base directory exists
21
+ os.makedirs(self.base_output_dir, exist_ok=True)
22
 
23
  def clear_and_recreate_dir(self, output_folder):
24
 
 
33
  #print("Clearing is unused for now for persistency")
34
 
35
  def save_images(self, id, pdf_path, max_pages, pages: list[int] = None) -> list[str]:
36
+ # Use absolute path for Hugging Face Spaces compatibility
37
+ output_folder = os.path.join(self.base_output_dir, id)
38
  images = convert_from_path(pdf_path)
39
 
40
  print(f"Saving images from {pdf_path} to {output_folder}. Max pages: {max_pages}")
 
50
  if pages and i not in pages:
51
  continue
52
 
53
+ full_save_path = os.path.join(output_folder, f"page_{i + 1}.png")
54
 
55
  #print(f"Saving image to {full_save_path}")
56
 
 
58
 
59
  num_page_processed += 1
60
 
61
+ return [os.path.join(output_folder, f"page_{i + 1}.png") for i in range(num_page_processed)]
rag.py CHANGED
@@ -5,7 +5,7 @@ import re
5
  from typing import List
6
  from utils import encode_image
7
  from PIL import Image
8
- from google import genai
9
  import torch
10
  import subprocess
11
  import psutil
@@ -64,30 +64,28 @@ class Rag:
64
 
65
  return response_text
66
 
67
- def get_answer_from_gemini(self, query: str, image_paths: List[str]) -> str:
68
- print(f"Querying Gemini 2.5 Pro for query={query}, image_paths={image_paths}")
 
 
 
69
  try:
70
- # Use environment variable GEMINI_API_KEY
71
- api_key = os.environ.get('GEMINI_API_KEY')
72
- if not api_key:
73
- return "Error: GEMINI_API_KEY is not set."
74
-
75
- genai.configure(api_key=api_key)
76
- model = genai.GenerativeModel('gemini-2.5-pro')
77
-
78
- # Load images
79
- images = []
80
- for p in image_paths:
81
- try:
82
- images.append(Image.open(p))
83
- except Exception:
84
- pass
85
-
86
- chat_session = model.start_chat()
87
- response = chat_session.send_message([*images, query])
88
- return response.text
89
  except Exception as e:
90
- print(f"Gemini error: {e}")
91
  return f"Error: {str(e)}"
92
 
93
  #os.environ['OPENAI_API_KEY'] = "for the love of Jesus let this work"
@@ -100,13 +98,160 @@ class Rag:
100
  dotenv_file = dotenv.find_dotenv()
101
  dotenv.load_dotenv(dotenv_file)
102
 
103
- # This function formerly used Ollama. Replace with Gemini 2.5 Pro.
104
- print(f"Querying Gemini (replacement for Ollama) for query={query}, imagesPaths={imagesPaths}")
105
- try:
106
- enhanced_query = f"Use all {len(imagesPaths)} pages to answer comprehensively.\n\nQuery: {query}"
107
- return self.get_answer_from_gemini(enhanced_query, imagesPaths)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  except Exception as e:
109
- print(f"Gemini replacement error: {e}")
110
  return None
111
 
112
 
 
5
  from typing import List
6
  from utils import encode_image
7
  from PIL import Image
8
+ from ollama import chat
9
  import torch
10
  import subprocess
11
  import psutil
 
64
 
65
  return response_text
66
 
67
+ def get_answer_from_gemini(self, query, imagePaths):
68
+
69
+
70
+ print(f"Querying Gemini for query={query}, imagePaths={imagePaths}")
71
+
72
  try:
73
+ client = genai.Client(api_key="AIzaSyCwRr9054tCuh2S8yGpwKFvOAxYMT4WNIs")
74
+
75
+ images = [Image.open(path) for path in imagePaths]
76
+
77
+ response = client.models.generate_content(
78
+ model="gemini-2.5-flash",
79
+ contents=[images, query],
80
+ )
81
+
82
+ print(response.text)
83
+ answer = response.text
84
+
85
+ return answer
86
+
 
 
 
 
 
87
  except Exception as e:
88
+ print(f"An error occurred while querying Gemini: {e}")
89
  return f"Error: {str(e)}"
90
 
91
  #os.environ['OPENAI_API_KEY'] = "for the love of Jesus let this work"
 
98
  dotenv_file = dotenv.find_dotenv()
99
  dotenv.load_dotenv(dotenv_file)
100
 
101
+ #ollama method below
102
+
103
+ torch.cuda.empty_cache() #release cuda so that ollama can use gpu!
104
+
105
+
106
+ os.environ['OLLAMA_FLASH_ATTENTION'] = os.environ['flashattn'] #int "1"
107
+ if os.environ['ollama'] == "minicpm-v":
108
+ os.environ['ollama'] = "minicpm-v:8b-2.6-q8_0" #set to quantized version
109
+ elif os.environ['ollama'] == "gemma3":
110
+ os.environ['ollama'] = "gemma3:12b" #set to upscaled version 12b when needed
111
+ # Add specific environment variables for Gemma3 to prevent raw token issues
112
+ os.environ['OLLAMA_KEEP_ALIVE'] = "5m"
113
+ os.environ['OLLAMA_ORIGINS'] = "*"
114
+
115
+
116
+ # Close model thread (colpali)
117
+ print(f"Querying OpenAI for query={query}, imagesPaths={imagesPaths}")
118
+
119
+ try:
120
+
121
+ # Enhanced prompt for more detailed responses with explicit page usage
122
+ enhanced_query = f"""
123
+ Please provide a comprehensive and detailed answer to the following query.
124
+ Use ALL available information from the provided document images to give a thorough response.
125
+
126
+ Query: {query}
127
+
128
+ CRITICAL INSTRUCTIONS:
129
+ - You have been provided with {len(imagesPaths)} document page(s)
130
+ - You MUST reference information from ALL {len(imagesPaths)} page(s) in your response
131
+ - Do not skip any pages - each page contains relevant information
132
+ - If you mention one page, you must also mention the others
133
+ - Ensure your response reflects the complete information from all pages
134
+
135
+ Instructions for detailed response:
136
+ 1. Provide extensive background information and context
137
+ 2. Include specific details, examples, and data points from ALL documents
138
+ 3. Explain concepts thoroughly with step-by-step breakdowns
139
+ 4. Provide comprehensive analysis rather than simple answers when requested
140
+ 5. Explicitly reference each page and what information it contributes
141
+ 6. Cross-reference information between pages when relevant
142
+ 7. Ensure no page is left unmentioned in your analysis
143
+
144
+ SPECIAL INSTRUCTIONS FOR TABULAR DATA:
145
+ - If the query requests a table, list, or structured data, organize your response in a clear, structured format
146
+ - Use numbered lists, bullet points, or clear categories when appropriate
147
+ - Include specific data points or comparisons when available
148
+ - Structure information in a way that can be easily converted to a table format
149
+
150
+ IMPORTANT: Respond with natural, human-readable text only. Do not include any special tokens, codes, or technical identifiers in your response.
151
+
152
+ Make sure to acknowledge and use information from all {len(imagesPaths)} provided pages.
153
+ """
154
+
155
+ # Try with current model first
156
+ current_model = os.environ['ollama']
157
+
158
+ # Set different options based on the model
159
+ if "gemma3" in current_model.lower():
160
+ # Specific options for Gemma3 to prevent raw token issues
161
+ model_options = {
162
+ "num_predict": 1024, # Shorter responses for Gemma3
163
+ "stop": ["<eos>", "<|endoftext|>", "</s>", "<|im_end|>"], # More stop tokens
164
+ "top_k": 20, # Lower top_k for more focused generation
165
+ "top_p": 0.8, # Lower top_p for more deterministic output
166
+ "repeat_penalty": 1.2, # Higher repeat penalty
167
+ "seed": 42, # Consistent results
168
+ "temperature": 0.7, # Lower temperature for more focused responses
169
+ }
170
+ else:
171
+ # Default options for other models
172
+ model_options = {
173
+ "num_predict": 2048, # Limit response length
174
+ "stop": ["<eos>", "<|endoftext|>", "</s>"], # Stop at end tokens
175
+ "top_k": 40, # Reduce randomness
176
+ "top_p": 0.9, # Nucleus sampling
177
+ "repeat_penalty": 1.1, # Prevent repetition
178
+ "seed": 42, # Consistent results
179
+ }
180
+
181
+ response = chat(
182
+ model=current_model,
183
+ messages=[
184
+ {
185
+ 'role': 'user',
186
+ 'content': enhanced_query,
187
+ 'images': imagesPaths,
188
+ "temperature":float(os.environ['temperature']), #test if temp makes a diff
189
+ }
190
+ ],
191
+ options=model_options
192
+ )
193
+
194
+ answer = response.message.content
195
+
196
+ # Clean the response to handle raw token issues
197
+ cleaned_answer = self._clean_raw_token_response(answer)
198
+
199
+ # If the cleaned answer is still problematic, try fallback models
200
+ if cleaned_answer and "❌ **Model Response Error**" in cleaned_answer:
201
+ print(f"⚠️ Primary model {current_model} failed, trying fallback models...")
202
+
203
+ # List of fallback models to try
204
+ fallback_models = [
205
+ "llama3.2-vision:latest",
206
+ "llava:latest",
207
+ "bakllava:latest",
208
+ "llama3.2:latest"
209
+ ]
210
+
211
+ for fallback_model in fallback_models:
212
+ try:
213
+ print(f"πŸ”„ Trying fallback model: {fallback_model}")
214
+ response = chat(
215
+ model=fallback_model,
216
+ messages=[
217
+ {
218
+ 'role': 'user',
219
+ 'content': enhanced_query,
220
+ 'images': imagesPaths,
221
+ "temperature":float(os.environ['temperature']),
222
+ }
223
+ ],
224
+ options={
225
+ "num_predict": 2048,
226
+ "stop": ["<eos>", "<|endoftext|>", "</s>"],
227
+ "top_k": 40,
228
+ "top_p": 0.9,
229
+ "repeat_penalty": 1.1,
230
+ "seed": 42,
231
+ }
232
+ )
233
+
234
+ fallback_answer = response.message.content
235
+ cleaned_fallback = self._clean_raw_token_response(fallback_answer)
236
+
237
+ if cleaned_fallback and "❌ **Model Response Error**" not in cleaned_fallback:
238
+ print(f"βœ… Fallback model {fallback_model} succeeded")
239
+ return cleaned_fallback
240
+
241
+ except Exception as fallback_error:
242
+ print(f"❌ Fallback model {fallback_model} failed: {fallback_error}")
243
+ continue
244
+
245
+ # If all fallbacks fail, return the original error
246
+ return cleaned_answer
247
+
248
+ print(f"Original response: {answer}")
249
+ print(f"Cleaned response: {cleaned_answer}")
250
+
251
+ return cleaned_answer
252
+
253
  except Exception as e:
254
+ print(f"An error occurred while querying OpenAI: {e}")
255
  return None
256
 
257