MouadHsb commited on
Commit
0c0f56d
·
1 Parent(s): 08ba4b8

changing library

Browse files
app/__pycache__/main.cpython-310.pyc CHANGED
Binary files a/app/__pycache__/main.cpython-310.pyc and b/app/__pycache__/main.cpython-310.pyc differ
 
app/services/__pycache__/embedding_service.cpython-310.pyc CHANGED
Binary files a/app/services/__pycache__/embedding_service.cpython-310.pyc and b/app/services/__pycache__/embedding_service.cpython-310.pyc differ
 
app/services/embedding_service copy.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import numpy as np
3
+ from typing import List, Dict, Any, Tuple
4
+ from sentence_transformers import SentenceTransformer
5
+ import torch
6
+ import os
7
+
8
+ os.environ["PYTORCH_ENABLE_META_TENSORS"] = "0"
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ class EmbeddingService:
13
+ """Service for handling document embeddings using Sentence Transformers."""
14
+
15
+ def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
16
+ """
17
+ Initialize the embedding system.
18
+
19
+ Args:
20
+ model_name: Name of the Sentence Transformers model to use. Default is "all-MiniLM-L6-v2" (80MB).
21
+ """
22
+ logger.info(f"Loading embedding model: {model_name}")
23
+
24
+ # Explicitly set device to CPU to avoid meta tensor issue
25
+ torch.set_grad_enabled(False)
26
+ # With 16GB of RAM, we can afford to use standard loading without memory optimization
27
+ # Force the model to load fully into memory without any meta tensors
28
+ torch.cuda.empty_cache() if torch.cuda.is_available() else None
29
+
30
+ #########################################################
31
+ torch.set_default_device("cpu")
32
+ #########################################################
33
+
34
+ self.model = SentenceTransformer(model_name, device="cpu")
35
+
36
+ # Ensure model is fully materialized, not using meta tensors
37
+ for param in self.model.parameters():
38
+ if hasattr(param, 'is_meta') and param.is_meta:
39
+ # Should not happen with environment variable set, but just in case
40
+ raise RuntimeError("Meta tensors still detected despite disabling them")
41
+
42
+ self.embedding_dim = self.model.get_sentence_embedding_dimension()
43
+ logger.info(f"Embedding dimension: {self.embedding_dim}")
44
+
45
+ def embed_documents(self, documents: List[Dict[str, Any]]) -> Tuple[np.ndarray, List[Dict[str, Any]]]:
46
+ """
47
+ Embed a list of documents.
48
+
49
+ Args:
50
+ documents: List of document dictionaries.
51
+
52
+ Returns:
53
+ Tuple of (embeddings array, documents).
54
+ """
55
+ texts = [doc["text"] for doc in documents]
56
+ logger.info(f"Embedding {len(texts)} documents...")
57
+ embeddings = self.model.encode(texts, show_progress_bar=True, convert_to_numpy=True)
58
+
59
+ return embeddings, documents
60
+
61
+ def embed_query(self, query: str) -> np.ndarray:
62
+ """
63
+ Embed a search query.
64
+
65
+ Args:
66
+ query: The search query.
67
+
68
+ Returns:
69
+ Query embedding array.
70
+ """
71
+ return self.model.encode([query], convert_to_numpy=True)
72
+
73
+ def get_model_info(self) -> Dict[str, Any]:
74
+ """
75
+ Get information about the embedding model.
76
+
77
+ Returns:
78
+ Dictionary with model information.
79
+ """
80
+ # Access the model attributes in a safer way
81
+ try:
82
+ model_name = self.model._model_config.get('name',
83
+ self.model._model_config.get('model_name_or_path', 'unknown'))
84
+ except:
85
+ model_name = str(self.model) # Fallback to string representation
86
+
87
+ try:
88
+ max_seq_length = self.model.get_max_seq_length()
89
+ except:
90
+ max_seq_length = 512 # Default value if method not available
91
+
92
+ return {
93
+ "model_name": model_name,
94
+ "dimension": self.embedding_dim,
95
+ "max_seq_length": max_seq_length,
96
+ "normalize_embeddings": getattr(self.model, "normalize_embeddings", True)
97
+ }
app/services/embedding_service.py CHANGED
@@ -1,47 +1,62 @@
1
  import logging
2
  import numpy as np
3
- from typing import List, Dict, Any, Tuple
4
- from sentence_transformers import SentenceTransformer
5
  import torch
6
- import os
7
-
8
- os.environ["PYTORCH_ENABLE_META_TENSORS"] = "0"
9
 
10
  logger = logging.getLogger(__name__)
11
 
12
  class EmbeddingService:
13
- """Service for handling document embeddings using Sentence Transformers."""
14
 
15
- def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
16
  """
17
  Initialize the embedding system.
18
 
19
  Args:
20
- model_name: Name of the Sentence Transformers model to use. Default is "all-MiniLM-L6-v2" (80MB).
21
  """
22
  logger.info(f"Loading embedding model: {model_name}")
23
 
24
- # Explicitly set device to CPU to avoid meta tensor issue
25
  torch.set_grad_enabled(False)
26
- # With 16GB of RAM, we can afford to use standard loading without memory optimization
27
- # Force the model to load fully into memory without any meta tensors
28
- torch.cuda.empty_cache() if torch.cuda.is_available() else None
29
-
30
- #########################################################
31
- torch.set_default_device("cpu")
32
- #########################################################
33
 
34
- self.model = SentenceTransformer(model_name, device="cpu")
35
-
36
- # Ensure model is fully materialized, not using meta tensors
37
- for param in self.model.parameters():
38
- if hasattr(param, 'is_meta') and param.is_meta:
39
- # Should not happen with environment variable set, but just in case
40
- raise RuntimeError("Meta tensors still detected despite disabling them")
41
 
42
- self.embedding_dim = self.model.get_sentence_embedding_dimension()
 
43
  logger.info(f"Embedding dimension: {self.embedding_dim}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
 
 
 
45
  def embed_documents(self, documents: List[Dict[str, Any]]) -> Tuple[np.ndarray, List[Dict[str, Any]]]:
46
  """
47
  Embed a list of documents.
@@ -54,7 +69,38 @@ class EmbeddingService:
54
  """
55
  texts = [doc["text"] for doc in documents]
56
  logger.info(f"Embedding {len(texts)} documents...")
57
- embeddings = self.model.encode(texts, show_progress_bar=True, convert_to_numpy=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  return embeddings, documents
60
 
@@ -68,7 +114,26 @@ class EmbeddingService:
68
  Returns:
69
  Query embedding array.
70
  """
71
- return self.model.encode([query], convert_to_numpy=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  def get_model_info(self) -> Dict[str, Any]:
74
  """
@@ -77,21 +142,9 @@ class EmbeddingService:
77
  Returns:
78
  Dictionary with model information.
79
  """
80
- # Access the model attributes in a safer way
81
- try:
82
- model_name = self.model._model_config.get('name',
83
- self.model._model_config.get('model_name_or_path', 'unknown'))
84
- except:
85
- model_name = str(self.model) # Fallback to string representation
86
-
87
- try:
88
- max_seq_length = self.model.get_max_seq_length()
89
- except:
90
- max_seq_length = 512 # Default value if method not available
91
-
92
  return {
93
- "model_name": model_name,
94
  "dimension": self.embedding_dim,
95
- "max_seq_length": max_seq_length,
96
- "normalize_embeddings": getattr(self.model, "normalize_embeddings", True)
97
- }
 
1
  import logging
2
  import numpy as np
 
 
3
  import torch
4
+ from typing import List, Dict, Any, Tuple
5
+ from transformers import AutoModel, AutoTokenizer
 
6
 
7
  logger = logging.getLogger(__name__)
8
 
9
  class EmbeddingService:
10
+ """Service for handling document embeddings using Hugging Face Transformers."""
11
 
12
+ def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
13
  """
14
  Initialize the embedding system.
15
 
16
  Args:
17
+ model_name: Name of the model to use. Default is "sentence-transformers/all-MiniLM-L6-v2".
18
  """
19
  logger.info(f"Loading embedding model: {model_name}")
20
 
21
+ # Disable gradients for inference
22
  torch.set_grad_enabled(False)
 
 
 
 
 
 
 
23
 
24
+ # Load tokenizer and model
25
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
26
+ self.model = AutoModel.from_pretrained(model_name)
27
+
28
+ # Move model to CPU and eval mode
29
+ self.model = self.model.to("cpu")
30
+ self.model.eval()
31
 
32
+ # Get embedding dimension from model config
33
+ self.embedding_dim = self.model.config.hidden_size
34
  logger.info(f"Embedding dimension: {self.embedding_dim}")
35
+
36
+ def _mean_pooling(self, model_output, attention_mask):
37
+ """
38
+ Perform mean pooling on token embeddings.
39
+
40
+ Args:
41
+ model_output: Output from the transformer model
42
+ attention_mask: Attention mask to avoid padding tokens
43
+
44
+ Returns:
45
+ Sentence embeddings
46
+ """
47
+ # First element of model_output contains token embeddings
48
+ token_embeddings = model_output[0]
49
+
50
+ # Expand attention mask from [batch_size, seq_length] to [batch_size, seq_length, hidden_size]
51
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
52
+
53
+ # Sum token embeddings and divide by the expanded mask
54
+ sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
55
+ sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
56
 
57
+ # Return mean-pooled embeddings
58
+ return sum_embeddings / sum_mask
59
+
60
  def embed_documents(self, documents: List[Dict[str, Any]]) -> Tuple[np.ndarray, List[Dict[str, Any]]]:
61
  """
62
  Embed a list of documents.
 
69
  """
70
  texts = [doc["text"] for doc in documents]
71
  logger.info(f"Embedding {len(texts)} documents...")
72
+
73
+ # Process in batches to avoid OOM
74
+ batch_size = 32
75
+ all_embeddings = []
76
+
77
+ for i in range(0, len(texts), batch_size):
78
+ batch_texts = texts[i:i+batch_size]
79
+
80
+ # Tokenize batch
81
+ encoded_input = self.tokenizer(
82
+ batch_texts,
83
+ padding=True,
84
+ truncation=True,
85
+ max_length=512,
86
+ return_tensors='pt'
87
+ ).to("cpu")
88
+
89
+ # Compute token embeddings
90
+ with torch.no_grad():
91
+ model_output = self.model(**encoded_input)
92
+
93
+ # Apply mean pooling
94
+ batch_embeddings = self._mean_pooling(model_output, encoded_input['attention_mask'])
95
+
96
+ # Normalize embeddings
97
+ batch_embeddings = torch.nn.functional.normalize(batch_embeddings, p=2, dim=1)
98
+
99
+ # Convert to numpy and add to result
100
+ all_embeddings.append(batch_embeddings.cpu().numpy())
101
+
102
+ # Combine all batches
103
+ embeddings = np.vstack(all_embeddings)
104
 
105
  return embeddings, documents
106
 
 
114
  Returns:
115
  Query embedding array.
116
  """
117
+ # Tokenize query
118
+ encoded_input = self.tokenizer(
119
+ [query],
120
+ padding=True,
121
+ truncation=True,
122
+ max_length=512,
123
+ return_tensors='pt'
124
+ ).to("cpu")
125
+
126
+ # Compute token embeddings
127
+ with torch.no_grad():
128
+ model_output = self.model(**encoded_input)
129
+
130
+ # Apply mean pooling
131
+ embeddings = self._mean_pooling(model_output, encoded_input['attention_mask'])
132
+
133
+ # Normalize embeddings
134
+ embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
135
+
136
+ return embeddings.cpu().numpy()
137
 
138
  def get_model_info(self) -> Dict[str, Any]:
139
  """
 
142
  Returns:
143
  Dictionary with model information.
144
  """
 
 
 
 
 
 
 
 
 
 
 
 
145
  return {
146
+ "model_name": self.model.config.name_or_path,
147
  "dimension": self.embedding_dim,
148
+ "max_seq_length": self.model.config.max_position_embeddings,
149
+ "normalize_embeddings": True # We're always normalizing
150
+ }
logs/app.log CHANGED
@@ -2487,3 +2487,28 @@
2487
  2025-05-13 15:26:46,505 - INFO - Application shut down
2488
  2025-05-13 15:26:51,494 - INFO - Application started
2489
  2025-05-13 15:26:56,341 - INFO - Application started
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2487
  2025-05-13 15:26:46,505 - INFO - Application shut down
2488
  2025-05-13 15:26:51,494 - INFO - Application started
2489
  2025-05-13 15:26:56,341 - INFO - Application started
2490
+ 2025-05-13 15:27:24,810 - INFO - ArXiv API service initialized
2491
+ 2025-05-13 15:27:24,811 - INFO - Document processor initialized with chunk_size=500, chunk_overlap=50
2492
+ 2025-05-13 15:27:24,811 - INFO - Loading embedding model: sentence-transformers/all-MiniLM-L6-v2
2493
+ 2025-05-13 15:27:24,814 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
2494
+ 2025-05-13 15:27:27,035 - INFO - Embedding dimension: 384
2495
+ 2025-05-13 15:27:27,036 - INFO - Vector database initialized with dimension 384
2496
+ 2025-05-13 15:27:27,036 - INFO - LLM service initialized with 3 supported models
2497
+ 2025-05-13 15:27:27,036 - INFO - RAG service initialized
2498
+ 2025-05-13 15:27:27,039 - INFO - Searching ArXiv with query: latest advancements quantum computing AND (cat:cs.LG OR cat:quant-ph)
2499
+ 2025-05-13 15:27:27,040 - INFO - Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=latest+advancements+quantum+computing+AND+%28cat%3Acs.LG+OR+cat%3Aquant-ph%29&id_list=&sortBy=relevance&sortOrder=descending&start=0&max_results=100
2500
+ 2025-05-13 15:27:31,218 - INFO - Got first page: 100 of 144553 total results
2501
+ 2025-05-13 15:27:31,219 - INFO - Found 10 papers
2502
+ 2025-05-13 15:27:31,219 - INFO - Split paper 'A Survey on Quantum Machine Learning: Current Trends, Challenges, Opportunities, and the Road Ahead' into 4 chunks
2503
+ 2025-05-13 15:27:31,219 - INFO - Split paper 'Quantum Machine Learning: A Hands-on Tutorial for Machine Learning Practitioners and Researchers' into 3 chunks
2504
+ 2025-05-13 15:27:31,219 - INFO - Split paper 'Deep-Q Learning with Hybrid Quantum Neural Network on Solving Maze Problems' into 4 chunks
2505
+ 2025-05-13 15:27:31,219 - INFO - Split paper 'Generalization Error Bound for Quantum Machine Learning in NISQ Era -- A Survey' into 5 chunks
2506
+ 2025-05-13 15:27:31,219 - INFO - Split paper 'Quantum Supervised Learning' into 4 chunks
2507
+ 2025-05-13 15:27:31,219 - INFO - Split paper 'What is my quantum computer good for? Quantum capability learning with physics-aware neural networks' into 4 chunks
2508
+ 2025-05-13 15:27:31,219 - INFO - Split paper 'Quantum-Assisted Clustering Algorithms for NISQ-Era Devices' into 3 chunks
2509
+ 2025-05-13 15:27:31,219 - INFO - Split paper 'Benchmarking MedMNIST dataset on real quantum hardware' into 5 chunks
2510
+ 2025-05-13 15:27:31,219 - INFO - Split paper 'From Quantum Graph Computing to Quantum Graph Learning: A Survey' into 4 chunks
2511
+ 2025-05-13 15:27:31,219 - INFO - Split paper 'Advances in Quantum Deep Learning: An Overview' into 3 chunks
2512
+ 2025-05-13 15:27:31,220 - INFO - Embedding 39 documents...
2513
+ 2025-05-13 15:27:31,612 - INFO - Added 39 documents to vector database. Total: 39
2514
+ 2025-05-13 15:52:38,024 - INFO - Application started