nikhildsst commited on
Commit
1638245
·
verified ·
1 Parent(s): c5322fc

Upload 6 files

Browse files
embeddings/__pycache__/embedding_manager.cpython-310.pyc ADDED
Binary file (907 Bytes). View file
 
embeddings/embedding_manager.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # embeddings/embedding_manager.py
2
+ from sentence_transformers import SentenceTransformer
3
+ from config import Config
4
+
5
+ class EmbeddingManager:
6
+ def __init__(self):
7
+ self.model = SentenceTransformer(Config.EMBEDDING_MODEL)
8
+
9
+ def get_embedding(self, text):
10
+ return self.model.encode(text)
11
+
12
+ def get_embeddings(self, texts):
13
+ return self.model.encode(texts)
file_processor/__pycache__/processor.cpython-310.pyc ADDED
Binary file (1.84 kB). View file
 
file_processor/processor.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # file_processor/processor.py
2
+ import PyPDF2
3
+ import docx
4
+ import os
5
+ from typing import List, Dict
6
+
7
+ class FileProcessor:
8
+ """Handles processing of uploaded files in various formats"""
9
+
10
+ @staticmethod
11
+ def process_pdf(file_path: str) -> str:
12
+ """Extract text from PDF files"""
13
+ text = ""
14
+ with open(file_path, 'rb') as file:
15
+ pdf_reader = PyPDF2.PdfReader(file)
16
+ for page in pdf_reader.pages:
17
+ text += page.extract_text() + "\n"
18
+ return text
19
+
20
+ @staticmethod
21
+ def process_docx(file_path: str) -> str:
22
+ """Extract text from DOCX files"""
23
+ doc = docx.Document(file_path)
24
+ text = ""
25
+ for paragraph in doc.paragraphs:
26
+ text += paragraph.text + "\n"
27
+ return text
28
+
29
+ @staticmethod
30
+ def process_txt(file_path: str) -> str:
31
+ """Extract text from TXT files"""
32
+ with open(file_path, 'r', encoding='utf-8') as file:
33
+ return file.read()
34
+
35
+ def process_file(self, file_path: str) -> str:
36
+ """Process file based on its extension"""
37
+ _, ext = os.path.splitext(file_path)
38
+ ext = ext.lower()
39
+
40
+ if ext == '.pdf':
41
+ return self.process_pdf(file_path)
42
+ elif ext == '.docx':
43
+ return self.process_docx(file_path)
44
+ elif ext == '.txt':
45
+ return self.process_txt(file_path)
46
+ else:
47
+ raise ValueError(f"Unsupported file format: {ext}")
retrieval/__pycache__/vector_store.cpython-310.pyc ADDED
Binary file (1.27 kB). View file
 
retrieval/vector_store.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # retrieval/vector_store.py
2
+ import faiss
3
+ import numpy as np
4
+ from typing import List, Tuple
5
+
6
+ class VectorStore:
7
+ def __init__(self, dimension: int):
8
+ self.dimension = dimension
9
+ self.index = faiss.IndexFlatL2(dimension)
10
+ self.texts = []
11
+
12
+ def add_texts(self, texts: List[str], embeddings: np.ndarray):
13
+ self.texts.extend(texts)
14
+ self.index.add(embeddings)
15
+
16
+ def search(self, query_embedding: np.ndarray, k: int) -> List[Tuple[str, float]]:
17
+ query_embedding = query_embedding.reshape(1, -1)
18
+ distances, indices = self.index.search(query_embedding, k)
19
+
20
+ results = []
21
+ for idx, distance in zip(indices[0], distances[0]):
22
+ if idx < len(self.texts):
23
+ results.append((self.texts[idx], float(distance)))
24
+ return results