Spaces:
Running
Running
Upload 6 files
Browse files
embeddings/__pycache__/embedding_manager.cpython-310.pyc
ADDED
Binary file (907 Bytes). View file
|
|
embeddings/embedding_manager.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# embeddings/embedding_manager.py
|
2 |
+
from sentence_transformers import SentenceTransformer
|
3 |
+
from config import Config
|
4 |
+
|
5 |
+
class EmbeddingManager:
|
6 |
+
def __init__(self):
|
7 |
+
self.model = SentenceTransformer(Config.EMBEDDING_MODEL)
|
8 |
+
|
9 |
+
def get_embedding(self, text):
|
10 |
+
return self.model.encode(text)
|
11 |
+
|
12 |
+
def get_embeddings(self, texts):
|
13 |
+
return self.model.encode(texts)
|
file_processor/__pycache__/processor.cpython-310.pyc
ADDED
Binary file (1.84 kB). View file
|
|
file_processor/processor.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# file_processor/processor.py
|
2 |
+
import PyPDF2
|
3 |
+
import docx
|
4 |
+
import os
|
5 |
+
from typing import List, Dict
|
6 |
+
|
7 |
+
class FileProcessor:
|
8 |
+
"""Handles processing of uploaded files in various formats"""
|
9 |
+
|
10 |
+
@staticmethod
|
11 |
+
def process_pdf(file_path: str) -> str:
|
12 |
+
"""Extract text from PDF files"""
|
13 |
+
text = ""
|
14 |
+
with open(file_path, 'rb') as file:
|
15 |
+
pdf_reader = PyPDF2.PdfReader(file)
|
16 |
+
for page in pdf_reader.pages:
|
17 |
+
text += page.extract_text() + "\n"
|
18 |
+
return text
|
19 |
+
|
20 |
+
@staticmethod
|
21 |
+
def process_docx(file_path: str) -> str:
|
22 |
+
"""Extract text from DOCX files"""
|
23 |
+
doc = docx.Document(file_path)
|
24 |
+
text = ""
|
25 |
+
for paragraph in doc.paragraphs:
|
26 |
+
text += paragraph.text + "\n"
|
27 |
+
return text
|
28 |
+
|
29 |
+
@staticmethod
|
30 |
+
def process_txt(file_path: str) -> str:
|
31 |
+
"""Extract text from TXT files"""
|
32 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
33 |
+
return file.read()
|
34 |
+
|
35 |
+
def process_file(self, file_path: str) -> str:
|
36 |
+
"""Process file based on its extension"""
|
37 |
+
_, ext = os.path.splitext(file_path)
|
38 |
+
ext = ext.lower()
|
39 |
+
|
40 |
+
if ext == '.pdf':
|
41 |
+
return self.process_pdf(file_path)
|
42 |
+
elif ext == '.docx':
|
43 |
+
return self.process_docx(file_path)
|
44 |
+
elif ext == '.txt':
|
45 |
+
return self.process_txt(file_path)
|
46 |
+
else:
|
47 |
+
raise ValueError(f"Unsupported file format: {ext}")
|
retrieval/__pycache__/vector_store.cpython-310.pyc
ADDED
Binary file (1.27 kB). View file
|
|
retrieval/vector_store.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# retrieval/vector_store.py
|
2 |
+
import faiss
|
3 |
+
import numpy as np
|
4 |
+
from typing import List, Tuple
|
5 |
+
|
6 |
+
class VectorStore:
|
7 |
+
def __init__(self, dimension: int):
|
8 |
+
self.dimension = dimension
|
9 |
+
self.index = faiss.IndexFlatL2(dimension)
|
10 |
+
self.texts = []
|
11 |
+
|
12 |
+
def add_texts(self, texts: List[str], embeddings: np.ndarray):
|
13 |
+
self.texts.extend(texts)
|
14 |
+
self.index.add(embeddings)
|
15 |
+
|
16 |
+
def search(self, query_embedding: np.ndarray, k: int) -> List[Tuple[str, float]]:
|
17 |
+
query_embedding = query_embedding.reshape(1, -1)
|
18 |
+
distances, indices = self.index.search(query_embedding, k)
|
19 |
+
|
20 |
+
results = []
|
21 |
+
for idx, distance in zip(indices[0], distances[0]):
|
22 |
+
if idx < len(self.texts):
|
23 |
+
results.append((self.texts[idx], float(distance)))
|
24 |
+
return results
|