MohamedFahim commited on
Commit
ee16852
·
1 Parent(s): 440b942

Add application file

Browse files
Files changed (4) hide show
  1. Dockerfile +44 -0
  2. interface.py +269 -0
  3. main_api.py +363 -0
  4. requirements.txt +14 -0
Dockerfile ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use Python 3.12.3 as base image
2
+ FROM python:3.12.3-slim
3
+
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # Set environment variables
8
+ ENV PYTHONDONTWRITEBYTECODE=1
9
+ ENV PYTHONUNBUFFERED=1
10
+
11
+ # Install system dependencies
12
+ RUN apt-get update && apt-get install -y --no-install-recommends \
13
+ build-essential \
14
+ && apt-get clean \
15
+ && rm -rf /var/lib/apt/lists/*
16
+
17
+ # Copy requirements and install Python dependencies
18
+ COPY requirements.txt .
19
+ RUN pip install --no-cache-dir -r requirements.txt
20
+
21
+ # Copy project files
22
+ COPY main_api.py .
23
+ COPY interface.py .
24
+ # Copy any other necessary files
25
+ COPY . .
26
+ # Note: Remove .env copy for HF Spaces - use HF Spaces secrets instead
27
+ # COPY .env .
28
+
29
+ # Expose port 7860 (required by Hugging Face Spaces)
30
+ EXPOSE 7860
31
+
32
+ # Create entry point script for HF Spaces
33
+ RUN echo '#!/bin/bash\n\
34
+ echo "Starting FastAPI server..."\n\
35
+ python main_api.py &\n\
36
+ echo "Waiting for FastAPI to start..."\n\
37
+ sleep 10\n\
38
+ echo "Starting Gradio interface..."\n\
39
+ python interface.py\n\
40
+ wait\n' > /app/entrypoint.sh && \
41
+ chmod +x /app/entrypoint.sh
42
+
43
+ # Run both services
44
+ CMD ["/app/entrypoint.sh"]
interface.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ import time
4
+ import os
5
+
6
+ # Use localhost for HF Spaces since both services run in the same container
7
+ API_BASE_URL = "http://localhost:8000"
8
+
9
+ def extract_links(url):
10
+ """Extract links from the given URL"""
11
+ endpoint = f"{API_BASE_URL}/extract_links"
12
+ payload = {"url": url}
13
+ try:
14
+ response = requests.post(endpoint, json=payload, timeout=30)
15
+ if response.status_code == 200:
16
+ return response.json()["unique_links"]
17
+ else:
18
+ raise Exception(f"Failed to extract links: {response.text}")
19
+ except requests.exceptions.RequestException as e:
20
+ raise Exception(f"Connection error: {str(e)}")
21
+
22
+ def extract_text(urls):
23
+ """Extract text from URLs"""
24
+ endpoint = f"{API_BASE_URL}/extract_text"
25
+ try:
26
+ response = requests.post(endpoint, json=urls, timeout=60)
27
+ if response.status_code == 200:
28
+ return response.json()["file_saved"]
29
+ else:
30
+ raise Exception(f"Failed to extract text: {response.text}")
31
+ except requests.exceptions.RequestException as e:
32
+ raise Exception(f"Connection error: {str(e)}")
33
+
34
+ def perform_rag(file_path, prompt):
35
+ """Perform RAG on the extracted text"""
36
+ endpoint = f"{API_BASE_URL}/rag"
37
+ payload = {"file_path": file_path, "prompt": prompt}
38
+ try:
39
+ response = requests.post(endpoint, json=payload, timeout=60)
40
+ if response.status_code == 200:
41
+ return response.json()
42
+ else:
43
+ raise Exception(f"Failed to perform RAG: {response.text}")
44
+ except requests.exceptions.RequestException as e:
45
+ raise Exception(f"Connection error: {str(e)}")
46
+
47
+ def check_api_health():
48
+ """Check if FastAPI is running"""
49
+ try:
50
+ response = requests.get(f"{API_BASE_URL}/", timeout=5)
51
+ return response.status_code == 200
52
+ except:
53
+ return False
54
+
55
+ def process_web_rag(url, prompt, data_source, progress=gr.Progress()):
56
+ """Main processing function with progress tracking"""
57
+ if not url or not prompt:
58
+ return "❌ Error: Please provide both URL and prompt", "", ""
59
+
60
+ # Check API health first
61
+ if not check_api_health():
62
+ return "❌ Error: FastAPI service is not available. Please wait a moment and try again.", "", ""
63
+
64
+ try:
65
+ progress(0.1, desc="Starting process...")
66
+
67
+ if data_source == "Multiple links (first 5)":
68
+ progress(0.2, desc="🔍 Extracting links from webpage...")
69
+ links = extract_links(url)
70
+ sample_links = links[:5]
71
+
72
+ progress(0.4, desc="📄 Extracting text from multiple pages...")
73
+ file_path = extract_text(sample_links)
74
+
75
+ status_msg = f"✅ Processed {len(sample_links)} pages from {len(links)} total links found"
76
+ else:
77
+ progress(0.3, desc="📄 Extracting text from homepage...")
78
+ file_path = extract_text([url])
79
+ status_msg = "✅ Processed homepage content"
80
+
81
+ progress(0.7, desc="🤖 Performing RAG analysis...")
82
+ result = perform_rag(file_path, prompt)
83
+
84
+ progress(1.0, desc="✅ Complete!")
85
+
86
+ # Format the response
87
+ response_text = f"**Query:** {result['user_query']}\n\n**Response:** {result['assistant_response']}"
88
+ sources_text = result['sources']
89
+
90
+ return status_msg, response_text, sources_text
91
+
92
+ except Exception as e:
93
+ return f"❌ Error: {str(e)}", "", ""
94
+
95
+ # Custom CSS for modern styling
96
+ custom_css = """
97
+ .gradio-container {
98
+ max-width: 900px !important;
99
+ margin: auto !important;
100
+ }
101
+
102
+ .header-text {
103
+ text-align: center;
104
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
105
+ -webkit-background-clip: text;
106
+ -webkit-text-fill-color: transparent;
107
+ font-size: 2.5em;
108
+ font-weight: bold;
109
+ margin-bottom: 0.5em;
110
+ }
111
+
112
+ .description-text {
113
+ text-align: center;
114
+ color: #666;
115
+ font-size: 1.1em;
116
+ margin-bottom: 2em;
117
+ }
118
+
119
+ .input-group {
120
+ background: #f8f9fa;
121
+ padding: 1.5em;
122
+ border-radius: 12px;
123
+ margin: 1em 0;
124
+ border: 1px solid #e9ecef;
125
+ }
126
+
127
+ .output-group {
128
+ background: #ffffff;
129
+ border-radius: 12px;
130
+ border: 1px solid #dee2e6;
131
+ margin: 1em 0;
132
+ }
133
+
134
+ .status-box {
135
+ padding: 1em;
136
+ border-radius: 8px;
137
+ margin: 0.5em 0;
138
+ }
139
+
140
+ .status-success {
141
+ background-color: #d4edda;
142
+ border-color: #c3e6cb;
143
+ color: #155724;
144
+ }
145
+
146
+ .status-error {
147
+ background-color: #f8d7da;
148
+ border-color: #f5c6cb;
149
+ color: #721c24;
150
+ }
151
+ """
152
+
153
+ # Create the Gradio interface
154
+ with gr.Blocks(css=custom_css, title="Web RAG System", theme=gr.themes.Soft()) as app:
155
+ # Header
156
+ gr.HTML("""
157
+ <div class="header-text">🌐 Web RAG System</div>
158
+ <div class="description-text">
159
+ Extract content from web pages and ask questions using AI-powered retrieval
160
+ </div>
161
+ """)
162
+
163
+ with gr.Row():
164
+ with gr.Column(scale=1):
165
+ # Input section
166
+ gr.HTML('<div style="font-size: 1.2em; font-weight: bold; margin-bottom: 1em;">📝 Input Configuration</div>')
167
+
168
+ url_input = gr.Textbox(
169
+ label="🔗 Website URL",
170
+ placeholder="https://example.com",
171
+ info="Enter the URL you want to analyze"
172
+ )
173
+
174
+ prompt_input = gr.Textbox(
175
+ label="❓ Your Question",
176
+ placeholder="What is this website about?",
177
+ lines=3,
178
+ info="Ask any question about the content"
179
+ )
180
+
181
+ data_source = gr.Radio(
182
+ choices=["Multiple links (first 5)", "Homepage only"],
183
+ value="Multiple links (first 5)",
184
+ label="📊 Data Source",
185
+ info="Choose how much content to analyze"
186
+ )
187
+
188
+ process_btn = gr.Button(
189
+ "🚀 Analyze Website",
190
+ variant="primary",
191
+ size="lg"
192
+ )
193
+
194
+ # Output section
195
+ gr.HTML('<div style="font-size: 1.2em; font-weight: bold; margin: 2em 0 1em 0;">📋 Results</div>')
196
+
197
+ status_output = gr.Textbox(
198
+ label="📊 Processing Status",
199
+ interactive=False,
200
+ show_label=True
201
+ )
202
+
203
+ with gr.Row():
204
+ with gr.Column(scale=2):
205
+ response_output = gr.Textbox(
206
+ label="🤖 AI Response",
207
+ lines=8,
208
+ interactive=False,
209
+ show_label=True
210
+ )
211
+
212
+ with gr.Column(scale=1):
213
+ sources_output = gr.Textbox(
214
+ label="📚 Sources",
215
+ lines=8,
216
+ interactive=False,
217
+ show_label=True
218
+ )
219
+
220
+ # Example section
221
+ gr.HTML("""
222
+ <div style="margin-top: 2em; padding: 1.5em; background: #f8f9fa; border-radius: 12px; border-left: 4px solid #667eea;">
223
+ <h3 style="margin-top: 0; color: #333;">💡 Example Usage</h3>
224
+ <p><strong>URL:</strong> https://openai.com</p>
225
+ <p><strong>Question:</strong> What are the main products and services offered?</p>
226
+ <p><strong>Data Source:</strong> Multiple links (first 5)</p>
227
+ </div>
228
+ """)
229
+
230
+ # Add a note about the system status
231
+ gr.HTML("""
232
+ <div style="margin-top: 1em; padding: 1em; background: #e3f2fd; border-radius: 8px; border-left: 4px solid #2196f3;">
233
+ <p style="margin: 0; color: #0d47a1;">
234
+ ℹ️ <strong>Note:</strong> If you encounter connection errors, please wait a moment for the system to initialize and try again.
235
+ </p>
236
+ </div>
237
+ """)
238
+
239
+ # Connect the function
240
+ process_btn.click(
241
+ fn=process_web_rag,
242
+ inputs=[url_input, prompt_input, data_source],
243
+ outputs=[status_output, response_output, sources_output],
244
+ show_progress=True
245
+ )
246
+
247
+ # Add keyboard shortcut
248
+ url_input.submit(
249
+ fn=process_web_rag,
250
+ inputs=[url_input, prompt_input, data_source],
251
+ outputs=[status_output, response_output, sources_output],
252
+ show_progress=True
253
+ )
254
+
255
+ prompt_input.submit(
256
+ fn=process_web_rag,
257
+ inputs=[url_input, prompt_input, data_source],
258
+ outputs=[status_output, response_output, sources_output],
259
+ show_progress=True
260
+ )
261
+
262
+ if __name__ == "__main__":
263
+ app.launch(
264
+ server_name="0.0.0.0",
265
+ server_port=7860,
266
+ share=False,
267
+ show_error=True,
268
+ quiet=False
269
+ )
main_api.py ADDED
@@ -0,0 +1,363 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ from typing import List
4
+ import requests
5
+ from bs4 import BeautifulSoup
6
+ import time
7
+ import os
8
+ import json
9
+ import random
10
+ import logging
11
+ import groq
12
+ import numpy as np
13
+ from sklearn.metrics.pairwise import cosine_similarity
14
+ import uvicorn
15
+ from supabase import create_client, Client
16
+ from urllib.parse import urljoin, urlparse
17
+
18
+
19
+ # Initialize FastAPI app
20
+ app = FastAPI(
21
+ title="Web RAG System API",
22
+ description="Extract content from web pages and perform RAG operations",
23
+ version="1.0.0"
24
+ )
25
+
26
+ # Configure logging
27
+ logging.basicConfig(level=logging.INFO)
28
+ logger = logging.getLogger(__name__)
29
+
30
+ # Initialize Supabase client with environment variables
31
+ try:
32
+ url = os.environ.get('SUPABASE_URL')
33
+ key = os.environ.get('SUPABASE_SERVICE_ROLE_KEY')
34
+
35
+ if not url or not key:
36
+ logger.warning("Supabase credentials not found in environment variables")
37
+ supabase = None
38
+ else:
39
+ supabase: Client = create_client(url, key)
40
+ logger.info("Supabase client initialized successfully")
41
+ except Exception as e:
42
+ logger.error(f"Failed to initialize Supabase client: {e}")
43
+ supabase = None
44
+
45
+ # User agents for web scraping
46
+ user_agents = [
47
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
48
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/102.0",
49
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Safari/605.1.15",
50
+ "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0",
51
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0",
52
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
53
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
54
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/103.0.1264.49",
55
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 15_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Mobile/15E148 Safari/604.1",
56
+ "Mozilla/5.0 (iPad; CPU OS 15_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Mobile/15E148 Safari/604.1",
57
+ "Mozilla/5.0 (Linux; Android 12; SM-G991B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Mobile Safari/537.36",
58
+ "Mozilla/5.0 (Linux; Android 11; Pixel 5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Mobile Safari/537.36",
59
+ "Mozilla/5.0 (Linux; Android 11; SM-A217F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Mobile Safari/537.36",
60
+ "Mozilla/5.0 (Linux; Android 10; SM-G975F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Mobile Safari/537.36"
61
+ ]
62
+
63
+ # Pydantic models
64
+ class RAGRequest(BaseModel):
65
+ file_path: str
66
+ prompt: str
67
+
68
+ class URL(BaseModel):
69
+ url: str
70
+
71
+ @app.get("/")
72
+ async def root():
73
+ """Health check endpoint"""
74
+ return {"message": "Web RAG System API is running", "status": "healthy"}
75
+
76
+ @app.get("/health")
77
+ async def health_check():
78
+ """Detailed health check"""
79
+ health_status = {
80
+ "api": "healthy",
81
+ "supabase": "connected" if supabase else "not configured",
82
+ "hf_token": "configured" if os.environ.get('hf_token') else "not configured",
83
+ "groq_token": "configured" if os.environ.get('groq_token') else "not configured"
84
+ }
85
+ return health_status
86
+
87
+ @app.post("/rag")
88
+ async def rag(request: RAGRequest):
89
+ """Perform RAG operations on extracted text"""
90
+ try:
91
+ # Check required environment variables
92
+ hf_token = os.environ.get('hf_token')
93
+ groq_token = os.environ.get('groq_token')
94
+
95
+ if not hf_token:
96
+ raise HTTPException(status_code=500, detail="HuggingFace token not configured")
97
+ if not groq_token:
98
+ raise HTTPException(status_code=500, detail="Groq token not configured")
99
+ if not supabase:
100
+ raise HTTPException(status_code=500, detail="Supabase not configured")
101
+
102
+ logger.info(f"Processing RAG request for file: {request.file_path}")
103
+
104
+ # HuggingFace Inference API for embeddings
105
+ API_URL = "https://router.huggingface.co/hf-inference/models/BAAI/bge-large-en-v1.5/pipeline/feature-extraction"
106
+ headers = {
107
+ "Authorization": hf_token,
108
+ }
109
+
110
+ def query(payload):
111
+ response = requests.post(API_URL, headers=headers, json=payload)
112
+ if response.status_code != 200:
113
+ logger.error(f"HuggingFace API error: {response.status_code} - {response.text}")
114
+ raise HTTPException(status_code=500, detail="Failed to get embeddings from HuggingFace")
115
+ return response.json()
116
+
117
+ # Create a Groq client
118
+ groq_client = groq.Client(api_key=groq_token)
119
+
120
+ def process_with_groq(query_text, context):
121
+ prompt = f"""
122
+ Context information:
123
+ {context}
124
+
125
+ Based on the context information above, please answer the following question:
126
+ {query_text}
127
+
128
+ Answer:
129
+ """
130
+
131
+ try:
132
+ response = groq_client.chat.completions.create(
133
+ messages=[{"role": "user", "content": prompt}],
134
+ model="llama-3.3-70b-versatile",
135
+ temperature=0.4,
136
+ max_tokens=512
137
+ )
138
+ return response.choices[0].message.content
139
+ except Exception as e:
140
+ logger.error(f"Groq API error: {e}")
141
+ raise HTTPException(status_code=500, detail="Failed to process with Groq")
142
+
143
+ def get_file_from_supabase(bucket_name, file_path):
144
+ try:
145
+ response = supabase.storage.from_(bucket_name).download(file_path)
146
+ content = response.decode('utf-8')
147
+ return content
148
+ except Exception as e:
149
+ logger.error(f"Error downloading file from Supabase: {e}")
150
+ raise HTTPException(
151
+ status_code=404,
152
+ detail=f"File not found in Supabase bucket: {file_path}"
153
+ )
154
+
155
+ # Get file content from Supabase
156
+ bucket_name = "url-2-ans-bucket"
157
+ file_path = request.file_path
158
+
159
+ content = get_file_from_supabase(bucket_name, file_path)
160
+ logger.info(f"Successfully downloaded file from Supabase: {file_path}")
161
+
162
+ # Simple text chunking
163
+ chunk_size = 1000
164
+ overlap = 200
165
+ chunks = []
166
+
167
+ for i in range(0, len(content), chunk_size - overlap):
168
+ chunk = content[i:i + chunk_size]
169
+ if len(chunk) > 100:
170
+ chunks.append({"text": chunk, "position": i})
171
+
172
+ logger.info(f"Created {len(chunks)} chunks from document")
173
+
174
+ # Get embeddings for all chunks
175
+ chunk_embeddings = []
176
+ for chunk in chunks:
177
+ embedding = query({"inputs": chunk["text"]})
178
+ chunk_embeddings.append(embedding)
179
+
180
+ # Get embedding for the query
181
+ query_embedding = query({"inputs": request.prompt})
182
+
183
+ # Calculate similarity between query and all chunks
184
+ similarities = []
185
+ for chunk_embedding in chunk_embeddings:
186
+ query_np = np.array(query_embedding)
187
+ chunk_np = np.array(chunk_embedding)
188
+
189
+ if len(query_np.shape) == 1:
190
+ query_np = query_np.reshape(1, -1)
191
+ if len(chunk_np.shape) == 1:
192
+ chunk_np = chunk_np.reshape(1, -1)
193
+
194
+ similarity = cosine_similarity(query_np, chunk_np)[0][0]
195
+ similarities.append(similarity)
196
+
197
+ # Get top 3 most similar chunks
198
+ top_k = 3
199
+ top_indices = np.argsort(similarities)[-top_k:][::-1]
200
+
201
+ relevant_chunks = [chunks[i]["text"] for i in top_indices]
202
+ context_text = "\n\n".join(relevant_chunks)
203
+
204
+ # Process with Groq
205
+ answer = process_with_groq(request.prompt, context_text)
206
+
207
+ # Prepare sources
208
+ sources = [{"text": chunks[i]["text"][:200] + "...", "position": chunks[i]["position"]}
209
+ for i in top_indices]
210
+
211
+ return {
212
+ "sources": sources,
213
+ "user_query": request.prompt,
214
+ "assistant_response": answer,
215
+ "file_source": f"supabase://{bucket_name}/{file_path}"
216
+ }
217
+
218
+ except HTTPException:
219
+ raise
220
+ except Exception as e:
221
+ logger.exception("Error occurred in RAG process")
222
+ raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
223
+
224
+ @app.post("/extract_links")
225
+ async def extract_links(url: URL):
226
+ """Extract unique links from a given URL"""
227
+ def extract_unique_links(url_string, max_retries=3, timeout=30):
228
+ for attempt in range(max_retries):
229
+ try:
230
+ headers = {'User-Agent': random.choice(user_agents)}
231
+ response = requests.get(url_string, headers=headers, timeout=timeout)
232
+ response.raise_for_status()
233
+ soup = BeautifulSoup(response.text, 'html.parser')
234
+
235
+ base_url = urlparse(url_string)
236
+ base_url = f"{base_url.scheme}://{base_url.netloc}"
237
+
238
+ a_tags = soup.find_all('a', href=True)
239
+ links = []
240
+ for a in a_tags:
241
+ href = a.get('href')
242
+ full_url = urljoin(base_url, href)
243
+ links.append(full_url)
244
+
245
+ unique_links = list(dict.fromkeys(links))
246
+ unique_links.insert(0, url_string)
247
+ return unique_links
248
+
249
+ except requests.RequestException as e:
250
+ logger.warning(f"Attempt {attempt + 1} failed: {e}")
251
+ if attempt < max_retries - 1:
252
+ wait_time = 5 * (attempt + 1)
253
+ time.sleep(wait_time)
254
+ else:
255
+ logger.error(f"Failed to retrieve {url_string} after {max_retries} attempts.")
256
+ raise HTTPException(status_code=500, detail=f"Failed to retrieve {url_string} after {max_retries} attempts.")
257
+ return []
258
+
259
+ try:
260
+ unique_links = extract_unique_links(url.url)
261
+ return {"unique_links": unique_links}
262
+ except Exception as e:
263
+ logger.exception("Error in extract_links")
264
+ raise HTTPException(status_code=500, detail=f"Failed to extract links: {str(e)}")
265
+
266
+ @app.post("/extract_text")
267
+ async def extract_text(urls: List[str]):
268
+ """Extract text content from multiple URLs"""
269
+ if not supabase:
270
+ raise HTTPException(status_code=500, detail="Supabase not configured")
271
+
272
+ output_file = "extracted_text.txt"
273
+
274
+ def upload_text_content(filename, content, bucket_name):
275
+ try:
276
+ file_content = content.encode('utf-8')
277
+
278
+ # Try to upload first
279
+ try:
280
+ response = supabase.storage.from_(bucket_name).upload(
281
+ path=filename,
282
+ file=file_content,
283
+ file_options={"content-type": "text/plain"}
284
+ )
285
+ logger.info(f"Text file uploaded successfully: {filename}")
286
+ return response
287
+ except Exception as upload_error:
288
+ # If upload fails (file exists), try to update
289
+ try:
290
+ response = supabase.storage.from_(bucket_name).update(
291
+ path=filename,
292
+ file=file_content,
293
+ file_options={"content-type": "text/plain"}
294
+ )
295
+ logger.info(f"Text file updated successfully: {filename}")
296
+ return response
297
+ except Exception as update_error:
298
+ logger.error(f"Error updating text content: {update_error}")
299
+ raise HTTPException(status_code=500, detail="Failed to save file to storage")
300
+
301
+ except Exception as e:
302
+ logger.error(f"Error with file operations: {e}")
303
+ raise HTTPException(status_code=500, detail="Failed to save file to storage")
304
+
305
+ def text_data_extractor(links):
306
+ extracted_texts = []
307
+
308
+ for link in links:
309
+ parsed_url = urlparse(link)
310
+ if not parsed_url.scheme:
311
+ logger.warning(f"Invalid URL: {link}")
312
+ continue
313
+
314
+ retries = 3
315
+ while retries > 0:
316
+ try:
317
+ headers = {'User-Agent': random.choice(user_agents)}
318
+ response = requests.get(link, headers=headers, timeout=30)
319
+ response.raise_for_status()
320
+ soup = BeautifulSoup(response.text, 'html.parser')
321
+ text = soup.get_text()
322
+ clean_text = ' '.join(text.split())
323
+ extracted_texts.append({"url": link, "text": clean_text})
324
+ break
325
+
326
+ except requests.RequestException as e:
327
+ retries -= 1
328
+ logger.warning(f"Retry {3 - retries} for {link} failed: {e}")
329
+ if retries > 0:
330
+ wait_time = 5 * (3 - retries)
331
+ time.sleep(wait_time)
332
+
333
+ if retries == 0:
334
+ extracted_texts.append({
335
+ "url": link,
336
+ "text": "Failed to retrieve text after multiple attempts."
337
+ })
338
+
339
+ return extracted_texts
340
+
341
+ try:
342
+ extracted_data = text_data_extractor(urls)
343
+ string_output = json.dumps(extracted_data, ensure_ascii=False, indent=2)
344
+
345
+ # Upload to Supabase
346
+ upload_text_content(output_file, string_output, "url-2-ans-bucket")
347
+
348
+ return {"extracted_data": extracted_data, "file_saved": output_file}
349
+
350
+ except Exception as e:
351
+ logger.exception("Error in extract_text")
352
+ raise HTTPException(status_code=500, detail=f"Failed to extract text: {str(e)}")
353
+
354
+ # Main execution
355
+ if __name__ == "__main__":
356
+ # Run the FastAPI app
357
+ uvicorn.run(
358
+ "main_api:app",
359
+ host="0.0.0.0",
360
+ port=8000,
361
+ reload=False, # Disable reload for production
362
+ access_log=True
363
+ )
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.111.0
2
+ uvicorn==0.30.1
3
+ pydantic==2.7.1
4
+ requests==2.32.2
5
+ beautifulsoup4==4.12.3
6
+ llama-index==0.10.55
7
+ python-dotenv==1.0.1
8
+ llama-index==0.10.55
9
+ streamlit==1.30.0
10
+ requests==2.32.2
11
+ groq==0.20.0
12
+ scikit-learn==1.6.1
13
+ gradio==5.33.0
14
+ supabase==2.15.2