Spaces:

felixmortas
/

Hf_Aagent_Course_Final_Assignment

Paused

App Files Files Community

felixmortas commited on Jul 14

Commit

e9b8de1

1 Parent(s): e83334a

Improve URL search tool with RAG

Browse files

Files changed (2) hide show

custom_tools.py +24 -0
web_semantic_search_tool.py +111 -0

custom_tools.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import utils
 import os
 import requests
@@ -39,6 +40,7 @@ def web_search(query: str) -> str:
     results = tool.invoke(query)
     return results
 @tool
 def url_search(url: str) -> str:
     """
@@ -71,6 +73,28 @@ def url_search(url: str) -> str:
         return "Too many redirects while trying to access the URL."
     except RequestException as e:
         return f"Failed to access the URL. Error: {e}"
 @tool
 def wiki_search(query: str, lang_tag: str = 'en', date: str = None) -> str:

 import utils
+from web_semantic_search_tool import WebSemanticSearchTool
 import os
 import requests
     results = tool.invoke(query)
     return results
+'''
 @tool
 def url_search(url: str) -> str:
     """
         return "Too many redirects while trying to access the URL."
     except RequestException as e:
         return f"Failed to access the URL. Error: {e}"
+        '''
+# Création du tool pour LangGraph
+web_search_tool_instance = WebSemanticSearchTool()
+@tool
+def url_search(question: str, url: str) -> str:
+    """
+    Access a specific URL provided by the web_search tool call.
+    Args:
+        question (str): The question you want to answer accessing this URL.
+        url (str): The URL to access.
+    Returns:
+        str: 3 chunks with the highest similarity score based on the query of the accessed URL or an error message.
+    """
+    try:
+        return web_search_tool_instance.search_semantic(question.strip(), url.strip())
+    except ValueError:
+        return "Incorrect format. Use: 'your_query, http://example.com'"
 @tool
 def wiki_search(query: str, lang_tag: str = 'en', date: str = None) -> str:

web_semantic_search_tool.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import requests
+from bs4 import BeautifulSoup
+import torch
+from sentence_transformers import SentenceTransformer
+import chromadb
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.tools import Tool
+import uuid
+from typing import List, Dict
+class WebSemanticSearchTool:
+    def __init__(self):
+        # Initialisation du modèle d'embedding
+        self.embedding_model = SentenceTransformer('all-MiniLM-L12-v2')
+        # Initialisation de Chroma (en mémoire)
+        self.chroma_client = chromadb.Client()
+        # Text splitter pour le chunking
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=500,
+            chunk_overlap=50,
+            length_function=len
+        )
+    def extract_content(self, url: str) -> str:
+        """Extrait le contenu textuel d'une page web"""
+        try:
+            response = requests.get(url, timeout=10)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, 'html.parser')
+            # Supprimer les scripts et styles
+            for script in soup(["script", "style"]):
+                script.decompose()
+            # Extraire le texte principal
+            text = soup.get_text()
+            # Nettoyer le texte
+            lines = (line.strip() for line in text.splitlines())
+            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+            text = ' '.join(chunk for chunk in chunks if chunk)
+            return text
+        except Exception as e:
+            return f"Erreur lors de l'extraction: {str(e)}"
+    def create_chunks(self, text: str) -> List[str]:
+        """Divise le texte en chunks"""
+        return self.text_splitter.split_text(text)
+    def search_semantic(self, query: str, url: str) -> str:
+        """Recherche sémantique dans une page web"""
+        # 1. Extraire le contenu
+        content = self.extract_content(url)
+        if content.startswith("Erreur"):
+            return content
+        # 2. Créer les chunks
+        chunks = self.create_chunks(content)
+        if not chunks:
+            return "Aucun contenu trouvé dans la page"
+        # 3. Créer une collection Chroma temporaire
+        collection_name = f"temp_collection_{uuid.uuid4().hex[:8]}"
+        collection = self.chroma_client.create_collection(
+            name=collection_name,
+            embedding_function=None  # Nous gérons les embeddings manuellement
+        )
+        try:
+            # 4. Générer les embeddings pour tous les chunks
+            chunk_embeddings = self.embedding_model.encode(chunks)
+            # 5. Ajouter les chunks à la collection
+            collection.add(
+                documents=chunks,
+                embeddings=chunk_embeddings.tolist(),
+                ids=[f"chunk_{i}" for i in range(len(chunks))]
+            )
+            # 6. Générer l'embedding de la requête
+            query_embedding = self.embedding_model.encode([query])
+            # 7. Rechercher les 3 chunks les plus similaires
+            results = collection.query(
+                query_embeddings=query_embedding.tolist(),
+                n_results=3
+            )
+            # 8. Formater les résultats
+            if results['documents']:
+                top_chunks = results['documents'][0]
+                distances = results['distances'][0] if results['distances'] else []
+                formatted_results = []
+                for i, chunk in enumerate(top_chunks):
+                    similarity = 1 - distances[i] if distances else "N/A"
+                    formatted_results.append(chunk)
+                return "\n\n".join(formatted_results)
+            else:
+                return "Aucun résultat trouvé"
+        finally:
+            # Nettoyer la collection temporaire
+            self.chroma_client.delete_collection(collection_name)