felixmortas commited on
Commit
e9b8de1
·
1 Parent(s): e83334a

Improve URL search tool with RAG

Browse files
Files changed (2) hide show
  1. custom_tools.py +24 -0
  2. web_semantic_search_tool.py +111 -0
custom_tools.py CHANGED
@@ -1,4 +1,5 @@
1
  import utils
 
2
 
3
  import os
4
  import requests
@@ -39,6 +40,7 @@ def web_search(query: str) -> str:
39
  results = tool.invoke(query)
40
  return results
41
 
 
42
  @tool
43
  def url_search(url: str) -> str:
44
  """
@@ -71,6 +73,28 @@ def url_search(url: str) -> str:
71
  return "Too many redirects while trying to access the URL."
72
  except RequestException as e:
73
  return f"Failed to access the URL. Error: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  @tool
76
  def wiki_search(query: str, lang_tag: str = 'en', date: str = None) -> str:
 
1
  import utils
2
+ from web_semantic_search_tool import WebSemanticSearchTool
3
 
4
  import os
5
  import requests
 
40
  results = tool.invoke(query)
41
  return results
42
 
43
+ '''
44
  @tool
45
  def url_search(url: str) -> str:
46
  """
 
73
  return "Too many redirects while trying to access the URL."
74
  except RequestException as e:
75
  return f"Failed to access the URL. Error: {e}"
76
+ '''
77
+
78
+ # Création du tool pour LangGraph
79
+ web_search_tool_instance = WebSemanticSearchTool()
80
+
81
+ @tool
82
+ def url_search(question: str, url: str) -> str:
83
+ """
84
+ Access a specific URL provided by the web_search tool call.
85
+
86
+ Args:
87
+ question (str): The question you want to answer accessing this URL.
88
+ url (str): The URL to access.
89
+
90
+ Returns:
91
+ str: 3 chunks with the highest similarity score based on the query of the accessed URL or an error message.
92
+ """
93
+ try:
94
+ return web_search_tool_instance.search_semantic(question.strip(), url.strip())
95
+ except ValueError:
96
+ return "Incorrect format. Use: 'your_query, http://example.com'"
97
+
98
 
99
  @tool
100
  def wiki_search(query: str, lang_tag: str = 'en', date: str = None) -> str:
web_semantic_search_tool.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import torch
4
+ from sentence_transformers import SentenceTransformer
5
+ import chromadb
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain.tools import Tool
8
+ import uuid
9
+ from typing import List, Dict
10
+
11
+ class WebSemanticSearchTool:
12
+ def __init__(self):
13
+ # Initialisation du modèle d'embedding
14
+ self.embedding_model = SentenceTransformer('all-MiniLM-L12-v2')
15
+
16
+ # Initialisation de Chroma (en mémoire)
17
+ self.chroma_client = chromadb.Client()
18
+
19
+ # Text splitter pour le chunking
20
+ self.text_splitter = RecursiveCharacterTextSplitter(
21
+ chunk_size=500,
22
+ chunk_overlap=50,
23
+ length_function=len
24
+ )
25
+
26
+ def extract_content(self, url: str) -> str:
27
+ """Extrait le contenu textuel d'une page web"""
28
+ try:
29
+ response = requests.get(url, timeout=10)
30
+ response.raise_for_status()
31
+
32
+ soup = BeautifulSoup(response.content, 'html.parser')
33
+
34
+ # Supprimer les scripts et styles
35
+ for script in soup(["script", "style"]):
36
+ script.decompose()
37
+
38
+ # Extraire le texte principal
39
+ text = soup.get_text()
40
+
41
+ # Nettoyer le texte
42
+ lines = (line.strip() for line in text.splitlines())
43
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
44
+ text = ' '.join(chunk for chunk in chunks if chunk)
45
+
46
+ return text
47
+
48
+ except Exception as e:
49
+ return f"Erreur lors de l'extraction: {str(e)}"
50
+
51
+ def create_chunks(self, text: str) -> List[str]:
52
+ """Divise le texte en chunks"""
53
+ return self.text_splitter.split_text(text)
54
+
55
+ def search_semantic(self, query: str, url: str) -> str:
56
+ """Recherche sémantique dans une page web"""
57
+ # 1. Extraire le contenu
58
+ content = self.extract_content(url)
59
+ if content.startswith("Erreur"):
60
+ return content
61
+
62
+ # 2. Créer les chunks
63
+ chunks = self.create_chunks(content)
64
+ if not chunks:
65
+ return "Aucun contenu trouvé dans la page"
66
+
67
+ # 3. Créer une collection Chroma temporaire
68
+ collection_name = f"temp_collection_{uuid.uuid4().hex[:8]}"
69
+ collection = self.chroma_client.create_collection(
70
+ name=collection_name,
71
+ embedding_function=None # Nous gérons les embeddings manuellement
72
+ )
73
+
74
+ try:
75
+ # 4. Générer les embeddings pour tous les chunks
76
+ chunk_embeddings = self.embedding_model.encode(chunks)
77
+
78
+ # 5. Ajouter les chunks à la collection
79
+ collection.add(
80
+ documents=chunks,
81
+ embeddings=chunk_embeddings.tolist(),
82
+ ids=[f"chunk_{i}" for i in range(len(chunks))]
83
+ )
84
+
85
+ # 6. Générer l'embedding de la requête
86
+ query_embedding = self.embedding_model.encode([query])
87
+
88
+ # 7. Rechercher les 3 chunks les plus similaires
89
+ results = collection.query(
90
+ query_embeddings=query_embedding.tolist(),
91
+ n_results=3
92
+ )
93
+
94
+ # 8. Formater les résultats
95
+ if results['documents']:
96
+ top_chunks = results['documents'][0]
97
+ distances = results['distances'][0] if results['distances'] else []
98
+
99
+ formatted_results = []
100
+ for i, chunk in enumerate(top_chunks):
101
+ similarity = 1 - distances[i] if distances else "N/A"
102
+ formatted_results.append(chunk)
103
+
104
+ return "\n\n".join(formatted_results)
105
+ else:
106
+ return "Aucun résultat trouvé"
107
+
108
+ finally:
109
+ # Nettoyer la collection temporaire
110
+ self.chroma_client.delete_collection(collection_name)
111
+