|
import requests |
|
from bs4 import BeautifulSoup |
|
import re |
|
import concurrent.futures |
|
from googlesearch import search |
|
|
|
|
|
class SearchClient: |
|
def __init__(self, vendor, engine_id=None, api_key=None): |
|
self.vendor = vendor |
|
|
|
|
|
if vendor == "bing": |
|
self.endpoint = "https://api.bing.microsoft.com/v7.0/search" |
|
self.headers = { |
|
"Ocp-Apim-Subscription-Key": api_key, |
|
} |
|
|
|
@staticmethod |
|
def _extract_text_from_link(link): |
|
page = requests.get(link) |
|
if page.status_code == 200: |
|
soup = BeautifulSoup(page.content, "html.parser") |
|
text = soup.get_text() |
|
cleaned_text = re.sub(r"\s+", " ", text) |
|
return cleaned_text |
|
return None |
|
|
|
def _fetch_text_from_links(self, links): |
|
results = [] |
|
with concurrent.futures.ThreadPoolExecutor() as executor: |
|
future_to_link = { |
|
executor.submit(self._extract_text_from_link, link): link |
|
for link in links |
|
} |
|
for future in concurrent.futures.as_completed(future_to_link): |
|
link = future_to_link[future] |
|
try: |
|
cleaned_text = future.result() |
|
if cleaned_text: |
|
results.append({"text": cleaned_text, "link": link}) |
|
except Exception as e: |
|
print(f"Error fetching data from {link}: {e}") |
|
return results |
|
|
|
def _google_search(self,query, n_crawl): |
|
search_results = search(query, stop=n_crawl, lang='en', country='IN') |
|
|
|
|
|
links = list(search_results) |
|
|
|
text_results = self._fetch_text_from_links(links) |
|
return text_results |
|
|
|
def _bing_search(self, query, n_crawl): |
|
params = { |
|
"q": query, |
|
"count": n_crawl, |
|
"mkt": "en-US", |
|
} |
|
response = requests.get(self.endpoint, headers=self.headers, params=params) |
|
search_results = response.json() |
|
|
|
results = [] |
|
for item in search_results.get("webPages", {}).get("value", []): |
|
link = item["url"] |
|
results.append(link) |
|
|
|
text_results = self._fetch_text_from_links(results) |
|
return text_results |
|
|
|
def search(self, query, n_crawl): |
|
if self.vendor == "bing": |
|
return self._bing_search(query, n_crawl) |
|
else: |
|
return "Invalid vendor" |
|
|
|
def search_google(self, query, n_crawl): |
|
if self.vendor == "google": |
|
return self._google_search(query, n_crawl) |
|
else: |
|
return "Invalid vendor" |
|
|
|
|