File size: 2,120 Bytes
f28c8cc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = "https://books.toscrape.com"
book_urls = []
page = 1
while True:
url = f"{base_url}/catalogue/page-{page}.html"
response = requests.get(url)
if response.status_code != 200:
break
soup = BeautifulSoup(response.content, 'html.parser')
articles = soup.find_all('article', class_='product_pod')
if not articles:
break
for article in articles:
link = article.find('h3').find('a')['href']
book_url = f"{base_url}/catalogue/{link.replace('../', '')}"
book_urls.append(book_url)
page += 1
book_data = []
for url in book_urls:
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
title = soup.find('h1').text
price = soup.find('p', class_='price_color').text
category = soup.find('ul', class_='breadcrumb').find_all('li')[2].text.strip()
rating_map = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}
rating_element = soup.find('p', class_='star-rating')
rating_text = rating_element['class'][1] if rating_element and 'class' in rating_element.attrs else 'Zero'
rating = rating_map.get(rating_text, 0)
# Extract availability
availability_element = soup.find('p', class_='instock availability')
availability = availability_element.text.strip() if availability_element else 'N/A'
# Extract description
description_element = soup.find('meta', attrs={'name': 'description'})
description = description_element['content'].strip() if description_element and 'content' in description_element.attrs else 'N/A'
book_info = {
'title': title,
'price': price,
'category': category,
'rating': rating,
'availability': availability,
'description': description
}
book_data.append(book_info)
df = pd.DataFrame(book_data)
# display(df.head())
# display(df.info())
df.to_csv('books.csv', index=False) |