File size: 2,120 Bytes
f28c8cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import requests
from bs4 import BeautifulSoup
import pandas as pd

base_url = "https://books.toscrape.com"
book_urls = []

page = 1
while True:
    url = f"{base_url}/catalogue/page-{page}.html"
    response = requests.get(url)

    if response.status_code != 200:
        break

    soup = BeautifulSoup(response.content, 'html.parser')
    articles = soup.find_all('article', class_='product_pod')

    if not articles:
        break

    for article in articles:
        link = article.find('h3').find('a')['href']
        book_url = f"{base_url}/catalogue/{link.replace('../', '')}"
        book_urls.append(book_url)

    page += 1

book_data = []

for url in book_urls:
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        title = soup.find('h1').text
        price = soup.find('p', class_='price_color').text
        category = soup.find('ul', class_='breadcrumb').find_all('li')[2].text.strip()

        rating_map = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}
        rating_element = soup.find('p', class_='star-rating')
        rating_text = rating_element['class'][1] if rating_element and 'class' in rating_element.attrs else 'Zero'
        rating = rating_map.get(rating_text, 0)

        # Extract availability
        availability_element = soup.find('p', class_='instock availability')
        availability = availability_element.text.strip() if availability_element else 'N/A'

        # Extract description
        description_element = soup.find('meta', attrs={'name': 'description'})
        description = description_element['content'].strip() if description_element and 'content' in description_element.attrs else 'N/A'


        book_info = {
            'title': title,
            'price': price,
            'category': category,
            'rating': rating,
            'availability': availability,
            'description': description
        }
        book_data.append(book_info)

df = pd.DataFrame(book_data)
# display(df.head())
# display(df.info())

df.to_csv('books.csv', index=False)