B_2 CIE Web Scraping
B_2 CIE Web Scraping
Submitted By,
Jivani Dhairya (202203100110120),
Sanjana Kotadiya (202203100110175),
Tirthkumar Thummar (202203100110190),
Archie Koradia (202203100110197)
Guided By,
Ms. Jenisha Tailor
Web scraping tools are software applications designed to extract data from
websites automatically. It enables users to retrieve data from web pages
and save it in usable format for analysis or other purposes.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
# Function to extract product details from individual product page
def extract_product_details(product_url):
response = requests.get(product_url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
# Try to extract specific details (these selectors will need to be updated based on
the actual page structure)
try:
display_size = soup.find('li', {'class': 'd-item__attr-value'}).text.strip() # Example
selector for display size
except AttributeError:
display_size = 'N/A'
try:
battery_capacity = soup.find('li', {'class': 'd-item__attr-value'}).text.strip() #
Example selector for battery
except AttributeError:
battery_capacity = 'N/A'
try:
status = soup.find('span', class_='d-item__cond').text.strip() # Example selector
for status
except AttributeError:
status = 'N/A'
# Define the eBay search URL (modify the search query to suit your needs)
url = "https://www.ebay.com/sch/i.html?_nkw=iphone&_sop=12" # Example: search
for iPhones
# Step 4: Find all product listings (based on the structure of the page)
listings = soup.find_all('li', class_='s-item') # This class might change, inspect the
actual structure
try:
shipping = item.find('span', class_='s-item__shipping').text.strip() # Shipping info
except AttributeError:
shipping = 'N/A'
try:
condition = item.find('span', class_='s-item__condition').text.strip() # Product
condition
except AttributeError:
condition = 'N/A'
try:
link = item.find('a', class_='s-item__link')['href'] # Product URL
except (AttributeError, TypeError):
link = 'N/A'
df = pd.DataFrame(product_data)
df.to_excel(desktop_path, index=False, engine='openpyxl')
print(f"Data saved to '{desktop_path}'")
else:
print("No product data found.")
else:
print(f"Failed to retrieve the page. Status Code: {response.status_code}")
# Mode
mode_values = selected_columns.mode()
print("\nMode:\n", mode_values)
Chapter 6: References
https://www.ebay.com/