my scraper code for clicky.pk
Wednesday, 9 August 2023 -from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from bs4 import BeautifulSoup
from collections import defaultdict
from selenium.webdriver.chrome.options import Options
import time
import os
from operator import itemgetter
import sys
from urllib.parse import urlparse
current_dir = os.getcwd()
chromedriver_path = os.path.join(current_dir, 'chromedriver')
service = Service(chromedriver_path)
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(service=service, options=chrome_options)
def extract_price(original_price, sale_price):
original_price = int(original_price.replace('Rs ', '').replace(',', '').strip())
sale_price = int(sale_price.replace('Rs ', '').replace(',', '').strip()) if sale_price else None
return original_price, sale_price
def format_url(url):
parsed = urlparse(url)
if parsed.netloc:
return parsed.geturl()
else:
return 'https://' + parsed.geturl()
ACCEPTABLE_URLS = [
"https://www.clicky.pk/bags",
"https://www.clicky.pk/coats-winter-wear-women",
"https://www.clicky.pk/women-footwear",
"https://www.clicky.pk/women-heels",
]
def scrape_products(url):
base_url = "https://www.clicky.pk"
formatted_url = format_url(url)
product_count = 0 # initialize product count outside the while loop
if formatted_url not in ACCEPTABLE_URLS:
raise ValueError(f"Invalid URL: {url}. This URL is not in the list of acceptable URLs.")
page_number = 1
summary = {
"total_products_found": 0,
"total_products_on_sale": 0,
"average_price": 0,
"min_price": float('inf'),
"max_price": 0,
"price_density": defaultdict(int),
"products": [],
}
while True:
print(f"Scraping page {page_number}...")
driver.get(f"{formatted_url}?sort=-createdAt&page={page_number}")
print("Waiting for the page to load...")
max_attempts = 3
attempts = 0
while attempts < max_attempts:
try:
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'div.product__details.list_details'))
)
print("Page loaded successfully.")
break
except TimeoutException:
attempts += 1
print(f"Attempt {attempts} failed. Trying again...")
if attempts == max_attempts:
print("Max attempts reached. Raising exception...")
raise
soup = BeautifulSoup(driver.page_source, 'html.parser')
products = soup.find_all('div', class_='product__details list_details')
if len(products) == 0:
print("No more pages found. Ending the scraping process.")
break
prices = []
for product in products:
product_count += 1
summary["total_products_found"] += 1
title_tag = product.find('a', class_='name')
original_price_tag = product.find('del')
sale_price_tag = product.find('span', class_='discounted_price')
url_tag = title_tag
url = base_url + url_tag['href'] if url_tag else None
if title_tag and original_price_tag:
title = title_tag.text.strip()
original_price, sale_price = extract_price(original_price_tag.text.strip(), sale_price_tag.text.strip() if sale_price_tag else None)
if sale_price is None:
prices.append(original_price)
summary["price_density"][original_price] += 1
summary["products"].append(f"Product {product_count}:\nTitle: {title}\nPrice: {original_price} Rs\nURL: {url}\n")
else:
prices.append(sale_price)
summary["total_products_on_sale"] += 1
summary["price_density"][sale_price] += 1
summary["products"].append(f"Product {product_count}:\nTitle: {title}\nOriginal Price: {original_price} Rs\nSale Price: {sale_price} Rs\nURL: {url}\n")
summary["min_price"] = min(min(prices), summary["min_price"])
summary["max_price"] = max(max(prices), summary["max_price"])
summary["average_price"] = sum(prices) / len(prices)
page_number += 1
driver.quit()
return summary
if __name__ == "__main__":
url = sys.argv[1]
try:
result = scrape_products(url)
for key, value in result.items():
print(f"{key}: {value}")
except ValueError as e:
print(e)