shoaib sherazi: my scraper code for clicky.pk

my scraper code for clicky.pk

Wednesday, 9 August 2023 -
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from bs4 import BeautifulSoup
from collections import defaultdict
from selenium.webdriver.chrome.options import Options
import time
import os
from operator import itemgetter
import sys
from urllib.parse import urlparse

current_dir = os.getcwd()
chromedriver_path = os.path.join(current_dir, 'chromedriver')
service = Service(chromedriver_path)
chrome_options = Options()
chrome_options.add_argument("--headless")

driver = webdriver.Chrome(service=service, options=chrome_options)

def extract_price(original_price, sale_price):
    original_price = int(original_price.replace('Rs ', '').replace(',', '').strip())
    sale_price = int(sale_price.replace('Rs ', '').replace(',', '').strip()) if sale_price else None
    return original_price, sale_price

def format_url(url):
    parsed = urlparse(url)
    if parsed.netloc:
        return parsed.geturl()
    else:
        return 'https://' + parsed.geturl()

ACCEPTABLE_URLS = [
    "https://www.clicky.pk/bags",
    "https://www.clicky.pk/coats-winter-wear-women",
    "https://www.clicky.pk/women-footwear",
    "https://www.clicky.pk/women-heels",
]

def scrape_products(url):
    base_url = "https://www.clicky.pk"
    formatted_url = format_url(url)
    product_count = 0  # initialize product count outside the while loop

    if formatted_url not in ACCEPTABLE_URLS:
        raise ValueError(f"Invalid URL: {url}. This URL is not in the list of acceptable URLs.")

    page_number = 1
    summary = {
        "total_products_found": 0,
        "total_products_on_sale": 0,
        "average_price": 0,
        "min_price": float('inf'),
        "max_price": 0,
        "price_density": defaultdict(int),
        "products": [],
    }
    while True:
        print(f"Scraping page {page_number}...")
        driver.get(f"{formatted_url}?sort=-createdAt&page={page_number}")

        print("Waiting for the page to load...")
        max_attempts = 3
        attempts = 0

        while attempts < max_attempts:
            try:
                WebDriverWait(driver, 30).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, 'div.product__details.list_details'))
                )
                print("Page loaded successfully.")
                break
            except TimeoutException:
                attempts += 1
                print(f"Attempt {attempts} failed. Trying again...")
                if attempts == max_attempts:
                    print("Max attempts reached. Raising exception...")
                    raise

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        products = soup.find_all('div', class_='product__details list_details')

        if len(products) == 0:
            print("No more pages found. Ending the scraping process.")
            break

        prices = []
        for product in products:
            product_count += 1
            summary["total_products_found"] += 1
            title_tag = product.find('a', class_='name')
            original_price_tag = product.find('del')
            sale_price_tag = product.find('span', class_='discounted_price')
            url_tag = title_tag
            url = base_url + url_tag['href'] if url_tag else None

            if title_tag and original_price_tag:
                title = title_tag.text.strip()
                original_price, sale_price = extract_price(original_price_tag.text.strip(), sale_price_tag.text.strip() if sale_price_tag else None)

                if sale_price is None:
                    prices.append(original_price)
                    summary["price_density"][original_price] += 1
                    summary["products"].append(f"Product {product_count}:\nTitle: {title}\nPrice: {original_price} Rs\nURL: {url}\n")
                else:
                    prices.append(sale_price)
                    summary["total_products_on_sale"] += 1
                    summary["price_density"][sale_price] += 1
                    summary["products"].append(f"Product {product_count}:\nTitle: {title}\nOriginal Price: {original_price} Rs\nSale Price: {sale_price} Rs\nURL: {url}\n")

        summary["min_price"] = min(min(prices), summary["min_price"])
        summary["max_price"] = max(max(prices), summary["max_price"])
        summary["average_price"] = sum(prices) / len(prices)

        page_number += 1

    driver.quit()

    return summary

if __name__ == "__main__":
    url = sys.argv[1]
    try:
        result = scrape_products(url)
        for key, value in result.items():
            print(f"{key}: {value}")
    except ValueError as e:
        print(e)