Code Snippets Repository

Home

my scraper code for clicky.pk

Wednesday, 9 August 2023 -

from selenium import webdriver

from selenium.webdriver.chrome.service import Service

from selenium.webdriver.common.by import By

from selenium.webdriver.support.ui import WebDriverWait

from selenium.webdriver.support import expected_conditions as EC

from selenium.common.exceptions import TimeoutException, NoSuchElementException

from bs4 import BeautifulSoup

from collections import defaultdict

from selenium.webdriver.chrome.options import Options

import time

import os

from operator import itemgetter

import sys

from urllib.parse import urlparse


current_dir = os.getcwd()

chromedriver_path = os.path.join(current_dir, 'chromedriver')

service = Service(chromedriver_path)

chrome_options = Options()

chrome_options.add_argument("--headless")


driver = webdriver.Chrome(service=service, options=chrome_options)


def extract_price(original_price, sale_price):

    original_price = int(original_price.replace('Rs ', '').replace(',', '').strip())

    sale_price = int(sale_price.replace('Rs ', '').replace(',', '').strip()) if sale_price else None

    return original_price, sale_price


def format_url(url):

    parsed = urlparse(url)

    if parsed.netloc:

        return parsed.geturl()

    else:

        return 'https://' + parsed.geturl()


ACCEPTABLE_URLS = [

    "https://www.clicky.pk/bags",

    "https://www.clicky.pk/coats-winter-wear-women",

    "https://www.clicky.pk/women-footwear",

    "https://www.clicky.pk/women-heels",

]


def scrape_products(url):

    base_url = "https://www.clicky.pk"

    formatted_url = format_url(url)

    product_count = 0  # initialize product count outside the while loop


    if formatted_url not in ACCEPTABLE_URLS:

        raise ValueError(f"Invalid URL: {url}. This URL is not in the list of acceptable URLs.")


    page_number = 1

    summary = {

        "total_products_found": 0,

        "total_products_on_sale": 0,

        "average_price": 0,

        "min_price": float('inf'),

        "max_price": 0,

        "price_density": defaultdict(int),

        "products": [],

    }

    while True:

        print(f"Scraping page {page_number}...")

        driver.get(f"{formatted_url}?sort=-createdAt&page={page_number}")


        print("Waiting for the page to load...")

        max_attempts = 3

        attempts = 0


        while attempts < max_attempts:

            try:

                WebDriverWait(driver, 30).until(

                    EC.presence_of_element_located((By.CSS_SELECTOR, 'div.product__details.list_details'))

                )

                print("Page loaded successfully.")

                break

            except TimeoutException:

                attempts += 1

                print(f"Attempt {attempts} failed. Trying again...")

                if attempts == max_attempts:

                    print("Max attempts reached. Raising exception...")

                    raise


        soup = BeautifulSoup(driver.page_source, 'html.parser')

        products = soup.find_all('div', class_='product__details list_details')


        if len(products) == 0:

            print("No more pages found. Ending the scraping process.")

            break


        prices = []

        for product in products:

            product_count += 1

            summary["total_products_found"] += 1

            title_tag = product.find('a', class_='name')

            original_price_tag = product.find('del')

            sale_price_tag = product.find('span', class_='discounted_price')

            url_tag = title_tag

            url = base_url + url_tag['href'] if url_tag else None


            if title_tag and original_price_tag:

                title = title_tag.text.strip()

                original_price, sale_price = extract_price(original_price_tag.text.strip(), sale_price_tag.text.strip() if sale_price_tag else None)


                if sale_price is None:

                    prices.append(original_price)

                    summary["price_density"][original_price] += 1

                    summary["products"].append(f"Product {product_count}:\nTitle: {title}\nPrice: {original_price} Rs\nURL: {url}\n")

                else:

                    prices.append(sale_price)

                    summary["total_products_on_sale"] += 1

                    summary["price_density"][sale_price] += 1

                    summary["products"].append(f"Product {product_count}:\nTitle: {title}\nOriginal Price: {original_price} Rs\nSale Price: {sale_price} Rs\nURL: {url}\n")


        summary["min_price"] = min(min(prices), summary["min_price"])

        summary["max_price"] = max(max(prices), summary["max_price"])

        summary["average_price"] = sum(prices) / len(prices)


        page_number += 1


    driver.quit()


    return summary


if __name__ == "__main__":

    url = sys.argv[1]

    try:

        result = scrape_products(url)

        for key, value in result.items():

            print(f"{key}: {value}")

    except ValueError as e:

        print(e)