"""
Tokopedia Product Scraper using Selenium WebDriver

This script scrapes product data from Tokopedia search results.
It extracts product name, price, rating, and other relevant information
and saves it to a CSV file.

Author: Qwen Assistant
Date: September 2025
"""

import time
import csv
import random
import logging
from urllib.parse import urljoin, urlparse
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("tokopedia_scraper.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

class TokopediaScraper:
    def __init__(self, delay_range=(2, 5), headless=True):
        """
        Initialize the Tokopedia scraper.
        
        Args:
            delay_range (tuple): Range of delay between requests in seconds
            headless (bool): Whether to run browser in headless mode
        """
        self.delay_range = delay_range
        self.base_url = "https://www.tokopedia.com"
        self.driver = None
        self.headless = headless
        self.setup_driver()
        
    def setup_driver(self):
        """Set up the Chrome WebDriver with appropriate options."""
        chrome_options = Options()
        
        if self.headless:
            chrome_options.add_argument("--headless")
        
        # Add common options to avoid detection and improve performance
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_argument(f"--user-data-dir=/tmp/selenium_user_data_{int(time.time())}")
        
        try:
            # Using webdriver-manager to automatically manage ChromeDriver
            # Handle different versions of selenium/webdriver-manager
            try:
                # Newer versions syntax (4.x+)
                self.driver = webdriver.Chrome(options=chrome_options)
            except TypeError:
                # Fallback for older versions
                self.driver = webdriver.Chrome(
                    ChromeDriverManager().install(),
                    options=chrome_options
                )
            # Execute script to hide webdriver property
            self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
            logger.info("Chrome WebDriver initialized successfully")
        except Exception as e:
            logger.error(f"Failed to initialize Chrome WebDriver: {e}")
            raise
    
    def search_products(self, keyword, limit=20):
        """
        Search for products on Tokopedia.
        
        Args:
            keyword (str): Search keyword
            limit (int): Maximum number of products to scrape
            
        Returns:
            list: List of product dictionaries
        """
        try:
            search_url = f"https://www.tokopedia.com/search?st=product&q={keyword}"
            logger.info(f"Searching for products with keyword: {keyword}")
            self.driver.get(search_url)
            
            # Wait for search results to load
            WebDriverWait(self.driver, 30).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "[data-testid='divProductWrapper']"))
            )
            
            products = []
            page = 1
            
            while len(products) < limit:
                logger.info(f"Scraping page {page}")
                
                # Wait for products to load
                time.sleep(random.uniform(*self.delay_range))
                
                # Find product elements
                product_elements = self.driver.find_elements(By.CSS_SELECTOR, "[data-testid='divProductWrapper']")
                
                if not product_elements:
                    logger.warning("No product elements found on current page")
                    break
                
                # Extract product information
                for element in product_elements:
                    if len(products) >= limit:
                        break
                        
                    try:
                        product = self.extract_product_info(element)
                        if product:
                            products.append(product)
                            logger.info(f"Extracted product: {product['name'][:50]}...")
                    except Exception as e:
                        logger.warning(f"Failed to extract product info: {e}")
                        continue
                
                # Check if we've reached the limit
                if len(products) >= limit:
                    break
                
                # Try to go to next page
                if not self.go_to_next_page():
                    logger.info("No more pages available")
                    break
                    
                page += 1
            
            logger.info(f"Successfully scraped {len(products)} products")
            return products[:limit]
            
        except TimeoutException:
            logger.error("Timeout while searching for products")
            return []
        except Exception as e:
            logger.error(f"Error during product search: {e}")
            return []
    
    def extract_product_info(self, element):
        """
        Extract product information from a product element.
        
        Args:
            element: Selenium WebElement representing a product
            
        Returns:
            dict: Product information or None if extraction fails
        """
        try:
            product = {}
            
            # Extract product name
            try:
                name_element = element.find_element(By.CSS_SELECTOR, "[data-testid='spnSRPProdName']")
                product['name'] = name_element.text.strip()
            except NoSuchElementException:
                product['name'] = "N/A"
            
            # Extract product price
            try:
                price_element = element.find_element(By.CSS_SELECTOR, "[data-testid='spnSRPProdPrice']")
                product['price'] = price_element.text.strip()
            except NoSuchElementException:
                product['price'] = "N/A"
            
            # Extract product rating
            try:
                rating_element = element.find_element(By.CSS_SELECTOR, "[data-testid='icnSRPRating']")
                rating_value = rating_element.get_attribute("aria-label")
                if rating_value:
                    # Extract numeric rating from aria-label
                    import re
                    rating_match = re.search(r"(\d+\.?\d*)", rating_value)
                    product['rating'] = rating_match.group(1) if rating_match else "N/A"
                else:
                    product['rating'] = "N/A"
            except NoSuchElementException:
                product['rating'] = "N/A"
            
            # Extract number of reviews
            try:
                review_element = element.find_element(By.CSS_SELECTOR, "[data-testid='spnSRPProdReviewCount']")
                product['reviews'] = review_element.text.strip()
            except NoSuchElementException:
                product['reviews'] = "0"
            
            # Extract number sold
            try:
                sold_element = element.find_element(By.CSS_SELECTOR, "[data-testid='spnSRPProdSoldCount']")
                product['sold'] = sold_element.text.strip()
            except NoSuchElementException:
                product['sold'] = "0"
            
            # Extract product URL
            try:
                link_element = element.find_element(By.TAG_NAME, "a")
                product['url'] = link_element.get_attribute("href")
            except NoSuchElementException:
                product['url'] = "N/A"
            
            # Extract shop name
            try:
                shop_element = element.find_element(By.CSS_SELECTOR, "[data-testid='lnkSRPShopName']")
                product['shop'] = shop_element.text.strip()
            except NoSuchElementException:
                product['shop'] = "N/A"
            
            return product
            
        except Exception as e:
            logger.error(f"Error extracting product info: {e}")
            return None
    
    def go_to_next_page(self):
        """
        Navigate to the next page of search results.
        
        Returns:
            bool: True if successful, False otherwise
        """
        try:
            # Look for next page button
            next_button = WebDriverWait(self.driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "[aria-label='Laman berikutnya']"))
            )
            
            if next_button:
                # Scroll to next button to ensure it's visible
                self.driver.execute_script("arguments[0].scrollIntoView();", next_button)
                time.sleep(1)
                
                # Click next button
                next_button.click()
                
                # Wait for new page to load
                WebDriverWait(self.driver, 20).until(
                    EC.staleness_of(next_button)
                )
                
                # Additional wait for content to load
                time.sleep(random.uniform(*self.delay_range))
                return True
                
        except TimeoutException:
            logger.info("Next page button not found or not clickable")
        except Exception as e:
            logger.error(f"Error navigating to next page: {e}")
            
        return False
    
    def save_to_csv(self, products, filename="tokopedia_products.csv"):
        """
        Save products to CSV file.
        
        Args:
            products (list): List of product dictionaries
            filename (str): Output filename
        """
        try:
            if not products:
                logger.warning("No products to save")
                return
            
            df = pd.DataFrame(products)
            df.to_csv(filename, index=False, encoding='utf-8')
            logger.info(f"Saved {len(products)} products to {filename}")
            
        except Exception as e:
            logger.error(f"Error saving to CSV: {e}")
    
    def close(self):
        """Close the WebDriver."""
        if self.driver:
            self.driver.quit()
            logger.info("WebDriver closed")

def main():
    """Main function to run the scraper."""
    # Configuration
    keyword = input("Enter search keyword: ").strip()
    if not keyword:
        logger.error("No keyword provided")
        return
    
    try:
        limit = int(input("Enter number of products to scrape (default 20): ") or "20")
    except ValueError:
        limit = 20
    
    # Initialize scraper
    scraper = TokopediaScraper(delay_range=(2, 5), headless=True)
    
    try:
        # Search for products
        products = scraper.search_products(keyword, limit)
        
        if products:
            # Save to CSV
            filename = f"tokopedia_{keyword.replace(' ', '_')}_{int(time.time())}.csv"
            scraper.save_to_csv(products, filename)
            
            # Display summary
            print(f"\nScraping completed!")
            print(f"Total products scraped: {len(products)}")
            print(f"Data saved to: {filename}")
            
            # Show first few products as sample
            print("\nSample products:")
            for i, product in enumerate(products[:3]):
                print(f"{i+1}. {product['name'][:50]}... - {product['price']} - Rating: {product['rating']}")
        else:
            print("No products found or scraping failed.")
            
    except KeyboardInterrupt:
        logger.info("Scraping interrupted by user")
    except Exception as e:
        logger.error(f"An error occurred during scraping: {e}")
    finally:
        # Clean up
        scraper.close()

if __name__ == "__main__":
    main()