#!/usr/bin/env python3
"""
Tokopedia Review Scraper - Approach Similar to Others
"""

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import time
import pandas as pd

def scrape_tokopedia_reviews():
    """Scrape reviews from Tokopedia store/product page"""
    
    # Get URL from user
    url = input("Masukkan URL toko/produk Tokopedia: ").strip()
    
    if not url:
        print("URL tidak boleh kosong!")
        return
    
    # Setup Chrome options
    options = Options()
    options.add_argument("--start-maximized")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    # Add user agent to appear more like a real browser
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    # Run in headless mode to avoid UI issues
    options.add_argument("--headless")
    
    try:
        # Initialize WebDriver
        print("Membuka browser...")
        # Add some delay to prevent session conflicts
        time.sleep(2)
        driver = webdriver.Chrome(options=options)
        # Execute script to hide webdriver property
        driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        driver.get(url)
        
        # Wait for page to load
        print("Menunggu halaman dimuat...")
        time.sleep(5)
        
        data = []
        page_count = 0
        max_pages = 3  # Limit pages to avoid detection
        
        while page_count < max_pages:
            print(f"Mengekstrak data dari halaman {page_count + 1}...")
            
            # Get page source and parse with BeautifulSoup
            soup = BeautifulSoup(driver.page_source, "html.parser")
            
            # Try different selectors for reviews
            # Selector 1: Common review selector
            containers = soup.find_all('article', attrs={'class': 'css-ccpe8t'})
            
            # If not found, try alternative selectors
            if not containers:
                containers = soup.find_all('article', class_=lambda x: x and 'review' in x.lower())
            
            print(f"Ditemukan {len(containers)} kontainer review")
            
            for container in containers:
                try:
                    # Try different ways to find review text
                    review_element = container.find('span', attrs={'data-testid': 'lblItemUlasan'})
                    
                    if not review_element:
                        # Alternative selectors
                        review_element = container.find('p', class_=lambda x: x and ('review' in x.lower() or 'ulasan' in x.lower()))
                    
                    if not review_element:
                        # Try any text content in the container
                        review_text = container.get_text(strip=True)
                        if review_text and len(review_text) > 10:  # Minimum length filter
                            data.append((review_text,))
                    else:
                        review = review_element.text.strip()
                        if review:
                            data.append((review,))
                            
                except AttributeError:
                    continue
            
            page_count += 1
            print(f"Total review sementara: {len(data)}")
            
            # Try to go to next page
            if page_count < max_pages:
                try:
                    print("Mencoba navigasi ke halaman berikutnya...")
                    # Wait and click next button
                    next_button = WebDriverWait(driver, 10).until(
                        EC.element_to_be_clickable((By.CSS_SELECTOR, "button[aria-label^='Laman berikutnya']"))
                    )
                    driver.execute_script("arguments[0].click();", next_button)
                    time.sleep(3)  # Wait for page to load
                except (TimeoutException, NoSuchElementException):
                    print("Tidak ditemukan tombol halaman berikutnya atau sudah di halaman terakhir")
                    break
        
        # Save data
        if data:
            print(f"\nBerhasil mengumpulkan {len(data)} review!")
            df = pd.DataFrame(data, columns=["Ulasan"])
            filename = "tokopedia_reviews.csv"
            df.to_csv(filename, index=False, encoding='utf-8')
            print(f"Data telah disimpan ke {filename}")
            
            # Show sample data
            print("\nContoh data yang diambil:")
            for i, review in enumerate(data[:5]):
                print(f"{i+1}. {review[0][:100]}...")
        else:
            print("Tidak ditemukan data review. Mungkin struktur halaman berbeda atau tidak ada review.")
            
    except Exception as e:
        print(f"Terjadi kesalahan: {e}")
    finally:
        try:
            driver.quit()
            print("Browser ditutup.")
        except:
            pass

if __name__ == "__main__":
    scrape_tokopedia_reviews()