#!/usr/bin/env python3
"""
TripAdvisor Restaurant Scraper for Athens
Collects 500+ restaurants with complete information including:
- Restaurant details (name, description, cuisine type, hours)
- Photos (downloaded locally)
- Menu items and prices
- Reviews and ratings
"""

import requests
import json
import time
import os
import uuid
import re
from datetime import datetime
from urllib.parse import urljoin, urlparse, quote_plus
from typing import List, Dict, Optional
from dataclasses import dataclass
from sqlalchemy import create_engine, text
import urllib.request
from fake_useragent import UserAgent

# Selenium imports for dynamic content
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from bs4 import BeautifulSoup

@dataclass
class RestaurantData:
    """Data class for restaurant information"""
    name: str
    description: str = ""
    cuisine_type: str = ""
    opening_time: Optional[str] = None
    closing_time: Optional[str] = None
    phone: str = ""
    address: str = ""
    website: str = ""
    price_range: str = ""
    rating: float = 0.0
    review_count: int = 0
    photos: List[str] = None
    menu_items: List[Dict] = None
    tripadvisor_url: str = ""
    
    def __post_init__(self):
        if self.photos is None:
            self.photos = []
        if self.menu_items is None:
            self.menu_items = []

class TripAdvisorRestaurantScraper:
    """Scraper for TripAdvisor restaurant data"""
    
    def __init__(self):
        self.ua = UserAgent()
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': self.ua.random,
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
        })
        
        # Database connection
        self.DATABASE_URL = f"postgresql://postgres:{quote_plus('F@f@k0s!!')}@localhost:5432/bookbeach"
        self.engine = create_engine(self.DATABASE_URL)
        
        # Create directories for photos
        self.photo_dir = "restaurant_photos"
        os.makedirs(self.photo_dir, exist_ok=True)
        
        # Setup Selenium
        self.setup_selenium()
        
    def setup_selenium(self):
        """Setup Selenium WebDriver"""
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # Run in background
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument(f"--user-agent={self.ua.random}")
        
        try:
            self.driver = webdriver.Chrome(options=chrome_options)
            self.wait = WebDriverWait(self.driver, 10)
        except Exception as e:
            print(f"⚠️ Warning: Could not setup Selenium: {e}")
            print("📝 Will use requests-only mode (limited functionality)")
            self.driver = None
            self.wait = None
    
    def get_restaurant_urls(self, base_url: str, max_pages: int = 20) -> List[str]:
        """Extract restaurant URLs from search results"""
        restaurant_urls = []
        
        for page in range(0, max_pages * 30, 30):  # TripAdvisor uses 30-item pagination
            page_url = base_url.replace('-oa30-', f'-oa{page}-') if page > 0 else base_url
            print(f"🔍 Scraping page {page//30 + 1}: {page_url}")
            
            try:
                if self.driver:
                    self.driver.get(page_url)
                    time.sleep(3)
                    
                    # Wait for restaurant listings to load
                    try:
                        self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "[data-test-target='restaurants-list']")))
                    except TimeoutException:
                        print("⚠️ Timeout waiting for restaurant list")
                        continue
                    
                    # Find restaurant links
                    restaurant_links = self.driver.find_elements(By.CSS_SELECTOR, "a[href*='/Restaurant_Review']")
                    
                    for link in restaurant_links:
                        href = link.get_attribute('href')
                        if href and '/Restaurant_Review' in href:
                            full_url = urljoin('https://www.tripadvisor.com', href)
                            if full_url not in restaurant_urls:
                                restaurant_urls.append(full_url)
                else:
                    # Fallback to requests
                    response = self.session.get(page_url)
                    soup = BeautifulSoup(response.content, 'html.parser')
                    
                    # Find restaurant links
                    links = soup.find_all('a', href=True)
                    for link in links:
                        href = link['href']
                        if '/Restaurant_Review' in href:
                            full_url = urljoin('https://www.tripadvisor.com', href)
                            if full_url not in restaurant_urls:
                                restaurant_urls.append(full_url)
                
                print(f"📊 Found {len(restaurant_urls)} restaurants so far")
                time.sleep(2)  # Be polite to servers
                
            except Exception as e:
                print(f"❌ Error on page {page//30 + 1}: {e}")
                continue
        
        print(f"✅ Total restaurants found: {len(restaurant_urls)}")
        return restaurant_urls[:600]  # Get extra to ensure we have 500+ valid ones
    
    def extract_restaurant_data(self, url: str) -> Optional[RestaurantData]:
        """Extract detailed restaurant information"""
        try:
            print(f"🔍 Scraping: {url}")
            
            if self.driver:
                self.driver.get(url)
                time.sleep(3)
                soup = BeautifulSoup(self.driver.page_source, 'html.parser')
            else:
                response = self.session.get(url)
                soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract restaurant name
            name_elem = soup.find('h1', {'data-test-target': 'top-info-header'}) or \
                       soup.find('h1')
            name = name_elem.get_text(strip=True) if name_elem else "Unknown Restaurant"
            
            # Extract description
            description_elem = soup.find('div', {'data-test-target': 'restaurant-detail-info'}) or \
                              soup.find('div', class_=re.compile('.*description.*', re.I))
            description = description_elem.get_text(strip=True) if description_elem else ""
            
            # Extract cuisine type
            cuisine_elem = soup.find('span', string=re.compile('cuisines', re.I))
            cuisine_type = ""
            if cuisine_elem:
                cuisine_parent = cuisine_elem.find_parent()
                if cuisine_parent:
                    cuisine_type = cuisine_parent.get_text(strip=True).replace('Cuisines', '').strip()
            
            # Extract hours
            hours_elem = soup.find('div', string=re.compile('hours', re.I)) or \
                        soup.find('span', string=re.compile('open|closed', re.I))
            opening_time = closing_time = None
            if hours_elem:
                hours_text = hours_elem.get_text(strip=True)
                # Try to parse hours like "10:00 AM - 11:00 PM"
                hours_match = re.search(r'(\d{1,2}:\d{2}\s*[AP]M)\s*-\s*(\d{1,2}:\d{2}\s*[AP]M)', hours_text, re.I)
                if hours_match:
                    opening_time = hours_match.group(1)
                    closing_time = hours_match.group(2)
            
            # Extract contact info
            phone_elem = soup.find('span', string=re.compile('phone', re.I))
            phone = ""
            if phone_elem:
                phone_parent = phone_elem.find_parent()
                if phone_parent:
                    phone = phone_parent.get_text(strip=True).replace('Phone', '').strip()
            
            # Extract address
            address_elem = soup.find('span', {'data-test-target': 'restaurant-detail-info'}) or \
                          soup.find('div', class_=re.compile('.*address.*', re.I))
            address = address_elem.get_text(strip=True) if address_elem else ""
            
            # Extract price range
            price_elem = soup.find('span', string=re.compile('price', re.I))
            price_range = ""
            if price_elem:
                price_parent = price_elem.find_parent()
                if price_parent:
                    price_range = price_parent.get_text(strip=True)
            
            # Extract rating and reviews
            rating = 0.0
            review_count = 0
            rating_elem = soup.find('span', class_=re.compile('.*rating.*', re.I))
            if rating_elem:
                rating_text = rating_elem.get('aria-label', '') or rating_elem.get_text(strip=True)
                rating_match = re.search(r'(\d+\.?\d*)', rating_text)
                if rating_match:
                    rating = float(rating_match.group(1))
            
            review_elem = soup.find('span', string=re.compile('review', re.I))
            if review_elem:
                review_text = review_elem.get_text(strip=True)
                review_match = re.search(r'(\d+)', review_text)
                if review_match:
                    review_count = int(review_match.group(1))
            
            # Extract photo URLs
            photos = []
            photo_elements = soup.find_all('img', src=True)
            for img in photo_elements[:10]:  # Limit to 10 photos per restaurant
                src = img.get('src')
                if src and ('restaurant' in src.lower() or 'food' in src.lower() or 'tripadvisor' in src):
                    photos.append(src)
            
            # Extract menu items (basic implementation)
            menu_items = []
            menu_sections = soup.find_all('div', class_=re.compile('.*menu.*', re.I))
            for section in menu_sections[:5]:  # Limit sections
                items = section.find_all('div', string=re.compile('.*'))
                for item in items[:20]:  # Limit items per section
                    item_text = item.get_text(strip=True)
                    if len(item_text) > 5 and len(item_text) < 200:  # Reasonable item name length
                        # Try to extract price
                        price_match = re.search(r'[\$€£¥]?\s*(\d+[.,]\d{0,2})', item_text)
                        price = float(price_match.group(1).replace(',', '.')) if price_match else 0.0
                        
                        menu_items.append({
                            'name': item_text,
                            'price': price,
                            'description': '',
                            'category': 'General'
                        })
            
            restaurant = RestaurantData(
                name=name,
                description=description,
                cuisine_type=cuisine_type,
                opening_time=opening_time,
                closing_time=closing_time,
                phone=phone,
                address=address,
                price_range=price_range,
                rating=rating,
                review_count=review_count,
                photos=photos,
                menu_items=menu_items,
                tripadvisor_url=url
            )
            
            print(f"✅ Extracted: {name} ({len(photos)} photos, {len(menu_items)} menu items)")
            return restaurant
            
        except Exception as e:
            print(f"❌ Error extracting {url}: {e}")
            return None
    
    def download_photo(self, photo_url: str, restaurant_name: str, photo_index: int) -> Optional[str]:
        """Download and save restaurant photo"""
        try:
            # Clean restaurant name for filename
            safe_name = re.sub(r'[^\w\s-]', '', restaurant_name).strip()
            safe_name = re.sub(r'[-\s]+', '-', safe_name)
            
            # Get file extension
            parsed_url = urlparse(photo_url)
            ext = os.path.splitext(parsed_url.path)[1] or '.jpg'
            
            # Create filename
            filename = f"{safe_name}_{photo_index}{ext}"
            filepath = os.path.join(self.photo_dir, filename)
            
            # Download photo
            headers = {'User-Agent': self.ua.random}
            urllib.request.urlretrieve(photo_url, filepath)
            
            print(f"📷 Downloaded: {filename}")
            return filepath
            
        except Exception as e:
            print(f"❌ Error downloading photo {photo_url}: {e}")
            return None
    
    def save_to_database(self, restaurants: List[RestaurantData]):
        """Save restaurant data to database"""
        with self.engine.connect() as db:
            # Get company and currency info
            company_result = db.execute(text("SELECT company_id FROM companies WHERE company_name = 'bookbeach'")).fetchone()
            company_id = company_result[0] if company_result else "c35388d2-0028-4002-bcc4-db4d7ed2042e"
            
            # Get Athens beach place (if any)
            beach_result = db.execute(text("SELECT beach_place_id FROM beach_places WHERE city ILIKE '%athens%' LIMIT 1")).fetchone()
            beach_place_id = beach_result[0] if beach_result else None
            
            # Get EUR currency
            currency_result = db.execute(text("SELECT currency_id FROM currencies WHERE currency_code = 'EUR'")).fetchone()
            currency_id = currency_result[0] if currency_result else 1
            
            successful_saves = 0
            
            for restaurant in restaurants:
                try:
                    restaurant_id = str(uuid.uuid4())
                    
                    # Parse opening/closing times
                    opening_time = None
                    closing_time = None
                    if restaurant.opening_time:
                        try:
                            opening_time = datetime.strptime(restaurant.opening_time.replace(' ', ''), '%I:%M%p').time()
                        except:
                            pass
                    if restaurant.closing_time:
                        try:
                            closing_time = datetime.strptime(restaurant.closing_time.replace(' ', ''), '%I:%M%p').time()
                        except:
                            pass
                    
                    # Insert restaurant
                    db.execute(text("""
                        INSERT INTO restaurants (
                            restaurant_id, restaurant_name, company_id, beach_place_id,
                            description, cuisine_type, opening_time, closing_time, is_active
                        ) VALUES (
                            :restaurant_id, :restaurant_name, :company_id, :beach_place_id,
                            :description, :cuisine_type, :opening_time, :closing_time, :is_active
                        )
                    """), {
                        'restaurant_id': restaurant_id,
                        'restaurant_name': restaurant.name,
                        'company_id': company_id,
                        'beach_place_id': beach_place_id,
                        'description': restaurant.description,
                        'cuisine_type': restaurant.cuisine_type,
                        'opening_time': opening_time,
                        'closing_time': closing_time,
                        'is_active': True
                    })
                    
                    # Create default category
                    category_id = 1  # We'll create a simple incrementing ID
                    try:
                        db.execute(text("""
                            INSERT INTO restaurant_categories (
                                category_id, restaurant_id, category_name, description
                            ) VALUES (
                                :category_id, :restaurant_id, :category_name, :description
                            )
                        """), {
                            'category_id': category_id,
                            'restaurant_id': restaurant_id,
                            'category_name': restaurant.cuisine_type or 'General',
                            'description': f'{restaurant.cuisine_type} cuisine'
                        })
                    except:
                        # Category might already exist, continue
                        pass
                    
                    # Insert menu items
                    for item in restaurant.menu_items:
                        try:
                            item_id = str(uuid.uuid4())
                            db.execute(text("""
                                INSERT INTO restaurant_items (
                                    item_id, restaurant_id, category_id, item_name,
                                    description, price, currency_id, is_available
                                ) VALUES (
                                    :item_id, :restaurant_id, :category_id, :item_name,
                                    :description, :price, :currency_id, :is_available
                                )
                            """), {
                                'item_id': item_id,
                                'restaurant_id': restaurant_id,
                                'category_id': category_id,
                                'item_name': item.get('name', 'Unknown Item'),
                                'description': item.get('description', ''),
                                'price': item.get('price', 0.0),
                                'currency_id': currency_id,
                                'is_available': True
                            })
                        except Exception as e:
                            print(f"⚠️ Failed to save menu item: {e}")
                    
                    # Download photos
                    for i, photo_url in enumerate(restaurant.photos):
                        photo_path = self.download_photo(photo_url, restaurant.name, i)
                        if photo_path:
                            # Here you could save photo info to a photos table if needed
                            pass
                    
                    db.commit()
                    successful_saves += 1
                    print(f"✅ Saved: {restaurant.name}")
                    
                except Exception as e:
                    db.rollback()
                    print(f"❌ Failed to save {restaurant.name}: {e}")
            
            print(f"💾 Successfully saved {successful_saves}/{len(restaurants)} restaurants")
    
    def scrape_restaurants(self, base_url: str, target_count: int = 500) -> List[RestaurantData]:
        """Main scraping method"""
        print(f"🚀 Starting TripAdvisor restaurant scraper")
        print(f"🎯 Target: {target_count} restaurants")
        print(f"📍 Base URL: {base_url}")
        
        # Get restaurant URLs
        restaurant_urls = self.get_restaurant_urls(base_url, max_pages=25)
        
        if len(restaurant_urls) < target_count:
            print(f"⚠️ Warning: Only found {len(restaurant_urls)} URLs, less than target {target_count}")
        
        # Extract restaurant data
        restaurants = []
        for i, url in enumerate(restaurant_urls[:target_count + 100], 1):  # Get extra for safety
            print(f"📊 Progress: {i}/{min(len(restaurant_urls), target_count + 100)}")
            
            restaurant = self.extract_restaurant_data(url)
            if restaurant:
                restaurants.append(restaurant)
            
            # Save progress every 50 restaurants
            if len(restaurants) % 50 == 0:
                print(f"💾 Saving progress: {len(restaurants)} restaurants")
                self.save_to_database(restaurants[-50:])
            
            # Be polite to servers
            time.sleep(1)
            
            if len(restaurants) >= target_count:
                break
        
        print(f"✅ Scraping complete: {len(restaurants)} restaurants collected")
        return restaurants
    
    def close(self):
        """Clean up resources"""
        if self.driver:
            self.driver.quit()

def main():
    """Main execution function"""
    scraper = TripAdvisorRestaurantScraper()
    
    try:
        # TripAdvisor Athens restaurants URL
        base_url = "https://www.tripadvisor.com/Restaurants-g189400-oa30-zfp10954-Athens_Attica.html"
        
        # Scrape restaurants
        restaurants = scraper.scrape_restaurants(base_url, target_count=500)
        
        # Save final batch
        if restaurants:
            scraper.save_to_database(restaurants)
        
        print(f"🎉 Scraping completed successfully!")
        print(f"📊 Total restaurants: {len(restaurants)}")
        print(f"📁 Photos saved to: {scraper.photo_dir}")
        
    except Exception as e:
        print(f"❌ Error in main execution: {e}")
        import traceback
        traceback.print_exc()
    
    finally:
        scraper.close()

if __name__ == "__main__":
    main()