#!/usr/bin/env python3
"""
Simplified TripAdvisor Restaurant Scraper for Athens
Works with requests and beautifulsoup only - no Selenium required
Collects 500+ restaurants with available information
"""

import requests
import json
import time
import os
import uuid
import re
from datetime import datetime
from urllib.parse import urljoin, urlparse, quote_plus
from typing import List, Dict, Optional
from dataclasses import dataclass
from sqlalchemy import create_engine, text
import urllib.request
from bs4 import BeautifulSoup

@dataclass
class RestaurantData:
    """Data class for restaurant information"""
    name: str
    description: str = ""
    cuisine_type: str = ""
    opening_time: Optional[str] = None
    closing_time: Optional[str] = None
    phone: str = ""
    address: str = ""
    website: str = ""
    price_range: str = ""
    rating: float = 0.0
    review_count: int = 0
    photos: Optional[List[str]] = None
    menu_items: Optional[List[Dict]] = None
    tripadvisor_url: str = ""
    
    def __post_init__(self):
        if self.photos is None:
            self.photos = []
        if self.menu_items is None:
            self.menu_items = []

class SimpleTripAdvisorScraper:
    """Simplified TripAdvisor restaurant scraper"""
    
    def __init__(self):
        self.session = requests.Session()
        
        # Rotate user agents
        self.user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        ]
        
        self.session.headers.update({
            'User-Agent': self.user_agents[0],
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        })
        
        # Database connection
        self.DATABASE_URL = f"postgresql://postgres:{quote_plus('F@f@k0s!!')}@localhost:5432/bookbeach"
        self.engine = create_engine(self.DATABASE_URL)
        
        # Create directories for photos
        self.photo_dir = "restaurant_photos"
        os.makedirs(self.photo_dir, exist_ok=True)
        
        self.request_count = 0
    
    def rotate_user_agent(self):
        """Rotate user agent to avoid detection"""
        self.request_count += 1
        if self.request_count % 10 == 0:
            ua = self.user_agents[self.request_count % len(self.user_agents)]
            self.session.headers['User-Agent'] = ua
    
    def get_page_safely(self, url: str, retries: int = 3) -> Optional[BeautifulSoup]:
        """Get page with retries and error handling"""
        for attempt in range(retries):
            try:
                self.rotate_user_agent()
                response = self.session.get(url, timeout=15)
                response.raise_for_status()
                
                soup = BeautifulSoup(response.content, 'html.parser')
                return soup
                
            except Exception as e:
                print(f"⚠️ Attempt {attempt + 1} failed for {url}: {e}")
                if attempt < retries - 1:
                    time.sleep(2 ** attempt)  # Exponential backoff
                
        return None
    
    def get_restaurant_urls_from_search(self, base_url: str, max_pages: int = 20) -> List[str]:
        """Extract restaurant URLs from search pages"""
        restaurant_urls = set()  # Use set to avoid duplicates
        
        print(f"🔍 Scraping restaurant URLs from TripAdvisor...")
        
        for page in range(0, max_pages * 30, 30):
            if page == 0:
                page_url = base_url
            else:
                # Modify URL for pagination
                page_url = base_url.replace('-oa30-', f'-oa{page}-')
            
            print(f"📄 Page {page//30 + 1}: Extracting URLs...")
            
            soup = self.get_page_safely(page_url)
            if not soup:
                print(f"❌ Failed to load page {page//30 + 1}")
                continue
            
            # Look for restaurant links with various selectors
            selectors = [
                'a[href*="/Restaurant_Review"]',
                'a[href*="-d"]',  # TripAdvisor restaurant URLs contain -d
                '.result-title a',
                '[data-test-target] a[href*="Restaurant"]'
            ]
            
            page_restaurants = 0
            for selector in selectors:
                links = soup.select(selector)
                for link in links:
                    href = link.get('href')
                    if href and isinstance(href, str) and ('/Restaurant_Review' in href or '-d' in href):
                        if href.startswith('/'):
                            full_url = 'https://www.tripadvisor.com' + href
                        else:
                            full_url = href
                        
                        # Clean URL (remove extra parameters)
                        if '?' in full_url:
                            full_url = full_url.split('?')[0]
                        
                        restaurant_urls.add(full_url)
                        page_restaurants += 1
            
            print(f"   Found {page_restaurants} restaurants on this page")
            print(f"   Total unique restaurants: {len(restaurant_urls)}")
            
            # Break if no new restaurants found
            if page_restaurants == 0:
                print("   No more restaurants found, stopping pagination")
                break
            
            # Be polite
            time.sleep(2)
        
        restaurant_list = list(restaurant_urls)
        print(f"✅ Total restaurant URLs collected: {len(restaurant_list)}")
        return restaurant_list
    
    def extract_restaurant_details(self, url: str) -> Optional[RestaurantData]:
        """Extract restaurant details from individual page"""
        soup = self.get_page_safely(url)
        if not soup:
            return None
        
        try:
            # Restaurant name - try multiple selectors
            name = "Unknown Restaurant"
            name_selectors = [
                'h1[data-test-target="top-info-header"]',
                'h1.ui_header',
                'h1',
                '.ui_header h1'
            ]
            
            for selector in name_selectors:
                name_elem = soup.select_one(selector)
                if name_elem:
                    name = name_elem.get_text(strip=True)
                    break
            
            # Description
            description = ""
            desc_selectors = [
                '[data-test-target="restaurant-detail-info"]',
                '.restaurants-detail-info',
                '.ui_columns .text'
            ]
            
            for selector in desc_selectors:
                desc_elem = soup.select_one(selector)
                if desc_elem:
                    description = desc_elem.get_text(strip=True)
                    break
            
            # Cuisine type
            cuisine_type = ""
            cuisine_keywords = ['cuisine', 'food', 'category']
            for keyword in cuisine_keywords:
                elements = soup.find_all(text=re.compile(keyword, re.I))
                for elem in elements:
                    parent = elem.parent
                    if parent:
                        text = parent.get_text(strip=True)
                        if len(text) < 100:  # Reasonable length
                            cuisine_type = text.replace(keyword, '').strip(' :,')
                            break
                if cuisine_type:
                    break
            
            # Address
            address = ""
            address_selectors = [
                '[data-test-target="restaurant-detail-info"] .ui_link',
                '.street-address',
                '.address'
            ]
            
            for selector in address_selectors:
                addr_elem = soup.select_one(selector)
                if addr_elem:
                    address = addr_elem.get_text(strip=True)
                    break
            
            # Rating
            rating = 0.0
            rating_elements = soup.find_all(['span', 'div'], class_=re.compile('ui_bubble_rating', re.I))
            for elem in rating_elements:
                if hasattr(elem, 'get'):
                    class_list = elem.get('class', [])
                    if isinstance(class_list, list):
                        class_name = ' '.join(class_list)
                    else:
                        class_name = str(class_list)
                    # TripAdvisor uses bubble_xx pattern where xx is rating*10
                    rating_match = re.search(r'bubble_(\d+)', class_name)
                    if rating_match:
                        rating = int(rating_match.group(1)) / 10.0
                        break
            
            # Review count
            review_count = 0
            review_elements = soup.find_all(text=re.compile(r'\d+.*review', re.I))
            for elem in review_elements:
                numbers = re.findall(r'\d+', elem)
                if numbers:
                    review_count = int(numbers[0])
                    break
            
            # Photos
            photos = []
            img_elements = soup.find_all('img', src=True)
            for img in img_elements:
                src = img.get('src')
                if src and any(keyword in src.lower() for keyword in ['restaurant', 'food', 'photo']):
                    if src.startswith('//'):
                        src = 'https:' + src
                    elif src.startswith('/'):
                        src = 'https://www.tripadvisor.com' + src
                    photos.append(src)
            
            # Remove duplicates and limit
            photos = list(dict.fromkeys(photos))[:8]
            
            # Basic menu items (this is limited without JavaScript)
            menu_items = []
            # Look for price patterns in text
            price_patterns = soup.find_all(text=re.compile(r'[\$€£¥]\s*\d+', re.I))
            for i, price_text in enumerate(price_patterns[:20]):
                price_match = re.search(r'([\$€£¥])\s*(\d+(?:[.,]\d{2})?)', price_text)
                if price_match:
                    currency_symbol = price_match.group(1)
                    price_value = float(price_match.group(2).replace(',', '.'))
                    
                    # Try to find associated item name
                    parent = price_text.parent if hasattr(price_text, 'parent') else None
                    item_name = f"Menu Item {i+1}"
                    if parent:
                        parent_text = parent.get_text(strip=True)
                        if len(parent_text) > 5 and len(parent_text) < 100:
                            item_name = parent_text
                    
                    menu_items.append({
                        'name': item_name,
                        'price': price_value,
                        'description': '',
                        'category': cuisine_type or 'General'
                    })
            
            restaurant = RestaurantData(
                name=name,
                description=description,
                cuisine_type=cuisine_type,
                address=address,
                rating=rating,
                review_count=review_count,
                photos=photos,
                menu_items=menu_items,
                tripadvisor_url=url
            )
            
            print(f"✅ {name} - {rating}⭐ ({review_count} reviews, {len(photos)} photos)")
            return restaurant
            
        except Exception as e:
            print(f"❌ Error parsing {url}: {e}")
            return None
    
    def download_photo(self, photo_url: str, restaurant_name: str, photo_index: int) -> Optional[str]:
        """Download restaurant photo"""
        try:
            # Clean filename
            safe_name = re.sub(r'[^\w\s-]', '', restaurant_name).strip()
            safe_name = re.sub(r'[-\s]+', '-', safe_name)[:50]  # Limit length
            
            # Get extension
            ext = '.jpg'
            if '.' in photo_url:
                potential_ext = '.' + photo_url.split('.')[-1].split('?')[0]
                if potential_ext.lower() in ['.jpg', '.jpeg', '.png', '.gif']:
                    ext = potential_ext
            
            filename = f"{safe_name}_{photo_index}{ext}"
            filepath = os.path.join(self.photo_dir, filename)
            
            # Skip if already exists
            if os.path.exists(filepath):
                return filepath
            
            # Download
            headers = {'User-Agent': self.session.headers['User-Agent']}
            
            response = requests.get(photo_url, headers=headers, timeout=10)
            response.raise_for_status()
            
            with open(filepath, 'wb') as f:
                f.write(response.content)
            
            print(f"📷 Downloaded: {filename}")
            return filepath
            
        except Exception as e:
            print(f"❌ Photo download failed {photo_url}: {e}")
            return None
    
    def save_to_database(self, restaurants: List[RestaurantData]):
        """Save restaurants to database"""
        with self.engine.connect() as db:
            # Get IDs
            company_result = db.execute(text("SELECT company_id FROM companies WHERE company_name = 'bookbeach'")).fetchone()
            company_id = company_result[0] if company_result else "c35388d2-0028-4002-bcc4-db4d7ed2042e"
            
            beach_result = db.execute(text("SELECT beach_place_id FROM beach_places WHERE city ILIKE '%athens%' LIMIT 1")).fetchone()
            beach_place_id = beach_result[0] if beach_result else None
            
            currency_result = db.execute(text("SELECT currency_id FROM currencies WHERE currency_code = 'EUR'")).fetchone()
            currency_id = currency_result[0] if currency_result else 1
            
            saved_count = 0
            
            for restaurant in restaurants:
                try:
                    restaurant_id = str(uuid.uuid4())
                    
                    # Check if already exists
                    existing = db.execute(text("SELECT restaurant_id FROM restaurants WHERE restaurant_name = :name"), 
                                        {'name': restaurant.name}).fetchone()
                    if existing:
                        print(f"⏭️ Skipped (exists): {restaurant.name}")
                        continue
                    
                    # Insert restaurant
                    db.execute(text("""
                        INSERT INTO restaurants (
                            restaurant_id, restaurant_name, company_id, beach_place_id,
                            description, cuisine_type, is_active, created_at
                        ) VALUES (
                            :restaurant_id, :restaurant_name, :company_id, :beach_place_id,
                            :description, :cuisine_type, :is_active, NOW()
                        )
                    """), {
                        'restaurant_id': restaurant_id,
                        'restaurant_name': restaurant.name,
                        'company_id': company_id,
                        'beach_place_id': beach_place_id,
                        'description': restaurant.description,
                        'cuisine_type': restaurant.cuisine_type,
                        'is_active': True
                    })
                    
                    # Create category
                    if restaurant.cuisine_type:
                        try:
                            category_id = int(time.time() * 1000) % 2147483647  # Generate unique ID
                            db.execute(text("""
                                INSERT INTO restaurant_categories (
                                    category_id, restaurant_id, category_name, description
                                ) VALUES (
                                    :category_id, :restaurant_id, :category_name, :description
                                )
                            """), {
                                'category_id': category_id,
                                'restaurant_id': restaurant_id,
                                'category_name': restaurant.cuisine_type[:100],
                                'description': f'{restaurant.cuisine_type} cuisine'
                            })
                        except:
                            category_id = 1  # Fallback
                    else:
                        category_id = 1
                    
                    # Insert menu items
                    for item in restaurant.menu_items:
                        try:
                            item_id = str(uuid.uuid4())
                            db.execute(text("""
                                INSERT INTO restaurant_items (
                                    item_id, restaurant_id, category_id, item_name,
                                    description, price, currency_id, is_available, created_at
                                ) VALUES (
                                    :item_id, :restaurant_id, :category_id, :item_name,
                                    :description, :price, :currency_id, :is_available, NOW()
                                )
                            """), {
                                'item_id': item_id,
                                'restaurant_id': restaurant_id,
                                'category_id': category_id,
                                'item_name': item.get('name', 'Menu Item')[:100],
                                'description': item.get('description', '')[:500],
                                'price': item.get('price', 0.0),
                                'currency_id': currency_id,
                                'is_available': True
                            })
                        except Exception as e:
                            print(f"⚠️ Menu item error: {e}")
                    
                    # Download photos
                    for i, photo_url in enumerate(restaurant.photos):
                        self.download_photo(photo_url, restaurant.name, i)
                    
                    db.commit()
                    saved_count += 1
                    print(f"💾 Saved: {restaurant.name} ({len(restaurant.menu_items)} items)")
                    
                except Exception as e:
                    db.rollback()
                    print(f"❌ Save failed for {restaurant.name}: {e}")
            
            print(f"✅ Database save complete: {saved_count}/{len(restaurants)} restaurants")
    
    def scrape_athens_restaurants(self, target_count: int = 500):
        """Main scraping function"""
        print(f"🚀 Starting Athens restaurant scraper")
        print(f"🎯 Target: {target_count} restaurants")
        
        base_url = "https://www.tripadvisor.com/Restaurants-g189400-oa30-zfp10954-Athens_Attica.html"
        
        # Step 1: Get restaurant URLs
        restaurant_urls = self.get_restaurant_urls_from_search(base_url, max_pages=25)
        
        if len(restaurant_urls) < target_count:
            print(f"⚠️ Only found {len(restaurant_urls)} URLs, continuing with available ones")
        
        # Step 2: Extract details
        restaurants = []
        urls_to_process = restaurant_urls[:target_count + 100]  # Extra for safety
        
        for i, url in enumerate(urls_to_process, 1):
            print(f"📊 Progress: {i}/{len(urls_to_process)} ({len(restaurants)} valid)")
            
            restaurant = self.extract_restaurant_details(url)
            if restaurant and restaurant.name != "Unknown Restaurant":
                restaurants.append(restaurant)
            
            # Save every 25 restaurants
            if len(restaurants) % 25 == 0 and len(restaurants) > 0:
                print(f"💾 Saving batch: {len(restaurants)} restaurants")
                self.save_to_database(restaurants[-25:])
            
            # Stop if we hit target
            if len(restaurants) >= target_count:
                break
            
            # Be polite
            time.sleep(1.5)
        
        # Save final batch
        remaining = len(restaurants) % 25
        if remaining > 0:
            self.save_to_database(restaurants[-remaining:])
        
        print(f"🎉 Scraping complete!")
        print(f"📊 Restaurants collected: {len(restaurants)}")
        print(f"📁 Photos saved to: {self.photo_dir}")
        
        return restaurants

def main():
    """Main function"""
    scraper = SimpleTripAdvisorScraper()
    
    try:
        restaurants = scraper.scrape_athens_restaurants(target_count=500)
        
        print(f"\n✅ SCRAPING SUMMARY:")
        print(f"   🏪 Total restaurants: {len(restaurants)}")
        print(f"   📷 Photos directory: {scraper.photo_dir}")
        print(f"   💾 All data saved to database")
        
    except Exception as e:
        print(f"❌ Error: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()