#!/usr/bin/env python3
"""
Automated Favorites Scraper - Runs on Schedule or On-Demand
Scrapes favorites, runs analysis, checks availability
"""
import asyncio
import json
import subprocess
import time
import os
import sys
from pathlib import Path
from datetime import datetime
from favorites_scraper import scrape_favorites

# Only import schedule if needed (for scheduler mode)
# This allows the script to run without schedule module for 'now' command
try:
    import schedule
    HAS_SCHEDULE = True
except ImportError:
    HAS_SCHEDULE = False

def log(message):
    """Log with timestamp"""
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print(f"[{timestamp}] {message}")

def update_progress(step, step_name, total_steps=8):
    """Update progress file if job_id is provided"""
    job_id = os.environ.get('FARMMATCH_JOB_ID')
    if not job_id:
        return

    progress_file = f'/tmp/paradisomatch_progress_{job_id}.json'
    progress_data = {
        'job_id': job_id,
        'status': 'running' if step < total_steps else 'completed',
        'progress': step,
        'current_step': step_name,
        'total_steps': total_steps,
        'started_at': datetime.now().isoformat() if step == 0 else None,
        'completed_at': datetime.now().isoformat() if step >= total_steps else None,
        'full_pipeline': True
    }

    # Read existing progress to preserve started_at
    if os.path.exists(progress_file):
        try:
            with open(progress_file, 'r') as f:
                existing = json.load(f)
                if existing.get('started_at'):
                    progress_data['started_at'] = existing['started_at']
        except:
            pass

    # Remove None values
    progress_data = {k: v for k, v in progress_data.items() if v is not None}

    try:
        with open(progress_file, 'w') as f:
            json.dump(progress_data, f)
    except Exception as e:
        log(f"⚠️  Could not update progress: {e}")

def run_full_update_pipeline():
    """
    Complete update pipeline (optimized order):
    1. Scrape favorites
    2. Sync new properties to enriched_data.json (CRITICAL FIX)
    3. Check availability (filter out sold/removed properties FIRST)
    4. Extract breadcrumbs (location data)
    5. Extract GPS & KPIs from property pages
    6. Geocode active properties only
    7. Run GPT analysis (subjective criteria)
    8. Parse & combine all criteria
    """
    log("=" * 70)
    log("🚀 STARTING FULL UPDATE PIPELINE")
    log("=" * 70)

    script_dir = Path(__file__).parent

    try:
        # Step 1: Scrape favorites
        update_progress(1, "Scraping favorites from Properstar...")
        log("📥 Step 1/8: Scraping favorites from Properstar...")
        asyncio.run(scrape_favorites())
        log("✅ Favorites scraped successfully")

        # Check if we have the CSV
        csv_file = script_dir / "extracted_property_urls.csv"
        if not csv_file.exists():
            log("❌ No CSV file found - scraping may have failed")
            update_progress(1, "Failed: No properties found", total_steps=8)
            return False

        # Step 2: Sync new properties to enriched_data.json
        update_progress(2, "Syncing new properties to enriched_data.json...")
        log("\n🔄 Step 2/8: Syncing new properties to enriched_data.json...")
        log("   (Ensuring all scraped favorites are in enriched_data.json)")
        sync_script = script_dir / "sync_csv_to_enriched.py"
        if sync_script.exists():
            result = subprocess.run(
                ['python3', str(sync_script)],
                cwd=str(script_dir),
                capture_output=True,
                text=True,
                timeout=300  # 5 min timeout
            )
            if result.returncode == 0:
                log("✅ Sync completed - new properties added to enriched_data.json")
            else:
                log(f"⚠️  Sync had issues: {result.stderr[:200]}")
        else:
            log("⚠️  sync_csv_to_enriched.py not found - skipping sync (NEW PROPERTIES MAY NOT BE ANALYZED!)")

        # Step 3: Check availability FIRST (before geocoding/analysis)
        update_progress(3, "Checking property availability...")
        log("\n🔍 Step 3/8: Checking property availability...")
        log("   (Filtering out sold/removed properties before geocoding)")
        availability_script = script_dir / "check_availability.py"
        if availability_script.exists():
            result = subprocess.run(
                ['python3', str(availability_script)],
                cwd=str(script_dir),
                capture_output=True,
                text=True,
                timeout=3600  # 1 hour timeout
            )
            if result.returncode == 0:
                log("✅ Availability check completed")

                # Show how many properties were filtered out
                enriched_file = script_dir / "enriched_data.json"
                if enriched_file.exists():
                    with open(enriched_file, 'r', encoding='utf-8') as f:
                        properties = json.load(f)
                    active = sum(1 for p in properties if p.get('status') != 'Removed')
                    removed = len(properties) - active
                    log(f"   📊 {active} active properties, {removed} removed/sold (skipping these)")
            else:
                log(f"⚠️  Availability check had issues: {result.stderr[:200]}")
        else:
            log("⚠️  check_availability.py not found - skipping availability check")

        # Step 4: Extract breadcrumbs (location data)
        update_progress(4, "Extracting breadcrumbs...")
        log("\n🍞 Step 4/8: Extracting breadcrumbs (location data)...")
        breadcrumb_script = script_dir / "extract_breadcrumbs.py"
        if breadcrumb_script.exists():
            result = subprocess.run(
                ['python3', str(breadcrumb_script)],
                cwd=str(script_dir),
                capture_output=True,
                text=True,
                timeout=1800  # 30 min timeout
            )
            if result.returncode == 0:
                log("✅ Breadcrumb extraction completed")
            else:
                log(f"⚠️  Breadcrumb extraction had issues: {result.stderr[:200]}")
        else:
            log("⚠️  extract_breadcrumbs.py not found - skipping breadcrumb extraction")

        # Step 5: Extract GPS & KPIs
        update_progress(5, "Extracting GPS coordinates and KPIs...")
        log("\n🌍 Step 5/8: Extracting GPS coordinates and KPIs...")
        gps_script = script_dir / "extract_gps_and_kpis.py"
        if gps_script.exists():
            result = subprocess.run(
                ['python3', str(gps_script)],
                cwd=str(script_dir),
                capture_output=True,
                text=True,
                timeout=1800  # 30 min timeout
            )
            if result.returncode == 0:
                log("✅ GPS & KPI extraction completed")
            else:
                log(f"⚠️  GPS & KPI extraction had issues: {result.stderr[:200]}")
        else:
            log("⚠️  extract_gps_and_kpis.py not found - skipping GPS extraction")

        # Step 6: Geocode active properties (needs breadcrumbs & GPS data)
        update_progress(6, "Geocoding active properties...")
        log("\n📍 Step 6/8: Geocoding active properties...")
        geocode_script = script_dir / "geocode_properties.py"
        if geocode_script.exists():
            result = subprocess.run(
                ['python3', str(geocode_script)],
                cwd=str(script_dir),
                capture_output=True,
                text=True,
                timeout=1800  # 30 min timeout
            )
            if result.returncode == 0:
                log("✅ Geocoding completed")
            else:
                log(f"⚠️  Geocoding had issues: {result.stderr[:200]}")
        else:
            log("⚠️  geocode_properties.py not found - skipping geocoding")

        # Step 7: Run GPT analysis (subjective criteria)
        update_progress(7, "Running GPT analysis...")
        log("\n🤖 Step 7/8: Running GPT analysis (subjective criteria)...")
        analysis_script = script_dir / "analyze_from_urls_optimized.py"
        if analysis_script.exists():
            result = subprocess.run(
                ['python3', str(analysis_script)],
                cwd=str(script_dir),
                capture_output=True,
                text=True,
                timeout=3600,  # 1 hour timeout
                env={**os.environ, 'USE_CACHE': 'y', 'USE_OPTIMIZED_PROMPT': 'y'}
            )
            if result.returncode == 0:
                log("✅ GPT analysis completed (optimized prompt, ~30% token savings)")
            else:
                log(f"⚠️  GPT analysis had issues: {result.stderr[:200]}")
        else:
            log("⚠️  analyze_from_urls_optimized.py not found - skipping GPT analysis")

        # Step 8: Parse and combine all criteria
        update_progress(8, "Parsing and combining criteria...")
        log("\n📊 Step 8/8: Parsing and combining all criteria...")
        parse_script = script_dir / "parse_criteria.py"
        if parse_script.exists():
            result = subprocess.run(
                ['python3', str(parse_script)],
                cwd=str(script_dir),
                capture_output=True,
                text=True,
                timeout=600  # 10 min timeout
            )
            if result.returncode == 0:
                log("✅ Criteria parsing completed")
            else:
                log(f"⚠️  Criteria parsing had issues: {result.stderr[:200]}")
        else:
            log("⚠️  parse_criteria.py not found - skipping criteria parsing")

        # Success summary
        update_progress(8, "Completed successfully!")
        log("\n" + "=" * 70)
        log("✅ FULL UPDATE PIPELINE COMPLETED SUCCESSFULLY")
        log("=" * 70)

        # Generate summary
        enriched_file = script_dir / "enriched_data.json"
        if enriched_file.exists():
            with open(enriched_file, 'r', encoding='utf-8') as f:
                properties = json.load(f)

            total = len(properties)
            active = sum(1 for p in properties if p.get('status') != 'Removed')
            removed = total - active

            log(f"\n📊 Current Status:")
            log(f"   Total properties: {total}")
            log(f"   Active: {active}")
            log(f"   Removed: {removed}")

        return True

    except subprocess.TimeoutExpired:
        log("❌ Pipeline timed out")
        return False
    except Exception as e:
        log(f"❌ Pipeline error: {e}")
        return False

def run_scrape_only():
    """Just scrape favorites without full pipeline"""
    log("=" * 70)
    log("📥 SCRAPING FAVORITES ONLY")
    log("=" * 70)

    try:
        asyncio.run(scrape_favorites())
        log("✅ Favorites scraping completed")
        return True
    except Exception as e:
        log(f"❌ Scraping error: {e}")
        return False

def schedule_weekly_scraping(day_of_week="sunday", time_of_day="02:00"):
    """
    Schedule weekly scraping

    Args:
        day_of_week: "monday", "tuesday", etc.
        time_of_day: Time in 24-hour format (e.g., "02:00")
    """
    if not HAS_SCHEDULE:
        log("❌ Error: 'schedule' module not installed")
        log("   Install with: pip3 install schedule")
        return

    log("🤖 Automated Favorites Scraper - Starting Scheduler")
    log("=" * 70)
    log(f"Weekly scraping scheduled: Every {day_of_week.capitalize()} at {time_of_day}")
    log(f"Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    log("\nPress Ctrl+C to stop")
    log("=" * 70)

    # Schedule based on day of week
    schedule_map = {
        'monday': schedule.every().monday,
        'tuesday': schedule.every().tuesday,
        'wednesday': schedule.every().wednesday,
        'thursday': schedule.every().thursday,
        'friday': schedule.every().friday,
        'saturday': schedule.every().saturday,
        'sunday': schedule.every().sunday
    }

    day_scheduler = schedule_map.get(day_of_week.lower(), schedule.every().sunday)
    day_scheduler.at(time_of_day).do(run_full_update_pipeline)

    # Ask if user wants to run immediately
    run_now = input("\nRun full update pipeline now? [y/N]: ").lower()
    if run_now == 'y':
        run_full_update_pipeline()

    log("\n⏰ Scheduler running... (Ctrl+C to stop)")

    try:
        while True:
            schedule.run_pending()
            time.sleep(60)  # Check every minute
    except KeyboardInterrupt:
        log("\n🛑 Scheduler stopped")

if __name__ == "__main__":
    import sys

    if len(sys.argv) > 1:
        command = sys.argv[1].lower()

        if command == 'now':
            # Run full pipeline immediately
            run_full_update_pipeline()

        elif command == 'scrape-only':
            # Just scrape, no analysis
            run_scrape_only()

        elif command == 'schedule':
            # Start scheduler
            day = sys.argv[2] if len(sys.argv) > 2 else "sunday"
            time_str = sys.argv[3] if len(sys.argv) > 3 else "02:00"
            schedule_weekly_scraping(day, time_str)

        else:
            print("Usage:")
            print("  python3 auto_scrape_favorites.py now          - Run full pipeline now")
            print("  python3 auto_scrape_favorites.py scrape-only  - Just scrape favorites")
            print("  python3 auto_scrape_favorites.py schedule [day] [time]")
            print("                                                 - Start weekly scheduler")
            print("\nExamples:")
            print("  python3 auto_scrape_favorites.py schedule sunday 02:00")
            print("  python3 auto_scrape_favorites.py schedule friday 20:00")
    else:
        # Default: run full pipeline now
        run_full_update_pipeline()
