#!/usr/bin/env python3
"""
Property Availability Checker
Checks if properties are still available/published and marks unavailable ones for removal
"""
import json
import requests
import time
from pathlib import Path
from datetime import datetime
from bs4 import BeautifulSoup
import pandas as pd

def check_property_availability(url, timeout=10):
    """
    Check if a property is still available

    Returns:
        dict with:
        - available (bool): True if property is still active
        - status_code (int): HTTP status code
        - reason (str): Reason for unavailability
        - checked_at (str): ISO timestamp of check
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    }

    try:
        response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
        status_code = response.status_code

        # Check if page exists
        if status_code == 404:
            return {
                'available': False,
                'status_code': 404,
                'reason': 'Page not found (404)',
                'checked_at': datetime.now().isoformat()
            }

        # 410 Gone means permanently deleted
        if status_code == 410:
            return {
                'available': False,
                'status_code': 410,
                'reason': 'Property permanently removed (410 Gone)',
                'checked_at': datetime.now().isoformat()
            }

        # 403 Forbidden might indicate removed content
        if status_code == 403:
            return {
                'available': False,
                'status_code': 403,
                'reason': 'Access forbidden (403) - likely removed',
                'checked_at': datetime.now().isoformat()
            }

        if status_code >= 500:
            return {
                'available': True,  # Assume available if server error
                'status_code': status_code,
                'reason': f'Server error ({status_code}) - assuming available',
                'checked_at': datetime.now().isoformat()
            }

        # Check if redirected to homepage or search results (common for removed listings)
        if 'search' in response.url or response.url.endswith('/'):
            return {
                'available': False,
                'status_code': status_code,
                'reason': 'Redirected to search/homepage (property removed)',
                'checked_at': datetime.now().isoformat()
            }

        # Parse HTML to check for removal indicators
        soup = BeautifulSoup(response.content, 'html.parser')

        # Check for common "not available" messages
        page_text = soup.get_text().lower()
        unavailable_indicators = [
            'not available',
            'niet beschikbaar',
            'no longer available',
            'niet meer beschikbaar',
            'listing removed',
            'verwijderd',
            'sold',
            'verkocht',
            'under offer',
            'in optie'
        ]

        for indicator in unavailable_indicators:
            if indicator in page_text:
                return {
                    'available': False,
                    'status_code': status_code,
                    'reason': f'Page contains unavailability indicator: "{indicator}"',
                    'checked_at': datetime.now().isoformat()
                }

        # Check if essential property elements exist (title, price)
        has_title = soup.find('h1') or soup.find('title')
        has_price = 'price' in page_text or '€' in page_text or 'euro' in page_text

        if not has_title:
            return {
                'available': False,
                'status_code': status_code,
                'reason': 'Missing essential elements (no title)',
                'checked_at': datetime.now().isoformat()
            }

        # If we got here, assume property is still available
        return {
            'available': True,
            'status_code': status_code,
            'reason': 'Property appears to be active',
            'checked_at': datetime.now().isoformat()
        }

    except requests.exceptions.Timeout:
        return {
            'available': True,  # Assume available if timeout
            'status_code': 0,
            'reason': 'Timeout - assuming available',
            'checked_at': datetime.now().isoformat()
        }
    except requests.exceptions.RequestException as e:
        return {
            'available': True,  # Assume available if connection error
            'status_code': 0,
            'reason': f'Connection error: {str(e)[:50]} - assuming available',
            'checked_at': datetime.now().isoformat()
        }

def check_all_properties(skip_recently_checked=True, recent_threshold_hours=24):
    """
    Check availability of all properties in enriched_data.json

    Args:
        skip_recently_checked: Skip properties checked within recent_threshold_hours
        recent_threshold_hours: Hours to consider a check "recent"
    """
    enriched_file = Path("enriched_data.json")

    if not enriched_file.exists():
        print("❌ enriched_data.json not found!")
        return

    print("🔍 Property Availability Checker")
    print("=" * 70)
    print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"Skip recently checked: {skip_recently_checked} (within {recent_threshold_hours}h)")
    print("=" * 70)

    # Load properties
    with open(enriched_file, 'r', encoding='utf-8') as f:
        properties = json.load(f)

    # Load blacklist of manually removed properties
    removed_file = Path("removed_properties.json")
    manually_removed_urls = set()
    if removed_file.exists():
        try:
            with open(removed_file, 'r', encoding='utf-8') as f:
                manually_removed_urls = set(json.load(f))
            if manually_removed_urls:
                print(f"\n🚫 Loaded {len(manually_removed_urls)} manually removed URLs (will be filtered out)")
        except Exception as e:
            print(f"⚠️  Could not load removed_properties.json: {e}")

    # Filter out manually removed properties BEFORE processing
    original_count = len(properties)
    properties = [p for p in properties if p.get('url') not in manually_removed_urls]
    filtered_count = original_count - len(properties)

    if filtered_count > 0:
        print(f"🗑️  Filtered out {filtered_count} manually removed properties")
        # Save the filtered list immediately
        with open(enriched_file, 'w', encoding='utf-8') as f:
            json.dump(properties, f, indent=2, ensure_ascii=False)
        print(f"   ✅ Saved filtered data (removed blacklisted properties)")

    total = len(properties)
    checked = 0
    skipped = 0
    newly_unavailable = 0
    still_available = 0
    already_unavailable = 0

    print(f"\n📊 Total properties: {total}")

    for i, prop in enumerate(properties, 1):
        url = prop.get('url', '')
        current_status = prop.get('status', 'Active')
        last_checked = prop.get('availability_last_checked')

        # Skip if already marked as removed
        if current_status == 'Removed':
            already_unavailable += 1
            continue

        # Skip if recently checked
        if skip_recently_checked and last_checked:
            try:
                last_check_time = datetime.fromisoformat(last_checked)
                hours_since_check = (datetime.now() - last_check_time).total_seconds() / 3600
                if hours_since_check < recent_threshold_hours:
                    skipped += 1
                    if i % 20 == 0:
                        print(f"   [{i}/{total}] Skipped {skipped} recently checked properties...")
                    continue
            except:
                pass  # Invalid timestamp, check anyway

        location = prop.get('location', 'Unknown')
        print(f"\n[{i}/{total}] {location}")
        print(f"   URL: {url}")
        print(f"   Current status: {current_status}")
        print(f"   🔍 Checking availability...")

        # Check availability
        result = check_property_availability(url)
        checked += 1

        # Update property with check results
        prop['availability_last_checked'] = result['checked_at']
        prop['availability_status_code'] = result['status_code']
        prop['availability_reason'] = result['reason']

        if result['available']:
            still_available += 1
            print(f"   ✅ Available - {result['reason']}")
        else:
            newly_unavailable += 1
            prop['status'] = 'Removed'
            prop['removed_at'] = result['checked_at']
            prop['removal_reason'] = result['reason']
            print(f"   ❌ UNAVAILABLE - {result['reason']}")
            print(f"   📝 Marked as 'Removed'")

        # Save progress every 10 properties
        if checked % 10 == 0:
            with open(enriched_file, 'w', encoding='utf-8') as f:
                json.dump(properties, f, indent=2, ensure_ascii=False)
            print(f"\n   💾 Progress saved ({checked} checked)")

        # Rate limiting - be nice to the server
        if i < total:
            time.sleep(2)  # 2 seconds between requests

    # Final save
    print("\n" + "=" * 70)
    print("💾 Saving final results...")
    with open(enriched_file, 'w', encoding='utf-8') as f:
        json.dump(properties, f, indent=2, ensure_ascii=False)

    # Summary
    print("\n" + "=" * 70)
    print("📊 AVAILABILITY CHECK SUMMARY")
    print("=" * 70)
    print(f"Total properties: {total}")
    print(f"✅ Still available: {still_available}")
    print(f"❌ Newly unavailable: {newly_unavailable}")
    print(f"⚠️  Already marked unavailable: {already_unavailable}")
    print(f"⏭️  Skipped (recently checked): {skipped}")
    print(f"🔍 Actually checked: {checked}")

    if newly_unavailable > 0:
        print(f"\n🗑️  {newly_unavailable} properties marked as 'Removed'")
        print("   They will be filtered out in the map viewer when 'Show Removed' is unchecked")

        # Auto-unfavorite removed properties on Properstar
        unfavorite_removed_properties(properties)

    print("\n✨ Availability check complete!")

    # Generate summary report
    report = {
        'timestamp': datetime.now().isoformat(),
        'total_properties': total,
        'checked': checked,
        'skipped': skipped,
        'still_available': still_available,
        'newly_unavailable': newly_unavailable,
        'already_unavailable': already_unavailable,
        'availability_rate': f"{(still_available / max(1, checked)) * 100:.1f}%"
    }

    with open('availability_check_report.json', 'w', encoding='utf-8') as f:
        json.dump(report, f, indent=2)

    print(f"\n📄 Report saved to: availability_check_report.json")
    print(f"   Availability rate: {report['availability_rate']}")

def remove_404_properties(backup=True):
    """
    Remove properties that returned 404 (Page Not Found) based on Status_404 column
    in extracted_property_urls.csv. Removes from:
    - analysis_output.csv
    - extracted_property_urls.csv
    - enriched_data.json

    Args:
        backup: Create backup before removing (default: True)
    """
    print("🗑️  404 Property Removal Tool")
    print("=" * 70)

    # Check if extracted_property_urls.csv exists
    breadcrumb_file = Path("extracted_property_urls.csv")
    if not breadcrumb_file.exists():
        print("❌ extracted_property_urls.csv not found!")
        return

    # Load breadcrumb data with 404 flags
    try:
        df_breadcrumbs = pd.read_csv('extracted_property_urls.csv')
        if 'Status_404' not in df_breadcrumbs.columns:
            print("ℹ️  No Status_404 column found in extracted_property_urls.csv")
            print("   Run: python3 extract_breadcrumbs.py to check for 404 pages")
            return
    except Exception as e:
        print(f"❌ Error reading extracted_property_urls.csv: {e}")
        return

    # Find 404 URLs
    page_404_urls = set(df_breadcrumbs[df_breadcrumbs['Status_404'] == True]['URL'].tolist())

    if not page_404_urls:
        print("✅ No 404 pages found")
        return

    print(f"Found {len(page_404_urls)} properties with 404 status:")
    for url in list(page_404_urls)[:5]:
        print(f"   - {url}")
    if len(page_404_urls) > 5:
        print(f"   ... and {len(page_404_urls) - 5} more")

    # Create backups if requested
    if backup:
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

        # Backup analysis_output.csv
        if Path("analysis_output.csv").exists():
            backup_analysis = f"analysis_output_backup_{timestamp}.csv"
            df_analysis = pd.read_csv('analysis_output.csv')
            df_analysis.to_csv(backup_analysis, index=False)
            print(f"✅ Backup created: {backup_analysis}")

        # Backup extracted_property_urls.csv
        backup_breadcrumbs = f"extracted_property_urls_backup_{timestamp}.csv"
        df_breadcrumbs.to_csv(backup_breadcrumbs, index=False)
        print(f"✅ Backup created: {backup_breadcrumbs}")

        # Backup enriched_data.json
        if Path("enriched_data.json").exists():
            backup_enriched = f"enriched_data_backup_{timestamp}.json"
            with open("enriched_data.json", 'r', encoding='utf-8') as f:
                enriched_data = json.load(f)
            with open(backup_enriched, 'w', encoding='utf-8') as f:
                json.dump(enriched_data, f, indent=2, ensure_ascii=False)
            print(f"✅ Backup created: {backup_enriched}")

    # 1. Remove from analysis_output.csv
    if Path("analysis_output.csv").exists():
        df_analysis = pd.read_csv('analysis_output.csv')
        original_count = len(df_analysis)
        df_analysis = df_analysis[~df_analysis['URL'].isin(page_404_urls)]
        new_count = len(df_analysis)
        df_analysis.to_csv('analysis_output.csv', index=False)
        print(f"\n📊 analysis_output.csv: {original_count} → {new_count} ({original_count - new_count} removed)")

    # 2. Remove from extracted_property_urls.csv
    original_count = len(df_breadcrumbs)
    df_breadcrumbs = df_breadcrumbs[~df_breadcrumbs['URL'].isin(page_404_urls)]
    new_count = len(df_breadcrumbs)
    df_breadcrumbs.to_csv('extracted_property_urls.csv', index=False)
    print(f"📊 extracted_property_urls.csv: {original_count} → {new_count} ({original_count - new_count} removed)")

    # 3. Remove from enriched_data.json
    if Path("enriched_data.json").exists():
        with open("enriched_data.json", 'r', encoding='utf-8') as f:
            enriched_data = json.load(f)
        original_count = len(enriched_data)
        enriched_data = [p for p in enriched_data if p.get('url') not in page_404_urls]
        new_count = len(enriched_data)
        with open("enriched_data.json", 'w', encoding='utf-8') as f:
            json.dump(enriched_data, f, indent=2, ensure_ascii=False)
        print(f"📊 enriched_data.json: {original_count} → {new_count} ({original_count - new_count} removed)")

    print(f"\n✅ {len(page_404_urls)} 404 properties permanently removed from all data files")
    print("   These properties returned Page Not Found and no longer exist")

def remove_unavailable_properties(backup=True):
    """
    Permanently remove properties marked as 'Removed' from enriched_data.json

    Args:
        backup: Create backup before removing (default: True)
    """
    enriched_file = Path("enriched_data.json")

    if not enriched_file.exists():
        print("❌ enriched_data.json not found!")
        return

    print("🗑️  Property Removal Tool")
    print("=" * 70)

    # Load properties
    with open(enriched_file, 'r', encoding='utf-8') as f:
        properties = json.load(f)

    # Count removed properties
    removed_count = sum(1 for p in properties if p.get('status') == 'Removed')

    if removed_count == 0:
        print("✅ No properties marked as 'Removed'")
        return

    print(f"Found {removed_count} properties marked as 'Removed'")

    # Create backup
    if backup:
        backup_file = f"enriched_data_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        with open(backup_file, 'w', encoding='utf-8') as f:
            json.dump(properties, f, indent=2, ensure_ascii=False)
        print(f"✅ Backup created: {backup_file}")

    # Filter out removed properties
    active_properties = [p for p in properties if p.get('status') != 'Removed']

    print(f"\n📊 Before: {len(properties)} properties")
    print(f"📊 After:  {len(active_properties)} properties")
    print(f"🗑️  Removed: {removed_count} properties")

    # Confirm removal
    confirm = input(f"\n⚠️  Permanently remove {removed_count} properties? [y/N]: ").lower()

    if confirm == 'y':
        with open(enriched_file, 'w', encoding='utf-8') as f:
            json.dump(active_properties, f, indent=2, ensure_ascii=False)
        print(f"\n✅ {removed_count} properties permanently removed")
        print(f"✨ enriched_data.json now contains {len(active_properties)} active properties")
    else:
        print("\n❌ Removal cancelled")

def unfavorite_removed_properties(properties):
    """
    Automatically unfavorite properties that are marked as 'Removed' on Properstar

    Args:
        properties: List of property dictionaries
    """
    import asyncio
    from pathlib import Path

    # Find removed properties
    removed = [p for p in properties if p.get('status') == 'Removed']

    if not removed:
        return

    print("\n" + "=" * 70)
    print("🗑️  AUTO-UNFAVORITING REMOVED PROPERTIES")
    print("=" * 70)
    print(f"Found {len(removed)} removed properties to unfavorite on Properstar")

    # Check if auth.json exists
    auth_file = Path("auth.json")
    if not auth_file.exists():
        print("⚠️  auth.json not found - skipping auto-unfavorite")
        print("   Login required: ./login.sh or double-click 'Login to Properstar.command'")
        return

    # Check if unfavorite script exists
    unfavorite_script = Path(__file__).parent / "unfavorite_property.py"
    if not unfavorite_script.exists():
        print("⚠️  unfavorite_property.py not found - skipping auto-unfavorite")
        return

    print("\n🔄 Unfavoriting removed properties...")

    successes = 0
    failures = 0

    for i, prop in enumerate(removed, 1):
        url = prop.get('url')
        location = prop.get('location', 'Unknown')

        print(f"\n[{i}/{len(removed)}] {location}")
        print(f"   URL: {url}")

        try:
            # Run unfavorite script synchronously
            import subprocess
            result = subprocess.run(
                ['python3', str(unfavorite_script), url],
                capture_output=True,
                text=True,
                timeout=60
            )

            if result.returncode == 0:
                print(f"   ✅ Unfavorited successfully")
                successes += 1
            else:
                print(f"   ❌ Failed to unfavorite")
                print(f"   Error: {result.stderr[:100]}")
                failures += 1

        except subprocess.TimeoutExpired:
            print(f"   ⏱️  Timeout - skipping")
            failures += 1
        except Exception as e:
            print(f"   ❌ Error: {str(e)[:100]}")
            failures += 1

        # Rate limiting - be nice to Properstar
        if i < len(removed):
            time.sleep(3)  # 3 seconds between unfavorites

    # Summary
    print("\n" + "=" * 70)
    print("📊 AUTO-UNFAVORITE SUMMARY")
    print("=" * 70)
    print(f"✅ Successfully unfavorited: {successes}")
    print(f"❌ Failed: {failures}")
    print(f"📊 Total processed: {len(removed)}")
    print("\n✨ Auto-unfavorite complete!")
    print("   Removed properties are now unfavorited on Properstar")

    # Permanently remove unfavorited properties from enriched_data.json
    if successes > 0:
        print("\n" + "=" * 70)
        print("🗑️  PERMANENTLY REMOVING FROM enriched_data.json")
        print("=" * 70)

        enriched_file = Path("enriched_data.json")
        if enriched_file.exists():
            # Read current data
            with open(enriched_file, 'r', encoding='utf-8') as f:
                all_properties = json.load(f)

            original_count = len(all_properties)

            # Filter out removed properties
            active_properties = [p for p in all_properties if p.get('status') != 'Removed']
            new_count = len(active_properties)

            # Save filtered data
            with open(enriched_file, 'w', encoding='utf-8') as f:
                json.dump(active_properties, f, indent=2, ensure_ascii=False)

            print(f"✅ Removed properties permanently deleted from enriched_data.json")
            print(f"   {original_count} → {new_count} properties ({original_count - new_count} removed)")

if __name__ == "__main__":
    import sys

    print("\n" + "=" * 70)
    print("🔍 FarmMatch Property Availability Checker")
    print("=" * 70)

    if len(sys.argv) > 1 and sys.argv[1] == '--remove-404':
        # Remove 404 pages from all data files
        remove_404_properties()
    elif len(sys.argv) > 1 and sys.argv[1] == '--remove':
        # Permanent removal mode (remove marked as 'Removed')
        remove_unavailable_properties()
    elif len(sys.argv) > 1 and sys.argv[1] == '--force':
        # Force check all properties (ignore recent checks)
        check_all_properties(skip_recently_checked=False)
    else:
        # Normal mode: check availability (skip recently checked)
        check_all_properties(skip_recently_checked=True, recent_threshold_hours=24)

        print("\n" + "=" * 70)
        print("💡 NEXT STEPS:")
        print("=" * 70)
        print("1. Review removed properties in the map viewer")
        print("2. To permanently delete them, run:")
        print("   python3 check_availability.py --remove")
        print("\n3. To remove 404 pages (from breadcrumb extraction):")
        print("   python3 check_availability.py --remove-404")
        print("\n4. To force check all properties (ignore recent checks):")
        print("   python3 check_availability.py --force")
