#!/usr/bin/env python3
"""
Validate that all properties have breadcrumbs and coordinates.
Identifies missing data and suggests fixes.
"""
import pandas as pd
import sys

def validate():
    # Load all data
    analysis_df = pd.read_csv('analysis_output.csv')
    
    try:
        breadcrumb_df = pd.read_csv('extracted_property_urls.csv')
    except FileNotFoundError:
        print("❌ ERROR: extracted_property_urls.csv not found!")
        print("   Run: python3 extract_breadcrumbs.py")
        return False
    
    print(f"📊 Validation Report")
    print("=" * 70)
    
    # Get all property URLs
    all_urls = set(analysis_df['URL'].tolist())
    breadcrumb_urls = set(breadcrumb_df['URL'].tolist())
    
    # Find missing breadcrumbs
    missing_breadcrumbs = all_urls - breadcrumb_urls
    
    print(f"\n1. BREADCRUMB COVERAGE:")
    print(f"   Total properties: {len(all_urls)}")
    print(f"   With breadcrumbs: {len(breadcrumb_urls)}")
    print(f"   Missing breadcrumbs: {len(missing_breadcrumbs)}")
    
    if missing_breadcrumbs:
        print(f"\n   ⚠️  Properties without breadcrumbs:")
        for i, url in enumerate(list(missing_breadcrumbs)[:10], 1):
            prop_id = url.split('/')[-1]
            print(f"      {i}. {prop_id}")
        if len(missing_breadcrumbs) > 10:
            print(f"      ... and {len(missing_breadcrumbs) - 10} more")
    
    # Check geocoding coverage
    has_coords = analysis_df[analysis_df['Latitude'].notna()]
    no_coords = analysis_df[analysis_df['Latitude'].isna()]
    
    print(f"\n2. GEOCODING COVERAGE:")
    print(f"   With coordinates: {len(has_coords)}")
    print(f"   Missing coordinates: {len(no_coords)}")
    
    # Check properties with breadcrumbs but no coords
    no_coords_urls = set(no_coords['URL'].tolist())
    has_breadcrumb_no_coords = no_coords_urls & breadcrumb_urls
    
    if has_breadcrumb_no_coords:
        print(f"\n   ⚠️  Properties with breadcrumbs but NO coordinates: {len(has_breadcrumb_no_coords)}")
        print(f"      These should be re-geocoded!")
        for i, url in enumerate(list(has_breadcrumb_no_coords)[:5], 1):
            prop_id = url.split('/')[-1]
            breadcrumb = breadcrumb_df[breadcrumb_df['URL'] == url]['Breadcrumb'].values[0]
            print(f"      {i}. {prop_id}: {breadcrumb}")
        if len(has_breadcrumb_no_coords) > 5:
            print(f"      ... and {len(has_breadcrumb_no_coords) - 5} more")
    
    # Check for potentially wrong coordinates (need manual review)
    print(f"\n3. COORDINATE VALIDATION:")
    
    # Check for properties with coords but no breadcrumbs
    has_coords_urls = set(has_coords['URL'].tolist())
    has_coords_no_breadcrumb = has_coords_urls - breadcrumb_urls
    
    if has_coords_no_breadcrumb:
        print(f"   ⚠️  Properties with coordinates but NO breadcrumbs: {len(has_coords_no_breadcrumb)}")
        print(f"      These may have old/wrong coordinates!")
        for i, url in enumerate(list(has_coords_no_breadcrumb)[:5], 1):
            prop_id = url.split('/')[-1]
            coords_row = has_coords[has_coords['URL'] == url].iloc[0]
            lat, lon = coords_row['Latitude'], coords_row['Longitude']
            source = coords_row.get('LocationSource', 'unknown')
            print(f"      {i}. {prop_id}: {lat:.4f}, {lon:.4f} (source: {source})")
        if len(has_coords_no_breadcrumb) > 5:
            print(f"      ... and {len(has_coords_no_breadcrumb) - 5} more")
    
    print("\n" + "=" * 70)
    
    # Recommendations
    print("\n💡 RECOMMENDED ACTIONS:")
    
    if missing_breadcrumbs:
        print(f"   1. Re-run breadcrumb extraction:")
        print(f"      python3 extract_breadcrumbs.py")
    
    if has_breadcrumb_no_coords:
        print(f"   2. Re-run geocoding for properties with breadcrumbs:")
        print(f"      python3 geocode_with_breadcrumbs.py")
    
    if has_coords_no_breadcrumb:
        print(f"   3. Review properties with coordinates but no breadcrumbs")
        print(f"      These may have cached wrong coordinates")
    
    print(f"   4. After fixes, regenerate enriched data:")
    print(f"      python3 parse_criteria.py")
    
    print()
    
    # Return success if no issues
    return len(missing_breadcrumbs) == 0 and len(has_breadcrumb_no_coords) == 0

if __name__ == '__main__':
    success = validate()
    sys.exit(0 if success else 1)
