#!/usr/bin/env python3
"""
Extract GPS coordinates and KPIs directly from property pages
Priority 1 improvements from IMPROVEMENTS_ROADMAP.md
"""
import asyncio
from playwright.async_api import async_playwright
import pandas as pd
import re
import json

async def extract_gps_coordinates(page, url):
    """
    Extract GPS coordinates from embedded maps and JavaScript
    Returns: (lat, lon, source) or (None, None, None)
    """
    try:
        # Method 1: Check for Google Maps iframe
        map_iframe = await page.query_selector("iframe[src*='maps.google'], iframe[src*='google.com/maps']")
        if map_iframe:
            src = await map_iframe.get_attribute('src')
            if src:
                # Parse coordinates from URL patterns:
                # ?q=49.2138787,-1.2426305
                # ?center=49.2138787,-1.2426305
                # ?ll=49.2138787,-1.2426305
                patterns = [
                    r'[?&]q=([-\d.]+),([-\d.]+)',
                    r'[?&]center=([-\d.]+),([-\d.]+)',
                    r'[?&]ll=([-\d.]+),([-\d.]+)',
                    r'@([-\d.]+),([-\d.]+)',
                ]
                for pattern in patterns:
                    match = re.search(pattern, src)
                    if match:
                        lat, lon = float(match.group(1)), float(match.group(2))
                        print(f"      ✅ GPS from Google Maps iframe: {lat}, {lon}")
                        return lat, lon, 'google_maps_iframe'

        # Method 2: Check for OpenStreetMap iframe
        osm_iframe = await page.query_selector("iframe[src*='openstreetmap.org']")
        if osm_iframe:
            src = await osm_iframe.get_attribute('src')
            if src:
                # OSM URL: ?mlat=49.2138&mlon=-1.2426
                match = re.search(r'[?&]mlat=([-\d.]+)&mlon=([-\d.]+)', src)
                if match:
                    lat, lon = float(match.group(1)), float(match.group(2))
                    print(f"      ✅ GPS from OpenStreetMap iframe: {lat}, {lon}")
                    return lat, lon, 'osm_iframe'

        # Method 3: Search JavaScript for GPS coordinates
        scripts = await page.query_selector_all("script")
        for script in scripts:
            try:
                text = await script.inner_text()
                if not text:
                    continue

                # Look for Google Maps LatLng
                # new google.maps.LatLng(49.2138787, -1.2426305)
                match = re.search(r'LatLng\s*\(\s*([-\d.]+)\s*,\s*([-\d.]+)\s*\)', text)
                if match:
                    lat, lon = float(match.group(1)), float(match.group(2))
                    print(f"      ✅ GPS from JavaScript LatLng: {lat}, {lon}")
                    return lat, lon, 'js_latlng'

                # Look for coordinate arrays: [49.2138787, -1.2426305]
                # Common in Leaflet, Mapbox
                match = re.search(r'["\']?(?:lat|latitude)["\']?\s*:\s*([-\d.]+)[,\s]+["\']?(?:lng|lon|longitude)["\']?\s*:\s*([-\d.]+)', text, re.IGNORECASE)
                if match:
                    lat, lon = float(match.group(1)), float(match.group(2))
                    print(f"      ✅ GPS from JavaScript object: {lat}, {lon}")
                    return lat, lon, 'js_object'

                # Look for Leaflet marker: L.marker([49.2138787, -1.2426305])
                match = re.search(r'L\.marker\s*\(\s*\[\s*([-\d.]+)\s*,\s*([-\d.]+)\s*\]', text)
                if match:
                    lat, lon = float(match.group(1)), float(match.group(2))
                    print(f"      ✅ GPS from Leaflet marker: {lat}, {lon}")
                    return lat, lon, 'leaflet_marker'

            except:
                continue

        # Method 4: Check for data attributes on map div
        map_div = await page.query_selector("div[data-lat], div[data-latitude]")
        if map_div:
            lat_attr = await map_div.get_attribute('data-lat') or await map_div.get_attribute('data-latitude')
            lon_attr = await map_div.get_attribute('data-lng') or await map_div.get_attribute('data-lon') or await map_div.get_attribute('data-longitude')
            if lat_attr and lon_attr:
                lat, lon = float(lat_attr), float(lon_attr)
                print(f"      ✅ GPS from div data attributes: {lat}, {lon}")
                return lat, lon, 'div_data_attrs'

        # Method 5: Check JSON-LD structured data
        json_ld_script = await page.query_selector("script[type='application/ld+json']")
        if json_ld_script:
            try:
                json_text = await json_ld_script.inner_text()
                data = json.loads(json_text)
                if isinstance(data, dict) and 'geo' in data:
                    geo = data['geo']
                    if 'latitude' in geo and 'longitude' in geo:
                        lat = float(geo['latitude'])
                        lon = float(geo['longitude'])
                        print(f"      ✅ GPS from JSON-LD: {lat}, {lon}")
                        return lat, lon, 'json_ld'
            except:
                pass

        return None, None, None

    except Exception as e:
        print(f"      ⚠️ Error extracting GPS: {str(e)[:80]}")
        return None, None, None


async def extract_property_kpis(page, url):
    """
    Extract property KPIs from structured data and page text
    Returns: dict with size, bedrooms, bathrooms, price
    """
    kpis = {}

    try:
        page_text = await page.inner_text("body")

        # Extract land size
        land_patterns = [
            r'(\d+[\d,\.]*)\s*(?:m2|m²|vierkante meter)',
            r'(?:perceel|grond|terrein)[:\s]+(\d+[\d,\.]*)\s*m',
            r'land[:\s]+(\d+[\d,\.]*)\s*(?:m2|m²)',
            r'(\d+[\d,\.]*)\s*(?:hectare|ha)(?!\w)',  # hectares
        ]

        for pattern in land_patterns:
            match = re.search(pattern, page_text, re.IGNORECASE)
            if match:
                size_str = match.group(1).replace(',', '').replace('.', '')
                size = float(size_str)

                # Convert hectares to m2
                if 'hectare' in pattern or 'ha' in pattern:
                    size = size * 10000

                kpis['land_size'] = size
                print(f"      📏 Land size: {size} m²")
                break

        # Extract building size
        building_patterns = [
            r'woonoppervlakte[:\s]*(\d+[\d,\.]*)\s*m',
            r'living\s+area[:\s]*(\d+[\d,\.]*)\s*m',
            r'surface\s+habitable[:\s]*(\d+[\d,\.]*)\s*m',
        ]

        for pattern in building_patterns:
            match = re.search(pattern, page_text, re.IGNORECASE)
            if match:
                size_str = match.group(1).replace(',', '').replace('.', '')
                kpis['building_size'] = float(size_str)
                print(f"      🏠 Building size: {kpis['building_size']} m²")
                break

        # Extract bedrooms
        bedroom_patterns = [
            r'(\d+)\s*(?:slaapkamer|bedroom|chambre)',
            r'(?:slaapkamers?|bedrooms?|chambres?)[:\s]*(\d+)',
        ]

        for pattern in bedroom_patterns:
            match = re.search(pattern, page_text, re.IGNORECASE)
            if match:
                kpis['bedrooms'] = int(match.group(1))
                print(f"      🛏️  Bedrooms: {kpis['bedrooms']}")
                break

        # Extract bathrooms
        bathroom_patterns = [
            r'(\d+)\s*(?:badkamer|bathroom|salle de bain)',
            r'(?:badkamers?|bathrooms?|salles? de bains?)[:\s]*(\d+)',
        ]

        for pattern in bathroom_patterns:
            match = re.search(pattern, page_text, re.IGNORECASE)
            if match:
                kpis['bathrooms'] = int(match.group(1))
                print(f"      🚿 Bathrooms: {kpis['bathrooms']}")
                break

        # Extract price
        price_patterns = [
            r'€\s*([\d,\.]+)',
            r'EUR\s*([\d,\.]+)',
            r'([\d,\.]+)\s*(?:€|EUR|euro)',
        ]

        for pattern in price_patterns:
            match = re.search(pattern, page_text)
            if match:
                price_str = match.group(1).replace(',', '').replace('.', '')
                # Only accept if it looks like a realistic price (> 10000)
                try:
                    price = int(price_str)
                    if price > 10000:
                        kpis['price'] = price
                        print(f"      💰 Price: €{price:,}")
                        break
                except:
                    continue

        return kpis

    except Exception as e:
        print(f"      ⚠️ Error extracting KPIs: {str(e)[:80]}")
        return kpis


async def main():
    """Main extraction loop"""
    # Load properties
    df = pd.read_csv('analysis_output.csv')
    print(f"📊 Total properties: {len(df)}")

    # Find properties missing coordinates
    missing_coords = df[df['Latitude'].isna()]
    print(f"📊 Properties missing coordinates: {len(missing_coords)}")

    # Find properties missing KPIs
    land_size_missing = df['land_size'].isna() if 'land_size' in df.columns else pd.Series([True] * len(df))
    price_missing = df['price'].isna() if 'price' in df.columns else pd.Series([True] * len(df))
    missing_kpis = df[land_size_missing | price_missing]
    print(f"📊 Properties missing KPIs: {len(missing_kpis)}")

    # Get all unique URLs to process
    urls_to_process = set(missing_coords['URL'].tolist())
    urls_to_process.update(missing_kpis['URL'].tolist())

    print(f"📊 Total properties to scrape: {len(urls_to_process)}")

    if not urls_to_process:
        print("✅ All properties have GPS and KPIs!")
        return

    # Initialize new columns if they don't exist
    if 'GPSSource' not in df.columns:
        df['GPSSource'] = None
    if 'land_size' not in df.columns:
        df['land_size'] = None
    if 'building_size' not in df.columns:
        df['building_size'] = None
    if 'bedrooms' not in df.columns:
        df['bedrooms'] = None
    if 'bathrooms' not in df.columns:
        df['bathrooms'] = None
    if 'price' not in df.columns:
        df['price'] = None

    # Start scraping
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()

        gps_found = 0
        kpis_found = 0

        for i, url in enumerate(sorted(urls_to_process), 1):
            prop_id = url.split('/')[-1]
            print(f"\n[{i}/{len(urls_to_process)}] {prop_id}")

            try:
                # Load page
                await page.goto(url, wait_until="domcontentloaded", timeout=30000)
                await page.wait_for_timeout(2000)

                # Get index in dataframe
                idx = df[df['URL'] == url].index[0]

                # Extract GPS if missing
                if pd.isna(df.at[idx, 'Latitude']):
                    lat, lon, source = await extract_gps_coordinates(page, url)
                    if lat and lon:
                        df.at[idx, 'Latitude'] = lat
                        df.at[idx, 'Longitude'] = lon
                        df.at[idx, 'GPSSource'] = source
                        gps_found += 1

                # Extract KPIs
                kpis = await extract_property_kpis(page, url)
                if kpis:
                    for key, value in kpis.items():
                        if value is not None:
                            df.at[idx, key] = value
                            kpis_found += 1

                # Save progress every 10 properties
                if i % 10 == 0:
                    df.to_csv('analysis_output.csv', index=False)
                    print(f"\n   💾 Progress saved")

                # Rate limiting
                await page.wait_for_timeout(500)

            except Exception as e:
                print(f"   ❌ Error: {str(e)[:100]}")
                continue

        await browser.close()

    # Final save
    df.to_csv('analysis_output.csv', index=False)

    print("\n" + "="*70)
    print("✅ GPS AND KPI EXTRACTION COMPLETE")
    print("="*70)
    print(f"Properties processed: {len(urls_to_process)}")
    print(f"GPS coordinates found: {gps_found}")
    print(f"KPIs extracted: {kpis_found}")
    print(f"\nFinal statistics:")
    print(f"   Total properties: {len(df)}")
    print(f"   With coordinates: {df['Latitude'].notna().sum()} ({df['Latitude'].notna().sum()/len(df)*100:.1f}%)")
    print(f"   With land size: {df['land_size'].notna().sum()} ({df['land_size'].notna().sum()/len(df)*100:.1f}%)")
    print(f"   With price: {df['price'].notna().sum()} ({df['price'].notna().sum()/len(df)*100:.1f}%)")

    print("\n💡 Next steps:")
    print("   1. Run: python3 parse_criteria.py  # Update enriched_data.json")
    print("   2. Open map viewer to see improved locations and data")


if __name__ == '__main__':
    asyncio.run(main())
