#!/usr/bin/env python3
"""
Property Details Enrichment
Scrapes detailed property information (land size, building size, bedrooms, etc.)
from individual property pages and adds to enriched_data.json
"""
import asyncio
import json
import os
import re
from pathlib import Path
from playwright.async_api import async_playwright
from datetime import datetime

def log(message):
    """Log with timestamp"""
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print(f"[{timestamp}] {message}")

def extract_number(text):
    """Extract first number from text"""
    if not text:
        return None
    match = re.search(r'[\d.,]+', text.replace(' ', ''))
    if match:
        num_str = match.group().replace('.', '').replace(',', '.')
        try:
            return float(num_str)
        except:
            return None
    return None

async def scrape_property_details(url):
    """Scrape detailed property information from Properstar page"""

    async with async_playwright() as p:
        # Check if auth.json exists for logged-in access
        auth_file = Path("auth.json")
        auth_exists = auth_file.exists()

        if auth_exists:
            browser = await p.chromium.launch(headless=True)
            context = await browser.new_context(storage_state="auth.json")
        else:
            browser = await p.chromium.launch(headless=True)
            context = await browser.new_context()

        page = await context.new_page()

        details = {
            'land_size_m2': None,
            'building_size_m2': None,
            'bedrooms': None,
            'bathrooms': None,
            'property_type': None,
            'year_built': None
        }

        try:
            # Navigate to property page
            await page.goto(url, timeout=30000)
            await page.wait_for_load_state("domcontentloaded")

            # Wait a bit for dynamic content
            await asyncio.sleep(2)

            # Extract property details from the details section
            # Properstar uses various labels - we'll look for common ones

            # Look for all detail rows
            detail_rows = await page.query_selector_all('div.property-detail, div.detail-row, tr, li.feature-item')

            for row in detail_rows:
                try:
                    text = await row.inner_text()
                    text_lower = text.lower()

                    # Land size / Plot size
                    if any(keyword in text_lower for keyword in ['oppervlakte perceel', 'plot size', 'land size', 'terrain', 'grundstück']):
                        details['land_size_m2'] = extract_number(text)

                    # Building size / Living area
                    elif any(keyword in text_lower for keyword in ['woonoppervlakte', 'living area', 'building size', 'wohnfläche', 'surface habitable']):
                        details['building_size_m2'] = extract_number(text)

                    # Bedrooms
                    elif any(keyword in text_lower for keyword in ['slaapkamer', 'bedroom', 'schlafzimmer', 'chambre']):
                        details['bedrooms'] = int(extract_number(text) or 0)

                    # Bathrooms
                    elif any(keyword in text_lower for keyword in ['badkamer', 'bathroom', 'badezimmer', 'salle de bain']):
                        details['bathrooms'] = int(extract_number(text) or 0)

                    # Property type
                    elif any(keyword in text_lower for keyword in ['type', 'property type', 'soort']):
                        # Extract property type (e.g., "Farm", "House", "Villa")
                        if ':' in text:
                            details['property_type'] = text.split(':')[1].strip()

                    # Year built
                    elif any(keyword in text_lower for keyword in ['bouwjaar', 'year built', 'baujahr', 'année']):
                        year = extract_number(text)
                        if year and 1500 < year < 2030:
                            details['year_built'] = int(year)

                except Exception as e:
                    # Skip this row if there's an error
                    continue

            # Also check for structured data in meta tags or schema.org
            try:
                # Look for schema.org structured data
                schema_scripts = await page.query_selector_all('script[type="application/ld+json"]')
                for script in schema_scripts:
                    try:
                        content = await script.inner_text()
                        data = json.loads(content)

                        if isinstance(data, dict):
                            # Check for RealEstate schema
                            if 'floorSize' in data:
                                details['building_size_m2'] = details['building_size_m2'] or extract_number(str(data['floorSize']))
                            if 'numberOfRooms' in data:
                                details['bedrooms'] = details['bedrooms'] or int(data['numberOfRooms'])
                    except:
                        pass
            except:
                pass

        except Exception as e:
            log(f"⚠️  Error scraping {url}: {e}")

        finally:
            await browser.close()

        return details

async def enrich_all_properties():
    """Enrich all active properties in enriched_data.json with detailed information"""

    log("=" * 70)
    log("🔍 STARTING PROPERTY DETAILS ENRICHMENT")
    log("=" * 70)

    enriched_file = Path("enriched_data.json")

    if not enriched_file.exists():
        log("❌ enriched_data.json not found")
        return False

    # Load existing data
    with open(enriched_file, 'r', encoding='utf-8') as f:
        properties = json.load(f)

    log(f"📊 Found {len(properties)} properties")

    # Filter active properties
    active_properties = [p for p in properties if p.get('status') != 'Removed']
    log(f"📊 {len(active_properties)} active properties to enrich")

    # Process each property
    enriched_count = 0
    skipped_count = 0
    error_count = 0

    for i, prop in enumerate(properties, 1):
        if prop.get('status') == 'Removed':
            continue

        url = prop.get('url')
        if not url:
            continue

        # Check if already enriched (skip if has land_size_m2 field)
        if prop.get('land_size_m2') is not None:
            log(f"[{i}/{len(active_properties)}] ⏭️  Skipping {prop.get('location')} (already enriched)")
            skipped_count += 1
            continue

        log(f"[{i}/{len(active_properties)}] 🔍 Enriching {prop.get('location')}")
        log(f"   URL: {url}")

        try:
            details = await scrape_property_details(url)

            # Add details to property
            prop.update(details)
            prop['details_last_updated'] = datetime.now().isoformat()

            # Log what we found
            found_fields = [k for k, v in details.items() if v is not None]
            if found_fields:
                log(f"   ✅ Found: {', '.join(found_fields)}")
                if details['land_size_m2']:
                    log(f"      Land: {details['land_size_m2']} m²")
                if details['building_size_m2']:
                    log(f"      Building: {details['building_size_m2']} m²")
                if details['bedrooms']:
                    log(f"      Bedrooms: {details['bedrooms']}")
                enriched_count += 1
            else:
                log(f"   ⚠️  No structured details found")

            # Rate limiting
            await asyncio.sleep(2)

        except Exception as e:
            log(f"   ❌ Error: {e}")
            error_count += 1
            # Add empty fields to mark as attempted
            prop.update({k: None for k in ['land_size_m2', 'building_size_m2', 'bedrooms', 'bathrooms', 'property_type', 'year_built']})
            prop['details_last_updated'] = datetime.now().isoformat()

    # Save updated data
    log("\n💾 Saving enriched data...")
    with open(enriched_file, 'w', encoding='utf-8') as f:
        json.dump(properties, f, indent=2, ensure_ascii=False)

    log("\n" + "=" * 70)
    log("✅ PROPERTY DETAILS ENRICHMENT COMPLETED")
    log("=" * 70)
    log(f"📊 Summary:")
    log(f"   Total active properties: {len(active_properties)}")
    log(f"   Enriched: {enriched_count}")
    log(f"   Skipped (already enriched): {skipped_count}")
    log(f"   Errors: {error_count}")

    return True

if __name__ == "__main__":
    asyncio.run(enrich_all_properties())
