#!/usr/bin/env python3
"""
Extract Property KPIs from Descriptions
Uses GPT to extract structured data (land size, building size, bedrooms, etc.)
from property descriptions already in enriched_data.json
"""
import json
import os
import re
from pathlib import Path
from datetime import datetime
from openai import OpenAI
from dotenv import load_dotenv

# Load API key
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def log(message):
    """Log with timestamp"""
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print(f"[{timestamp}] {message}")

def extract_kpis_with_gpt(title, summary, analysis):
    """Use GPT to extract structured KPIs from property text"""

    prompt = f"""Extract numerical property data from this real estate listing. Return ONLY valid JSON.

Title: {title}

Description: {summary}

Analysis: {analysis[:500]}

Extract these fields (use null if not found):
- land_size_m2: plot/land size in square meters
- building_size_m2: building/house size in square meters
- bedrooms: number of bedrooms
- bathrooms: number of bathrooms

Look for keywords like: m², hectare (10000 m²), oppervlakte, perceel, land, plot, terrain, slaapkamer, bedroom, badkamer, bathroom.

Return JSON format:
{{"land_size_m2": number|null, "building_size_m2": number|null, "bedrooms": number|null, "bathrooms": number|null}}"""

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a data extraction specialist. Extract property data and return ONLY valid JSON. No explanations."},
                {"role": "user", "content": prompt}
            ],
            temperature=0,
            max_tokens=150
        )

        result_text = response.choices[0].message.content.strip()

        # Remove markdown code blocks if present
        if result_text.startswith("```"):
            result_text = re.sub(r'```json\n?|```\n?', '', result_text).strip()

        # Parse JSON
        data = json.loads(result_text)

        return data

    except Exception as e:
        log(f"   ⚠️  GPT extraction error: {e}")
        return {
            'land_size_m2': None,
            'building_size_m2': None,
            'bedrooms': None,
            'bathrooms': None
        }

def extract_all_kpis():
    """Extract KPIs for all properties in enriched_data.json"""

    log("=" * 70)
    log("🔍 STARTING PROPERTY KPI EXTRACTION")
    log("=" * 70)

    enriched_file = Path("enriched_data.json")

    if not enriched_file.exists():
        log("❌ enriched_data.json not found")
        return False

    # Load existing data
    with open(enriched_file, 'r', encoding='utf-8') as f:
        properties = json.load(f)

    log(f"📊 Found {len(properties)} total properties")

    # Filter active properties
    active_properties = [p for p in properties if p.get('status') != 'Removed']
    log(f"📊 {len(active_properties)} active properties to process")

    # Process each property
    extracted_count = 0
    skipped_count = 0
    error_count = 0
    active_processed = 0

    for i, prop in enumerate(properties, 1):
        if prop.get('status') == 'Removed':
            continue

        active_processed += 1

        # Check if already has KPI data
        if prop.get('land_size_m2') is not None or prop.get('kpis_extracted'):
            log(f"[{active_processed}/{len(active_properties)}] ⏭️  Skipping {prop.get('location')} (already has KPIs)")
            skipped_count += 1
            continue

        log(f"[{active_processed}/{len(active_properties)}] 🔍 Extracting KPIs for {prop.get('location')}")

        title = prop.get('title', '')
        summary = prop.get('summary', '')
        analysis = prop.get('analysis', '')

        if not summary and not analysis:
            log(f"   ⚠️  No description text available")
            prop['kpis_extracted'] = True
            prop['kpis_extraction_date'] = datetime.now().isoformat()
            error_count += 1
            continue

        try:
            kpis = extract_kpis_with_gpt(title, summary, analysis)

            # Add KPIs to property
            prop.update(kpis)
            prop['kpis_extracted'] = True
            prop['kpis_extraction_date'] = datetime.now().isoformat()

            # Log what we found
            found_fields = []
            if kpis.get('land_size_m2'):
                found_fields.append(f"Land: {kpis['land_size_m2']} m²")
            if kpis.get('building_size_m2'):
                found_fields.append(f"Building: {kpis['building_size_m2']} m²")
            if kpis.get('bedrooms'):
                found_fields.append(f"Bedrooms: {kpis['bedrooms']}")
            if kpis.get('bathrooms'):
                found_fields.append(f"Bathrooms: {kpis['bathrooms']}")

            if found_fields:
                log(f"   ✅ Found: {', '.join(found_fields)}")
                extracted_count += 1
            else:
                log(f"   ℹ️  No KPIs found in text")

        except Exception as e:
            log(f"   ❌ Error: {e}")
            prop['kpis_extracted'] = True
            prop['kpis_extraction_date'] = datetime.now().isoformat()
            error_count += 1

    # Save updated data
    log("\n💾 Saving enriched data...")
    with open(enriched_file, 'w', encoding='utf-8') as f:
        json.dump(properties, f, indent=2, ensure_ascii=False)

    log("\n" + "=" * 70)
    log("✅ PROPERTY KPI EXTRACTION COMPLETED")
    log("=" * 70)
    log(f"📊 Summary:")
    log(f"   Total active properties: {len(active_properties)}")
    log(f"   KPIs extracted: {extracted_count}")
    log(f"   Skipped (already had KPIs): {skipped_count}")
    log(f"   No KPIs found: {error_count}")

    return True

if __name__ == "__main__":
    extract_all_kpis()
