#!/usr/bin/env python3
"""
Migrate existing CSV + JSON data into properties.json (single source of truth).

Layered merge:
  1. enriched_data.json as base (status, availability, criteria, analysis)
  2. extracted_property_urls.csv layered (thumbnail, breadcrumb, location)
  3. analysis_output.csv layered (GPT scores, KPIs, custom scores, coordinates)

Run once to bootstrap, then repeatable to absorb CSV updates.
"""
import csv
import json
from pathlib import Path

from store import load, persist, upsert, detect_source, STATUS_REMOVED

SCRIPT_DIR = Path(__file__).parent


def read_csv_rows(path):
    """Read CSV and return list of dicts."""
    if not path.exists():
        print(f"  Skip: {path.name} not found")
        return []
    with open(path, newline='', encoding='utf-8') as f:
        return list(csv.DictReader(f))


def parse_number(val):
    """Parse a string number, return float or None."""
    if val is None or val == '':
        return None
    try:
        cleaned = str(val).replace(',', '.').replace('\xa0', '').replace(' ', '')
        return float(cleaned)
    except (ValueError, TypeError):
        return None


def migrate():
    store = load()
    print(f"  Existing store: {len(store)} properties")

    # Layer 1: enriched_data.json (richest current source)
    enriched_path = SCRIPT_DIR / 'enriched_data.json'
    if enriched_path.exists():
        with open(enriched_path, encoding='utf-8') as f:
            enriched = json.load(f)
        for p in enriched:
            url = p.get('url')
            if not url:
                continue
            upsert(store, url, p)
        print(f"  Layer 1: {len(enriched)} from enriched_data.json")

    # Layer 2: extracted_property_urls.csv
    rows = read_csv_rows(SCRIPT_DIR / 'extracted_property_urls.csv')
    for row in rows:
        url = row.get('Property URL') or row.get('URL')
        if not url:
            continue
        fields = {}
        if row.get('Thumbnail'):
            fields['thumbnail'] = row['Thumbnail']
        if row.get('Breadcrumb'):
            fields['breadcrumb'] = row['Breadcrumb']
        if row.get('Locatie'):
            fields['location_raw'] = row['Locatie']
        if row.get('MapsLink'):
            fields['maps_link'] = row['MapsLink']
        if fields:
            upsert(store, url, fields)
    print(f"  Layer 2: {len(rows)} from extracted_property_urls.csv")

    # Layer 3: analysis_output.csv (GPT scores, KPIs, coordinates)
    rows = read_csv_rows(SCRIPT_DIR / 'analysis_output.csv')
    for row in rows:
        url = row.get('URL')
        if not url:
            continue
        fields = {}
        # GPT criteria scores
        for crit in ['market_garden', 'guest_accommodation', 'workshop',
                     'rental_units', 'location', 'local_market']:
            v = parse_number(row.get(crit))
            if v is not None:
                fields.setdefault('criteria', {})
                fields['criteria'][crit] = v
        # Risk
        if row.get('risk_profile'):
            fields['risk_profile'] = row['risk_profile']
        # Combined scores
        for key in ['overall_score_combined', 'custom_overall_score', 'Gewogen Score']:
            v = parse_number(row.get(key))
            if v is not None:
                mapped = {'overall_score_combined': 'overall_score',
                          'custom_overall_score': 'custom_score',
                          'Gewogen Score': 'gpt_score'}.get(key, key)
                fields[mapped] = v
        # KPIs
        for csv_key, store_key in [('land_size', 'land_size_m2'),
                                    ('building_size', 'building_size_m2'),
                                    ('bedrooms', 'bedrooms'),
                                    ('bathrooms', 'bathrooms'),
                                    ('price', 'price')]:
            v = parse_number(row.get(csv_key))
            if v is not None:
                fields[store_key] = v
        # Coordinates (only if present and not already set)
        lat = parse_number(row.get('Latitude'))
        lon = parse_number(row.get('Longitude'))
        if lat and lon:
            fields['lat'] = lat
            fields['lon'] = lon
        # GPS source
        if row.get('GPSSource'):
            fields['gps_source'] = row['GPSSource']
        # Hospital distance
        v = parse_number(row.get('hospital_distance_m'))
        if v is not None:
            fields['hospital_distance_m'] = v
        # Title/summary
        if row.get('Titel'):
            fields['title'] = row['Titel']
        if row.get('Samenvatting'):
            fields['summary'] = row['Samenvatting']
        if row.get('GPT Analyse'):
            fields['analysis'] = row['GPT Analyse']

        if fields:
            # Special handling: merge criteria dict instead of overwriting
            if 'criteria' in fields and url in store and 'criteria' in store[url]:
                existing = store[url]['criteria']
                if isinstance(existing, dict):
                    existing.update(fields.pop('criteria'))
            upsert(store, url, fields)
    print(f"  Layer 3: {len(rows)} from analysis_output.csv")

    # Detect source from URL
    for url, prop in store.items():
        if not prop.get('source'):
            prop['source'] = detect_source(url)

    # Summary
    removed = sum(1 for p in store.values() if p.get('status') == STATUS_REMOVED)
    active = len(store) - removed
    print(f"\n  Result: {len(store)} total | {active} active | {removed} removed")

    persist(store)


if __name__ == '__main__':
    print("MIGRATE — CSV+JSON → properties.json")
    print("=" * 50)
    migrate()
