#!/usr/bin/env python3
"""
GPT property analysis — store-based, structured input, JSON mode, single call.

Replaces analyze_from_urls.py. Instead of scraping pages and sending raw HTML to GPT,
this reads pre-computed data from the store and sends a compact property card.

Usage:
    python3 analyze_properties.py              # Analyze all unanalyzed active properties
    python3 analyze_properties.py --limit 5    # Test on 5 properties
    python3 analyze_properties.py --dry-run    # Show what would be analyzed
    python3 analyze_properties.py --force      # Re-analyze even if already done
    python3 analyze_properties.py --model gpt-4o-mini  # Choose model (default: gpt-4o-mini)
"""
import argparse
import json
import os
import time
from datetime import datetime

from dotenv import load_dotenv
from openai import OpenAI

from store import load, persist, upsert, is_active, get_score, short_url

load_dotenv()

SYSTEM_PROMPT = """You are a real estate analyst evaluating properties for a farm retreat business combining:
- Guest accommodation (4-6 rooms, B&B / themed retreat weeks)
- Food experiences (farm-to-table dinners, cooking events for 8-12 guests)
- Workshop/studio (creative retreats, craft workshops, artist residencies)
- Market garden (permaculture, ingredients for kitchen, product line)

This is a design-led hospitality concept where architectural character, food infrastructure,
and workshop space matter MORE than raw bedroom count or land size.

Score each criterion 1-5 based on the property data provided. Be honest and critical.
1 = Unsuitable (fundamental problems)
2 = Limited (significant disadvantages)
3 = Moderate (mix of pros/cons)
4 = Well suited (minor adaptations)
5 = Excellent (ideal fit)

Respond ONLY with valid JSON."""

CRITERIA_PROMPT = """\
Evaluate this property:

{property_card}

Return JSON with exactly this structure:
{{
  "market_garden": {{"score": <1-5>, "reason": "<20 words max>"}},
  "guest_accommodation": {{"score": <1-5>, "reason": "<20 words max>"}},
  "workshop": {{"score": <1-5>, "reason": "<20 words max>"}},
  "food_experience": {{"score": <1-5>, "reason": "<20 words max>"}},
  "design_story": {{"score": <1-5>, "reason": "<20 words max>"}},
  "location": {{"score": <1-5>, "reason": "<20 words max>"}},
  "local_market": {{"score": <1-5>, "reason": "<20 words max>"}},
  "livability": {{"score": <1-5>, "reason": "<20 words max>"}},
  "has_electricity": <true|false|null>,
  "has_mains_water": <true|false|null>,
  "risk_profile": "<low|medium|high>",
  "risk_reason": "<20 words max>",
  "renovation_estimate": "<none|minor|moderate|major|rebuild>",
  "summary": "<2 sentences: strongest point + biggest concern>"
}}

Key scoring notes:
- HARD REQUIREMENTS: Must have electricity AND running water (mains or well with pump). No utilities = high risk.
- Market garden: land >= 3000m2 minimum for serious growing. 5000m2+ with rainfall 600-1300mm = score 3+. No land or < 3000m2 = score 1.
- Guest accommodation: 4-6 rooms is the sweet spot (not more). Score on quality not quantity. En-suite potential, separate guest flow, charm. Ruin = score 1.
- Workshop: existing outbuilding 30m2+ suitable for creative retreats, craft workshops, or artist studio. Multiple outbuildings = score 5. No outbuildings = score 2. This is THE key revenue enabler for themed weeks.
- Food experience: can this property host farm dinners for 8-12 guests? Score on: kitchen size (15m2+ ideal), communal dining space, table d'hôtes potential, proximity to local producers/markets. Large kitchen visible to guests = score 5. Kitchenette only = score 1.
- Design/story: architectural character that would appeal to design press (Remodelista, Monocle, i-escape). Stone farmhouse, historic mill, period features (beams, fireplace), photogenic setting. Score 5 = exceptional character + compelling origin story. Score 1 = generic modern box.
- Location: scenic setting + accessibility. Airport < 2h, near hiking/nature, tourist route proximity. Drives press and bookings.
- Local market & tourist short-stay potential: Score the area's viability for Airbnb/B&B guest bookings. Consider: proximity to tourist attractions (coast, mountains, wine routes, historic towns, hiking trails), seasonal demand (year-round vs summer-only), existing short-stay market (Airbnb density, gîte culture), accessibility from airports/TGV, weekend-trip potential from major cities. Score 5 = established tourist destination with year-round demand (coast, wine region, major hiking). Score 4 = strong seasonal tourism + cultural draws. Score 3 = moderate tourism, some attractions within 30min. Score 2 = rural with limited tourist pull, mainly passing traffic. Score 1 = no tourism infrastructure or attractions nearby.
- Livability: can you live here day-to-day? Electricity + water + heating (essential), internet viability (fiber/4G/Starlink), grocery/pharmacy within 20 min, habitable condition, winter accessibility. Score 5 = all utilities + shops nearby + habitable. Score 1 = ruin with no utilities.
- has_electricity/has_mains_water: true if mentioned, false if explicitly absent, null if unknown."""


def build_property_card(prop):
    """Build a compact text card from pre-computed store data."""
    lines = []

    # Basic info
    price = prop.get('price')
    if price:
        lines.append(f"Price: EUR {price:,.0f}")

    beds = prop.get('bedrooms') or prop.get('rooms')
    building = prop.get('building_size_m2') or prop.get('building_size')
    land = prop.get('land_size_m2') or prop.get('land_size')
    parts = []
    if beds:
        parts.append(f"Bedrooms: {beds}")
    if building:
        parts.append(f"Building: {building:.0f}m2")
    if land:
        parts.append(f"Land: {land:,.0f}m2")
    if parts:
        lines.append(' | '.join(parts))

    # DPE / year built
    dpe = prop.get('dpe')
    year = prop.get('year_built')
    if dpe or year:
        dpe_str = f"DPE: {dpe}" if dpe else ""
        year_str = f"Built: ~{year}" if year else ""
        lines.append(' | '.join(filter(None, [dpe_str, year_str])))

    # Renovation score
    reno = prop.get('renovation_score')
    if reno:
        lines.append(f"Renovation estimate: {reno}/5 (1=pristine, 5=rebuild)")

    # Keyword signals
    signals = prop.get('keyword_signals', [])
    if signals:
        labels = [s.replace('has_', '').replace('_', ' ') for s in signals]
        lines.append(f"Features: {', '.join(labels)}")

    # Utilities (from keyword detection or explicit fields)
    utilities = []
    if prop.get('has_electricity') or 'has_electricity' in signals:
        utilities.append('electricity: YES')
    if prop.get('has_mains_water') or 'has_mains_water' in signals:
        utilities.append('mains water: YES')
    if utilities:
        lines.append(f"Utilities: {', '.join(utilities)}")
    else:
        lines.append("Utilities: UNKNOWN (check listing)")

    # Location
    loc_parts = []
    if prop.get('city'):
        loc_parts.append(prop['city'])
    if prop.get('county'):
        loc_parts.append(prop['county'])
    if prop.get('region'):
        loc_parts.append(prop['region'])
    if loc_parts:
        lines.append(f"Location: {', '.join(loc_parts)}")
    elif prop.get('location'):
        lines.append(f"Location: {prop['location']}")

    # Climate / soil / risk (from API enrichment)
    api_parts = []
    if prop.get('soil_quality_score'):
        api_parts.append(f"Soil quality: {prop['soil_quality_score']}/5")
    if prop.get('soil_ph'):
        api_parts.append(f"pH: {prop['soil_ph']:.1f}")
    if prop.get('risk_score'):
        api_parts.append(f"Natural risk: {prop['risk_score']}/5 (5=safe)")
    if prop.get('has_flood_risk'):
        api_parts.append("FLOOD ZONE")
    if api_parts:
        lines.append(' | '.join(api_parts))

    # Custom criteria data (rainfall, airport, etc.)
    rainfall = prop.get('annual_rainfall_mm')
    if rainfall:
        lines.append(f"Rainfall: {rainfall:.0f}mm/year")

    airport_km = prop.get('airport_distance_km') or prop.get('hospital_distance_m')
    if airport_km:
        if isinstance(airport_km, (int, float)) and airport_km > 1000:
            lines.append(f"Airport: ~{airport_km/1000:.0f}km")

    # Property type
    ptype = prop.get('property_type')
    if ptype:
        lines.append(f"Type: {ptype}")

    # Description (truncated)
    desc = prop.get('description') or prop.get('analysis') or ''
    if desc and len(desc) > 50:
        lines.append(f"\nDescription:\n{desc[:800]}")

    # Title
    title = prop.get('title', '')
    if title and 'just a moment' not in title.lower():
        lines.insert(0, f"Title: {title}")

    return '\n'.join(lines) if lines else 'No data available'


def prefilter_redflag(prop):
    """Fast red-flag check before burning GPT tokens. Returns reason string or None."""
    # Too small for 4+ guest rooms + workshop + communal dining
    bsize = prop.get('building_size_m2') or prop.get('building_size')
    if bsize and bsize < 80:
        return f'building_too_small_{bsize}m2'

    # 1-2 bedrooms can't host guests
    beds = prop.get('bedrooms')
    if beds and beds < 3:
        return f'only_{beds}_bedrooms'

    # No coordinates = can't score location/amenities
    if not prop.get('lat'):
        return 'no_coordinates'

    # Expensive + small = no room for the vision
    price = prop.get('price')
    if price and price > 400000 and bsize and bsize < 120:
        return 'expensive_and_small'

    return None


def needs_analysis(prop, force=False):
    """Property needs GPT analysis."""
    if not is_active(prop):
        return False
    if force:
        return True
    # Skip if already analyzed with new system
    if prop.get('gpt_analyzed_at'):
        return False
    # Also skip if already has criteria scores from old system AND they look real
    criteria = prop.get('criteria', {})
    if criteria and len(criteria) >= 4:
        # Check it's not hallucinated (blocked page analysis)
        title = (prop.get('title') or '').lower()
        if 'just a moment' not in title:
            return False
    # Pre-filter: skip obvious no-go properties
    redflag = prefilter_redflag(prop)
    if redflag:
        return False
    return True


def analyze_property(client, prop, model):
    """Send property card to GPT and return parsed criteria."""
    card = build_property_card(prop)

    if len(card.strip()) < 20:
        return None, 'insufficient data'

    prompt = CRITERIA_PROMPT.format(property_card=card)

    try:
        resp = client.chat.completions.create(
            model=model,
            response_format={"type": "json_object"},
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": prompt},
            ],
            temperature=0.0,
            max_tokens=500,
        )

        text = resp.choices[0].message.content.strip()
        data = json.loads(text)

        # Extract criteria scores
        criteria = {}
        for key in ['market_garden', 'guest_accommodation', 'workshop',
                     'food_experience', 'design_story',
                     'rental_units', 'location', 'local_market', 'livability']:
            entry = data.get(key, {})
            if isinstance(entry, dict) and 'score' in entry:
                score = entry['score']
                if isinstance(score, (int, float)) and 1 <= score <= 5:
                    criteria[key] = int(score)

        if len(criteria) < 4:
            return None, f'only {len(criteria)} criteria parsed'

        result = {
            'criteria': criteria,
            'risk_profile': data.get('risk_profile', 'medium'),
            'risk_reason': data.get('risk_reason', ''),
            'renovation_estimate': data.get('renovation_estimate', ''),
            'gpt_summary': data.get('summary', ''),
            'gpt_analyzed_at': datetime.now().isoformat(),
            'gpt_model': model,
        }

        # Utility flags from GPT
        for flag in ('has_electricity', 'has_mains_water'):
            val = data.get(flag)
            if isinstance(val, bool):
                result[flag] = val

        # Token usage
        usage = resp.usage
        tokens = {'input': usage.prompt_tokens, 'output': usage.completion_tokens,
                  'total': usage.total_tokens}

        return result, tokens

    except json.JSONDecodeError as e:
        return None, f'JSON parse error: {str(e)[:60]}'
    except Exception as e:
        return None, f'API error: {str(e)[:80]}'


def main():
    parser = argparse.ArgumentParser(description='GPT property analysis (store-based)')
    parser.add_argument('--limit', type=int, default=0, help='Max properties (0=all)')
    parser.add_argument('--dry-run', action='store_true', help='Show what would be analyzed')
    parser.add_argument('--force', action='store_true', help='Re-analyze already done')
    parser.add_argument('--model', default='gpt-4o-mini', help='OpenAI model (default: gpt-4o-mini)')
    args = parser.parse_args()

    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        print("Error: OPENAI_API_KEY not set. Create .env file or export it.")
        return

    store = load()

    # Count pre-filtered red flags
    prefiltered = {}
    for url, p in store.items():
        if is_active(p) and not p.get('gpt_analyzed_at') and not (p.get('criteria') and len(p.get('criteria', {})) >= 4):
            reason = prefilter_redflag(p)
            if reason:
                prefiltered[url] = reason

    candidates = [url for url, p in store.items() if needs_analysis(p, args.force)]

    if args.limit:
        candidates = candidates[:args.limit]

    print(f"GPT ANALYSIS — {len(candidates)} properties to analyze")
    print(f"  Model: {args.model}")
    print(f"  Store: {len(store)} total properties")
    if prefiltered:
        from collections import Counter
        reasons = Counter(r.split('_')[0] for r in prefiltered.values())
        print(f"  Pre-filtered: {len(prefiltered)} red-flagged ({', '.join(f'{v} {k}' for k, v in reasons.most_common())})")
    print()

    if args.dry_run:
        for url in candidates[:20]:
            p = store[url]
            card_len = len(build_property_card(p))
            print(f"  {short_url(url)} ({card_len} chars)")
        if len(candidates) > 20:
            print(f"  ... and {len(candidates) - 20} more")
        return

    if not candidates:
        print("Nothing to do.")
        return

    client = OpenAI(api_key=api_key)
    analyzed = 0
    failed = 0
    total_tokens = {'input': 0, 'output': 0, 'total': 0}

    for i, url in enumerate(candidates):
        prop = store[url]
        print(f"  [{i+1}/{len(candidates)}] {short_url(url)}...", end=' ', flush=True)

        result, info = analyze_property(client, prop, args.model)

        if result:
            upsert(store, url, result)
            analyzed += 1

            scores = result['criteria']
            score_str = ' '.join(f"{k[:4]}={v}" for k, v in scores.items())
            print(f"OK ({score_str})")

            if isinstance(info, dict):
                for k in total_tokens:
                    total_tokens[k] += info.get(k, 0)
        else:
            failed += 1
            print(f"FAIL ({info})")

        # Save every 10 properties
        if (i + 1) % 10 == 0:
            persist(store)
            print(f"  [saved progress — {total_tokens['total']:,} tokens used]")

        time.sleep(0.5)  # rate limit buffer

    print(f"\nAnalyzed {analyzed}/{len(candidates)} ({failed} failed)")
    print(f"  Tokens: {total_tokens['input']:,} input + {total_tokens['output']:,} output = {total_tokens['total']:,} total")

    # Rough cost estimate
    if total_tokens['total'] > 0:
        # gpt-4o-mini pricing: $0.15/1M input, $0.60/1M output
        cost_input = total_tokens['input'] * 0.15 / 1_000_000
        cost_output = total_tokens['output'] * 0.60 / 1_000_000
        print(f"  Estimated cost: ${cost_input + cost_output:.4f}")

    if analyzed > 0:
        persist(store)


if __name__ == '__main__':
    main()