#!/usr/bin/env python3
"""
GPT Analysis with Structured Outputs
Uses OpenAI's structured outputs feature for guaranteed valid JSON responses
Combines with pre-extracted property facts for higher quality and lower cost
"""

import os
import json
import time
import requests
import pandas as pd
from pathlib import Path
from openai import OpenAI
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from typing import Literal, Optional
from extract_property_facts import PropertyFactsExtractor

# Load API key
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# === STRUCTURED OUTPUT SCHEMA ===

class PropertyCriteria(BaseModel):
    """Structured schema for property evaluation criteria"""

    market_garden: int = Field(
        ge=1, le=5,
        description="Regenerative market garden potential: soil quality, sun exposure, water access, size (1=poor, 5=excellent)"
    )
    market_garden_reasoning: str = Field(
        max_length=200,
        description="Brief explanation for market garden score"
    )

    guest_accommodation: int = Field(
        ge=1, le=5,
        description="Guest accommodation/B&B potential: tranquility, setting, accessibility, existing facilities (1=poor, 5=excellent)"
    )
    guest_accommodation_reasoning: str = Field(
        max_length=200,
        description="Brief explanation for guest accommodation score"
    )

    workshop: int = Field(
        ge=1, le=5,
        description="Workshop/food processing potential: existing buildings, space, facilities (1=poor, 5=excellent)"
    )
    workshop_reasoning: str = Field(
        max_length=200,
        description="Brief explanation for workshop score"
    )

    rental_units: int = Field(
        ge=1, le=5,
        description="Independent rental units potential: separate buildings, conversion possibilities (1=poor, 5=excellent)"
    )
    rental_units_reasoning: str = Field(
        max_length=200,
        description="Brief explanation for rental units score"
    )

    location: int = Field(
        ge=1, le=5,
        description="Location quality: distance to coast/cities/airports, accessibility, area appeal (1=poor, 5=excellent)"
    )
    location_reasoning: str = Field(
        max_length=200,
        description="Brief explanation for location score"
    )

    local_market: int = Field(
        ge=1, le=5,
        description="Local market access: farmers markets, farm shops, direct sales opportunities (1=poor, 5=excellent)"
    )
    local_market_reasoning: str = Field(
        max_length=200,
        description="Brief explanation for local market score"
    )

    risk_profile: Literal["Laag", "Gemiddeld", "Hoog"] = Field(
        description="Overall investment risk: Laag=low renovation/clear usage, Gemiddeld=moderate work needed, Hoog=major renovation or unclear viability"
    )
    risk_reasoning: str = Field(
        max_length=200,
        description="Brief explanation for risk assessment"
    )

    overall_assessment: str = Field(
        max_length=300,
        description="Concise overall assessment highlighting key strengths and weaknesses"
    )


# === OPTIMIZED PROMPT ===

STRUCTURED_PROMPT_TEMPLATE = """You are an expert in evaluating rural properties for regenerative agriculture and sustainable lifestyle projects.

Analyze this property for the following use cases:
1. **Regenerative market garden** - Growing vegetables/herbs using sustainable methods
2. **Guest accommodation** - Short-stay rentals, B&B, retreat hosting
3. **Workshop/food processing** - Value-added production (jams, preserves, etc.)
4. **Rental income** - Long-term or short-term rental units
5. **Location appeal** - Proximity to attractions, accessibility
6. **Local market access** - Ability to sell products locally

{property_facts}

{custom_criteria_data}

Evaluate each criterion on a 1-5 scale:
- 1 = Very poor / Not viable
- 2 = Poor / Significant limitations
- 3 = Acceptable / Some limitations
- 4 = Good / Minor limitations
- 5 = Excellent / Ideal conditions

Assess risk based on:
- **Laag** (Low): Property is move-in ready or requires minimal work, clear value proposition
- **Gemiddeld** (Medium): Some renovation needed, moderate uncertainty
- **Hoog** (High): Major renovation required, significant uncertainty, or questionable viability

Provide practical, realistic assessments based on the facts provided."""


# === COST TRACKING ===
COST_TRACKER_FILE = "gpt_cost_tracker_structured.json"

def load_cost_tracker():
    """Load cost tracking data"""
    if os.path.exists(COST_TRACKER_FILE):
        with open(COST_TRACKER_FILE, 'r') as f:
            return json.load(f)
    return {
        "total_cost": 0.0,
        "total_requests": 0,
        "runs": []
    }

def save_cost_tracker(tracker):
    """Save cost tracking data"""
    with open(COST_TRACKER_FILE, 'w') as f:
        json.dump(tracker, f, indent=2)

def estimate_cost(input_tokens, output_tokens, model="gpt-4o-mini"):
    """Estimate cost for GPT models"""
    pricing = {
        "gpt-4o-mini": {"input": 0.15, "output": 0.60},
        "gpt-4o": {"input": 2.50, "output": 10.00},
    }

    rates = pricing.get(model, pricing["gpt-4o-mini"])
    input_cost = (input_tokens / 1_000_000) * rates["input"]
    output_cost = (output_tokens / 1_000_000) * rates["output"]
    return input_cost + output_cost


# === MAIN ANALYSIS FUNCTION ===

def analyze_property_structured(url: str, html_content: str, custom_data: Optional[dict] = None) -> dict:
    """
    Analyze property using structured outputs

    Args:
        url: Property URL
        html_content: Raw HTML from property page
        custom_data: Optional custom criteria data

    Returns:
        Dictionary with analysis results including structured criteria
    """

    # Extract structured facts
    print(f"   📄 Extracting structured facts...")
    extractor = PropertyFactsExtractor(html_content, url)
    facts = extractor.extract_all()

    # Convert to GPT-friendly text
    property_facts = extractor.to_prompt_text()
    print(f"   📊 Extracted {len(property_facts)} characters of structured data")

    # Format custom criteria if available
    custom_text = ""
    if custom_data and custom_data.get('custom_score', 0) > 0:
        custom_text = f"""
OBJECTIVE DATA (Climate & Location):
- Custom criteria score: {custom_data.get('custom_score', 0):.1f}/5.0
- Growing season length: {custom_data.get('growing_season_days', 'N/A')} days
- Annual sunshine: {custom_data.get('annual_sunshine_hours', 'N/A')} hours
- Frost-free period: {custom_data.get('frost_free_days', 'N/A')} days
- Distance to coast: {custom_data.get('distance_to_coast_km', 'N/A')} km
- Airport distance: {custom_data.get('nearest_airport_km', 'N/A')} km

Consider this objective data in your assessment.
"""

    # Assemble prompt
    final_prompt = STRUCTURED_PROMPT_TEMPLATE.format(
        property_facts=property_facts,
        custom_criteria_data=custom_text if custom_text else "[No objective climate data available]"
    )

    # Call GPT with structured outputs
    print(f"   🤖 Calling GPT-4o-mini with structured outputs...")
    start_time = time.time()

    completion = client.beta.chat.completions.parse(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": "You are an expert property evaluator for regenerative agriculture projects. Provide honest, practical assessments."
            },
            {
                "role": "user",
                "content": final_prompt
            }
        ],
        response_format=PropertyCriteria,
        temperature=0.3
    )

    duration = time.time() - start_time

    # Extract structured response
    criteria = completion.choices[0].message.parsed

    # Track costs
    input_tokens = completion.usage.prompt_tokens
    output_tokens = completion.usage.completion_tokens
    cost = estimate_cost(input_tokens, output_tokens, model="gpt-4o-mini")

    print(f"   💰 Cost: ${cost:.6f} ({input_tokens} in + {output_tokens} out tokens)")
    print(f"   ⏱️  Duration: {duration:.2f}s")

    # Calculate weighted score (matching old system)
    score_weights = {"1": -2, "2": -1, "3": 1, "4": 2, "5": 3}
    criteria_weights = {
        "market_garden": 2.0,
        "guest_accommodation": 2.5,
        "workshop": 2.0,
        "rental_units": 1.5,
        "location": 3.0,
        "local_market": 1.5
    }

    total_score = 0.0
    total_weight = 0.0

    for criterion, weight in criteria_weights.items():
        score_value = getattr(criteria, criterion)
        weighted = score_weights[str(score_value)] * weight
        total_score += weighted
        total_weight += weight

    # Apply risk factor
    risk_factors = {"Laag": 1.0, "Gemiddeld": 0.9, "Hoog": 0.7}
    risk_factor = risk_factors.get(criteria.risk_profile, 0.9)

    weighted_score = round((total_score / total_weight) * risk_factor, 2) if total_weight > 0 else 0

    # Return complete analysis
    return {
        'url': url,
        'structured_facts': facts,
        'criteria': {
            'market_garden': criteria.market_garden,
            'guest_accommodation': criteria.guest_accommodation,
            'workshop': criteria.workshop,
            'rental_units': criteria.rental_units,
            'location': criteria.location,
            'local_market': criteria.local_market
        },
        'reasoning': {
            'market_garden': criteria.market_garden_reasoning,
            'guest_accommodation': criteria.guest_accommodation_reasoning,
            'workshop': criteria.workshop_reasoning,
            'rental_units': criteria.rental_units_reasoning,
            'location': criteria.location_reasoning,
            'local_market': criteria.local_market_reasoning,
            'risk': criteria.risk_reasoning
        },
        'risk_profile': criteria.risk_profile,
        'overall_assessment': criteria.overall_assessment,
        'weighted_score': weighted_score,
        'tokens': {
            'input': input_tokens,
            'output': output_tokens,
            'cost': cost
        },
        'duration_seconds': duration
    }


# === BATCH PROCESSING ===

def analyze_all_properties():
    """Analyze all properties from CSV using structured outputs"""

    input_file = "extracted_property_urls.csv"
    output_file = "analysis_output_structured.csv"
    enriched_file = "enriched_data.json"

    # Load existing enriched data to get custom criteria
    property_data_by_url = {}
    if os.path.exists(enriched_file):
        try:
            with open(enriched_file, 'r') as f:
                enriched_data = json.load(f)
                property_data_by_url = {prop['url']: prop for prop in enriched_data}
                print(f"📊 Loaded {len(property_data_by_url)} properties from enriched_data.json")
        except Exception as e:
            print(f"⚠️  Could not load enriched_data.json: {e}")

    # Load properties to analyze
    df = pd.read_csv(input_file)

    # Load existing analyses to skip
    analyzed_urls = set()
    if os.path.exists(output_file):
        existing_df = pd.read_csv(output_file)
        analyzed_urls = set(existing_df['URL'].tolist())
        print(f"📚 {len(analyzed_urls)} properties already analyzed")

    # Headers for requests
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15"
    }

    # Cost tracking
    cost_tracker = load_cost_tracker()
    run_data = {
        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
        "properties_analyzed": 0,
        "total_cost": 0.0,
        "tokens_input": 0,
        "tokens_output": 0
    }

    results = []
    total_to_analyze = len(df) - len(analyzed_urls)

    print(f"\n🚀 Starting structured analysis of {total_to_analyze} properties\n")

    for idx, row in enumerate(df.itertuples(), 1):
        url = row.URL

        # Skip if already analyzed
        if url in analyzed_urls:
            print(f"⏭️  [{idx}/{len(df)}] Already analyzed: {url}")
            continue

        try:
            print(f"\n🔎 [{idx}/{len(df)}] Analyzing: {url}")

            # Fetch property page
            response = requests.get(url, headers=headers, timeout=15)
            if response.status_code != 200:
                print(f"   ❌ HTTP {response.status_code} - skipping")
                continue

            # Get custom data if available
            custom_data = property_data_by_url.get(url, {})

            # Analyze with structured outputs
            analysis = analyze_property_structured(url, response.text, custom_data)

            # Track costs
            run_data["properties_analyzed"] += 1
            run_data["total_cost"] += analysis['tokens']['cost']
            run_data["tokens_input"] += analysis['tokens']['input']
            run_data["tokens_output"] += analysis['tokens']['output']

            # Format for CSV
            results.append({
                "URL": url,
                "Titel": analysis['structured_facts'].get('title', ''),
                "Locatie": analysis['structured_facts'].get('location', {}).get('full', ''),
                "Prijs": analysis['structured_facts'].get('price'),
                "market_garden": analysis['criteria']['market_garden'],
                "guest_accommodation": analysis['criteria']['guest_accommodation'],
                "workshop": analysis['criteria']['workshop'],
                "rental_units": analysis['criteria']['rental_units'],
                "location": analysis['criteria']['location'],
                "local_market": analysis['criteria']['local_market'],
                "risk_profile": analysis['risk_profile'],
                "overall_assessment": analysis['overall_assessment'],
                "Gewogen Score": analysis['weighted_score']
            })

            # Save progress
            if results:
                results_df = pd.DataFrame(results)
                results_df.to_csv(output_file, index=False, encoding='utf-8')
                print(f"   💾 Progress saved ({len(results)} properties)")

            # Rate limiting
            time.sleep(1)

        except Exception as e:
            print(f"   ❌ Error: {e}")
            continue

    # Final save
    if results:
        results_df = pd.DataFrame(results)
        results_df.to_csv(output_file, index=False, encoding='utf-8')
        print(f"\n✅ Analysis complete! {len(results)} properties analyzed")

    # Cost report
    run_data["duration_seconds"] = sum(r.get('duration_seconds', 0) for r in results) if results else 0
    cost_tracker["runs"].append(run_data)
    cost_tracker["total_cost"] += run_data["total_cost"]
    cost_tracker["total_requests"] += run_data["properties_analyzed"]
    save_cost_tracker(cost_tracker)

    print("\n" + "=" * 70)
    print("💰 COST REPORT")
    print("=" * 70)
    print(f"Properties analyzed: {run_data['properties_analyzed']}")
    print(f"Total cost: ${run_data['total_cost']:.6f}")
    print(f"Average per property: ${run_data['total_cost']/max(1, run_data['properties_analyzed']):.6f}")
    print(f"Input tokens: {run_data['tokens_input']:,}")
    print(f"Output tokens: {run_data['tokens_output']:,}")
    print(f"\nAll-time total: ${cost_tracker['total_cost']:.4f}")
    print("=" * 70)


if __name__ == "__main__":
    import sys

    if len(sys.argv) > 1 and sys.argv[1] == "test":
        # Test mode: analyze a single property
        test_url = "https://www.properstar.com/property-for-sale/france/lot-et-garonne/7837046"
        print(f"Testing with URL: {test_url}\n")

        headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15"
        }

        response = requests.get(test_url, headers=headers, timeout=15)
        if response.status_code == 200:
            result = analyze_property_structured(test_url, response.text)
            print("\n" + "=" * 70)
            print("ANALYSIS RESULT")
            print("=" * 70)
            print(json.dumps(result, indent=2, ensure_ascii=False))
        else:
            print(f"Failed to fetch: HTTP {response.status_code}")
    else:
        # Production mode: analyze all properties
        analyze_all_properties()