#!/usr/bin/env python3
"""
Batch API GPT Analysis - 50% Cost Reduction
Uses OpenAI's Batch API for overnight processing with structured outputs
Perfect for weekly full updates
"""

import os
import json
import time
import requests
import pandas as pd
from pathlib import Path
from openai import OpenAI
from dotenv import load_dotenv
from datetime import datetime
from extract_property_facts import PropertyFactsExtractor

# Load API key
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# === CONFIGURATION ===
BATCH_INPUT_FILE = "batch_analysis_input.jsonl"
BATCH_OUTPUT_FILE = "batch_analysis_output.jsonl"
BATCH_RESULTS_FILE = "batch_analysis_results.json"
BATCH_STATUS_FILE = "batch_status.json"


# === PROMPT TEMPLATE ===
BATCH_PROMPT_TEMPLATE = """You are an expert in evaluating rural properties for regenerative agriculture and sustainable lifestyle projects.

Analyze this property for the following use cases:
1. **Regenerative market garden** - Growing vegetables/herbs using sustainable methods
2. **Guest accommodation** - Short-stay rentals, B&B, retreat hosting
3. **Workshop/food processing** - Value-added production (jams, preserves, etc.)
4. **Rental income** - Long-term or short-term rental units
5. **Location appeal** - Proximity to attractions, accessibility
6. **Local market access** - Ability to sell products locally

{property_facts}

{custom_criteria_data}

Evaluate each criterion on a 1-5 scale (1=poor, 5=excellent) and provide reasoning.

Return a valid JSON response with this structure:
{{
  "market_garden": <1-5>,
  "market_garden_reasoning": "<brief explanation>",
  "guest_accommodation": <1-5>,
  "guest_accommodation_reasoning": "<brief explanation>",
  "workshop": <1-5>,
  "workshop_reasoning": "<brief explanation>",
  "rental_units": <1-5>,
  "rental_units_reasoning": "<brief explanation>",
  "location": <1-5>,
  "location_reasoning": "<brief explanation>",
  "local_market": <1-5>,
  "local_market_reasoning": "<brief explanation>",
  "risk_profile": "<Laag|Gemiddeld|Hoog>",
  "risk_reasoning": "<brief explanation>",
  "overall_assessment": "<concise summary>"
}}

Risk levels:
- Laag: Move-in ready, minimal work needed
- Gemiddeld: Some renovation needed
- Hoog: Major renovation or uncertain viability"""


# === STEP 1: CREATE BATCH INPUT ===

def create_batch_input():
    """
    Create JSONL file with batch requests for all properties
    """
    print("📝 Creating batch input file...")
    print("=" * 70)

    input_csv = "extracted_property_urls.csv"
    enriched_file = "enriched_data.json"

    # Load existing analyses to skip
    analyzed_urls = set()
    if os.path.exists("analysis_output.csv"):
        existing_df = pd.read_csv("analysis_output.csv")
        # Only skip properties with gpt_score > 0
        if 'Gewogen Score' in existing_df.columns:
            analyzed_urls = set(existing_df[existing_df['Gewogen Score'] > 0]['URL'].tolist())
            print(f"ℹ️  {len(analyzed_urls)} properties already analyzed (will skip)")

    # Load custom criteria data
    property_data_by_url = {}
    if os.path.exists(enriched_file):
        with open(enriched_file, 'r') as f:
            enriched_data = json.load(f)
            property_data_by_url = {prop['url']: prop for prop in enriched_data}
            print(f"📊 Loaded custom data for {len(property_data_by_url)} properties")

    # Load properties
    df = pd.read_csv(input_csv)
    print(f"📥 Found {len(df)} total properties in CSV")

    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15"
    }

    batch_requests = []
    property_metadata = {}

    for idx, row in enumerate(df.itertuples(), 1):
        url = row.URL

        # Skip if already analyzed
        if url in analyzed_urls:
            continue

        try:
            print(f"📄 [{idx}/{len(df)}] Fetching: {url}")

            # Fetch property page
            response = requests.get(url, headers=headers, timeout=15)
            if response.status_code != 200:
                print(f"   ⚠️  HTTP {response.status_code} - skipping")
                continue

            # Extract structured facts
            extractor = PropertyFactsExtractor(response.text, url)
            facts = extractor.extract_all()
            property_facts_text = extractor.to_prompt_text()

            # Get custom criteria
            custom_data = property_data_by_url.get(url, {})
            custom_text = ""
            if custom_data and custom_data.get('custom_score', 0) > 0:
                custom_text = f"""
OBJECTIVE DATA (Climate & Location):
- Custom score: {custom_data.get('custom_score', 0):.1f}/5.0
- Growing season: {custom_data.get('growing_season_days', 'N/A')} days
- Annual sunshine: {custom_data.get('annual_sunshine_hours', 'N/A')} hours
- Distance to coast: {custom_data.get('distance_to_coast_km', 'N/A')} km
"""

            # Create batch request
            final_prompt = BATCH_PROMPT_TEMPLATE.format(
                property_facts=property_facts_text,
                custom_criteria_data=custom_text if custom_text else "[No objective climate data]"
            )

            batch_request = {
                "custom_id": f"property_{idx}",
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {
                    "model": "gpt-4o-mini",
                    "messages": [
                        {
                            "role": "system",
                            "content": "You are an expert property evaluator. Provide honest, practical assessments in valid JSON format."
                        },
                        {
                            "role": "user",
                            "content": final_prompt
                        }
                    ],
                    "temperature": 0.3,
                    "response_format": {"type": "json_object"}
                }
            }

            batch_requests.append(batch_request)

            # Store metadata for later matching
            property_metadata[f"property_{idx}"] = {
                'url': url,
                'title': facts.get('title', ''),
                'location': facts.get('location', {}).get('full', ''),
                'price': facts.get('price'),
                'custom_score': custom_data.get('custom_score', 0)
            }

            print(f"   ✅ Added to batch ({len(batch_requests)} total)")

            # Rate limiting for fetching
            time.sleep(0.5)

        except Exception as e:
            print(f"   ❌ Error: {e}")
            continue

    # Save batch input file (JSONL format)
    print(f"\n💾 Saving batch input file...")
    with open(BATCH_INPUT_FILE, 'w') as f:
        for request in batch_requests:
            f.write(json.dumps(request) + '\n')

    # Save metadata for later
    with open('batch_metadata.json', 'w') as f:
        json.dump(property_metadata, f, indent=2)

    print(f"✅ Created batch input with {len(batch_requests)} requests")
    print(f"📄 File: {BATCH_INPUT_FILE}")
    print(f"💰 Estimated cost: ${len(batch_requests) * 0.001:.4f} (50% savings vs real-time API)")

    return len(batch_requests)


# === STEP 2: SUBMIT BATCH ===

def submit_batch():
    """
    Upload batch file and submit for processing
    """
    print("\n🚀 Submitting batch to OpenAI...")
    print("=" * 70)

    if not os.path.exists(BATCH_INPUT_FILE):
        print("❌ Batch input file not found! Run 'create' first.")
        return None

    # Upload file
    print("📤 Uploading batch file...")
    with open(BATCH_INPUT_FILE, 'rb') as f:
        batch_file = client.files.create(
            file=f,
            purpose="batch"
        )

    print(f"✅ File uploaded: {batch_file.id}")

    # Create batch
    print("🎯 Creating batch job...")
    batch = client.batches.create(
        input_file_id=batch_file.id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
            "description": "Paradisomatch property analysis",
            "created_at": datetime.now().isoformat()
        }
    )

    # Save batch status
    batch_status = {
        'batch_id': batch.id,
        'status': batch.status,
        'created_at': batch.created_at,
        'input_file_id': batch.input_file_id,
        'request_counts': batch.request_counts.__dict__ if batch.request_counts else {}
    }

    with open(BATCH_STATUS_FILE, 'w') as f:
        json.dump(batch_status, f, indent=2)

    print(f"✅ Batch submitted successfully!")
    print(f"   Batch ID: {batch.id}")
    print(f"   Status: {batch.status}")
    print(f"   Completion window: 24 hours")
    print(f"\n💡 Check status with: python3 batch_gpt_analysis.py status")

    return batch.id


# === STEP 3: CHECK STATUS ===

def check_batch_status():
    """
    Check the status of the running batch
    """
    if not os.path.exists(BATCH_STATUS_FILE):
        print("❌ No batch status file found. Submit a batch first.")
        return None

    with open(BATCH_STATUS_FILE, 'r') as f:
        batch_info = json.load(f)

    batch_id = batch_info['batch_id']

    print(f"📊 Checking batch status...")
    print("=" * 70)

    # Fetch current status
    batch = client.batches.retrieve(batch_id)

    # Update status file
    batch_info['status'] = batch.status
    batch_info['request_counts'] = batch.request_counts.__dict__ if batch.request_counts else {}
    batch_info['last_checked'] = datetime.now().isoformat()

    if batch.output_file_id:
        batch_info['output_file_id'] = batch.output_file_id
    if batch.error_file_id:
        batch_info['error_file_id'] = batch.error_file_id

    with open(BATCH_STATUS_FILE, 'w') as f:
        json.dump(batch_info, f, indent=2)

    # Display status
    print(f"Batch ID: {batch_id}")
    print(f"Status: {batch.status}")
    print(f"Created: {datetime.fromtimestamp(batch.created_at).strftime('%Y-%m-%d %H:%M:%S')}")

    if batch.request_counts:
        counts = batch.request_counts.__dict__
        print(f"\nProgress:")
        print(f"  Total: {counts.get('total', 0)}")
        print(f"  Completed: {counts.get('completed', 0)}")
        print(f"  Failed: {counts.get('failed', 0)}")

        if counts.get('total', 0) > 0:
            pct = (counts.get('completed', 0) / counts['total']) * 100
            print(f"  Progress: {pct:.1f}%")

    if batch.status == 'completed':
        print(f"\n✅ Batch completed!")
        print(f"   Output file: {batch.output_file_id}")
        print(f"\n💡 Retrieve results with: python3 batch_gpt_analysis.py retrieve")
    elif batch.status == 'failed':
        print(f"\n❌ Batch failed!")
        if batch.error_file_id:
            print(f"   Error file: {batch.error_file_id}")
    elif batch.status in ['validating', 'in_progress', 'finalizing']:
        print(f"\n⏳ Batch is processing... check back later")
    elif batch.status == 'cancelling' or batch.status == 'cancelled':
        print(f"\n⚠️  Batch cancelled")

    return batch


# === STEP 4: RETRIEVE RESULTS ===

def retrieve_batch_results():
    """
    Download and process batch results
    """
    if not os.path.exists(BATCH_STATUS_FILE):
        print("❌ No batch status file found.")
        return None

    with open(BATCH_STATUS_FILE, 'r') as f:
        batch_info = json.load(f)

    if batch_info.get('status') != 'completed':
        print(f"⚠️  Batch not completed yet. Status: {batch_info.get('status')}")
        return None

    output_file_id = batch_info.get('output_file_id')
    if not output_file_id:
        print("❌ No output file ID found.")
        return None

    print("📥 Downloading batch results...")
    print("=" * 70)

    # Download output file
    file_response = client.files.content(output_file_id)
    output_content = file_response.text

    # Save raw output
    with open(BATCH_OUTPUT_FILE, 'w') as f:
        f.write(output_content)

    print(f"✅ Downloaded results to {BATCH_OUTPUT_FILE}")

    # Parse results
    print("\n📊 Processing results...")
    results = []
    for line in output_content.strip().split('\n'):
        if line:
            results.append(json.loads(line))

    # Load metadata
    with open('batch_metadata.json', 'r') as f:
        property_metadata = json.load(f)

    # Process each result
    processed_results = []
    total_cost = 0.0

    for result in results:
        custom_id = result['custom_id']
        metadata = property_metadata.get(custom_id, {})

        if result['response']['status_code'] == 200:
            response_body = result['response']['body']
            message = response_body['choices'][0]['message']['content']

            # Parse JSON response
            try:
                analysis = json.loads(message)

                # Calculate weighted score
                score_weights = {"1": -2, "2": -1, "3": 1, "4": 2, "5": 3}
                criteria_weights = {
                    "market_garden": 2.0,
                    "guest_accommodation": 2.5,
                    "workshop": 2.0,
                    "rental_units": 1.5,
                    "location": 3.0,
                    "local_market": 1.5
                }

                total_score = 0.0
                total_weight = 0.0

                for criterion, weight in criteria_weights.items():
                    score_value = analysis.get(criterion, 3)
                    weighted = score_weights[str(score_value)] * weight
                    total_score += weighted
                    total_weight += weight

                # Apply risk factor
                risk_factors = {"Laag": 1.0, "Gemiddeld": 0.9, "Hoog": 0.7}
                risk_factor = risk_factors.get(analysis.get('risk_profile', 'Gemiddeld'), 0.9)

                weighted_score = round((total_score / total_weight) * risk_factor, 2) if total_weight > 0 else 0

                # Track cost
                usage = response_body.get('usage', {})
                input_tokens = usage.get('prompt_tokens', 0)
                output_tokens = usage.get('completion_tokens', 0)
                # Batch API pricing (50% discount)
                cost = ((input_tokens / 1_000_000) * 0.075) + ((output_tokens / 1_000_000) * 0.30)
                total_cost += cost

                processed_results.append({
                    'URL': metadata.get('url'),
                    'Titel': metadata.get('title'),
                    'Locatie': metadata.get('location'),
                    'Prijs': metadata.get('price'),
                    'market_garden': analysis.get('market_garden'),
                    'guest_accommodation': analysis.get('guest_accommodation'),
                    'workshop': analysis.get('workshop'),
                    'rental_units': analysis.get('rental_units'),
                    'location': analysis.get('location'),
                    'local_market': analysis.get('local_market'),
                    'risk_profile': analysis.get('risk_profile'),
                    'overall_assessment': analysis.get('overall_assessment'),
                    'Gewogen Score': weighted_score,
                    'GPT Analyse': json.dumps(analysis, ensure_ascii=False)
                })

            except json.JSONDecodeError as e:
                print(f"⚠️  Failed to parse response for {custom_id}: {e}")

    # Save to CSV
    if processed_results:
        df = pd.DataFrame(processed_results)
        df.to_csv('analysis_output_batch.csv', index=False, encoding='utf-8')

        print(f"\n✅ Processed {len(processed_results)} properties")
        print(f"💾 Saved to: analysis_output_batch.csv")
        print(f"\n💰 Cost Report:")
        print(f"   Total cost: ${total_cost:.4f}")
        print(f"   Average per property: ${total_cost/len(processed_results):.6f}")
        print(f"   Savings vs real-time API: ~50%")

    return processed_results


# === MAIN CLI ===

if __name__ == "__main__":
    import sys

    if len(sys.argv) < 2:
        print("Usage:")
        print("  python3 batch_gpt_analysis.py create   - Create batch input file")
        print("  python3 batch_gpt_analysis.py submit   - Submit batch to OpenAI")
        print("  python3 batch_gpt_analysis.py status   - Check batch status")
        print("  python3 batch_gpt_analysis.py retrieve - Download and process results")
        print("\nTypical workflow:")
        print("  1. create  → prepare requests")
        print("  2. submit  → send to OpenAI (24h processing)")
        print("  3. status  → check progress")
        print("  4. retrieve → get results (after completion)")
        sys.exit(0)

    command = sys.argv[1].lower()

    if command == 'create':
        count = create_batch_input()
        if count > 0:
            print(f"\n✅ Ready to submit {count} properties for analysis")
            print(f"💡 Next: python3 batch_gpt_analysis.py submit")

    elif command == 'submit':
        batch_id = submit_batch()
        if batch_id:
            print(f"\n✅ Batch submitted: {batch_id}")

    elif command == 'status':
        check_batch_status()

    elif command == 'retrieve':
        results = retrieve_batch_results()
        if results:
            print(f"\n✅ All done! Merge with main analysis:")
            print(f"   python3 parse_criteria.py")

    else:
        print(f"❌ Unknown command: {command}")
        print("Run without arguments to see usage.")
