# analyze_from_urls_optimized.py
# Smart Cost-Cutting GPT Analysis with Caching & Change Detection

import os
import time
import requests
import pandas as pd
import json
import hashlib
from bs4 import BeautifulSoup
from openai import OpenAI
from dotenv import load_dotenv
import re
from pathlib import Path

# Load API key
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# === COST TRACKING ===
COST_TRACKER_FILE = "gpt_cost_tracker.json"

def load_cost_tracker():
    """Load cost tracking data"""
    if os.path.exists(COST_TRACKER_FILE):
        with open(COST_TRACKER_FILE, 'r') as f:
            return json.load(f)
    return {
        "total_cost": 0.0,
        "total_requests": 0,
        "cache_hits": 0,
        "cache_misses": 0,
        "tokens_saved": 0,
        "runs": []
    }

def save_cost_tracker(tracker):
    """Save cost tracking data"""
    with open(COST_TRACKER_FILE, 'w') as f:
        json.dump(tracker, f, indent=2)

def estimate_cost(input_tokens, output_tokens, model="gpt-4o-mini"):
    """Estimate cost for various GPT models (October 2025 pricing)"""
    pricing = {
        "gpt-3.5-turbo": {"input": 0.50, "output": 1.50},
        "gpt-4o-mini": {"input": 0.15, "output": 0.60},
        "gpt-4o": {"input": 2.50, "output": 10.00},
        "o3-mini": {"input": 1.10, "output": 4.40},
        "o1-mini": {"input": 1.10, "output": 4.40},
    }

    rates = pricing.get(model, pricing["gpt-4o-mini"])
    input_cost = (input_tokens / 1_000_000) * rates["input"]
    output_cost = (output_tokens / 1_000_000) * rates["output"]
    return input_cost + output_cost

# === INTELLIGENT CACHING ===
CACHE_DIR = Path(".gpt_cache")
CACHE_DIR.mkdir(exist_ok=True)

def get_content_hash(url, text, prompt):
    """Generate hash of property content for cache key"""
    content = f"{url}|{text[:500]}|{prompt}"
    return hashlib.sha256(content.encode()).hexdigest()

def get_cached_analysis(cache_key):
    """Retrieve cached analysis if exists"""
    cache_file = CACHE_DIR / f"{cache_key}.json"
    if cache_file.exists():
        with open(cache_file, 'r') as f:
            return json.load(f)
    return None

def save_to_cache(cache_key, analysis_data):
    """Save analysis to cache"""
    cache_file = CACHE_DIR / f"{cache_key}.json"
    with open(cache_file, 'w') as f:
        json.dump(analysis_data, f, indent=2)

# === OPTIMIZED PROMPT (30% SHORTER) ===
OPTIMIZED_PROMPT = """Analyseer dit vastgoed voor regeneratieve landbouw. Score 1-5 (1=slecht, 5=excellent):

{advertentietekst}

1. Market garden: bodem, zon, water, oppervlakte
2. Gastenverblijf: rust, omgeving, bereikbaarheid
3. Werkplaats: gebouwen, voorzieningen
4. Verhuureenheden: huurpotentieel
5. Ligging: afstand kust/stad/vliegveld
6. Lokale markt: afzetmogelijkheden

Format:
1. Market garden: [score] - [reden]
...
Risico: Laag/Gemiddeld/Hoog"""

# Load English prompt template (unified - includes custom data integration)
with open("prompt_english.txt", "r", encoding="utf-8") as f:
    base_prompt = f.read()

# Import custom data formatter
from format_custom_data_for_gpt import format_custom_data_for_prompt

# Data loading
input_file = "extracted_property_urls.csv"
output_file = "analysis_output.csv"
enriched_file = "enriched_data.json"

# Load existing analyses from enriched_data.json (source of truth)
# Only consider properties "analyzed" if they have gpt_score > 0
analyzed_urls = set()
property_data_by_url = {}  # For quick lookup of custom criteria
if os.path.exists(enriched_file):
    try:
        with open(enriched_file, 'r') as f:
            enriched_data = json.load(f)
            # Only skip properties that have actual GPT scores (not 0.0)
            analyzed_urls = {
                prop['url'] for prop in enriched_data
                if prop.get('gpt_score', 0) > 0
            }
            # Create URL-indexed lookup for custom criteria
            property_data_by_url = {
                prop['url']: prop for prop in enriched_data
            }
            print(f"📊 Loaded enriched_data.json: {len(enriched_data)} total, {len(analyzed_urls)} with GPT scores")
    except Exception as e:
        print(f"⚠️  Could not load enriched_data.json: {e}")

# Load existing analysis_output.csv for appending new results
if os.path.exists(output_file):
    existing_df = pd.read_csv(output_file)
else:
    existing_df = pd.DataFrame()

df = pd.read_csv(input_file)
results = []

# Headers for scraping
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15"
}

# Score weights
score_weights = {
    "1": -2,
    "2": -1,
    "3": 1,
    "4": 2,
    "5": 3
}

# Criterion weights (supports both English and Dutch)
criteria_weights = {
    # English keywords (from English prompt)
    "regenerative market garden": 2.0,
    "market garden": 2.0,
    "guest accommodation": 2.5,
    "bed & breakfast": 2.5,
    "workshop": 2.0,
    "food processing": 2.0,
    "independent rental units": 1.5,
    "rental units": 1.5,
    "location relative to": 3.0,
    "location": 3.0,
    "distance to local market": 1.5,
    "local market": 1.5,
    # Dutch keywords (for backward compatibility)
    "regeneratieve market garden": 2.0,
    "gastenverblijf": 2.5,
    "werkplaats": 2.0,
    "zelfstandige verhuureenheden": 1.5,
    "locatie": 3.0,
    "afstand tot lokale markt": 1.5
}

# === COST TRACKING INITIALIZATION ===
cost_tracker = load_cost_tracker()
run_start = time.time()
run_data = {
    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
    "properties_analyzed": 0,
    "cache_hits": 0,
    "cache_misses": 0,
    "total_cost": 0.0,
    "tokens_input": 0,
    "tokens_output": 0
}

# === PROGRESS TRACKING ===
job_id = os.getenv('FARMMATCH_JOB_ID')
progress_file = f'/tmp/paradisomatch_progress_{job_id}.json' if job_id else None

def update_progress(progress, status='running', current_step=''):
    """Update progress file for UI tracking"""
    if not progress_file:
        return
    try:
        with open(progress_file, 'w') as f:
            json.dump({
                'job_id': job_id,
                'status': status,
                'progress': progress,
                'current_step': current_step,
                'total_steps': 1,
                'started_at': time.strftime("%Y-%m-%dT%H:%M:%S")
            }, f)
    except Exception as e:
        print(f"Warning: Could not update progress file: {e}")

# User configuration
# Support non-interactive mode via environment variables
import sys
if sys.stdin.isatty():
    USE_OPTIMIZED_PROMPT = input("Use optimized prompt? (saves ~30% tokens) [y/N]: ").lower() == 'y'
    USE_CACHE = input("Use intelligent caching? (skip unchanged properties) [Y/n]: ").lower() != 'n'
else:
    # Non-interactive mode: use smart defaults
    USE_OPTIMIZED_PROMPT = os.getenv('USE_OPTIMIZED_PROMPT', 'n').lower() == 'y'
    USE_CACHE = os.getenv('USE_CACHE', 'y').lower() == 'y'

print(f"\n🚀 Starting analysis with:")
print(f"   - Prompt: {'Optimized (short)' if USE_OPTIMIZED_PROMPT else 'Full English with custom data'}")
print(f"   - Intelligent caching: {'✓' if USE_CACHE else '✗'}")
print(f"   - Custom criteria integration: ✓")
print(f"   - SHORT-STAY focus: ✓")
print(f"   - LIVABILITY emphasis: ✓")
print(f"   - Total properties: {len(df)}")
print(f"   - With GPT scores: {len(analyzed_urls)}")
print(f"   - Need analysis: {len(df) - len(analyzed_urls)}")
print(f"   - Cache directory: {CACHE_DIR.absolute()}\n")

update_progress(0, 'running', f'Starting analysis of {len(df)} properties...')

# Track how many we need to analyze (excluding already analyzed)
properties_to_analyze = [url for url in df['URL'] if url not in analyzed_urls] if USE_CACHE else df['URL'].tolist()
total_to_analyze = len(properties_to_analyze)
analyzed_count = 0

print(f"📊 Will analyze {total_to_analyze} properties (skipping {len(df) - total_to_analyze} already analyzed)\n")

for idx, row in enumerate(df.itertuples(), 1):
    url = row.URL

    # Skip if already has GPT score (gpt_score > 0)
    if url in analyzed_urls and USE_CACHE:
        print(f"⏭️  Already has GPT score, skipping: {url}")
        continue

    try:
        analyzed_count += 1
        progress_pct = int((analyzed_count / max(total_to_analyze, 1)) * 100)
        update_progress(progress_pct, 'running', f'Analyzing property {analyzed_count}/{total_to_analyze}...')
        print(f"🔎 Analysis {analyzed_count}/{total_to_analyze}: {url}")
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract semantic data from page structure (IMPROVED!)
        title_elem = soup.find('h1')
        title_text = title_elem.get_text(strip=True) if title_elem else ""

        # Extract from listing-section-content divs (much better than meta tags!)
        sections = soup.find_all('div', class_='listing-section-content')
        section_texts = []
        for section in sections[:3]:  # First 3 sections usually have key info
            text = section.get_text(strip=True)
            if text and len(text) > 20:  # Avoid empty or tiny sections
                section_texts.append(text[:500])  # Limit each section

        # Extract areas div specifically (has structured property data)
        areas = soup.find('div', class_='areas')
        if areas:
            areas_text = areas.get_text(strip=True)
            section_texts.append(f"Property details: {areas_text}")

        # Combine all extracted text
        full_text = "\n\n".join(section_texts) if section_texts else title_text

        # Fallback to meta description if we got nothing
        if not full_text or len(full_text) < 50:
            desc_tag = soup.find("meta", {"name": "description"})
            full_text = desc_tag["content"] if desc_tag else title_text

        print(f"   📄 Extracted {len(full_text)} characters from semantic HTML")

        # Extra context from location fields
        location_parts = []
        for field in ["Plaats", "Provincie", "Adres"]:
            if field in row and pd.notna(row[field]):
                location_parts.append(str(row[field]))

        location_context = "\n".join(location_parts)

        # Get custom criteria data for this property if available
        property_data = property_data_by_url.get(url, {})
        custom_data_text = format_custom_data_for_prompt(property_data)
        has_custom_data = bool(custom_data_text and property_data.get('custom_score', 0) > 0)

        # Choose prompt: optimized (short) or full English with custom data
        if USE_OPTIMIZED_PROMPT:
            prompt_template = OPTIMIZED_PROMPT
        else:
            prompt_template = base_prompt  # Full English prompt with custom data integration

        # Assemble final prompt
        final_prompt = prompt_template.replace("{advertentietekst}", full_text)
        final_prompt = final_prompt.replace("{locatie_context}", location_context if location_context else "No additional location information available")

        # Insert custom criteria data if available (only for full prompt)
        if not USE_OPTIMIZED_PROMPT:
            if has_custom_data:
                final_prompt = final_prompt.replace("{custom_criteria_data}", custom_data_text)
                print(f"   🌍 Using custom criteria data (score: {property_data.get('custom_score', 0):.1f}/5.0)")
            else:
                # Replace placeholder with "no data" message
                no_data_msg = "[NO OBJECTIVE DATA AVAILABLE]\nObjective climate and location data has not yet been collected for this property.\nBase your analysis on the description and general knowledge of the region."
                final_prompt = final_prompt.replace("{custom_criteria_data}", no_data_msg)

        # === CHECK CACHE ===
        cache_key = get_content_hash(url, full_text, prompt_template)
        cached = get_cached_analysis(cache_key) if USE_CACHE else None

        if cached:
            print(f"   💾 Cache hit! Reusing previous analysis")
            reply = cached['analysis']
            cost_tracker["cache_hits"] += 1
            run_data["cache_hits"] += 1

            # Estimate tokens we saved
            est_input = len(final_prompt) // 4
            est_output = len(reply) // 4
            cost_tracker["tokens_saved"] += est_input + est_output
        else:
            # === GPT REQUEST ===
            print(f"   🤖 Calling GPT-4o-mini...")
            gpt_response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "user", "content": final_prompt}],
                temperature=0.3
            )

            reply = gpt_response.choices[0].message.content.strip()

            # Track costs
            input_tokens = gpt_response.usage.prompt_tokens
            output_tokens = gpt_response.usage.completion_tokens
            cost = estimate_cost(input_tokens, output_tokens, model="gpt-4o-mini")

            cost_tracker["total_cost"] += cost
            cost_tracker["total_requests"] += 1
            cost_tracker["cache_misses"] += 1

            run_data["cache_misses"] += 1
            run_data["total_cost"] += cost
            run_data["tokens_input"] += input_tokens
            run_data["tokens_output"] += output_tokens

            print(f"   💰 Cost: ${cost:.6f} ({input_tokens} in + {output_tokens} out tokens)")

            # Save to cache
            if USE_CACHE:
                save_to_cache(cache_key, {
                    'url': url,
                    'analysis': reply,
                    'timestamp': time.time()
                })

        # === SCORE EXTRACTION ===
        lines = reply.split("\n")
        total_score = 0.0
        total_weight = 0.0

        for line in lines:
            if ":" in line:
                parts = line.split(":")
                criterium = parts[0].strip().lower()
                score_part = parts[1].strip().split(" ")[0]
                # Strip markdown formatting (**, *, etc.) from score
                score_part = score_part.strip('*').strip()
                score_value = score_weights.get(score_part, 0)

                for key in criteria_weights:
                    if key in criterium:
                        weight = criteria_weights[key]
                        total_score += score_value * weight
                        total_weight += weight
                        break

        # Risk profile
        risk_factor = 1.0
        risk_match = re.search(r"[Rr]isico(profiel)?:\s*(Laag|Gemiddeld|Hoog)", reply)
        if risk_match:
            level = risk_match.group(2).lower()
            if level == "laag":
                risk_factor = 1.0
            elif level == "gemiddeld":
                risk_factor = 0.9
            elif level == "hoog":
                risk_factor = 0.7

        weighted_score = round((total_score / total_weight) * risk_factor, 2) if total_weight > 0 else 0

        results.append({
            "URL": url,
            "Titel": title_text,
            "Samenvatting": full_text[:200] if len(full_text) > 200 else full_text,
            "GPT Analyse": reply,
            "Gewogen Score": weighted_score
        })

        run_data["properties_analyzed"] += 1

        # Intermediate save
        pd.DataFrame(results + existing_df.to_dict("records")).to_csv(output_file, index=False, encoding="utf-8")

        # Only sleep if we made an API call
        if not cached:
            time.sleep(1)  # Reduced from 2s to 1s

    except Exception as e:
        print(f"❌ Error at {url}: {e}")
        results.append({
            "URL": url,
            "Titel": title_text if 'title_text' in locals() else "",
            "Samenvatting": full_text[:200] if 'full_text' in locals() and len(full_text) > 200 else (full_text if 'full_text' in locals() else ""),
            "GPT Analyse": f"Error: {e}",
            "Gewogen Score": 0
        })

# === FINAL SAVE ===
if results:
    final_df = pd.DataFrame(results + existing_df.to_dict("records"))
    final_df.to_csv(output_file, index=False, encoding="utf-8")
    print(f"\n✅ Analysis complete! {len(results)} new properties analyzed.")

# === COST REPORT ===
run_data["duration_seconds"] = round(time.time() - run_start, 2)
cost_tracker["runs"].append(run_data)
save_cost_tracker(cost_tracker)

# Mark progress as complete
update_progress(100, 'completed', f'Analysis complete! {run_data["properties_analyzed"]} properties analyzed.')

print("\n" + "="*60)
print("💰 COST REPORT")
print("="*60)
print(f"This run:")
print(f"  Properties analyzed: {run_data['properties_analyzed']}")
print(f"  Cache hits: {run_data['cache_hits']}")
print(f"  Cache misses (API calls): {run_data['cache_misses']}")
print(f"  Total cost: ${run_data['total_cost']:.6f}")
print(f"  Input tokens: {run_data['tokens_input']:,}")
print(f"  Output tokens: {run_data['tokens_output']:,}")
print(f"  Duration: {run_data['duration_seconds']}s")
print()
print(f"All-time totals:")
print(f"  Total spent: ${cost_tracker['total_cost']:.4f}")
print(f"  Total requests: {cost_tracker['total_requests']}")
print(f"  Cache efficiency: {cost_tracker['cache_hits']}/{cost_tracker['cache_hits']+cost_tracker['cache_misses']} ({100*cost_tracker['cache_hits']/(cost_tracker['cache_hits']+cost_tracker['cache_misses']+0.001):.1f}%)")
print(f"  Tokens saved by cache: {cost_tracker['tokens_saved']:,}")
if USE_OPTIMIZED_PROMPT:
    print(f"  Est. savings from optimized prompt: ~30%")
print("="*60)
print(f"\n📊 Cost tracker saved to: {COST_TRACKER_FILE}")