#!/usr/bin/env python3
"""
Re-extract scores from existing GPT analyses in CSV.
This fixes the issue where analyses exist but "Gewogen Score" is 0.
"""

import pandas as pd
import re

# Score weights mapping
score_weights = {
    "5": 5,
    "4": 4,
    "3": 3,
    "2": 2,
    "1": 1
}

# Criterion weights (English keywords)
criteria_weights = {
    "regenerative market garden": 2.0,
    "market garden": 2.0,
    "guest accommodation": 2.5,
    "bed & breakfast": 2.5,
    "workshop": 2.0,
    "food processing": 2.0,
    "independent rental units": 1.5,
    "rental units": 1.5,
    "location relative to": 3.0,
    "location": 3.0,
    "distance to local market": 1.5,
    "local market": 1.5
}

print("📊 Loading CSV...")
df = pd.read_csv('analysis_output_OLD_WITHOUT_SCORES.csv', encoding='utf-8')
print(f"✓ Loaded {len(df)} entries")

fixed_count = 0
for idx, row in df.iterrows():
    analysis = row['GPT Analyse']
    if pd.isna(analysis) or analysis == '' or 'Pending' in str(analysis):
        continue

    # Extract scores from analysis text
    lines = str(analysis).split("\n")
    total_score = 0.0
    total_weight = 0.0

    for line in lines:
        if ":" in line:
            parts = line.split(":")
            criterium = parts[0].strip().lower()
            score_part = parts[1].strip().split(" ")[0]
            # Strip markdown formatting (**, *, etc.) from score - THIS IS THE FIX!
            score_part = score_part.strip('*').strip()
            score_value = score_weights.get(score_part, 0)

            for key in criteria_weights:
                if key in criterium:
                    weight = criteria_weights[key]
                    total_score += score_value * weight
                    total_weight += weight
                    break

    # Risk profile
    risk_factor = 1.0
    risk_match = re.search(r"[Rr]isk.*?:\s*(Low|Moderate|High|Laag|Gemiddeld|Hoog)", analysis)
    if risk_match:
        level = risk_match.group(1).lower()
        if level in ["low", "laag"]:
            risk_factor = 1.0
        elif level in ["moderate", "gemiddeld"]:
            risk_factor = 0.9
        elif level in ["high", "hoog"]:
            risk_factor = 0.7

    weighted_score = round((total_score / total_weight) * risk_factor, 2) if total_weight > 0 else 0

    # Update score if different
    if weighted_score != row['Gewogen Score']:
        df.at[idx, 'Gewogen Score'] = weighted_score
        fixed_count += 1

print(f"\n✅ Fixed {fixed_count} scores")
print(f"📊 Saving to analysis_output.csv...")
df.to_csv('analysis_output.csv', index=False, encoding='utf-8')

# Show statistics
scores = df['Gewogen Score']
non_zero = scores[scores > 0]
print(f"\n📈 FINAL STATISTICS:")
print(f"Total entries: {len(df)}")
print(f"With scores > 0: {len(non_zero)} ({100*len(non_zero)/len(df):.1f}%)")
print(f"Score range: {scores.min():.2f} - {scores.max():.2f}")
if len(non_zero) > 0:
    print(f"Average score (non-zero): {non_zero.mean():.2f}")