#!/usr/bin/env python3
"""
Fix scores in CSV by re-extracting from analyses that have mixed markdown/non-markdown formats.
This handles both **3** and 3 formats.
"""

import pandas as pd
import re

# Score weights mapping
score_weights = {
    "5": 5,
    "4": 4,
    "3": 3,
    "2": 2,
    "1": 1
}

# Criterion weights (English keywords)
criteria_weights = {
    "regenerative market garden": 2.0,
    "market garden": 2.0,
    "guest accommodation": 2.5,
    "bed & breakfast": 2.5,
    "workshop": 2.0,
    "food processing": 2.0,
    "independent rental units": 1.5,
    "rental units": 1.5,
    "location relative to": 3.0,
    "location": 3.0,
    "distance to local market": 1.5,
    "local market": 1.5
}

print("📊 Loading analysis_output.csv...")
df = pd.read_csv('analysis_output.csv', encoding='utf-8')
print(f"✓ Loaded {len(df)} entries")

before_zero = len(df[df['Gewogen Score'] == 0])
print(f"Before: {before_zero} entries with score = 0 ({100*before_zero/len(df):.1f}%)")

fixed_count = 0
for idx, row in df.iterrows():
    analysis = row['GPT Analyse']
    if pd.isna(analysis) or analysis == '' or 'Pending' in str(analysis) or 'Error' in str(analysis):
        continue

    # Extract scores from analysis text
    lines = str(analysis).split("\n")
    total_score = 0.0
    total_weight = 0.0

    for line in lines:
        if ":" in line:
            parts = line.split(":")
            criterium = parts[0].strip().lower()
            score_part = parts[1].strip().split(" ")[0]
            # Strip markdown formatting (**, *, etc.) from score - THIS HANDLES BOTH FORMATS!
            score_part = score_part.strip('*').strip()
            score_value = score_weights.get(score_part, 0)

            if score_value > 0:  # Only count if we found a valid score
                for key in criteria_weights:
                    if key in criterium:
                        weight = criteria_weights[key]
                        total_score += score_value * weight
                        total_weight += weight
                        break

    # Risk profile
    risk_factor = 1.0
    risk_match = re.search(r"[Rr]isk.*?:\s*\**(Low|Moderate|High|Laag|Gemiddeld|Hoog)\**", analysis)
    if risk_match:
        level = risk_match.group(1).lower()
        if level in ["low", "laag"]:
            risk_factor = 1.0
        elif level in ["moderate", "gemiddeld"]:
            risk_factor = 0.9
        elif level in ["high", "hoog"]:
            risk_factor = 0.7

    weighted_score = round((total_score / total_weight) * risk_factor, 2) if total_weight > 0 else 0

    # Update score
    if weighted_score != row['Gewogen Score']:
        df.at[idx, 'Gewogen Score'] = weighted_score
        fixed_count += 1

print(f"\n✅ Fixed {fixed_count} scores")

# Backup old CSV
import shutil
shutil.copy('analysis_output.csv', 'analysis_output_BEFORE_SCORE_FIX.csv')
print(f"✓ Backed up original to analysis_output_BEFORE_SCORE_FIX.csv")

# Save fixed CSV
df.to_csv('analysis_output.csv', index=False, encoding='utf-8')
print(f"✓ Saved fixed CSV")

# Show statistics
scores = df['Gewogen Score']
non_zero = scores[scores > 0]
zero = scores[scores == 0]

print(f"\n📈 FINAL STATISTICS:")
print(f"Total entries: {len(df)}")
print(f"With scores > 0: {len(non_zero)} ({100*len(non_zero)/len(df):.1f}%)")
print(f"With scores = 0: {len(zero)} ({100*len(zero)/len(df):.1f}%)")
if len(non_zero) > 0:
    print(f"Score range: {non_zero.min():.2f} - {non_zero.max():.2f}")
    print(f"Average score (non-zero): {non_zero.mean():.2f}")