"""
Parse individual criteria scores from GPT analysis
UPDATED: Now includes KPI validation and combines GPT + Custom scores
"""
import pandas as pd
import re
import json
from validate_scores import validate_criteria_scores

def extract_criteria_scores(gpt_analysis):
    """Extract individual scores from GPT analysis text"""
    if pd.isna(gpt_analysis):
        return {}

    scores = {}
    criteria_map = {
        # Dutch keywords
        'regeneratieve market garden': 'market_garden',
        'gastenverblijf': 'guest_accommodation',
        'werkplaats': 'workshop',
        'zelfstandige verhuureenheden': 'rental_units',
        'ligging': 'location',
        'afstand tot lokale markt': 'local_market',
        # English keywords (CRITICAL FIX for English analyses)
        'regenerative market garden': 'market_garden',
        'market garden': 'market_garden',
        'guest accommodation': 'guest_accommodation',
        'bed & breakfast': 'guest_accommodation',
        'workshop': 'workshop',
        'food processing': 'workshop',
        'independent rental units': 'rental_units',
        'rental units': 'rental_units',
        'location relative to': 'location',
        'location': 'location',
        'distance to local market': 'local_market',
        'local market': 'local_market'
    }

    # Parse each line looking for pattern: "criterion: score"
    lines = gpt_analysis.split('\n')
    for line in lines:
        if ':' in line:
            # Look for patterns like "1. Regenerative market garden: **4** –" OR "1. Regeneratieve market garden: 4 –"
            # Updated regex to handle markdown formatting (**score**)
            match = re.search(r'(\d+)\.\s*([^:]+):\s*\**(\d+)\**', line)
            if match:
                criterion_text = match.group(2).strip().lower()
                score = int(match.group(3))

                # Map to our standard names
                for key, value in criteria_map.items():
                    if key in criterion_text:
                        scores[value] = score
                        break

    return scores

def extract_risk_profile(gpt_analysis):
    """Extract risk profile from GPT analysis"""
    if pd.isna(gpt_analysis):
        return None

    # Look for "Risicoprofiel: Laag/Gemiddeld/Hoog"
    match = re.search(r'[Rr]isicoprofiel:\s*(Laag|Gemiddeld|Hoog)', gpt_analysis)
    if match:
        return match.group(1)
    return None

def get_risk_factor(risk_profile):
    """Convert risk profile to multiplier"""
    if not risk_profile:
        return 0.9  # Default to Gemiddeld

    risk_profile = risk_profile.lower()
    if 'laag' in risk_profile:
        return 1.0
    elif 'gemiddeld' in risk_profile:
        return 0.9
    elif 'hoog' in risk_profile:
        return 0.7
    return 0.9

def calculate_combined_score(gpt_score, custom_score, risk_factor, weights=(0.6, 0.4)):
    """
    Calculate combined overall score from GPT and Custom criteria

    Args:
        gpt_score: Weighted GPT score (0-5)
        custom_score: Custom criteria overall score (0-5)
        risk_factor: Risk multiplier (0.7-1.0)
        weights: Tuple of (gpt_weight, custom_weight) - defaults to (0.6, 0.4)

    Returns:
        float: Combined overall score
    """
    gpt_weight, custom_weight = weights

    # If one score is missing, use only the available one
    if gpt_score == 0 and custom_score > 0:
        return custom_score * risk_factor
    elif custom_score == 0 and gpt_score > 0:
        return gpt_score * risk_factor
    elif gpt_score == 0 and custom_score == 0:
        return 0

    # Combine both scores
    combined = (gpt_score * gpt_weight + custom_score * custom_weight) * risk_factor
    return round(combined, 2)

def enrich_analysis_data(input_csv="analysis_output.csv", output_json="enriched_data.json"):
    """Add individual criteria scores to the dataset"""

    print(f"📊 Reading {input_csv}...")
    df = pd.read_csv(input_csv)

    # Normalize URL column name if needed
    if 'URL' not in df.columns and 'Property URL' in df.columns:
        df = df.rename(columns={'Property URL': 'URL'})

    # Remove duplicate URLs to keep the dataset consistent before enrichment
    before = len(df)
    if 'URL' in df.columns:
        df = df.drop_duplicates(subset=['URL']).reset_index(drop=True)
    removed = before - len(df)
    if removed > 0:
        print(f"🔧 Removed {removed} duplicate rows based on URL")

    # Load existing enriched_data.json to preserve geocoding and other data
    existing_data = {}
    try:
        with open(output_json, 'r', encoding='utf-8') as f:
            existing_properties = json.load(f)
            existing_data = {p['url']: p for p in existing_properties}
            print(f"✓ Loaded {len(existing_data)} existing properties from {output_json}")
    except FileNotFoundError:
        print(f"ℹ️ No existing {output_json} found, creating from scratch")

    # Remove existing Prijs/Locatie columns if they exist (to avoid duplicates)
    for col in ['Prijs', 'Locatie', 'Prijs_x', 'Prijs_y', 'Locatie_x', 'Locatie_y']:
        if col in df.columns:
            df.drop(col, axis=1, inplace=True)

    # Merge price data from extracted_property_urls.csv
    try:
        print(f"💰 Loading price data from extracted_property_urls.csv...")
        price_df = pd.read_csv('extracted_property_urls.csv')

        # Normalize URL column
        if 'URL' not in price_df.columns and 'Property URL' in price_df.columns:
            price_df = price_df.rename(columns={'Property URL': 'URL'})

        if 'URL' not in price_df.columns:
            raise KeyError("No URL column found in extracted_property_urls.csv")

        # Avoid duplicate suffix collisions (e.g., Thumbnail_x)
        for col in ['Thumbnail']:
            if col in df.columns:
                df.drop(columns=[col], inplace=True)

        merge_cols = ['URL']
        for col in ['Prijs', 'Locatie', 'Thumbnail', 'Status', 'Titel', 'Samenvatting']:
            if col in price_df.columns:
                merge_cols.append(col)

        df = df.merge(price_df[merge_cols], on='URL', how='left')
        print(f"✓ Price data merged using columns: {merge_cols}")
    except Exception as e:
        print(f"⚠️ Could not load price data: {e}")
        if 'Prijs' not in df.columns:
            df['Prijs'] = None
        if 'Locatie' not in df.columns:
            df['Locatie'] = None

    # Add new columns for each criterion
    criteria_columns = ['market_garden', 'guest_accommodation', 'workshop',
                       'rental_units', 'location', 'local_market', 'risk_profile',
                       'overall_score_combined']

    # Recalculate Gewogen Score when GPT Analyse exists but score is 0 (fallback)
    criteria_weights = {
        "regeneratieve market garden": 2.5,
        "gastenverblijf": 3.0,
        "werkplaats": 2.0,
        "zelfstandige verhuureenheden": 2.5,
        "locatie": 2.0,
        "afstand tot lokale markt": 1.0,
        "regenerative market garden": 2.5,
        "guest accommodation": 3.0,
        "workshop": 2.0,
        "independent rental units": 2.5,
        "location": 2.0,
        "distance to local market": 1.0,
    }
    score_weights = {"1": -2, "2": -1, "3": 1, "4": 2, "5": 3}

    def recalc_weighted_score(text):
        total_score = 0.0
        total_weight = 0.0
        for line in str(text).split("\n"):
            if ":" in line:
                parts = line.split(":")
                criterium = parts[0].strip().lower()
                score_part = parts[1].strip().split(" ")[0]
                score_val = score_weights.get(score_part, 0)
                for key, val in criteria_weights.items():
                    if key in criterium:
                        total_score += score_val * val
                        total_weight += val
                        break
        if total_weight == 0:
            return 0.0
        risk_factor = 1.0
        match = pd.Series([text]).str.extract(r"[Rr]isicoprofiel:\\s*(Laag|Gemiddeld|Hoog)").iloc[0,0]
        if isinstance(match, str):
            m = match.lower()
            if m == "laag":
                risk_factor = 1.0
            elif m == "gemiddeld":
                risk_factor = 0.9
            elif m == "hoog":
                risk_factor = 0.7
        return round((total_score / total_weight) * risk_factor, 2)

    if 'Gewogen Score' in df.columns:
        missing_scores = (df['Gewogen Score'].fillna(0) == 0) & df['GPT Analyse'].notna()
        if missing_scores.any():
            print(f"🔧 Recalculating Gewogen Score for {missing_scores.sum()} rows with GPT text but zero score")
            df.loc[missing_scores, 'Gewogen Score'] = df.loc[missing_scores, 'GPT Analyse'].apply(recalc_weighted_score)

    # Deterministic fallback for stubborn zeros
    def deterministic_fallback(row):
        # start neutral
        scores = {
            'market_garden': 3,
            'guest_accommodation': 3,
            'workshop': 3,
            'rental_units': 3,
            'location': 3,
            'local_market': 3
        }
        land = float(row['land_size']) if pd.notna(row.get('land_size')) else None
        bld = float(row['building_size']) if pd.notna(row.get('building_size')) else None
        beds = int(row['bedrooms']) if pd.notna(row.get('bedrooms')) else None
        baths = int(row['bathrooms']) if pd.notna(row.get('bathrooms')) else None
        # Market garden
        if land is not None:
            if land >= 1500: scores['market_garden'] += 1
            elif land < 500: scores['market_garden'] -= 1
        # Guest accommodation
        if beds is not None:
            if beds >= 3: scores['guest_accommodation'] += 1
            elif beds <= 1: scores['guest_accommodation'] -= 1
        # Workshop
        if bld is not None:
            if bld >= 80: scores['workshop'] += 1
            elif bld < 40: scores['workshop'] -= 1
        # Rental units
        if beds is not None and bld is not None:
            if beds >= 2 and bld >= 100: scores['rental_units'] += 1
            elif beds <= 1 or bld < 60: scores['rental_units'] -= 1
        # Location
        if pd.notna(row.get('Latitude')) and pd.notna(row.get('Longitude')):
            scores['location'] += 1
        else:
            scores['location'] -= 1
        # Local market
        if pd.notna(row.get('hospital_distance_m')):
            dist = float(row['hospital_distance_m'])
            if dist <= 15000: scores['local_market'] += 1
            elif dist > 40000: scores['local_market'] -= 1

        # clamp 1-5
        for k in scores:
            scores[k] = max(1, min(5, scores[k]))

        # risk factor objective
        risk = row.get('risk_profile_objective')
        if isinstance(risk, str):
            rlower = risk.lower()
            risk_factor = 1.0 if 'laag' in rlower else 0.9 if 'gemiddeld' in rlower else 0.7
        else:
            risk_factor = 0.9

        # recompute weighted score using same weights
        weighted_sum = (
            scores['market_garden'] * criteria_weights['regeneratieve market garden'] +
            scores['guest_accommodation'] * criteria_weights['gastenverblijf'] +
            scores['workshop'] * criteria_weights['werkplaats'] +
            scores['rental_units'] * criteria_weights['zelfstandige verhuureenheden'] +
            scores['location'] * criteria_weights['locatie'] +
            scores['local_market'] * criteria_weights['afstand tot lokale markt']
        )
        total_w = sum(criteria_weights.values())
        weighted_score = round((weighted_sum / total_w) * risk_factor, 2)
        return scores, weighted_score

    for col in criteria_columns:
        if col not in df.columns:
            df[col] = None

    # Ensure custom_overall_score column exists (fallback to 0 if absent)
    if 'custom_overall_score' not in df.columns:
        df['custom_overall_score'] = 0.0

    properties = []
    validation_count = 0
    fallback_applied = 0

    for idx, row in df.iterrows():
        print(f"Processing {idx+1}/{len(df)}: {row['URL']}")
        fallback_used = False

        # Extract criteria scores from GPT analysis
        scores = extract_criteria_scores(row.get('GPT Analyse', ''))
        risk = extract_risk_profile(row.get('GPT Analyse', ''))
        if (not risk or pd.isna(risk)) and pd.notna(row.get('risk_profile_objective')):
            risk = row.get('risk_profile_objective')

        # Get KPIs for validation and filtering
        kpis = {
            'land_size_m2': float(row['land_size']) if pd.notna(row.get('land_size')) else None,
            'building_size_m2': float(row['building_size']) if pd.notna(row.get('building_size')) else None,
            'bedrooms': int(row['bedrooms']) if pd.notna(row.get('bedrooms')) else None,
            'bathrooms': int(row['bathrooms']) if pd.notna(row.get('bathrooms')) else None
        }

        # Validate scores against KPIs
        validated_scores, overrides = validate_criteria_scores(scores, kpis)

        if overrides:
            validation_count += len(overrides)
            print(f"  ⚠️ Applied {len(overrides)} validation overrides")

        # Calculate combined overall score
        gpt_score = float(row.get('Gewogen Score', 0)) if pd.notna(row.get('Gewogen Score')) else 0
        custom_score = float(row.get('custom_overall_score', 0)) if pd.notna(row.get('custom_overall_score')) else 0
        risk_factor = get_risk_factor(risk)

        # If gpt_score is still 0, use deterministic fallback
        if gpt_score == 0:
            fb_scores, fb_weighted = deterministic_fallback(row)
            gpt_score = fb_weighted
            df.at[idx, 'Gewogen Score'] = gpt_score
            for k,v in fb_scores.items():
                scores[k] = v
            if risk is None:
                risk = row.get('risk_profile_objective') if pd.notna(row.get('risk_profile_objective')) else 'Gemiddeld'
            fallback_applied += 1
            fallback_used = True

        overall_score = calculate_combined_score(gpt_score, custom_score, risk_factor)

        # Update dataframe with validated scores and combined totals
        for criterion, score in validated_scores.items():
            df.at[idx, criterion] = score
        df.at[idx, 'risk_profile'] = risk
        df.at[idx, 'overall_score_combined'] = overall_score

        # Get existing property data (if any) to preserve geocoding
        existing_prop = existing_data.get(row['URL'], {})

        # Create enriched property object, merging with existing data
        prop = {
            'url': row['URL'],
            'title': row.get('Titel', 'Untitled')[:100] if pd.notna(row.get('Titel')) else 'Untitled',
            'summary': row.get('Samenvatting', '')[:200] if pd.notna(row.get('Samenvatting')) else '',
            'overall_score': overall_score,  # NEW: Combined GPT + Custom score
            'gpt_score': gpt_score,  # Store individual scores for transparency
            'custom_score': custom_score,
            'location': row.get('Locatie', existing_prop.get('location', 'Unknown')) if pd.notna(row.get('Locatie')) else existing_prop.get('location', 'Unknown'),
            # CRITICAL FIX: analysis_output.csv is the SINGLE SOURCE OF TRUTH
            # NEVER use old coordinates from enriched_data.json - always use analysis_output.csv
            # This prevents wrong cached coordinates from persisting
            'lat': float(row['Latitude']) if pd.notna(row.get('Latitude')) else None,
            'lon': float(row['Longitude']) if pd.notna(row.get('Longitude')) else None,
            'location_source': row.get('LocationSource') if pd.notna(row.get('LocationSource')) else existing_prop.get('location_source'),
            'gps_source': row.get('GPSSource') if pd.notna(row.get('GPSSource')) else existing_prop.get('gps_source'),
            'analysis': row.get('GPT Analyse', '') if pd.notna(row.get('GPT Analyse')) else '',
            'criteria': validated_scores,  # UPDATED: Now uses validated scores
            'risk_profile': risk,
            'status': row.get('Status', 'Active') if pd.notna(row.get('Status')) else 'Active',
            # Try new 'price' column first, then fallback to old 'Prijs' column
            'price': (float(row['price']) if pd.notna(row.get('price')) and 10000 <= row.get('price') <= 10000000
                     else (float(row['Prijs']) if pd.notna(row.get('Prijs')) and 10000 <= row.get('Prijs') <= 10000000
                          else None))
        }

        # Preserve all geocoding-related fields from existing data
        for key in ['country', 'country_code', 'state', 'region', 'province', 'district',
                    'county', 'municipality', 'city', 'town', 'village', 'hamlet', 'suburb',
                    'postcode', 'display_name', 'osm_type', 'osm_id', 'place_id', 'locality',
                    'admin_level_1', 'admin_level_2', 'availability_last_checked',
                    'availability_status_code', 'availability_reason']:
            if key in existing_prop:
                prop[key] = existing_prop[key]

        # Add validation overrides for transparency
        if overrides:
            prop['validation_overrides'] = overrides
        if fallback_used:
            prop['validation_overrides'] = prop.get('validation_overrides', [])
            prop['validation_overrides'].append('deterministic_fallback_applied')

        # Add KPIs to property object
        prop.update(kpis)

        properties.append(prop)

    # Save updated CSV
    df.to_csv(input_csv, index=False, encoding='utf-8')
    print(f"💾 Updated {input_csv} with criteria scores")

    # Save JSON for web interface
    with open(output_json, 'w', encoding='utf-8') as f:
        json.dump(properties, f, ensure_ascii=False, indent=2)

    print(f"✅ Saved {len(properties)} enriched properties to {output_json}")

    if validation_count > 0:
        print(f"🔧 Applied {validation_count} KPI validation overrides across all properties")
    if fallback_applied > 0:
        print(f"🔧 Applied deterministic fallback scoring to {fallback_applied} properties")

    # Show stats
    print("\n📊 Criteria Coverage:")
    for criterion in ['market_garden', 'guest_accommodation', 'workshop',
                     'rental_units', 'location', 'local_market']:
        count = df[criterion].notna().sum()
        avg = df[criterion].mean() if count > 0 else 0
        print(f"  {criterion}: {count}/{len(df)} properties (avg: {avg:.2f})")

    # Show score composition stats
    print("\n📊 Score Composition:")
    gpt_count = df['Gewogen Score'].notna().sum()
    custom_count = df['custom_overall_score'].notna().sum()
    combined_count = df['overall_score_combined'].notna().sum()
    combined_avg = df['overall_score_combined'].mean() if combined_count > 0 else 0
    print(f"  GPT scores: {gpt_count}/{len(df)} properties")
    print(f"  Custom scores: {custom_count}/{len(df)} properties")
    print(f"  Combined (60% GPT + 40% Custom) * Risk Factor: {combined_count}/{len(df)} properties (avg: {combined_avg:.2f})")

    return properties

if __name__ == "__main__":
    enrich_analysis_data()