#!/usr/bin/env python3
"""
Data Integrity Checker
Validates system consistency and reports discrepancies
Run this regularly to catch issues early
"""

import json
import pandas as pd
from typing import List, Dict
from dataclasses import dataclass
from datetime import datetime


@dataclass
class IntegrityIssue:
    severity: str  # 'error', 'warning', 'info'
    category: str
    message: str
    details: Dict = None
    auto_fixable: bool = False


class DataIntegrityChecker:
    def __init__(self):
        self.issues: List[IntegrityIssue] = []

        # Load data sources
        try:
            with open('enriched_data.json') as f:
                self.enriched_data = json.load(f)
        except Exception as e:
            self.issues.append(IntegrityIssue(
                severity='error',
                category='data_loading',
                message=f'Could not load enriched_data.json: {e}'
            ))
            self.enriched_data = []

        try:
            self.csv_data = pd.read_csv('extracted_property_urls.csv')
        except Exception as e:
            self.issues.append(IntegrityIssue(
                severity='warning',
                category='data_loading',
                message=f'Could not load extracted_property_urls.csv: {e}'
            ))
            self.csv_data = pd.DataFrame()

    def run_all_checks(self):
        """Run all integrity checks"""
        print("="*60)
        print("Paradisomatch Data Integrity Check")
        print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print("="*60)
        print()

        self.check_pending_count_consistency()
        self.check_score_composition()
        self.check_geographic_data()
        self.check_url_consistency()
        self.check_status_values()
        self.check_score_ranges()
        self.check_overall_score_calculation()

        self.print_report()
        return len([i for i in self.issues if i.severity == 'error']) == 0

    def check_pending_count_consistency(self):
        """Verify pending analysis count is calculated consistently"""
        # Method 1: gpt_score = 0 (CORRECT per BUSINESS_RULES.md)
        pending_gpt = [p for p in self.enriched_data
                       if p.get('status') != 'Removed'
                       and p.get('gpt_score', 0) == 0]

        # Method 2: overall_score = 0 (INCORRECT - old way)
        pending_overall = [p for p in self.enriched_data
                           if p.get('status') != 'Removed'
                           and p.get('overall_score', 0) == 0]

        # Method 3: Not in CSV
        csv_urls = set(self.csv_data['URL'].tolist()) if not self.csv_data.empty else set()
        enriched_urls = {p['url'] for p in self.enriched_data}

        print(f"✓ Pending analysis count check:")
        print(f"  - By gpt_score=0 (correct): {len(pending_gpt)}")
        print(f"  - By overall_score=0 (old): {len(pending_overall)}")
        print(f"  - URLs in CSV: {len(csv_urls)}")
        print()

        if len(pending_gpt) != len(pending_overall):
            diff = abs(len(pending_gpt) - len(pending_overall))
            self.issues.append(IntegrityIssue(
                severity='warning',
                category='count_mismatch',
                message=f'Pending count differs by {diff} depending on method',
                details={
                    'gpt_method': len(pending_gpt),
                    'overall_method': len(pending_overall),
                    'explanation': 'Properties with custom_score but no gpt_score'
                }
            ))

        if len(csv_urls) < len(pending_gpt):
            self.issues.append(IntegrityIssue(
                severity='error',
                category='data_sync',
                message=f'CSV has {len(csv_urls)} URLs but {len(pending_gpt)} properties need analysis',
                details={'missing_urls': len(pending_gpt) - len(csv_urls)}
            ))

    def check_score_composition(self):
        """Verify overall_score only exists if component scores exist"""
        print("✓ Score composition check:")

        invalid = []
        for prop in self.enriched_data:
            overall = prop.get('overall_score', 0)
            gpt = prop.get('gpt_score', 0)
            custom = prop.get('custom_score', 0)

            # Rule: overall_score > 0 requires at least one component > 0
            if overall > 0 and gpt == 0 and custom == 0:
                invalid.append(prop['url'])

        if invalid:
            self.issues.append(IntegrityIssue(
                severity='error',
                category='score_integrity',
                message=f'{len(invalid)} properties have overall_score but no component scores',
                details={'urls': invalid[:5]},  # Show first 5
                auto_fixable=True
            ))
            print(f"  ✗ Found {len(invalid)} properties with invalid score composition")
        else:
            print(f"  ✓ All {len(self.enriched_data)} properties have valid score composition")
        print()

    def check_geographic_data(self):
        """Verify active properties have coordinates"""
        print("✓ Geographic data check:")

        active_without_coords = []
        for prop in self.enriched_data:
            if prop.get('status') == 'Active':
                if not prop.get('lat') or not prop.get('lon'):
                    active_without_coords.append(prop['url'])

        if active_without_coords:
            self.issues.append(IntegrityIssue(
                severity='error',
                category='missing_data',
                message=f'{len(active_without_coords)} active properties missing GPS coordinates',
                details={'urls': active_without_coords[:5]},
                auto_fixable=True
            ))
            print(f"  ✗ {len(active_without_coords)} active properties without coordinates")
        else:
            active = [p for p in self.enriched_data if p.get('status') == 'Active']
            print(f"  ✓ All {len(active)} active properties have coordinates")
        print()

    def check_url_consistency(self):
        """Verify URL uniqueness and consistency"""
        print("✓ URL consistency check:")

        urls = [p['url'] for p in self.enriched_data if 'url' in p]
        duplicates = [url for url in urls if urls.count(url) > 1]

        if duplicates:
            self.issues.append(IntegrityIssue(
                severity='error',
                category='data_integrity',
                message=f'{len(set(duplicates))} duplicate URLs found',
                details={'duplicate_urls': list(set(duplicates))}
            ))
            print(f"  ✗ Found {len(set(duplicates))} duplicate URLs")
        else:
            print(f"  ✓ All {len(urls)} URLs are unique")
        print()

    def check_status_values(self):
        """Verify status field only contains valid values"""
        print("✓ Status values check:")

        valid_statuses = {'Active', 'Removed'}
        invalid_status = []

        for prop in self.enriched_data:
            status = prop.get('status')
            if status not in valid_statuses:
                invalid_status.append({
                    'url': prop['url'],
                    'status': status
                })

        if invalid_status:
            self.issues.append(IntegrityIssue(
                severity='error',
                category='data_validation',
                message=f'{len(invalid_status)} properties with invalid status',
                details={'invalid': invalid_status[:5]}
            ))
            print(f"  ✗ {len(invalid_status)} properties with invalid status")
        else:
            print(f"  ✓ All properties have valid status values")
        print()

    def check_score_ranges(self):
        """Verify all scores are in valid range (0-5)"""
        print("✓ Score range check:")

        out_of_range = []
        for prop in self.enriched_data:
            for score_field in ['gpt_score', 'custom_score', 'overall_score']:
                score = prop.get(score_field, 0)
                if score < 0 or score > 5:
                    out_of_range.append({
                        'url': prop['url'],
                        'field': score_field,
                        'value': score
                    })

        if out_of_range:
            self.issues.append(IntegrityIssue(
                severity='error',
                category='data_validation',
                message=f'{len(out_of_range)} scores outside valid range (0-5)',
                details={'invalid': out_of_range[:5]}
            ))
            print(f"  ✗ {len(out_of_range)} scores out of range")
        else:
            print(f"  ✓ All scores within valid range (0-5)")
        print()

    def check_overall_score_calculation(self):
        """Verify overall_score is calculated correctly"""
        print("✓ Overall score calculation check:")

        GPT_WEIGHT = 0.6
        CUSTOM_WEIGHT = 0.4
        TOLERANCE = 0.01  # Allow small rounding differences

        incorrect = []
        for prop in self.enriched_data:
            gpt = prop.get('gpt_score', 0)
            custom = prop.get('custom_score', 0)
            overall = prop.get('overall_score', 0)

            expected = (gpt * GPT_WEIGHT) + (custom * CUSTOM_WEIGHT)
            if abs(overall - expected) > TOLERANCE:
                incorrect.append({
                    'url': prop['url'],
                    'gpt_score': gpt,
                    'custom_score': custom,
                    'overall_score': overall,
                    'expected': round(expected, 2)
                })

        if incorrect:
            self.issues.append(IntegrityIssue(
                severity='warning',
                category='calculation_error',
                message=f'{len(incorrect)} properties with incorrect overall_score calculation',
                details={'incorrect': incorrect[:5]},
                auto_fixable=True
            ))
            print(f"  ⚠ {len(incorrect)} properties with miscalculated scores")
        else:
            print(f"  ✓ All overall_scores calculated correctly")
        print()

    def print_report(self):
        """Print summary report"""
        errors = [i for i in self.issues if i.severity == 'error']
        warnings = [i for i in self.issues if i.severity == 'warning']
        info = [i for i in self.issues if i.severity == 'info']

        print()
        print("="*60)
        print("INTEGRITY CHECK RESULTS")
        print("="*60)
        print(f"Total properties checked: {len(self.enriched_data)}")
        print(f"Errors: {len(errors)}")
        print(f"Warnings: {len(warnings)}")
        print(f"Info: {len(info)}")
        print()

        if errors:
            print("🔴 ERRORS (must fix):")
            for issue in errors:
                print(f"  • [{issue.category}] {issue.message}")
                if issue.auto_fixable:
                    print("    (Auto-fixable)")
            print()

        if warnings:
            print("⚠️  WARNINGS (should fix):")
            for issue in warnings:
                print(f"  • [{issue.category}] {issue.message}")
            print()

        if info:
            print("ℹ️  INFO:")
            for issue in info:
                print(f"  • [{issue.category}] {issue.message}")
            print()

        if not errors and not warnings:
            print("✅ No issues found - data integrity is good!")
        else:
            print("⚠️  Issues detected - see details above")

        print("="*60)

    def export_report(self, filename='integrity_report.json'):
        """Export detailed report to JSON"""
        report = {
            'timestamp': datetime.now().isoformat(),
            'total_properties': len(self.enriched_data),
            'issues': [{
                'severity': i.severity,
                'category': i.category,
                'message': i.message,
                'details': i.details,
                'auto_fixable': i.auto_fixable
            } for i in self.issues]
        }

        with open(filename, 'w') as f:
            json.dump(report, f, indent=2)

        print(f"\n📄 Detailed report saved to: {filename}")


def main():
    checker = DataIntegrityChecker()
    success = checker.run_all_checks()
    checker.export_report()

    # Exit with error code if issues found
    import sys
    sys.exit(0 if success else 1)


if __name__ == '__main__':
    main()
