#!/usr/bin/env python3
"""
Data Validator for Paradisomatch

Validates data quality and catches issues early:
- Required fields present
- Coordinates in valid range (Europe)
- Scores in valid range (0-5)
- Status is valid enum
- No duplicate URLs
"""

from typing import List, Dict, Tuple
from db_manager import DatabaseManager

# Europe bounding box
EUROPE_BBOX = {
    'lat_min': 35.0,
    'lat_max': 72.0,
    'lon_min': -10.0,
    'lon_max': 40.0
}

# Valid status values
VALID_STATUSES = ['Active', 'Removed', 'Pending', 'Sold']

# Valid risk profiles
VALID_RISK_PROFILES = ['Laag', 'Gemiddeld', 'Hoog', None]


class DataValidator:
    """Validate Paradisomatch data quality"""

    def __init__(self, db_path='paradisomatch.db'):
        self.db = DatabaseManager(db_path)
        self.issues = []

    def add_issue(self, severity: str, category: str, message: str, property_url: str = None):
        """Add a validation issue"""
        self.issues.append({
            'severity': severity,  # ERROR, WARNING, INFO
            'category': category,
            'message': message,
            'url': property_url
        })

    def validate_required_fields(self, properties: List[Dict]) -> int:
        """Validate that required fields are present"""
        issues = 0

        for prop in properties:
            url = prop.get('url', 'UNKNOWN')

            # Required fields
            if not prop.get('url'):
                self.add_issue('ERROR', 'missing_field', 'Missing URL', url)
                issues += 1

            if not prop.get('title'):
                self.add_issue('WARNING', 'missing_field', 'Missing title', url)
                issues += 1

            if not prop.get('status'):
                self.add_issue('WARNING', 'missing_field', 'Missing status', url)
                issues += 1

        return issues

    def validate_coordinates(self, properties: List[Dict]) -> int:
        """Validate coordinates are in Europe"""
        issues = 0

        for prop in properties:
            url = prop.get('url', 'UNKNOWN')
            lat = prop.get('lat')
            lon = prop.get('lon')

            if lat is not None and lon is not None:
                # Check if in valid range
                if not (-90 <= lat <= 90):
                    self.add_issue('ERROR', 'invalid_coordinate', f'Latitude out of range: {lat}', url)
                    issues += 1

                if not (-180 <= lon <= 180):
                    self.add_issue('ERROR', 'invalid_coordinate', f'Longitude out of range: {lon}', url)
                    issues += 1

                # Check if in Europe
                if not (EUROPE_BBOX['lat_min'] <= lat <= EUROPE_BBOX['lat_max'] and
                        EUROPE_BBOX['lon_min'] <= lon <= EUROPE_BBOX['lon_max']):
                    self.add_issue('ERROR', 'outside_europe',
                                 f'Coordinates outside Europe: ({lat:.6f}, {lon:.6f})', url)
                    issues += 1

        return issues

    def validate_scores(self, properties: List[Dict]) -> int:
        """Validate scores are in valid range"""
        issues = 0

        for prop in properties:
            url = prop.get('url', 'UNKNOWN')

            # Check GPT score
            gpt_score = prop.get('gpt_score')
            if gpt_score is not None:
                if not (0 <= gpt_score <= 5):
                    self.add_issue('ERROR', 'invalid_score',
                                 f'GPT score out of range (0-5): {gpt_score}', url)
                    issues += 1

            # Check custom score
            custom_score = prop.get('custom_score')
            if custom_score is not None:
                if not (0 <= custom_score <= 5):
                    self.add_issue('ERROR', 'invalid_score',
                                 f'Custom score out of range (0-5): {custom_score}', url)
                    issues += 1

            # Check final score
            final_score = prop.get('final_score')
            if final_score is not None:
                if not (0 <= final_score <= 5):
                    self.add_issue('ERROR', 'invalid_score',
                                 f'Final score out of range (0-5): {final_score}', url)
                    issues += 1

        return issues

    def validate_status(self, properties: List[Dict]) -> int:
        """Validate status is valid enum"""
        issues = 0

        for prop in properties:
            url = prop.get('url', 'UNKNOWN')
            status = prop.get('status')

            if status and status not in VALID_STATUSES:
                self.add_issue('WARNING', 'invalid_status',
                             f'Invalid status: {status}', url)
                issues += 1

        return issues

    def validate_risk_profile(self, properties: List[Dict]) -> int:
        """Validate risk profile is valid"""
        issues = 0

        for prop in properties:
            url = prop.get('url', 'UNKNOWN')
            risk = prop.get('risk_profile')

            if risk and risk not in VALID_RISK_PROFILES:
                self.add_issue('WARNING', 'invalid_risk',
                             f'Invalid risk profile: {risk}', url)
                issues += 1

        return issues

    def validate_duplicates(self, properties: List[Dict]) -> int:
        """Check for duplicate URLs"""
        issues = 0
        seen_urls = {}

        for prop in properties:
            url = prop.get('url')
            if url:
                if url in seen_urls:
                    self.add_issue('ERROR', 'duplicate_url',
                                 f'Duplicate URL found', url)
                    issues += 1
                else:
                    seen_urls[url] = True

        return issues

    def validate_location_names(self, properties: List[Dict]) -> int:
        """Check that geocoded properties have location names"""
        issues = 0

        for prop in properties:
            url = prop.get('url', 'UNKNOWN')
            lat = prop.get('lat')
            lon = prop.get('lon')
            location = prop.get('location')

            if lat and lon:
                # Has coordinates but no location name
                if not location or location == 'Unknown':
                    self.add_issue('INFO', 'missing_location_name',
                                 'Property has coordinates but no location name', url)
                    issues += 1

        return issues

    def validate_all(self) -> Tuple[int, int, int]:
        """Run all validations"""
        self.issues = []

        print("="*70)
        print("DATA VALIDATION")
        print("="*70)

        # Load data
        properties = self.db.get_all_properties(status=None)  # Get all
        print(f"\n📊 Validating {len(properties)} properties...\n")

        # Run validations
        validations = [
            ("Required fields", self.validate_required_fields),
            ("Coordinates", self.validate_coordinates),
            ("Scores", self.validate_scores),
            ("Status values", self.validate_status),
            ("Risk profiles", self.validate_risk_profile),
            ("Duplicate URLs", self.validate_duplicates),
            ("Location names", self.validate_location_names)
        ]

        total_issues = 0
        for name, validation_func in validations:
            count = validation_func(properties)
            if count > 0:
                print(f"  ⚠️  {name}: {count} issues")
            else:
                print(f"  ✅ {name}: OK")
            total_issues += count

        # Categorize issues
        errors = len([i for i in self.issues if i['severity'] == 'ERROR'])
        warnings = len([i for i in self.issues if i['severity'] == 'WARNING'])
        infos = len([i for i in self.issues if i['severity'] == 'INFO'])

        return errors, warnings, infos

    def print_issues(self, limit=20):
        """Print validation issues"""
        if not self.issues:
            print("\n✅ No validation issues found!")
            return

        print(f"\n{'='*70}")
        print("VALIDATION ISSUES")
        print("="*70)

        # Group by severity
        for severity in ['ERROR', 'WARNING', 'INFO']:
            severity_issues = [i for i in self.issues if i['severity'] == severity]

            if severity_issues:
                print(f"\n{severity} ({len(severity_issues)}):")
                print("-"*70)

                for i, issue in enumerate(severity_issues[:limit], 1):
                    url_short = issue['url'][-20:] if issue['url'] else 'N/A'
                    print(f"{i}. [{issue['category']}] {issue['message']}")
                    print(f"   URL: ...{url_short}")

                if len(severity_issues) > limit:
                    print(f"   ... and {len(severity_issues) - limit} more")

    def get_report(self) -> Dict:
        """Get validation report"""
        errors = len([i for i in self.issues if i['severity'] == 'ERROR'])
        warnings = len([i for i in self.issues if i['severity'] == 'WARNING'])
        infos = len([i for i in self.issues if i['severity'] == 'INFO'])

        return {
            'total_issues': len(self.issues),
            'errors': errors,
            'warnings': warnings,
            'infos': infos,
            'passed': errors == 0
        }


def main():
    import argparse

    parser = argparse.ArgumentParser(description='Paradisomatch Data Validator')
    parser.add_argument('--fix-auto', action='store_true',
                       help='Automatically fix common issues')
    parser.add_argument('--show-all', action='store_true',
                       help='Show all issues (not just first 20)')

    args = parser.parse_args()

    validator = DataValidator()
    errors, warnings, infos = validator.validate_all()

    limit = None if args.show_all else 20
    validator.print_issues(limit=limit)

    # Summary
    print("\n" + "="*70)
    print("SUMMARY")
    print("="*70)
    print(f"❌ Errors:   {errors}")
    print(f"⚠️  Warnings: {warnings}")
    print(f"ℹ️  Info:     {infos}")

    if errors == 0:
        print("\n✅ DATA VALIDATION PASSED")
        return 0
    else:
        print("\n❌ DATA VALIDATION FAILED")
        return 1


if __name__ == '__main__':
    exit(main())
