#!/usr/bin/env python3
"""
UDISE+ Data Processor
Process downloaded student data and generate useful reports
"""

import os
import pandas as pd
from datetime import datetime
import sys

# Import helper functions (will be available when script runs)
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

DEFAULT_DOWNLOAD_DIR = os.path.expanduser("~/UDISE_Data")

def load_student_data(filename=None):
    """Load student data from Excel/CSV"""
    if filename is None:
        # Find latest file
        files = [f for f in os.listdir(DEFAULT_DOWNLOAD_DIR) 
                if f.endswith(('.xlsx', '.csv')) and 'student' in f.lower()]
        if not files:
            print("❌ No student data files found!")
            return None
        # Get most recent
        filename = max([os.path.join(DEFAULT_DOWNLOAD_DIR, f) for f in files], 
                      key=os.path.getmtime)
    
    print(f"📂 Loading: {filename}")
    
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)
    else:
        df = pd.read_excel(filename)
    
    print(f"✓ Loaded {len(df)} records")
    return df


def clean_student_data(df):
    """Clean and standardize student data"""
    print("🧹 Cleaning data...")
    
    # Make column names consistent
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
    
    # Remove completely empty rows
    df = df.dropna(how='all')
    
    # Clean phone numbers (if exists)
    if 'mobile_no' in df.columns:
        df['mobile_no'] = df['mobile_no'].astype(str).str.replace(r'\D', '', regex=True)
    
    # Standardize gender
    if 'gender' in df.columns:
        df['gender'] = df['gender'].str.upper()
        gender_map = {'M': 'Male', 'F': 'Female', 'MALE': 'Male', 'FEMALE': 'Female'}
        df['gender'] = df['gender'].map(gender_map).fillna(df['gender'])
    
    print(f"✓ Cleaned. {len(df)} records remaining")
    return df


def generate_summary_report(df):
    """Generate summary statistics"""
    print("\n" + "="*60)
    print("STUDENT DATA SUMMARY")
    print("="*60)
    
    summary = {}
    
    # Total students
    summary['Total Students'] = len(df)
    
    # Gender distribution
    if 'gender' in df.columns:
        gender_counts = df['gender'].value_counts().to_dict()
        summary['Male'] = gender_counts.get('Male', 0)
        summary['Female'] = gender_counts.get('Female', 0)
    
    # Class-wise distribution
    if 'class' in df.columns:
        class_counts = df['class'].value_counts().sort_index().to_dict()
        summary['Class Distribution'] = class_counts
    
    # Caste/Category distribution
    if 'caste' in df.columns or 'category' in df.columns:
        col = 'caste' if 'caste' in df.columns else 'category'
        cat_counts = df[col].value_counts().to_dict()
        summary['Category Distribution'] = cat_counts
    
    # Print summary
    for key, value in summary.items():
        if isinstance(value, dict):
            print(f"\n{key}:")
            for k, v in value.items():
                print(f"  {k}: {v}")
        else:
            print(f"{key}: {value}")
    
    return summary


def find_issues(df):
    """Find data quality issues"""
    print("\n" + "="*60)
    print("DATA QUALITY ISSUES")
    print("="*60)
    
    issues = []
    
    # Missing names
    missing_name = df[df.get('name', '').isna()]
    if len(missing_name) > 0:
        issues.append(f"  ❌ {len(missing_name)} records missing name")
    
    # Missing admission numbers
    missing_adm = df[df.get('admission_no', '').isna()]
    if len(missing_adm) > 0:
        issues.append(f"  ❌ {len(missing_adm)} records missing admission number")
    
    # Duplicate admission numbers
    if 'admission_no' in df.columns:
        duplicates = df[df.duplicated('admission_no', keep=False)]
        if len(duplicates) > 0:
            issues.append(f"  ⚠️  {len(duplicates)} records have duplicate admission numbers")
            print("\nDuplicate Admission Numbers:")
            print(duplicates[['admission_no', 'name']].to_string())
    
    # Invalid DOB (if exists)
    if 'dob' in df.columns:
        future_dobs = df[pd.to_datetime(df['dob'], errors='coerce') > pd.Timestamp.now()]
        if len(future_dobs) > 0:
            issues.append(f"  ⚠️  {len(future_dobs)} records have future birth dates")
    
    if issues:
        for issue in issues:
            print(issue)
    else:
        print("✓ No major issues found!")
    
    return issues


def export_formatted_excel(df, output_path=None):
    """Export cleaned data to formatted Excel"""
    if output_path is None:
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        output_path = os.path.join(DEFAULT_DOWNLOAD_DIR, 
                                   f'Student_Data_Cleaned_{timestamp}.xlsx')
    
    print(f"\n💾 Saving to: {output_path}")
    
    with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
        # Main data
        df.to_excel(writer, sheet_name='All_Students', index=False)
        
        # Class-wise sheets
        if 'class' in df.columns:
            for class_val in sorted(df['class'].unique()):
                class_df = df[df['class'] == class_val]
                safe_name = str(class_val).replace('/', '_')
                class_df.to_excel(writer, sheet_name=f'Class_{safe_name}', index=False)
        
        # Gender-wise sheets
        if 'gender' in df.columns:
            for gender in df['gender'].dropna().unique():
                gender_df = df[df['gender'] == gender]
                gender_df.to_excel(writer, sheet_name=f'Gender_{gender}', index=False)
    
    print("✓ Excel file saved successfully")
    return output_path


def main():
    print("="*60)
    print("UDISE+ DATA PROCESSOR")
    print("="*60)
    print()
    
    # Check if data exists
    if not os.path.exists(DEFAULT_DOWNLOAD_DIR):
        print(f"❌ Directory not found: {DEFAULT_DOWNLOAD_DIR}")
        print("Please run the main downloader first!")
        return
    
    # Load data
    df = load_student_data()
    if df is None or df.empty:
        print("❌ No data to process")
        return
    
    # Clean data
    df = clean_student_data(df)
    
    # Generate reports
    summary = generate_summary_report(df)
    issues = find_issues(df)
    
    # Save cleaned data
    output_file = export_formatted_excel(df)
    
    print("\n" + "="*60)
    print("PROCESS COMPLETE!")
    print("="*60)
    print(f"Output: {output_file}")
    print("\nNext steps:")
    print("  1. Open the Excel file")
    print("  2. Review 'Issues_Found' sheet for any problems")
    print("  3. Make additional edits if needed")
    print("  4. Generate reports/printouts")
    

if __name__ == "__main__":
    main()
