# Here, the code for the following logic should be implemented.
# * It should read over all the results file created (under 'preprocess_job/results/') from each job units.
# * Every tool will differ, but logic to combine all the results into single file is needed.
#     * This file should be saved under 'postprocess_job/aggregated_result.json'.
import json
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
from pprint import pprint
from enum import Enum
from typing import Dict, Any
import core.set_pythonpath  # This automatically configures PYTHONPATH
import utils
import shutil
from validators import InputManager
from typing import List, Dict, Any, Union
from pathlib import Path
from run_phbr import run_prediction
from validators import MHCClass



def _json_default(o):
    """Serialize NumPy/pandas types to standard JSON-compatible Python types."""
    if isinstance(o, np.generic):
        return o.item()
    if isinstance(o, np.ndarray):
        return o.tolist()
    if isinstance(o, pd.Series):
        return o.tolist()
    # Fallback for other objects that expose .tolist()
    tolist = getattr(o, "tolist", None)
    if callable(tolist):
        return tolist()
    return str(o)


def reformat_phbr_output_to_dict(df: pd.DataFrame) -> Dict[str, Any]:
    """
    Reformat the PHBR output to a dictionary.
    All the columns will be modified to match other tools' output format.
    * "seq #" column
    """
    # Rename the columns to match other tools' output format.
    df = df.rename(columns={
        'seq #': 'core.sequence_number',
        'mutant peptide': 'core.mutant_peptide',
        'PHBR-I': 'phbr.PHBR-I',
        'PHBR-II': 'phbr.PHBR-II'
    })

    # if ref_peptide exists in the columns, rename it to ref_peptide
    if 'ref_peptide' in df.columns:
        df = df.rename(columns={'ref_peptide': 'core.reference_peptide'})

    # Update unique values: "core.mutant_peptide" and "core.reference_peptide"
    unique_vals = {
        "core.mutant_peptide": df['core.mutant_peptide'].unique().tolist()
    }
    if 'core.reference_peptide' in df.columns:
        unique_vals["core.reference_peptide"] = df['core.reference_peptide'].unique().tolist()
    
    # Update field ranges: "core.sequence_number", "phbr.PHBR-I", and "phbr.PHBR-II"
    field_ranges = {
        "core.sequence_number": {
            "min": df['core.sequence_number'].min(),
            "max": df['core.sequence_number'].max()
        }
    }

    if 'phbr.PHBR-I' in df.columns:
        field_ranges["phbr.PHBR-I"] = {
            "min": df['phbr.PHBR-I'].min(),
            "max": df['phbr.PHBR-I'].max()
        }
    if 'phbr.PHBR-II' in df.columns:
        field_ranges["phbr.PHBR-II"] = {
            "min": df['phbr.PHBR-II'].min(),
            "max": df['phbr.PHBR-II'].max()
        }
    

    output_dict = {
        "warnings": [],
        "results": [
            {
                #"result_type": "peptide_table",
                "table_columns": df.columns.tolist(),
                "table_data": df.values.tolist(),
                "unique_vals": unique_vals,
                "field_ranges": field_ranges,
                "type": "peptide_table"
            }
        ]
    }
    return output_dict


def combine_phbr_json_result_files_to_df(mhci_output: str, mhcii_output: str) -> pd.DataFrame:
    """
    Combine MHCI and MHCII results into a single file when they share the same peptide.
    
    Args:
        mhci_output: Path to MHCI output file
        mhcii_output: Path to MHCII output file
        output_file: Path to combined output file
        mutation_info: Dictionary containing mutation analysis results
    """
    # Read both files - handle both JSON and TSV formats
    mhci_output_str = str(mhci_output)
    mhcii_output_str = str(mhcii_output)
    
    if mhci_output_str.endswith('.json'):
        mhci_df = phbr_json_result_to_df(mhci_output)
    else:
        mhci_df = pd.read_csv(mhci_output, sep='\t')
    
    if mhcii_output_str.endswith('.json'):
        mhcii_df = phbr_json_result_to_df(mhcii_output)
    else:
        mhcii_df = pd.read_csv(mhcii_output, sep='\t')
    
    # Rename PHBR columns to distinguish between MHCI and MHCII (only if they exist)
    if 'PHBR' in mhci_df.columns:
        mhci_df = mhci_df.rename(columns={'PHBR': 'PHBR-I'})
    if 'PHBR' in mhcii_df.columns:
        mhcii_df = mhcii_df.rename(columns={'PHBR': 'PHBR-II'})

    merge_on_columns = ['seq #', 'mutant peptide']
    if 'ref_peptide' in mhci_df.columns:
        merge_on_columns.append('ref_peptide')

    # Merge on peptide column
    combined_df = pd.merge(
        mhci_df,
        mhcii_df,
        on=merge_on_columns,
        how='outer',
        suffixes=('', '-II')  # Only add suffix to MHCII columns that might conflict
    )

    print('combined_df: \n', combined_df)

    return combined_df

def phbr_json_result_to_df(phbr_result_json_file: str) -> pd.DataFrame:
    """
    Convert the PHBR JSON result to a dataframe.
    """
    with open(phbr_result_json_file, 'r') as f:
        phbr_result = json.load(f)
    
    # When turning it back to dataframe, the header should be the first row of the dataframe.
    return pd.DataFrame(phbr_result['results'][0]['table_data'], columns=phbr_result['results'][0]['table_columns'])

def run(**kwargs):
    '''
    options:
        -h, --help            show this help message and exit
        --job-desc-file JOB_DESC_FILE
                                Path to job description file.
        --input-results-dir POSTPROCESS_INPUT_DIR
                                directory containing the result files to postprocess
        --postprocessed-results-dir POSTPROCESS_RESULT_DIR
                                a directory to contain the post-processed results
        --output-prefix OUTPUT_PREFIX, -o OUTPUT_PREFIX
                                prediction result output prefix.
        --output-format OUTPUT_FORMAT, -f OUTPUT_FORMAT
                                prediction result output format (Default=json)
    '''
    job_desc_file = kwargs.get('job_desc_file')
    include_mhci_mhcii_result = kwargs.get('include_mhci_mhcii_result', False)
    postprocess_result_dir = kwargs.get('postprocess_result_dir')
    output_prefix = kwargs.get('output_prefix', 'formatted_phbr_result')
    output_format = kwargs.get('output_format')
    output_file_name = f'{output_prefix}.{output_format}'

    # Read the job description file
    jd_content = json.load(job_desc_file)

    print('postprocess_result_dir: ', postprocess_result_dir)
    # count number of json files in the postprocess_result_dir
    json_files = [f for f in os.listdir(postprocess_result_dir) if f.endswith('.json')]

    print('json_files: ', json_files)

    if len(json_files) == 1:
        phbr_job = Path(postprocess_result_dir) / json_files[0]
        phbr_output_df = phbr_json_result_to_df(phbr_job)
    else:
        # Need to combine results from multiple jobs
        # 1. Read all the json files
        # 2. Combine the results
        # 3. Save the combined results to a single json file
        print(json_files)

        phbr_mhci_output = Path(postprocess_result_dir) / json_files[0]
        phbr_mhcii_output = Path(postprocess_result_dir) / json_files[1]

        print('phbr_mhci_output: ', phbr_mhci_output)
        print('phbr_mhcii_output: ', phbr_mhcii_output)

        # Combine the results
        phbr_output_df = combine_phbr_json_result_files_to_df(phbr_mhci_output, phbr_mhcii_output)

    # Need to add other keys to the output dict to correctly format the output
    # to match other tools' output format.
    output_dict = reformat_phbr_output_to_dict(phbr_output_df)
    # print the json to terminal
    # print(json.dumps(output_dict, indent=2, default=_json_default, ensure_ascii=False))

    if include_mhci_mhcii_result:
        # Find and include MHC-I results if available
        mhci_aggregate_jobs = [job for job in jd_content if job['job_type'] == 'aggregate' and 'tcell_mhci.py' in job['shell_cmd']]
        for job_data in mhci_aggregate_jobs:
            mhci_output_file = job_data['expected_outputs'][0]
            if os.path.exists(mhci_output_file):
                with open(mhci_output_file, 'r') as mhci_output_file:
                    mhci_result = json.load(mhci_output_file)
                for result in mhci_result['results']:
                    if result['result_type'] == 'peptide_table':
                        result = result.copy()  # Create a shallow copy to avoid modifying the original
                        result['type'] = 'tc1_peptide_table'  # Keep both result_type and type for consistency
                        output_dict['results'].append(result)
                print(f"Included MHC-I binding results from: {mhci_output_file}")
            else:
                print(f"MHC-I output file not found: {mhci_output_file}")
        
        # Find and include MHC-II results if available
        mhcii_aggregate_jobs = [job for job in jd_content if job['job_type'] == 'aggregate' and 'tcell_mhcii.py' in job['shell_cmd']]
        for job_data in mhcii_aggregate_jobs:
            mhcii_output_file = job_data['expected_outputs'][0]
            if os.path.exists(mhcii_output_file):
                with open(mhcii_output_file, 'r') as mhcii_output_file:
                    mhcii_result = json.load(mhcii_output_file)
                for result in mhcii_result['results']:
                    if result['result_type'] == 'peptide_table':
                        result = result.copy()  # Create a shallow copy to avoid modifying the original
                        result['type'] = 'tc2_peptide_table'  # Keep both result_type and type for consistency
                        output_dict['results'].append(result)
                print(f"Included MHC-II binding results from: {mhcii_output_file}")
            else:
                print(f"MHC-II output file not found: {mhcii_output_file}")
        
        # Report what was included
        if mhci_aggregate_jobs and mhcii_aggregate_jobs:
            print("Included both MHC-I and MHC-II binding results")
        elif mhci_aggregate_jobs:
            print("Included MHC-I binding results only")
        elif mhcii_aggregate_jobs:
            print("Included MHC-II binding results only")
        else:
            print("No MHC aggregate jobs found - only PHBR results will be included")

    # print(json.dumps(output_dict, indent=2, default=_json_default, ensure_ascii=False))

    # Save to the postprocess_result_dir
    with open(output_file_name, 'w') as f:
        json.dump(output_dict, f, indent=2, default=_json_default, ensure_ascii=False)
    print(f"Final post-processed result saved to: {output_file_name}")
