import pandas as pd
from pathlib import Path


DATA_DIR = Path(__file__).parent.parent / "data"
MHC_ALLELES_FILE = 'mhc_alleles.tsv'


def remove_tool_group_col():
    mhc_df = pd.read_csv(DATA_DIR / MHC_ALLELES_FILE, skipinitialspace=True, sep='\t')
    mhc_df.drop(mhc_df[mhc_df['Tool Group'] == 'pvc'].index, inplace=True)
    mhc_df = mhc_df.drop('Tool Group', axis=1)
    
    mhc_df.to_csv(DATA_DIR / MHC_ALLELES_FILE, sep='\t', index=False)

def populate_tool_group():
    mhc_df = pd.read_csv(DATA_DIR / MHC_ALLELES_FILE, skipinitialspace=True, sep='\t')

    for idx in mhc_df.index:
        val_c = mhc_df.at[idx, 'Tool Group']
        if pd.isna(val_c) or val_c == '':
            if 'mhc class i' in mhc_df.at[idx, 'Parent'].lower():
                mhc_df.at[idx, 'Tool Group'] = 'mhci'
            
            if 'mhc class ii' in mhc_df.at[idx, 'Parent'].lower():
                mhc_df.at[idx, 'Tool Group'] = 'mhcii'

    print(mhc_df)
    mhc_df.to_csv(DATA_DIR / MHC_ALLELES_FILE, sep='\t', index=False)


def remove_duplicates():
    '''
    Remove all duplicate rows in mhc_alleles.tsv.
    '''
    mhc_df = pd.read_csv(DATA_DIR / MHC_ALLELES_FILE, skipinitialspace=True, sep='\t')
    df_cleaned = mhc_df.drop_duplicates()

    # Save to file
    df_cleaned.to_csv(DATA_DIR / MHC_ALLELES_FILE, sep='\t', index=False)

if __name__=='__main__':
    # remove_duplicates()
    # populate_tool_group()
    remove_tool_group_col()