import sys
import re
import pandas as pd
from pathlib import Path

def add_netmhciipan_4_1_alleles():
    DATA_DIR = str(Path(__file__).parent.parent)
    TM_FILE = DATA_DIR + '/tools-mapping.tsv'
    ORIG_TM_FILE = DATA_DIR + '/Tools_MRO_mapping.xlsx'
    MOL_FILE = DATA_DIR + '/mro_molecules.tsv'


    tm_df = pd.read_csv(TM_FILE, sep='\t', index_col=False)
    netmhciipan_tm_df = tm_df[tm_df['Tool']=='netmhciipan']
    netmhciipan_tm_list = netmhciipan_tm_df['Tool Label'].to_list()
    print(netmhciipan_tm_df.head())
    print(len(netmhciipan_tm_df)) #5616
    
    original_tm_df = pd.read_excel(ORIG_TM_FILE, engine='openpyxl', index_col=False)
    netmhciipan_orig_tm_df = original_tm_df[original_tm_df['tool']=='netmhciipan']
    print(netmhciipan_orig_tm_df.head())
    print(len(netmhciipan_orig_tm_df)) #5622

    unknown_alleles = []
    for row in netmhciipan_orig_tm_df.itertuples():
        tool_label = row.term
        
        if tool_label not in netmhciipan_tm_list:
            unknown_alleles.append(tool_label)


    print(unknown_alleles)

    # Expecting 6 alleles
    print(len(unknown_alleles))

    # Alleles that doesn't exist in the NetMHCIIpan website (DTU)
    unknown=['DPA1*01:08/DPB1*128:01', 'DPA1*02:02/DPB1*123:01']
        
    mol_df = pd.read_csv(MOL_FILE, sep='\t', index_col=False)
    # mol_df['Label'] = mol_df['Label'].astype(str)

    parent = "MHC class II protein complex"
    mhcii_mol_df = mol_df[mol_df['Parent']==parent]
    print(mhcii_mol_df.head())

    new_row_data = []
    for unknown_allele in unknown_alleles:
        for row in mhcii_mol_df.itertuples():
            # Label is _2 ; IEDB Label is _3
            # print(row)
            label = row.Label

            if unknown_allele in label:
                new_row_data.append([
                    'mhcii',
                    'netmhciipan',
                    '4.1',
                    unknown_allele,
                    row._1, # MRO ID
                    '11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30'
                ])

    tm_cols = list(tm_df.columns)
    for row in new_row_data:
        tmp_df = pd.DataFrame([row], columns=tm_cols)
        tm_df = pd.concat([tmp_df, tm_df], ignore_index=True)

    tm_df.loc[tm_df['Tool'] == 'netmhciipan', 'Tool Version'] = '4.1'

    NEW_TM_FILE = DATA_DIR + '/tools-mapping.tsv'

    tm_df.to_csv(NEW_TM_FILE, sep='\t', index=False)


def comparison():
    DATA_DIR = str(Path(__file__).parent.parent)
    TM_FILE = DATA_DIR + '/tools-mapping.tsv'
    ORIG_TM_FILE = DATA_DIR + '/Tools_MRO_mapping.xlsx'


    tm_df = pd.read_csv(TM_FILE, sep='\t', index_col=False)
    netmhciipan_tm_df = tm_df[tm_df['Tool']=='netmhciipan']
    netmhciipan_tm_list = netmhciipan_tm_df['Tool Label'].to_list()
    print(netmhciipan_tm_df.head())
    print(len(netmhciipan_tm_df)) #5616
    
    original_tm_df = pd.read_excel(ORIG_TM_FILE, engine='openpyxl', index_col=False)
    netmhciipan_orig_tm_df = original_tm_df[original_tm_df['tool']=='netmhciipan']
    print(netmhciipan_orig_tm_df.head())
    print(len(netmhciipan_orig_tm_df)) #5622


def compare_allele_list():
    DATA_DIR = str(Path(__file__).parent.parent)
    TM_FILE = DATA_DIR + '/tools-mapping.tsv'
    ALLELELIST_FILE = DATA_DIR + '/netmhciipan-4.1/allele.list'
    MOL_FILE = DATA_DIR + '/mro_molecules.tsv'


    tm_df = pd.read_csv(TM_FILE, sep='\t', index_col=False)
    netmhciipan_tm_df = tm_df[tm_df['Tool']=='netmhciipan']
    netmhciipan_tm_list = netmhciipan_tm_df['Tool Label'].to_list()
    netmhciipan_tm_list = [re.sub(r'[^a-zA-Z0-9\s]', '', _) for _ in netmhciipan_tm_list]
    print(netmhciipan_tm_df.head())
    print(len(netmhciipan_tm_df)) #5616
    
    with open(ALLELELIST_FILE, 'r') as f:
        content = [_.strip() for _ in f.readlines()]
        
    print(len(content)) # 6 more alleles - probably most of H2 alleles

    unknown_alleles = []
    for each_allele in content:
        if each_allele.startswith('HLA-'):
            each_allele = each_allele.replace('HLA-', '')
        if each_allele.startswith('H-2'):
            each_allele = each_allele.replace('H-2', 'H2')

        clean_allele = re.sub(r'[^a-zA-Z0-9\s]', '', each_allele)


        if clean_allele not in netmhciipan_tm_list:
            unknown_alleles.append(clean_allele)

    # ['H2IAk', 'H2IAq', 'H2IAs', 'H2IAu', 'H2IEd', 'H2IEk']
    print(unknown_alleles)
    print(len(unknown_alleles))

    mol_df = pd.read_csv(MOL_FILE, sep='\t', index_col=False)
    # mol_df['Label'] = mol_df['Label'].astype(str)

    parent = "MHC class II protein complex"
    mhcii_mol_df = mol_df[mol_df['Parent']==parent]
    print(mhcii_mol_df.head())

    new_row_data = []
    for unknown_allele in unknown_alleles:
        if unknown_allele.startswith('H2'):
            unknown_allele = unknown_allele.replace('H2', 'H2-')
            print(unknown_allele)
        for row in mhcii_mol_df.itertuples():
            # Label is _2 ; IEDB Label is _3
            # print(row)
            label = row.Label

            if unknown_allele in label:
                new_row_data.append([
                    'mhcii',
                    'netmhciipan',
                    '4.1',
                    unknown_allele,
                    row._1, # MRO ID
                    '11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30'
                ])

    tm_cols = list(tm_df.columns)
    for row in new_row_data:
        tmp_df = pd.DataFrame([row], columns=tm_cols)
        tm_df = pd.concat([tmp_df, tm_df], ignore_index=True)

    tm_df.loc[tm_df['Tool'] == 'netmhciipan', 'Tool Version'] = '4.1'

    NEW_TM_FILE = DATA_DIR + '/tools-mapping.tsv'

    tm_df.to_csv(NEW_TM_FILE, sep='\t', index=False)

def comparison2():
    DATA_DIR = str(Path(__file__).parent.parent)
    TM_FILE = DATA_DIR + '/tools-mapping.tsv'
    ALLELELIST_FILE = DATA_DIR + '/netmhciipan-4.1/allele.list'


    tm_df = pd.read_csv(TM_FILE, sep='\t', index_col=False)
    netmhciipan_tm_df = tm_df[tm_df['Tool']=='netmhciipan']
    netmhciipan_tm_list = netmhciipan_tm_df['Tool Label'].to_list()
    print(netmhciipan_tm_df.head())
    print(len(netmhciipan_tm_df)) #5616
    
    with open(ALLELELIST_FILE, 'r') as f:
        content = [_.strip() for _ in f.readlines()]
        
    print(len(content)) # 6 more alleles - probably most of H2 alleles


def compare_allelelist():
    DATA_DIR = str(Path(__file__).parent.parent)
    TM_FILE = DATA_DIR + '/tools-mapping.tsv'
    ALLELELIST_FILE = DATA_DIR + '/netmhciipan-4.1/allelelist.txt'
    MOL_FILE = DATA_DIR + '/mro_molecules.tsv'


    tm_df = pd.read_csv(TM_FILE, sep='\t', index_col=False)
    netmhciipan_tm_df = tm_df[tm_df['Tool']=='netmhciipan']
    netmhciipan_tm_list = netmhciipan_tm_df['Tool Label'].to_list()
    netmhciipan_tm_list = [re.sub(r'[^a-zA-Z0-9\s]', '', _) for _ in netmhciipan_tm_list]
    print(netmhciipan_tm_df.head())
    print(len(netmhciipan_tm_df)) #5616
    
    with open(ALLELELIST_FILE, 'r') as f:
        content = [_.strip().split('\t')[0] for _ in f.readlines()]
    print(content[:10])
    print(len(content)) # 6 more alleles - probably most of H2 alleles
    # sys.exit()

    unknown_alleles = []
    for each_allele in content:
        if each_allele.startswith('HLA-'):
            each_allele = each_allele.replace('HLA-', '')
        if each_allele.startswith('H-2'):
            each_allele = each_allele.replace('H-2', 'H2')

        clean_allele = re.sub(r'[^a-zA-Z0-9\s]', '', each_allele)


        if clean_allele not in netmhciipan_tm_list:
            unknown_alleles.append(clean_allele)

    # ['H2IAk', 'H2IAq', 'H2IAs', 'H2IAu', 'H2IEd', 'H2IEk']
    print(unknown_alleles)
    print(len(unknown_alleles))
    sys.exit()

    mol_df = pd.read_csv(MOL_FILE, sep='\t', index_col=False)
    # mol_df['Label'] = mol_df['Label'].astype(str)

    parent = "MHC class II protein complex"
    mhcii_mol_df = mol_df[mol_df['Parent']==parent]
    print(mhcii_mol_df.head())

    new_row_data = []
    for unknown_allele in unknown_alleles:
        if unknown_allele.startswith('H2'):
            unknown_allele = unknown_allele.replace('H2', 'H2-')
            print(unknown_allele)
        for row in mhcii_mol_df.itertuples():
            # Label is _2 ; IEDB Label is _3
            # print(row)
            label = row.Label

            if unknown_allele in label:
                new_row_data.append([
                    'mhcii',
                    'netmhciipan',
                    '4.1',
                    unknown_allele,
                    row._1, # MRO ID
                    '11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30'
                ])

    tm_cols = list(tm_df.columns)
    for row in new_row_data:
        tmp_df = pd.DataFrame([row], columns=tm_cols)
        tm_df = pd.concat([tmp_df, tm_df], ignore_index=True)

    tm_df.loc[tm_df['Tool'] == 'netmhciipan', 'Tool Version'] = '4.1'

    NEW_TM_FILE = DATA_DIR + '/tools-mapping.tsv'

    tm_df.to_csv(NEW_TM_FILE, sep='\t', index=False)


def compare_iedb_dat():
    DATA_DIR = str(Path(__file__).parent.parent)
    TM_FILE = DATA_DIR + '/tools-mapping.tsv'
    ALLELELIST_FILE = DATA_DIR + '/netmhciipan-4.1/convert_IEDB.dat'
    MOL_FILE = DATA_DIR + '/mro_molecules.tsv'


    tm_df = pd.read_csv(TM_FILE, sep='\t', index_col=False)
    netmhciipan_tm_df = tm_df[tm_df['Tool']=='netmhciipan']
    netmhciipan_tm_list = netmhciipan_tm_df['Tool Label'].to_list()
    netmhciipan_tm_list = [re.sub(r'[^a-zA-Z0-9\s]', '', _) for _ in netmhciipan_tm_list]
    print(netmhciipan_tm_df.head())
    print(len(netmhciipan_tm_df)) #5616
    
    with open(ALLELELIST_FILE, 'r') as f:
        content = [_.strip().split(' ')[0] for _ in f.readlines()]
    print(content[:10])
    print(len(content)) # 6 more alleles - probably most of H2 alleles
    # sys.exit()

    unknown_alleles = []
    for each_allele in content:
        if each_allele.startswith('HLA-'):
            each_allele = each_allele.replace('HLA-', '')
        if each_allele.startswith('H-2'):
            each_allele = each_allele.replace('H-2', 'H2')

        clean_allele = re.sub(r'[^a-zA-Z0-9\s]', '', each_allele)


        if clean_allele not in netmhciipan_tm_list:
            unknown_alleles.append(each_allele)

    # ['BoLA-DRA-DRB30101', 'BoLA-DRA-DRB31101', 'BoLA-DRA-DRB31501']
    # However, these 3 alleles doesn't exist.
    print(unknown_alleles)
    print(len(unknown_alleles))

def organize_tools_mapping():
    DATA_DIR = str(Path(__file__).parent.parent)
    TM_FILE = DATA_DIR + '/tools-mapping.tsv'
    
    tm_df = pd.read_csv(TM_FILE, sep='\t', index_col=False)
    df_sorted = tm_df.sort_values(by='Tool Label')
    
    NEW_TM_FILE = DATA_DIR + '/tools-mapping.tsv'

    df_sorted.to_csv(NEW_TM_FILE, sep='\t', index=False)   

    
if __name__ == '__main__':
    # Retrieve the base file
    # create_master_file()

    '''
    Compare: tools-mapping.tsv / Tools_MRO_mapping.xlsx
    Result: Added 6 alleles (found MRO ID from mro_molecules.tsv) to the tools-mapping.tsv
    '''
    # add_netmhciipan_4_1_alleles()
    # comparison()

    '''
    Compare: tools-mapping.tsv / allele.list
    Result: Added 6 alleles (H2 alleles + MRO ID found in mro_molecules.tsv) to the tools-mapping.tsv
    '''
    # compare_allele_list()
    # comparison2()

    '''
    Compare: tools-mapping.tsv / allelelist.txt
    Result: Matches exactly -- Nothing to add
    '''
    # compare_allelelist()

    '''
    Compare: tools-mapping.tsv / convert_IEDB.dat
    Result: Found these new alleles ['BoLA-DRA-DRB30101', 'BoLA-DRA-DRB31101', 'BoLA-DRA-DRB31501'].
            However, they don't exist from DTU/IEDB Tools/MHCII standalone.
            Thus, nothing to do as there are NO new alleles to be added.
    '''
    # compare_iedb_dat()

    '''
    Reorganize tools-mapping by sorting the Tool Label column.
    '''
    organize_tools_mapping()