import re
import urllib.request
# import subprocess
# import mhcflurry
import pandas as pd
from pathlib import Path
from tqdm import tqdm
# from difflib import SequenceMatcher
# from allele_validator import AlleleValidator
from allele_info import MHCNPAlleleData, NetCTLpanAlleleData


# ===============================================================================================
# Global Vars.
# ===============================================================================================
DATA_DIR = str(Path(__file__).resolve().parents[0]) + "/data"
# ORIGINAL_MAPPING_FILE = 'Tools_MRO_mapping.xlsx'
ORIGINAL_MAPPING_FILE = 'Tools_MRO_Mapping_VFYD.xlsx'
MRO_MOLECULES_FILE = 'mro_molecules.tsv'
TOOLS_MAPPING_FILE = 'tools-mapping.tsv'
TOOLS_MAPPING_EXCEL = 'tools-mapping.xlsx'
METHOD_FILE = 'method-table.xlsx'
ALLELE_LENGTH_FILE = 'allele-lengths.xlsx'
MOLECULE_OUTFILE = 'processed_molecule_export.tsv'
DTU_NETMHCPAN_ALLELES_URL = 'https://services.healthtech.dtu.dk/services/NetMHCpan-4.1/MHC_allele_names.txt'
NETMHCPAN_ALLELE_PATH = DATA_DIR + '/netmhcpan-4.1/allelenames'
NETMHCIIPAN_ALLELE_PATH = DATA_DIR + '/netmhciipan-4.1/allele.list'
MISSING_31_ALLELES_FILE = 'tools_mro5_31_23.xlsx'
SPECIAL_CHARS_PATTERN = r'[^\w]'    # match anything that's not alphanumeric or underscore
FINAL_MOL_DATA_HEADER = ['IEDB Label', 'Synonyms', 'MRO ID', 'In Taxon']

# Tools_MRO_mapping don't specify which method belongs to which tool, so we need to
# define MHCI_METHODS/MHCII_METHODS separately.
MHCI_METHODS = [
    'ann', 
    'ann-3.4', 
    'comblib_sidney2008', 
    'consensus', 
    'netmhccons', 
    'netmhcpan', 
    'netmhcstabpan', 
    'pickpocket', 
    'recommended', 
    'smm', 
    'smmpmbec', 
    'netmhcpan-4.1'
    ]
MHCII_METHODS = [
    'comblib', 
    'consensus', 
    'netmhciipan', 
    'nn_align', 
    'nn_align-2.3', 
    'recommended', 
    'smm_align', 
    'tepitope'
    ]

# ===============================================================================================
# HELPER FUNCTIONS
# ===============================================================================================
def clean_label(label):
    _label = re.sub('_', '-', label)
    return re.sub(SPECIAL_CHARS_PATTERN, '', _label).lower()



# ===============================================================================================
# TOOLS MAPPING CODE
# ===============================================================================================
def remove_empty_mro_id() :
    tools_mapping_df = pd.read_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), skipinitialspace=True, sep='\t')
    final_tools_mapping_df = tools_mapping_df[tools_mapping_df['MRO ID'].notnull()]       
    
    # Write final df to output file    
    final_tools_mapping_df.to_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), sep='\t', index=False)
    

def remove_duplicate_mro_id() :
    '''===============================================================================================
        \n\tDescription :
          This function will remove rows that have duplicate MRO ID from 'tools-mapping'. It will
          pick out first unique MRO ID and remove rest of the same MRO ID from the list.
    
        Parameters :\n
          \t- None

        Return Value :\n
          \t- TSV file (tools-mapping.tsv)\n
    ==============================================================================================='''
    tools_mapping_df = pd.read_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), skipinitialspace=True, sep='\t')
    tools_mapping_header = list(tools_mapping_df.columns)
    tools_mapping_dict = {}

    for row in tqdm(tools_mapping_df.itertuples(name=None, index=False)) :
        mro_id = str(row[tools_mapping_header.index('MRO ID')])
        tool = row[tools_mapping_header.index('Tool')]
        tool_version = str(row[tools_mapping_header.index('Tool Version')])
        # tool_label = row[tools_mapping_header.index('Tool Label')]
        # iedb_label = row[tools_mapping_header.index('IEDB Label')]

        # This should essentially remove completely duplicated rows, but still capture unique tool labels
        if (tool, tool_version, mro_id) not in tools_mapping_dict :
            tools_mapping_dict[(tool, tool_version, mro_id)] = {
                'Tool Group': row[tools_mapping_header.index('Tool Group')],
                'Tool': row[tools_mapping_header.index('Tool')],
                'Tool Version': str(row[tools_mapping_header.index('Tool Version')]),
                'Tool Label': row[tools_mapping_header.index('Tool Label')],
                'IEDB Label': row[tools_mapping_header.index('IEDB Label')],
                'MRO ID': str(mro_id), # There are some with 'nan' values
                'Lengths': row[tools_mapping_header.index('Lengths')],
            }

    final_tools_mapping_df = pd.DataFrame.from_records(list(tools_mapping_dict.values()))

    print("Finished combining Tool Labels that map to the same MRO ID.")

    # Write final df to output file    
    final_tools_mapping_df.to_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), sep='\t', index=False)

def add_missing_netmhcpan_closest_alleles_as_synonyms() :
    missing_alleles_df = pd.read_excel(r'{}/{}'.format(DATA_DIR, MISSING_31_ALLELES_FILE), 
                                        sheet_name='unique-missing-alleles', engine='openpyxl')
    header = list(missing_alleles_df.columns)

    mol_df = pd.read_csv('{}/{}'.format(DATA_DIR, 'mhc_alleles.tsv'), skipinitialspace=True, sep='\t')
    mol_header = list(mol_df.columns)
    
    # key: mro id, value: unique-missing-alleles
    missing_alleles_dict = {}
    for row in tqdm(missing_alleles_df.itertuples(name=None, index=False)) :
        mro_id = row[header.index('MRO ID')]
        missing_allele = row[header.index('unique-missing-alleles')]

        if str(mro_id) != 'nan' :
            if mro_id not in missing_alleles_dict :
                missing_alleles_dict[mro_id] = [missing_allele]
            else :
                missing_alleles_dict[mro_id].append(missing_allele)
        else :
            print("Skipping allele %s (%s)..." %(missing_allele, mro_id))


    # Use the MRO ID to find where to add missing alleles as synonyms
    for mol_row in tqdm(mol_df.itertuples(name=None)) :
        mro_id = mol_row[mol_header.index('MRO ID')+1]
        
        for k_mro_id, v_missing_allele in missing_alleles_dict.items():
            if mro_id == k_mro_id :
                synonyms = []        
                if str(mol_row[mol_header.index('Synonyms')+1]) != 'nan' :
                    synonyms = mol_row[mol_header.index('Synonyms')+1].split('|')
                
                synonyms = synonyms + v_missing_allele

                mol_df.loc[mol_row[0], 'Synonyms'] = '|'.join(synonyms)
                break

    # Save to file
    mol_df.to_csv('{}/{}'.format(DATA_DIR, 'mhc_alleles.tsv'), sep='\t', index=False)


def add_missing_netmhcpan_output_alleles() :
    '''===============================================================================================
        \n\tDescription :
          This function will add 18 missing netmhcpan output alleles as synonyms. Although they are
          missing from the original source files, such as mro_molecules and Tools-Mapping file, they
          resemble closely to existing alleles in those files. Thus, it is safe enough to add these
          missing 18 alleles as synonyms.
    
        Parameters :\n
          \t- None

        Return Value :\n
          \t- TSV file (mhc_alleles.tsv)\n
    ==============================================================================================='''
    # TODO: Implement this function
    tools_mapping_df = pd.read_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), skipinitialspace=True, sep='\t')
    tm_header = list(tools_mapping_df.columns)

    mol_df = pd.read_csv('{}/{}'.format(DATA_DIR, 'mhc_alleles.tsv'), skipinitialspace=True, sep='\t')
    mol_header = list(mol_df.columns)
    
    # 17 missing alleles
    missing_output_alleles = [
        'HLA-A*01:159',
        'HLA-A*01:281',
        'HLA-A*02:437',
        'HLA-A*02:581',
        'HLA-A*02:728',
        'HLA-A*02:795',
        'HLA-A*03:260',
        'HLA-A*24:329',
        'HLA-A*24:378',
        'HLA-B*18:106',
        'HLA-B*27:185',
        'HLA-B*41:56',
        'HLA-C*04:338',
        'HLA-C*07:226',
        'HLA-C*07:432',
        'HLA-C*07:713',
        'HLA-C*12:139',
        'HLA-C*15:20'
    ]

    # tools label
    corr_input_alleles = [
        'HLA-A01:159',
        'HLA-A01:281',
        'HLA-A02:437',
        'HLA-A02:581',
        'HLA-A02:728',
        'HLA-A02:795',
        'HLA-A03:260',
        'HLA-A24:329',
        'HLA-A24:378',
        'HLA-B18:106',
        'HLA-B27:185',
        'HLA-B41:56',
        'HLA-C04:338',
        'HLA-C07:226',
        'HLA-C07:432',
        'HLA-C07:713',
        'HLA-C12:139',
        'HLA-C15:27'
    ]

    label_and_output_allele_dict = dict(zip(corr_input_alleles, missing_output_alleles))

    # Find MRO ID for each tools label from the tools-mapping file
    label_and_mroid_dict = dict(zip(corr_input_alleles, ['']*len(corr_input_alleles)))

    for tm_row in tqdm(tools_mapping_df.itertuples(name=None)) :
        mro_id = tm_row[tm_header.index('MRO ID') + 1]
        tool_name = tm_row[tm_header.index('Tool') + 1]
        tool_version = tm_row[tm_header.index('Tool Version') + 1]
        tool_label = tm_row[tm_header.index('Tool Label') + 1]
        
        if tool_name != 'netmhcpan' or str(tool_version) != '4.1':
            continue
        
        if tool_label in label_and_mroid_dict:
            label_and_mroid_dict[tool_label] = mro_id

    
    # Use the MRO ID to find where to add missing alleles as synonyms
    for mol_row in tqdm(mol_df.itertuples(name=None)) :
        mro_id = mol_row[mol_header.index('MRO ID') + 1]
        
        for k_label, v_mroid in label_and_mroid_dict.items():
            if v_mroid == mro_id :
                synonyms = []        
                if str(mol_row[mol_header.index('Synonyms') + 1]) != 'nan' :
                    synonyms = mol_row[mol_header.index('Synonyms') + 1].split('|')
                
                missing_output_allele = label_and_output_allele_dict[k_label]

                # Handle edge case
                if missing_output_allele == 'HLA-C15:20':
                    synonyms.append('HLA-C*15:20')

                synonyms.append(missing_output_allele)

                mol_df.loc[mol_row[0], 'Synonyms'] = '|'.join(synonyms)
                break

    # Save to file
    mol_df.to_csv('{}/{}'.format(DATA_DIR, 'mhc_alleles.tsv'), sep='\t', index=False)



def add_tool_label_as_synonym() :
    '''===============================================================================================
        \n\tDescription :
          This function will map 'MRO ID' from 'mro_molecules' to 'tools-mapping', and will grab the
          corresponding 'Tool Label' to add as a synonym to 'mro_molecules'. If the 'Tool Label' from
          'tools-mapping' is already in 'IEDB Label' format, then it will not add itself as a synonym
          to 'molecule-dev'.
    
        Parameters :\n
          \t- None

        Return Value :\n
          \t- TSV file (mhc_alleles.tsv)\n
    ==============================================================================================='''
    mol_df = pd.read_csv('{}/{}'.format(DATA_DIR, 'mro_molecules.tsv'), skipinitialspace=True, sep='\t')
    tools_mapping_df = pd.read_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), skipinitialspace=True, sep='\t')
    mol_header = list(mol_df.columns)
    
    for mol_row in tqdm(mol_df.itertuples(name=None)) :
        mro_id = mol_row[mol_header.index('MRO ID') + 1]
        iedb_label = mol_row[mol_header.index('IEDB Label') + 1]
        synonyms = []

        if str(mol_row[mol_header.index('Synonyms') + 1]) != 'nan' :
            synonyms = mol_row[mol_header.index('Synonyms') + 1].split('|')
        
        matched_mro_id_df = tools_mapping_df[tools_mapping_df['MRO ID'] == mro_id]
        potential_synonyms = list(set(matched_mro_id_df['Tool Label'].tolist()))
        
        for potential_synonym in potential_synonyms :
            if (potential_synonym not in synonyms) and (potential_synonym != iedb_label):
                synonyms.append(potential_synonym)

        mol_df.loc[mol_row[0], 'Synonyms'] = '|'.join(synonyms)
        
    # Save to file
    mol_df.to_csv('{}/{}'.format(DATA_DIR, 'mhc_alleles.tsv'), sep='\t', index=False)


def populate_immunogenicity_alleles():
    tools_mapping_df = pd.read_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), skipinitialspace=True, sep='\t')
    tools_mapping_header = list(tools_mapping_df.columns)
    immunogenicity_alleles = {
        "H-2-Db":"2,5,9",
        "H-2-Dd":"2,3,5",
        "H-2-Kb":"2,3,9",
        "H-2-Kd":"2,5,9",
        "H-2-Kk":"2,8,9",
        "H-2-Ld":"2,5,9",
        "HLA-A0101":"2,3,9",
        "HLA-A0201":"1,2,9",
        "HLA-A0202":"1,2,9",
        "HLA-A0203":"1,2,9",
        "HLA-A0206":"1,2,9",
        "HLA-A0211":"1,2,9",
        "HLA-A0301":"1,2,9",
        "HLA-A1101":"1,2,9",
        "HLA-A2301":"2,7,9",
        "HLA-A2402":"2,7,9",
        "HLA-A2601":"1,2,9",
        "HLA-A2902":"2,7,9",
        "HLA-A3001":"1,3,9",
        "HLA-A3002":"2,7,9",
        "HLA-A3101":"1,2,9",
        "HLA-A3201":"1,2,9",
        "HLA-A3301":"1,2,9",
        "HLA-A6801":"1,2,9",
        "HLA-A6802":"1,2,9",
        "HLA-A6901":"1,2,9",
        "HLA-B0702":"1,2,9",
        "HLA-B0801":"2,5,9",
        "HLA-B1501":"1,2,9",
        "HLA-B1502":"1,2,9",
        "HLA-B1801":"1,2,9",
        "HLA-B2705":"2,3,9",
        "HLA-B3501":"1,2,9",
        "HLA-B3901":"1,2,9",
        "HLA-B4001":"1,2,9",
        "HLA-B4002":"1,2,9",
        "HLA-B4402":"2,3,9",
        "HLA-B4403":"2,3,9",
        "HLA-B4501":"1,2,9",
        "HLA-B4601":"1,2,9",
        "HLA-B5101":"1,2,9",
        "HLA-B5301":"1,2,9",
        "HLA-B5401":"1,2,9",
        "HLA-B5701":"1,2,9",
        "HLA-B5801":"1,2,9"
    }


    for immuno_allele in immunogenicity_alleles.keys():
        clean_allele_label = clean_label(immuno_allele)
        
        for row in tqdm(tools_mapping_df.itertuples(name=None)) :
            row_tool_label = row[tools_mapping_header.index('Tool Label') + 1]
            clean_tool_label = clean_label(row_tool_label)

            if clean_allele_label == clean_tool_label :
                immuno_row = {
                    'Tool Group': 'mhci',
                    'Tool': 'immunogenicity',
                    'Tool Version': '3.0',
                    'Tool Label': immuno_allele,
                    'IEDB Label': row[tools_mapping_header.index('IEDB Label') + 1],
                    'MRO ID': row[tools_mapping_header.index('MRO ID') + 1],
                    'Lengths': '9'
                }

                tools_mapping_df = tools_mapping_df.append(immuno_row, ignore_index = True)
                break

    # Write final df to output file    
    tools_mapping_df.to_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), sep='\t', index=False)     

    
def populate_length():
    '''===============================================================================================
        \n\tDescription :
          This function will add 'Lengths' column to 'tools-mapping.tsv' file. It will reference the
          lengths from 'allele-lengths.xlsx'. Successfully mapped alleles from 'tools-mapping' to
          'allele-lengths' will have their corresponding lengths taken and added to 'Lengths' column.
          All unmaped alleles will use 'all methods' sheet from the 'allele-lengths.xlsx' and try to
          map the allele's tool name and grab the corresponding lengths from there.
    
        Parameters :\n
          \t- None

        Return Value :\n
          \t- TSV file (tools-mapping.tsv)\n
    ==============================================================================================='''
    # Read 'all methods' sheet of the excel file
    tools_mapping_df = pd.read_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), skipinitialspace=True, sep='\t')
    tools_mapping_header = list(tools_mapping_df.columns)
    length_col = []
    
    general_lengths_df = pd.read_excel(r'{}/allele-lengths.xlsx'.format(DATA_DIR), 
                                        sheet_name="all methods", engine='openpyxl')
    consensus_lengths_df = pd.read_excel(r'{}/allele-lengths.xlsx'.format(DATA_DIR), 
                                        sheet_name="consensus", engine='openpyxl')
    smm_lengths_df = pd.read_excel(r'{}/allele-lengths.xlsx'.format(DATA_DIR), 
                                        sheet_name="smm", engine='openpyxl')
    smmpmbec_lengths_df = pd.read_excel(r'{}/allele-lengths.xlsx'.format(DATA_DIR), 
                                        sheet_name="smmpmbec", engine='openpyxl')
    
    for row in tqdm(tools_mapping_df.itertuples(name=None)) :
        mhc_class = row[tools_mapping_header.index('Tool Group') + 1]
        method = row[tools_mapping_header.index('Tool') + 1]
        allele = row[tools_mapping_header.index('Tool Label') + 1]
        allele_idx = None
        lengths_df = None
        
        if mhc_class == 'mhci' :
            # Assign appropriate length dataframe
            if method == 'smm' :
                lengths_df = smm_lengths_df
            elif method == 'smmpmbec' :
                lengths_df = smmpmbec_lengths_df
            elif method == 'consensus' :
                lengths_df = consensus_lengths_df
            else :
                lengths_df = general_lengths_df

            # Find indices matched by alleles
            if method in ('consensus', 'smm', 'smmpmbec') :
                try :
                    allele_idx = lengths_df['allele'].loc[lambda x: x==allele].index[0]
                except : 
                    # If allele not found, use general length
                    allele_idx = general_lengths_df['method'].loc[lambda x: x==method].index[0]
            else :
                allele_idx = lengths_df['method'].loc[lambda x: x==method].index[0]

            # Get actual lengths
            lengths = str(lengths_df['lengths'][allele_idx]).strip("{}'")
        
        else :
            # mhcii - all alleles have length from 11-30.
            length_string_list = [str(_) for _ in list(range(11, 31))]
            lengths = ', '.join(length_string_list)

        # Fill lengths col
        length_col.append(lengths)
    
    # Update to the dataframe
    tools_mapping_df.insert(len(tools_mapping_header), 'Lengths', length_col)

    # Save to file
    tools_mapping_df.to_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), sep='\t', index=False)


def check_DTU_alleles_existence() :
    tools_mapping_df = pd.read_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), skipinitialspace=True, sep='\t')

    # Take only the netmhcpan-related alleles
    tools_mapping_df = tools_mapping_df[tools_mapping_df['Tool'] == 'netmhcpan']
    tools_mapping_alleles = tools_mapping_df['Tool Label'].tolist()

    # Read NetMHCpan allele names from the URL
    try:
        with urllib.request.urlopen(DTU_NETMHCPAN_ALLELES_URL) as f:
            netmhcpan_alleles_from_dtu = f.read().decode('utf-8').split('\n')
            netmhcpan_alleles_from_dtu = list(filter(None, netmhcpan_alleles_from_dtu))
    except urllib.error.URLError as e:
        raise "Provided URL(%s) is invalid." %(DTU_NETMHCPAN_ALLELES_URL)

    # print(len(netmhcpan_alleles_from_dtu))
    
    netmhcpan_alleles_from_dtu = [clean_label(_) for _ in netmhcpan_alleles_from_dtu]
    tools_mapping_alleles = [clean_label(_) for _ in tools_mapping_alleles]

    for each_allele in netmhcpan_alleles_from_dtu :
        if each_allele not in tools_mapping_alleles :
            print("Following allele wasn't found : %s" %(each_allele))


def create_netmhcpan_source_columns() :
    '''===============================================================================================
        \n\tDescription :
          This function will two new columns which indicates 'netmhcpan' alleles existence from different
          sources. First column will compare 'netmhcpan' alleles from DTU's netMHCpan-4.1 list.
          Second column will compare 'netmhcpan' alleles from 'netmhcpan-4.1-alleles.txt' which came
          from 'netmhcpan_4_1_executable' package.

        NOTE :
            DTU website contains 3584 allele names.
            Currently, it maps 3581. 3 unmapped alleles can be ignored for now.
            ['H-2-Dq', 'H-2-Kq', 'H-2-Lq']
            
        Parameters :\n
          \t- None

        Return Value :\n
          \t- TSV file (tools-mapping.tsv)\n
    ==============================================================================================='''
    tools_mapping_df = pd.read_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), skipinitialspace=True, sep='\t')
    netmhcpan_41_alleles = pd.read_csv(NETMHCPAN_ALLELE_PATH, header=None, sep='\t')
    netmhcpan_exec_alleles = netmhcpan_41_alleles.iloc[:, 0].tolist() # Left alleles are the valid ones
    valid_netmhcpan_alleles = []
    tools_mapping_header = list(tools_mapping_df.columns)
    netmhcpan_alleles_from_dtu = []
    exists_in_dtu = []
    exists_in_netmhcpan_exec = []


    # Grab all valid allele names (left column).
    for each_alleles_row in netmhcpan_exec_alleles :
        valid_tool_allele_name = each_alleles_row.split(' ')[0]
        valid_netmhcpan_alleles.append(valid_tool_allele_name)
    
    # Read NetMHCpan allele names from the URL
    try:
        with urllib.request.urlopen(DTU_NETMHCPAN_ALLELES_URL) as f:
            netmhcpan_alleles_from_dtu = f.read().decode('utf-8').split('\n')
            netmhcpan_alleles_from_dtu = list(filter(None, netmhcpan_alleles_from_dtu))
    except urllib.error.URLError as e:
        raise "Provided URL(%s) is invalid." %(DTU_NETMHCPAN_ALLELES_URL)


    # Map out DTU's netMHCpan-4.1 allele names.
    mapped_counter = 0
    for row in tqdm(tools_mapping_df.itertuples(name=None)) :
        row_tool = row[tools_mapping_header.index('Tool') + 1]
        row_tool_label = row[tools_mapping_header.index('Tool Label') + 1]
        
        if row_tool != 'netmhcpan' : 
            exists_in_dtu.append('0')
            exists_in_netmhcpan_exec.append('0')
            continue
        
        # check for DTU list
        if row_tool_label in netmhcpan_alleles_from_dtu :
            exists_in_dtu.append('1')
            mapped_counter = mapped_counter + 1
        else :
            exists_in_dtu.append('0')
        
        # check for netmhcpan-4.1 exec. list
        if row_tool_label in valid_netmhcpan_alleles :
            exists_in_netmhcpan_exec.append('1')
        else :
            exists_in_netmhcpan_exec.append('0')
        
    # print("Total # of alleles from DTU website : %s" %(len(netmhcpan_alleles_from_dtu)))
    # print("Total # of mapped alleles : %s" %(mapped_counter))
    
    # Update to the dataframe
    tools_mapping_df.insert(len(tools_mapping_header), 'From DTU Website', exists_in_dtu)
    tools_mapping_df.insert(len(tools_mapping_header), 'From NetMHCpan-4.1-Exec', exists_in_netmhcpan_exec)

    tools_mapping_df.to_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), sep='\t', index=False)

    print("Done")

def take_out_removable_mro_id() :
    '''===============================================================================================
        \n\tDescription :
          This function will remove any rows with MRO ID set to 'remove'. MRO ID with blanks are
          left because those allels could be mapped in the future.

        Parameters :\n
          \t- None

        Return Value :\n
          \t- TSV file (tools-mapping.tsv)\n
    ==============================================================================================='''
    tools_mapping_df = pd.read_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), skipinitialspace=True, sep='\t')
    cleaned_tools_mapping_df = tools_mapping_df[tools_mapping_df['MRO ID'] != 'remove']
    
    cleaned_tools_mapping_df.to_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), sep='\t', index=False)


def remove_duplicate_rows() :
    '''===============================================================================================
        \n\tDescription :
          This function will completely remove all rows that are completely duplicates.
          However, those with same MRO ID, IEDB Label, etc., but has different Tool Label will be
          left in order to capture all unique Tool Labels.

          Thus, there could be same MRO ID or IEDB Label, but only unique Tool Label.

        Parameters :\n
          \t- None

        Return Value :\n
          \t- TSV file (tools-mapping.tsv)\n
    ==============================================================================================='''
    tools_mapping_df = pd.read_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), skipinitialspace=True, sep='\t')
    tools_mapping_header = list(tools_mapping_df.columns)
    tools_mapping_dict = {}

    # Create list of dictionary where key is the (MRO ID, TOOL) and value is the entire row.
    for row in tqdm(tools_mapping_df.itertuples(name=None, index=False)) :
        mro_id = str(row[tools_mapping_header.index('MRO ID')])
        tool_name = row[tools_mapping_header.index('Tool')]
        tool_label = row[tools_mapping_header.index('Tool Label')]
        iedb_label = row[tools_mapping_header.index('IEDB Label')]

        # This should essentially remove completely duplicated rows, but still capture unique tool labels
        # NOTE: Some alleles have same iedb_label/mro_id/tool_name, but different tool_label
        if (tool_name, tool_label, iedb_label, mro_id) not in tools_mapping_dict :
            tools_mapping_dict[(tool_name, tool_label, iedb_label, mro_id)] = {
                'Tool Group': row[tools_mapping_header.index('Tool Group')],
                'Tool': row[tools_mapping_header.index('Tool')],
                'Tool Version': row[tools_mapping_header.index('Tool Version')],
                'Tool Label': row[tools_mapping_header.index('Tool Label')],
                'IEDB Label': row[tools_mapping_header.index('IEDB Label')],
                'MRO ID': str(mro_id) # There are some with 'nan' values
            }
        else :
            print("Duplicate found... %s" %(tool_label))

    final_tools_mapping_df = pd.DataFrame.from_records(list(tools_mapping_dict.values()))

    print("Finished combining Tool Labels that map to the same MRO ID.")

    # Write final df to output file    
    final_tools_mapping_df.to_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), sep='\t', index=False)


def update_netctlpan_allele_names() :
    '''===============================================================================================
        \n\tDescription :
          This function will take all 'netctlpan' alleles and map it to their IEDB Label in
          'Tools_MRO_mapping.xlsx', in order to grab their relevant MRO ID. Because 'tools-mapping.tsv'
          is missing MRO IDs for 'netctlpan' alleles, it will grab MRO ID from 'Tools_MRO_mapping.xlsx'.

        Parameters :\n
          \t- None

        Return Value :\n
          \t- TSV file (tools-mapping.tsv)\n
    ==============================================================================================='''
    tools_mapping_df = pd.read_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), skipinitialspace=True, sep='\t')
    original_tm_df = pd.read_excel('{}/{}'.format(DATA_DIR, ORIGINAL_MAPPING_FILE), engine='openpyxl')
    tools_mapping_header = list(tools_mapping_df.columns)

    orig_tm_tool_labels_info = {}
    orig_tm_mro_ids = original_tm_df['MRO ID'].tolist()
    orig_tm_tool_labels = original_tm_df['Tool Label'].tolist()
    orig_tm_iedb_labels = original_tm_df['MRO Name'].tolist()
    orig_tm_iedb_labels = [_.replace('protein complex', '').strip() for _ in orig_tm_iedb_labels]
    orig_tm_tool_label_info_list = zip(orig_tm_tool_labels, orig_tm_iedb_labels, orig_tm_mro_ids)
    
    for each_tool_label, each_iedb_label, each_mro_id in orig_tm_tool_label_info_list :
        cleaned_tool_label = clean_label(each_tool_label)
        orig_tm_tool_labels_info[cleaned_tool_label] = (each_iedb_label, each_mro_id)

    netctl_df = tools_mapping_df[(tools_mapping_df['Tool']=='netctlpan')]
    for row in tqdm(netctl_df.itertuples()):
        tool_label = row[tools_mapping_header.index('Tool Label') + 1]
        clean_tool_label = clean_label(tool_label)
        
        if clean_tool_label in orig_tm_tool_labels_info :
            mol_info = orig_tm_tool_labels_info[clean_tool_label]
            iedb_label, mro_id = mol_info

            tools_mapping_df.at[row.Index, 'IEDB Label'] = iedb_label
            tools_mapping_df.at[row.Index, 'MRO ID'] = mro_id

    # Write final df to output file    
    tools_mapping_df.to_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), sep='\t', index=False)


def add_unmapped_netmhcpan_alleles() :
    '''===============================================================================================
        \n\tDescription :
          As of 06/18/22, Randi was able to map 92 alleles to MRO IDs. This function will add these
          alleles to the tools-mapping sheet.
          After adding 92 alleles, it will remove other alleles that don't have MRO ID associated.

        Parameters :\n
          \t- None

        Return Value :\n
          \t- TSV file (tools-mapping.tsv)\n
    ==============================================================================================='''
    tools_mapping_df = pd.read_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), skipinitialspace=True, sep='\t')
    tool_mapping_header = list(tools_mapping_df.columns)
    additional_netmhcpan_df = pd.read_excel('{}/{}'.format(DATA_DIR, 'Additional_netMHCpanRV.xlsx'), sheet_name='Sheet1', engine='openpyxl')
    netmhcpan_df_header = list(additional_netmhcpan_df.columns)
    # Remove all 'Unnamed' columns -- excel sheet contains many empty columns
    netmhcpan_df_header = [column_name for column_name in netmhcpan_df_header if not column_name.startswith('Unnamed')]
    
    # Filter to only include those with MRO ID's mapped (92 alleles)
    filtered_netmhcpan_list = additional_netmhcpan_df[additional_netmhcpan_df['mro id'].notnull()]
    filtered_netmhcpan_df = pd.DataFrame(filtered_netmhcpan_list, columns=netmhcpan_df_header)

    additional_tool_labels = filtered_netmhcpan_df['Tool Label'].tolist()
    additional_iedb_labels = filtered_netmhcpan_df['IEDB Label'].tolist()
    # additional_iedb_labels = filtered_netmhcpan_df['mro name'].tolist()
    # additional_iedb_labels = [_.replace('protein complex', '').strip() for _ in additional_iedb_labels]
    additional_mro_ids = filtered_netmhcpan_df['mro id'].tolist()

    # Create dictionary -- tool_label (key) and IEDB_label/MRO_ID (value)
    additional_netmhcpan_alleles_dict = {}
    for i, additional_tool_label in enumerate(additional_tool_labels) :
        if additional_tool_label not in additional_netmhcpan_alleles_dict :
            additional_netmhcpan_alleles_dict[additional_tool_label] = (additional_iedb_labels[i], additional_mro_ids[i])

    # Find the additional alleles in the tools-mapping, and update IEDB Labels and MRO IDs.
    for netmhcpan_tool_label, netmhcpan_data in tqdm(additional_netmhcpan_alleles_dict.items()) :
        for row in tools_mapping_df.itertuples() :
            row_tool_label = row[tool_mapping_header.index('Tool Label') + 1]
            if netmhcpan_tool_label == row_tool_label :
                tools_mapping_df.at[row.Index, 'IEDB Label'] = netmhcpan_data[0]
                tools_mapping_df.at[row.Index, 'MRO ID'] = netmhcpan_data[1]
                break

    # Write final df to output file    
    tools_mapping_df.to_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), sep='\t', index=False)

def tool_label_indicator():
    '''
    This will iterate over tools-mapping and determine if the 'Tool_Label' is same as the input (1st column) for 
    'netMHCpan-4.1/allelenames' or output (2nd column)

    Result:
    10933 datapoints, where netmhcpan_input_format == False.
        1 - Gogo* (gorilla)
        10379 - HLA* (human)
        405 - Mamu* (macaque)
        104 - Patr* (chimpanzee)
        44 - SLA* (pig)
    '''
    tools_mapping_df = pd.read_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), skipinitialspace=True, sep='\t')
    tools_mapping_header = list(tools_mapping_df.columns)

    netmhcpan_labels_dict = {}
    netmhcpan_input_format = []
    netmhcpan_output_format = []

    with open(NETMHCPAN_ALLELE_PATH, 'r') as f:
        for line in f.readlines() :
            netmhcpan_tool_label, netmhcpan_iedb_label = line.strip().split(' ')
            netmhcpan_labels_dict[netmhcpan_tool_label] = netmhcpan_iedb_label
        
    for row in tools_mapping_df.itertuples(name=None, index=False):
        tm_tool = row[tools_mapping_header.index('Tool')]
        tm_tool_label = row[tools_mapping_header.index('Tool Label')]
        
        if tm_tool != 'netmhcpan' :
            netmhcpan_input_format.append('-')
            netmhcpan_output_format.append('-')
            continue

        if tm_tool_label in list(netmhcpan_labels_dict.keys()):
            netmhcpan_input_format.append(True)
        else :
            netmhcpan_input_format.append(False)
        
        if tm_tool_label in list(netmhcpan_labels_dict.values()):
            netmhcpan_output_format.append(True)
        else :
            netmhcpan_output_format.append(False)

    tools_mapping_df['netmhcpan_input_format'] = netmhcpan_input_format
    tools_mapping_df['netmhcpan_output_format'] = netmhcpan_output_format

    # Write final df to output file    
    tools_mapping_df.to_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), sep='\t', index=False)


# def update_netmhcpan_allele_names() :
#     '''===============================================================================================
#         \n\tDescription :
#             This function will take DTU's NetMHCpan-4.1 allele names, and map that to 'tools-mappng.tsv'.

#             NOTE: Not all alleles from DTU's list will map to 'tools-mapping.tsv'.

#             Parameters :\n
#             \t- None

#             Return Value :\n
#             \t- TSV file (tools-mapping.tsv)\n
#     ==============================================================================================='''
#     mol_df = pd.read_csv('{}/{}'.format(DATA_DIR, MRO_MOLECULES_FILE), skipinitialspace=True, sep='\t')
#     tools_mapping_df = pd.read_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), skipinitialspace=True, sep='\t')
#     dtu_netmhcpan_alleles = []
#     netmhcpan_allele_mapper_dict = {}

#     # Filter MRO data with alleles only relevant to NetMHCpan and NetMHCpan 4.1
#     filtered_tools_mapping_df = tools_mapping_df[(tools_mapping_df['Tool']=='netmhcpan')]
    
#     # Create another dictionary that holds Tools Label (key) and IEDB Label (value) of the
#     # entire Tools Mapping.
#     tm_tool_label = filtered_tools_mapping_df['Tool Label'].tolist()
#     tm_mroid = filtered_tools_mapping_df['MRO ID'].tolist()
#     cleaned_tm_tool_label = [clean_label(each_allele) for each_allele in tm_tool_label]
#     tm_iedb_label = filtered_tools_mapping_df['IEDB Label'].tolist()
#     tm_dict = dict(zip(cleaned_tm_tool_label, tm_iedb_label))

    
#     with open(NETMHCPAN_ALLELE_PATH, 'r') as f:
#         for line in f.readlines():
#             input_allele, output_allele = line.strip().split(' ')
#             # [(tool_label, iedb_label)]
#             dtu_netmhcpan_alleles.append((input_allele, output_allele))

#     # Dictionary with key representing allele names without special chars.
#     # and value with actual NetMHCpan allele names.
#     # ex.) {bola100901: BoLA-1*009:01}
#     for netmhcpan_allele_set in dtu_netmhcpan_alleles :
#         clean_netmhcpan_allele = clean_label(netmhcpan_allele_set[0])

#         # Need to consider for duplicates with different IEDB Label (2nd col) in allelenames file.
#         if clean_netmhcpan_allele in netmhcpan_allele_mapper_dict :
#             netmhcpan_allele_mapper_dict[clean_netmhcpan_allele].append(netmhcpan_allele_set) 
#         else :    
#             netmhcpan_allele_mapper_dict[clean_netmhcpan_allele] = [netmhcpan_allele_set]
    
#     organized_netmhcpan_allele_info = {}
#     for k, v in netmhcpan_allele_mapper_dict.items():
#         actual_iedb_label = ''
#         secondary_tool_label = ''
#         secondary_iedb_label = ''
#         prevailing_index = 0

#         if k in tm_dict :
#             actual_iedb_label = tm_dict[k]

#         # max length 'v' can have is 2
#         if 1 < len(v) :
#             # compare output label of netmhcpan to see which is closer to actual IEDB label
#             spec_char_bias_score_0 = 1
#             spec_char_bias_score_1 = 1
#             if ':' in v[0][1] : 
#                 spec_char_bias_score_0 += 0.1
#             if ':' in v[1][1] : 
#                 spec_char_bias_score_1 += 0.1
#             similarity_score_0 = SequenceMatcher(None, v[0][1], actual_iedb_label).ratio() * spec_char_bias_score_0
#             similarity_score_1 = SequenceMatcher(None, v[1][1], actual_iedb_label).ratio() * spec_char_bias_score_1
#             # print("Calculating score for {} and {} against {}.".format(v[0][1], v[1][1], actual_iedb_label))
            
#             if similarity_score_0 < similarity_score_1 :
#                 secondary_iedb_label = v[0][1]
#                 secondary_tool_label = v[0][0]

#             prevailing_index = 1

#         # Save labels that resembles more of the actual IEDB label
#         if k not in organized_netmhcpan_allele_info :
#             organized_netmhcpan_allele_info[k] = {
#                 'tool_label': v[prevailing_index][0],
#                 'iedb_label': v[prevailing_index][1],
#                 'iedb_syn': [secondary_tool_label, secondary_iedb_label],
#                 'mro_id': ''
#             }
        
#     # for k, v in organized_netmhcpan_allele_info.items() :
#     #     print(k, '-->', v)

#     # print(len(organized_netmhcpan_allele_info))
    
    
#     # df = pd.read_csv(NETMHCPAN_ALLELE_PATH)
#     # df.to_csv('{}/{}'.format(DATA_DIR, 'allelenames.csv', index=None))

#     # exit()

    
#     # NOTE: 
#     #   Some of these alleles are completely new, so they don't have MRO ID.
#     #   However, those that do have MRO ID, we need to pull that information from the mro_molecules sheet.
#     mol_iedb_label = mol_df['IEDB Label'].tolist()
#     cleaned_mol_iedb_label = [clean_label(each_allele) for each_allele in mol_iedb_label]
#     mol_mro_id = mol_df['MRO ID'].tolist()
#     mol_iedb_mroid_dict = dict(zip(cleaned_mol_iedb_label, mol_mro_id))

#     # Iterate mro_molecule file in order to assign MRO_ID to each alleles.
#     for k in organized_netmhcpan_allele_info.keys() :
#         if k in mol_iedb_mroid_dict :
#             mro_id = mol_iedb_mroid_dict[k]
#             organized_netmhcpan_allele_info[k]['mro_id'] = mro_id

    
#     # Following applies to those alleles with MRO ID.
#     # Also, corrects 'IEDB Label' and updates 'IEDB Synonyms'.
#     # --> With current data, if IEDB_Label is not correct when compared to mro_molecule file,
#     # v['iedb_label'] will equal to the IEDB_Label in the mro_molecule file. And the replaced allele
#     # will be added to the v['iedb_syn'] as synonym.
#     for k, v in organized_netmhcpan_allele_info.items():
#         if v['mro_id']:
#             matched_row = mol_df.loc[mol_df['MRO ID'] == v['mro_id']]
#             mol_iedb_label = matched_row['IEDB Label'].to_string(index=False).strip()

#             if v['iedb_label'] != mol_iedb_label :
#                 if v['iedb_label'] not in v['iedb_syn']:
#                     v['iedb_syn'].append(v['iedb_label'])
#                     v['iedb_syn'] = list(set(filter(None, v['iedb_syn']))) 
#                 v['iedb_label'] = mol_iedb_label

#     '''
#     At this point, the 'organized_netmhcpan_allele_info' dictionary is finalized.
#     1. Need to use this to populate the tools-mapping.
#     2. Add synonyms to molecule file.
#     '''
#     # for k, v in organized_netmhcpan_allele_info.items() :
#     #     print(k, v)
#     # exit()
    

#     # Iterate over tools-mapping file and whatever allele from the 'organized_netmhcpan_allele_info' dictionary
#     # is not found in tools-mapping should be added.
#     # NOTE: Adding new 11122 alleles as of 08/15/22
#     for k, v in organized_netmhcpan_allele_info.items() :
#         netmhcpan_tool_label = v['tool_label']
#         netmhcpan_mroid = v['mro_id']

#         if netmhcpan_mroid not in tm_mroid :
#             row = {
#                 'Tool Group': 'mhci',
#                 'Tool': 'netmhcpan',
#                 'Tool Version': '4.1',
#                 'Tool Label': netmhcpan_tool_label,
#                 'IEDB Label': v['iedb_label'],
#                 'MRO ID': v['mro_id']
#             }
#             tools_mapping_df = tools_mapping_df.append(row, ignore_index = True)
#         else :
#             tools_mapping_df.loc[(tools_mapping_df['MRO ID'] == netmhcpan_mroid) & (tools_mapping_df['Tool'] == 'netmhcpan'), 'Tool Label'] = netmhcpan_tool_label
#             tools_mapping_df.loc[(tools_mapping_df['MRO ID'] == netmhcpan_mroid) & (tools_mapping_df['Tool'] == 'netmhcpan'), 'IEDB Label'] = v['iedb_label']
#             tools_mapping_df.loc[(tools_mapping_df['MRO ID'] == netmhcpan_mroid) & (tools_mapping_df['Tool'] == 'netmhcpan'), 'MRO ID'] = v['mro_id']

#     # Write final df to output file    
#     tools_mapping_df.to_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), sep='\t', index=False)
    
#     # Iterate over mro_molecules to add any additional synonyms from the 'organized_netmhcpan_allele_info' dictionary.
#     mol_mroid_list = mol_df['MRO ID'].tolist()
#     for k, v in organized_netmhcpan_allele_info.items() :
#         netmhcpan_mroid = v['mro_id']
#         if netmhcpan_mroid in mol_mroid_list :
#             matched_row = mol_df.loc[mol_df['MRO ID'] == netmhcpan_mroid]
#             matched_row_syn_list = matched_row['Synonyms'].to_string(index=False).replace(',', '|').split('|')
#             matched_row_syn_list = [_.strip() for _ in matched_row_syn_list]
            
#             for fresh_netmhcpan_syn in v['iedb_syn'] :
#                 if fresh_netmhcpan_syn and fresh_netmhcpan_syn not in matched_row_syn_list :
#                     matched_row_syn_list.append(fresh_netmhcpan_syn)
#                     if 'NaN' in matched_row_syn_list :
#                         matched_row_syn_list.remove('NaN')

#                     # Add these synonym list as string to mro_mol file.
#                     mol_df.loc[mol_df['MRO ID'] == netmhcpan_mroid, 'Synonyms'] = '|'.join(matched_row_syn_list)

#     mol_df.to_csv('{}/{}'.format(DATA_DIR, 'mro_molecules_updated2.tsv'), sep='\t', index=False)
    

def update_netmhciipan_allele_names() :
    '''===============================================================================================
        \n\tDescription :
          This function will take DTU's NetMHCIIpan-4.1 allele names, and map that to 'tools-mappng.tsv'.
          
          NOTE: Not all alleles from DTU's list will map to 'tools-mapping.tsv'.

        Parameters :\n
          \t- None

        Return Value :\n
          \t- TSV file (tools-mapping.tsv)\n
    ==============================================================================================='''
    mol_df = pd.read_csv('{}/{}'.format(DATA_DIR, MRO_MOLECULES_FILE), skipinitialspace=True, sep='\t')
    tools_mapping_df = pd.read_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), skipinitialspace=True, sep='\t')
    netmhciipan_tool_names = {}


    # Test out if there are any duplicates in the file
    counter_dict = {}
    with open(NETMHCIIPAN_ALLELE_PATH, 'r') as netmhciipan_file :
        for line in netmhciipan_file :
            netmhciipan_allele = line.strip()
            cleaned_iedb_name = clean_label(netmhciipan_allele)

            # Converting DRB* alleles to IEDB label by appending 'HLA' prefix
            if cleaned_iedb_name.startswith('drb') :
                cleaned_iedb_name = 'hla' + cleaned_iedb_name
                
            if cleaned_iedb_name in counter_dict :
                counter_dict[cleaned_iedb_name] += 1
            else :
                counter_dict[cleaned_iedb_name] = 1


    # Convert the tools label to iedb label. To do so, only have to append 'HLA' prefix to DRB* alleles.
    # All other alleles in 'allele.list' are already in IEDB Label format.
    with open(NETMHCIIPAN_ALLELE_PATH, 'r') as netmhciipan_file :
        for line in netmhciipan_file :
            netmhciipan_allele = line.strip()
            cleaned_iedb_name = clean_label(netmhciipan_allele)
            
            # Converting DRB* alleles to IEDB label by appending 'HLA' prefix
            if cleaned_iedb_name.startswith('drb') :
                cleaned_iedb_name = 'hla' + cleaned_iedb_name

            # Mapping IEDB Label -> Tools Label
            netmhciipan_tool_names[cleaned_iedb_name] = netmhciipan_allele

    # NOTE:
    #   stripped_iedb_name <- dtu_iedb_label <- dtu_tool_name
    #   v
    #   tm_stripped_iedb_name -> iedb_name -> mro_id
    # Check from netMHCIIpan-4.0 alleles from tools-mapping.
    tm_iedb_labels = mol_df['IEDB Label'].tolist()
    tm_mro_id = mol_df['MRO ID'].tolist()
    tm_iedb_label_to_mro_id_dict = dict(zip(tm_iedb_labels, tm_mro_id))
    tm_allele_meta_data = {}
    for tm_iedb_label in tm_iedb_labels :
        cleaned_tm_iedb_label = clean_label(tm_iedb_label)
        mro_id = tm_iedb_label_to_mro_id_dict[tm_iedb_label]

        tm_allele_meta_data[cleaned_tm_iedb_label] = {
            'iedb_label': tm_iedb_label,
            'mro_id': str(mro_id),
        }
    

    for stripped_iedb_name in tqdm(netmhciipan_tool_names.keys()) :
        row = ''
        if stripped_iedb_name in tm_allele_meta_data :
            row = {
                'Tool Group': 'mhcii',
                'Tool': 'netmhciipan',
                'Tool Version': '4.1',
                'Tool Label': netmhciipan_tool_names[stripped_iedb_name],
                'IEDB Label': tm_allele_meta_data[stripped_iedb_name]['iedb_label'],
                'MRO ID': tm_allele_meta_data[stripped_iedb_name]['mro_id'] # There are some with 'nan' values
            }
        else :
            # NOTE: Hardcoding in adding information about 'DRB5_0108N'
            if stripped_iedb_name == 'hladrb50108n' :
                row = {
                    'Tool Group': 'mhcii',
                    'Tool': 'netmhciipan',
                    'Tool Version': '4.1',
                    'Tool Label': netmhciipan_tool_names[stripped_iedb_name],
                    'IEDB Label': 'HLA-DRB5*01:08',
                    'MRO ID': 'MRO:0034509'
                }

            else :
                row = {
                    'Tool Group': 'mhcii',
                    'Tool': 'netmhciipan',
                    'Tool Version': '4.1',
                    'Tool Label': netmhciipan_tool_names[stripped_iedb_name],
                    'IEDB Label': '',
                    'MRO ID': '' # There are some with 'nan' values
                }

        tools_mapping_df = tools_mapping_df.append(row, ignore_index = True)

    # Write final df to output file    
    tools_mapping_df.to_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), sep='\t', index=False)        


def assign_tool_versions() :
    '''===============================================================================================
        \n\tDescription :
          This function will take all the Tool names, and assign them to an appropriate version numbers
          from 'method-table.xlsx'. Tool names containing versions in its name will simply use version
          number defined inside its name (ex. Tool: ann-3.4 --> Tool: ann, Tool Version: 3.4).
          Tool names without versions will use the latest version number retrieved from 'method-table.xlsx'.
          (ex. Tool: netmhccons, Tool Version: 1.1).

        Parameters :\n
          \t- None

        Return Value :\n
          \t- TSV file (tools-mapping.tsv)\n
    ==============================================================================================='''
    mhci_method_table = pd.read_excel('{}/{}'.format(DATA_DIR, METHOD_FILE), sheet_name='MHCI', engine='openpyxl')
    mhcii_method_table = pd.read_excel('{}/{}'.format(DATA_DIR, METHOD_FILE), sheet_name='MHCII', engine='openpyxl')
    tools_mapping_df = pd.read_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), skipinitialspace=True, sep='\t')
    method_table_header = list(mhci_method_table.columns)
    tools_mapping_header = list(tools_mapping_df.columns)
    version_info = []
    tools_name = []

    # Retrieve IEDB Label without 'protein complex' word attached
    for row in tqdm(tools_mapping_df.itertuples(name=None, index=False)) :
        tool_group = row[tools_mapping_header.index('Tool Group')]
        tool = row[tools_mapping_header.index('Tool')]
        tool_name = ''
        tool_version = ''

        # Assign versions
        # 'ann', 'netmhcpan', and 'nn_align' are the only 3 tools that has another name with version attached to it.
        # (ann, ann-3.4, nn_align, nn_align-2.3, netmhcpan, netmhcpan-4.1)
        if '-' in tool :
            tool_name, tool_version = tool.split('-')
        elif tool == 'ann' :
            tool_name, tool_version = 'ann', '4.0'
        elif tool == 'netmhcpan' :
            tool_name, tool_version = 'netmhcpan', '4.0'
        elif tool == 'nn_align' :
            tool_name, tool_version = 'nn_align', '2.2'
        elif tool == 'netctlpan' :
            tool_name, tool_version = 'netctlpan', '1.1'
        else :
            # Give the default version to the tools without version number specified.
            tool_name = tool
            
            if tool_group == 'mhci' :
                method_table = mhci_method_table
            else :
                method_table = mhcii_method_table

            for method_row in method_table.itertuples(name=None, index=False) :
                if (tool_name == method_row[method_table_header.index('method')] and 
                    method_row[method_table_header.index('default_version')]) :
                    tool_version = method_row[method_table_header.index('version')]
                    break
    
        tools_name.append(tool_name)
        version_info.append(str(tool_version))
        

    # Populate the output data as dictionary, then turn it back to Dataframe
    output_data = {}
    output_data['Tool Group'] = tools_mapping_df['Tool Group'].tolist()
    output_data['Tool'] = tools_name
    output_data['Tool Version'] = version_info
    output_data['Tool Label'] = tools_mapping_df['Tool Label'].tolist()
    output_data['IEDB Label'] = tools_mapping_df['IEDB Label'].tolist()
    output_data['MRO ID'] = tools_mapping_df['MRO ID'].tolist()
    tools_mapping_df = pd.DataFrame.from_dict(output_data)

    # Filter out old versions of ANN v3.4, NetMHCpan v4.0, and NN_ALIGN v2.2
    tools_mapping_df.drop(
        tools_mapping_df[
            (tools_mapping_df['Tool'] == 'ann') & 
            (tools_mapping_df['Tool Version'] == '3.4')
            ].index, 
        inplace=True
    )

    tools_mapping_df.drop(
        tools_mapping_df[
            (tools_mapping_df['Tool'] == 'netmhcpan') & 
            (tools_mapping_df['Tool Version'] == '4.0')
            ].index, 
        inplace=True
    )

    tools_mapping_df.drop(
        tools_mapping_df[
            (tools_mapping_df['Tool'] == 'nn_align') & 
            (tools_mapping_df['Tool Version'] == '2.2')
            ].index, 
        inplace=True
    )

    # Write final df to output file    
    tools_mapping_df.to_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), sep='\t', index=False)



# def populate_mhcflurry_alleles() :
#     '''===============================================================================================
#         \n\tDescription :
#           This function will take all 'netctlpan' alleles and map it to their IEDB Label in
#           'Tools_MRO_mapping.xlsx', in order to grab their relevant MRO ID. Because 'tools-mapping.tsv'
#           is missing MRO IDs for 'netctlpan' alleles, it will grab MRO ID from 'Tools_MRO_mapping.xlsx'.

#         Parameters :\n
#           \t- None

#         Return Value :\n
#           \t- TSV file (tools-mapping.tsv)\n
#     ==============================================================================================='''
#     tools_mapping_df = pd.read_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), skipinitialspace=True, sep='\t')

#     # Then download mhcflurry datasets and trained models:
#     subprocess.run(["mhcflurry-downloads", "fetch"])
    
#     # Load predictor & retrieve alleles
#     predictor = mhcflurry.Class1PresentationPredictor.load()
#     mhcflurry_alleles = predictor.supported_alleles
#     print('Total number of MHCflurry alleles: %s' %(len(mhcflurry_alleles)))

#     # For each allele, I must check if the allele already exists in the table.
#     # If so, I must retrieve it's MRO ID information.
#     tool_labels = tools_mapping_df['Tool Label'].tolist()
#     iedb_labels = tools_mapping_df['IEDB Label'].tolist()

#     mhcflurry_entries = []
#     for mhcflurry_allele in mhcflurry_alleles :
#         if (mhcflurry_allele in iedb_labels) or (mhcflurry_allele in tool_labels) :
#             # Even if multiple row indices are returned, they are have same mro_id
#             matched_row = tools_mapping_df.index[tools_mapping_df['IEDB Label'] == mhcflurry_allele].tolist()[0]
#             tool_label = tools_mapping_df['Tool Label'].iloc[matched_row]
#             iedb_label = tools_mapping_df['IEDB Label'].iloc[matched_row]
#             mro_id = tools_mapping_df['MRO ID'].iloc[matched_row]

#             entry = {
#                         'Tool Group': 'mhci',
#                         'Tool': 'mhcflurry',
#                         'Tool Version': '',
#                         'Tool Label': tool_label,
#                         'IEDB Label': iedb_label,
#                         'MRO ID': mro_id
#                     }

#         else :
#             entry = {
#                         'Tool Group': 'mhci',
#                         'Tool': 'mhcflurry',
#                         'Tool Version': '',
#                         'Tool Label': '',
#                         'IEDB Label': mhcflurry_allele,
#                         'MRO ID': ''
#                     }

#         mhcflurry_entries.append(entry)

#     # Turn entries into df
#     mhcflurry_df = pd.DataFrame(mhcflurry_entries)

#     # Append mhcflurry entries to tools_mapping
#     tools_mapping_df = pd.concat([tools_mapping_df, mhcflurry_df])

#     # Write final df to output file    
#     tools_mapping_df.to_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), sep='\t', index=False)


def populate_alleles_from_allele_info() :
    '''===============================================================================================
        \n\tDescription :
          'Tools_MRO_mapping.xlsx' is missing few alleles from MHCI and other classes, such as 
          MHCII and MHC-NP.\n
          This function will pull alleles from 'allele-info' package and fill missing alleles from 
          each allele data classes to 'tools-mapping.tsv'.

        Parameters :\n
          \t- None

        Return Value :\n
          \t- TSV file (tools-mapping.tsv)\n
    ==============================================================================================='''
    tools_mapping_df = pd.read_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), skipinitialspace=True, sep='\t')
    mol_df = pd.read_csv('{}/{}'.format(DATA_DIR, MRO_MOLECULES_FILE), skipinitialspace=True, sep='\t')

    # Define inner function for populating MHC-NP
    def populate_mhcnp() :
        nonlocal tools_mapping_df
        mhcad = MHCNPAlleleData()
        # mhc_methods = mhcad.get_method_names()[0]
        mhc_method = 'mhcnp'

        # Total 8 MHCNP alleles
        relevant_alleles = mhcad.get_allele_names(method_name=mhc_method)

        num_alleles_added = 0
        for allele in relevant_alleles :
            # Get 'MRO ID' from Tools Mapping, then use that to find 'IEDB Label'
            # from mro_molecules.tsv.
            idx = tools_mapping_df.index[tools_mapping_df["Tool Label"] == allele].tolist()
            mro_id = tools_mapping_df.loc[idx[0]]["MRO ID"]
            mol_idx = mol_df.index[mol_df["MRO ID"] == mro_id].tolist()
            iedb_label = mol_df.loc[mol_idx[0]]["IEDB Label"]

            row = {
                'Tool Group': 'mhci',
                'Tool': mhc_method,
                'Tool Version': '',
                'Tool Label': allele,
                'IEDB Label': iedb_label,
                'MRO ID': mro_id
                }
                    
            # tools_mapping_df = tools_mapping_df.append(row, ignore_index = True)
            tools_mapping_df = pd.concat([tools_mapping_df, pd.DataFrame([row])], ignore_index = True)
            num_alleles_added = num_alleles_added + 1
            
        print("\nTotal of %s alleles from MHC-NP have been added to the MRO data." %(num_alleles_added))
        
    # Define inner function for populating NetCTLpan
    # NOTE: At this point only tools label will be populated.
    def populate_netctlpan() :
        nonlocal tools_mapping_df
        mhcad = NetCTLpanAlleleData()
        mhc_method = mhcad.get_method_names()[0]
        species = mhcad.get_species_list()

        num_alleles_added = 0
        for each_species in species :
            relevant_alleles = mhcad.get_allele_names_for_species(species=each_species)

            print("\n(NetCTLpan) %s - %s:" %(mhc_method, each_species))
            for allele in tqdm(relevant_alleles) :
                filtered_tools_mapping_df = tools_mapping_df[
                    (tools_mapping_df['Tool Group']=='mhci') & 
                    (tools_mapping_df['Tool']==mhc_method) & 
                    (tools_mapping_df['Tool Label']==allele)
                    ]

                if len(filtered_tools_mapping_df) == 0 :
                    iedb_label = allele
                    mro_id = ''
                    
                    row = {
                        'Tool Group': 'mhci',
                        'Tool': mhc_method,
                        'Tool Version': '',
                        'Tool Label': allele,
                        'IEDB Label': iedb_label,
                        'MRO ID': mro_id
                    }
                    
                    # tools_mapping_df = tools_mapping_df.append(row, ignore_index = True)
                    tools_mapping_df = pd.concat([tools_mapping_df, pd.DataFrame([row])], ignore_index = True)

                    num_alleles_added = num_alleles_added + 1
            
        print("\nTotal of %s alleles from NetCTLpan have been added to the Tools Mapping data." %(num_alleles_added))

    # Populate missing alleles from each class
    populate_mhcnp()
    populate_netctlpan()

    # Write final df to output file    
    tools_mapping_df.to_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), sep='\t', index=False)



def create_tools_mapping_from_mro():
    '''===============================================================================================
        \n\tDescription :
          This function will work off from initial 'Tools_MRO_mapping.xlsx'. It will rename columns
          to more meaningful names and clean up the IEDB labels from the MRO names. \n
          Right before creating the file, it will also remove all rows/alleles that are part of 
          'arb' method, since 'arb' is deprecated.

        Parameters :\n
          \t- None

        Return Value :\n
          \t- TSV file (tools-mapping.tsv)\n
    ==============================================================================================='''
    mro_df = pd.read_excel('{}/{}'.format(DATA_DIR, ORIGINAL_MAPPING_FILE), engine='openpyxl')
    mol_df = pd.read_csv('{}/{}'.format(DATA_DIR, MRO_MOLECULES_FILE), skipinitialspace=True, sep='\t')

    mro_header = list(mro_df.columns)
    mol_header = list(mol_df.columns)
    iedb_label_col = []
    tools_group_col = []
    tools_col = []
    tool_label_col = []
    mro_id_col = []
    

    # Retrieve IEDB Label without 'protein complex' word attached
    for row in tqdm(mro_df.itertuples(name=None, index=False)) :
        # allele_name = str(row[mro_header.index('MRO Name')]).replace('protein complex', '').rstrip()
        # cleaned_mro_names.append(allele_name)
        mroid = row[mro_header.index('MRO ID')]

        if mroid == 'remove':
            continue

        mask = mol_df['MRO ID'].values == mroid
        # Should only have 1 row returning because mol_df should contain unique MRO ID
        matched_rows = mol_df[mask].values.tolist()
        
        if len(matched_rows) == 1:
            # extract IEDB Label from molecules file
            matched_row = matched_rows[0]
            iedblabel = matched_row[mol_header.index('IEDB Label')]
            iedb_label_col.append(iedblabel)

            # Assign MHC class type to alleles
            mhc_class = assign_mhc_class(row[mro_header.index('Tool')], iedblabel)
            tools_group_col.append(mhc_class)

            # Add additional info from tools-mapping file
            tools_col.append(row[mro_header.index('Tool')])
            tool_label_col.append(row[mro_header.index('Tool Label')])
            mro_id_col.append(mroid)

        else:
            raise ValueError("More than 1 MRO ID seems to be found in the molecules file.")


    # Populate the output data as dictionary, then turn it back to Dataframe
    output_data = {}
    output_data['Tool Group'] = tools_group_col
    output_data['Tool'] = tools_col
    output_data['Tool Version'] = [''] * len(tools_group_col)
    output_data['Tool Label'] = tool_label_col
    output_data['IEDB Label'] = iedb_label_col
    output_data['MRO ID'] = mro_id_col
    mro_df = pd.DataFrame.from_dict(output_data)

    # Remove rows with 'HLA-C*15:20' as Tool Label
    # 'HLA-C*15:20' is obsolete. Identical to 'HLA-C*15:27'.
    # (https://www.ebi.ac.uk/ipd/imgt/hla/alleles/allele/?accession=HLA02838)
    mro_df = mro_df[mro_df['Tool Label'] != 'HLA-C*15:20']

    # Write Dataframe to CSV format
    mro_df.to_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), sep='\t', index=False)


def assign_mhc_class(given_method, allele) :
    mhc_class_type = ''

    if given_method == 'recommended' or given_method == 'consensus':
        # handle left-over unclassified alleles 
        if any(['HLA-DP' in allele, 'HLA-DQ' in allele, 'HLA-DR' in allele, 'H2-I' in allele]) : 
            mhc_class_type = 'mhcii'
        else :
            mhc_class_type = 'mhci'
    else :
        if given_method in MHCI_METHODS : mhc_class_type = 'mhci'
        if given_method in MHCII_METHODS : mhc_class_type = 'mhcii'

    return mhc_class_type

def add_edge_case_alleles():
    target_iedb_label = 'SLA-1*04:01'
    edge_case_allele = 'SLA-104:01'
    mhc_alleles_df = pd.read_csv('{}/{}'.format(DATA_DIR, 'mhc_alleles.tsv'), skipinitialspace=True, sep='\t')
    ma_header = list(mhc_alleles_df.columns)
    
    for mol_row in tqdm(mhc_alleles_df.itertuples(name=None)) :
        iedb_label = mol_row[ma_header.index('IEDB Label') + 1]

        # Add 'SLA-104:01' as synonym to 'SLA-1*04:01'
        if iedb_label == target_iedb_label:
            synonyms = mol_row[ma_header.index('Synonyms') + 1].split('|')
            synonyms.append(edge_case_allele)
            mhc_alleles_df.loc[mol_row[0], 'Synonyms'] = '|'.join(synonyms)
            break
        
    # Save to file
    mhc_alleles_df.to_csv('{}/{}'.format(DATA_DIR, 'mhc_alleles.tsv'), sep='\t', index=False)


if __name__ == '__main__':
    # Tools_MRO_mapping.xlsx --> assign Tool Group (MHCI/MHCII) + remove alleles belonging to 'arb' and 'recommended'.
    print('Running: create_tools_mapping_from_mro()...\n')
    create_tools_mapping_from_mro()
    
    #Pull any missing alleles from MHC-NP, NETCTLPAN Allele Data Info
    # NOTE: 
    # For NetCTLpan alleles, only tools label are populated. Need to call 'update_netctlpan_allele_names' function
    # later to populate iedb labels and mro ids.
    print('Running: populate_alleles_from_allele_info()...\n')
    populate_alleles_from_allele_info()
    # exit()

    # Add additional netmhcpan-4.1 alleles from DTU (allelenames file.)
    # print('Running: update_netmhcpan_allele_names()...\n')
    # update_netmhcpan_allele_names()
    print('Running: add_unmapped_netmhcpan_alleles()...\n')
    add_unmapped_netmhcpan_alleles()
    
    # Add additional netmhciipan-4.1 alleles from DTU (alleles.list)
    # print('Running: update_netmhciipan_allele_names()...\n')
    # update_netmhciipan_allele_names()
    
    # This functions fills IEDB Label and MRO ID for netCTLpan's allele.
    print('Running: update_netctlpan_allele_names()...\n')
    update_netctlpan_allele_names()

    remove_duplicate_rows()
    
    # Removing those where MRO ID equals 'remove'.
    print('Running: take_out_removable_mro_id()...\n')
    take_out_removable_mro_id()
    
    # NOTE: Importing MHCFlurry and Tensorflow caused a lot of issues with deployment.
    # Thus, it is disabled. Because MHCFlurry alleles are already pulled out during our first
    # run of this program, we create a separate text file containing MHCFlurry alleles and
    # manage that instead in the future.
    # populate_mhcflurry_alleles()
    
    # Removed method 'ann-3.4', 'netmhcpan-4.0', 'nn_align-2.2' + assigned the rest as default version numbers.
    print('Running: assign_tool_versions()...\n')
    assign_tool_versions()
    
    # Add available length column.
    populate_length()

    # Add immunogenicity alleles.
    # NOTE: Currently, immunogenicity information is not in the method table and alleles-lengths table.
    # That's why this function is here after assign_tool_versions() and populat_length().
    populate_immunogenicity_alleles()
    
    # Creates mhc_alleles.tsv
    add_tool_label_as_synonym()

    # Add SLA-104:01 as synonym to SLA-1*04:01
    add_edge_case_alleles()

    add_missing_netmhcpan_output_alleles()
    
    # 31 alleles - However, 1 is invalid term and another one hasn't been seen
    add_missing_netmhcpan_closest_alleles_as_synonyms()
    
    remove_duplicate_mro_id()
    
    remove_empty_mro_id()
    print("Initiating tools mapping finished.")