import sys
import re
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
PROJECT_DIR = str(Path(__file__).resolve().parents[1])
sys.path.insert(1, PROJECT_DIR)

# Get allele data
PARENT_DIR = Path(__file__).parent
DATA_DIR = PARENT_DIR.parent / "data"
TOOLS_MAPPING_FILE = DATA_DIR / "tools-mapping.tsv"
MRO_MOLECULES_FILE = DATA_DIR / "mro_molecules.tsv"

# Function to strip away all special characters. Also, removes 'HLA-' prefix.
def clean_mhcii_alleles(allele):
    if allele.startswith('HLA-'):
        allele = allele.replace('HLA-', '')

    return re.sub(r'[^a-zA-Z0-9\s]', '', allele)


def add_allelelist_data():
    # allele.list is identical to allelelist.txt
    # convert_IEDB.dat = allele.list + 3 additional unmappable alleles ('BoLA-DRA-DRB31101', 'BoLA-DRA-DRB30101', 'BoLA-DRA-DRB31501')
    ALLELELIST_DATA = DATA_DIR / "netmhciipan-4.1" / "allele.list"
    
    with open(ALLELELIST_DATA, 'r') as f:
        alleles = [_.strip() for _ in f.readlines()]
        cleaned_alleles = [clean_mhcii_alleles(_) for _ in alleles]

    tools_mapping_df = pd.read_csv(TOOLS_MAPPING_FILE, skipinitialspace=True, sep='\t')    
    '''
    Read 'tools-mapping.tsv', filter it to only get 'netmhciipan-4.1' alleles.
    Then, remove all special chars. and add the cleaned allele name to a
    new column called 'cleaned_label' at the end of the dataframe.
    '''
    netmhciipan_4_1_df = tools_mapping_df[
         (tools_mapping_df['Tool']=='netmhciipan') &
         (tools_mapping_df['Tool Version']==4.1)
         ].reset_index(drop=True)
    tool_label_col_vals = netmhciipan_4_1_df['Tool Label'].values
    cleaned_tool_label_col_vals = [clean_mhcii_alleles(_) for _ in tool_label_col_vals]
    netmhciipan_4_1_df['cleaned_label'] = cleaned_tool_label_col_vals
    
    '''
    Iterate each "cleaned_alleles" from allelelist file, and check if this
    already exists in the tools-mapping.
    '''
    cleaned_allele_col_vals = netmhciipan_4_1_df['cleaned_label'].values
    
    counter = 0
    for cleaned_allele in cleaned_alleles:
        if cleaned_allele not in cleaned_allele_col_vals:
            counter = counter + 1

    print(f'Total # of alleles that does not exist in the tools-mapping: {counter}')
    print(len(cleaned_alleles))
    print(len(cleaned_allele_col_vals))
    
    # av = AlleleValidator()
    # df = av.data['mro_data']

    # # First round of identification
    # for allele in tqdm(icer_alleles):
    #     iedb_label = av._convert_methodlabel_to_iedblabel(method_label=allele)
    #     mroid = av._convert_methodlabel_to_mroid(method_label=allele)
        
    #     # Second round of identification
    #     if not iedb_label:
    #         mroid, tool_label, iedb_label = av._identify_unknown_allele(allele)
        
    #     # Add icerfire row to the dataframe
    #     row = ['pvc', 'icerfire', '1.0', allele, iedb_label, mroid, '8,9,10,11,12,13,14']
    #     df = pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True)
        
    # df.to_csv(f'{DATA_DIR}/tools-mapping.tsv', sep='\t', index=False)


def add_4_1_alleles_name_list_data():
    DTU_WEB_ALLELE_FILE = DATA_DIR / "netmhciipan-4.1" / "4.1-alleles_name.list.txt"


    with open(DTU_WEB_ALLELE_FILE, 'r') as f:
        dtu_web_alleles_list = []
        header = f.readline().replace('\n', '')
        header = re.split(r'\s{2,}', header) # split the string if where there are more than 2 spaces

        # collect all alleles into a single list
        for _ in f.readlines():
            row = _.strip().split('\t')
            
            # each item in the list can be separted by multiple spaces.
            completely_splitted_row = []
            for each_element in row:
                each_element = each_element.strip()
                each_element = re.split(r'\s{2,}', each_element)
                completely_splitted_row = completely_splitted_row + each_element

            # Remove all empty strings in completely_splitted_row and add that to 'dtu_web_alleles_list'
            dtu_web_alleles_list = dtu_web_alleles_list + [item for item in completely_splitted_row if item.strip()]

    print(f'Total alleles from \'4.1-alleles_name.list.txt\' file: {len(dtu_web_alleles_list)}')

    cleaned_dtu_web_alleles = [clean_mhcii_alleles(_) for _ in dtu_web_alleles_list]
    
    # Create a lookup dictionary
    dtu_web_alleles_dict = dict(zip(cleaned_dtu_web_alleles, dtu_web_alleles_list))

    # Based on the 'analysis.ipynb', we know there are 276 unique alleles that
    # have not been mapped yet.
    # Alleles that are exclusive to '4.1-alleles_name.list.txt' (276 alleles):
    exclusive_allelels = ['DQB1*06:39', 'DPB1*16:01', 'DPB1*41:01', 'DPB1*71:01', 'DPB1*51:01', 'DQA1*01:05', 'DPB1*63:01', 'DPB1*31:01', 'DQB1*02:05', 'DQB1*04:03', 'DPB1*97:01', 'DPB1*131:01', 'DPB1*114:01', 'DPB1*81:01', 'DQB1*06:41', 'DPB1*01:01', 'DQA1*01:01', 'DPB1*133:01', 'DPB1*121:01', 'DPB1*45:01', 'DPB1*46:01', 'DPB1*90:01', 'DQB1*06:44', 'DPB1*129:01', 'DQA1*01:03', 'DQB1*03:12', 'DPB1*49:01', 'DQB1*03:11', 'DPB1*54:01', 'DQB1*05:06', 'DPB1*92:01', 'DPB1*18:01', 'DQB1*06:01', 'DPB1*09:01', 'DPB1*117:01', 'DPB1*44:01', 'DPB1*26:01', 'DQA1*05:05', 'DPB1*103:01', 'DPB1*08:01', 'DQA1*01:09', 'DQB1*06:22', 'DQB1*06:28', 'DPB1*65:01', 'DQA1*04:04', 'DQB1*06:21', 'DQB1*06:25', 'DQB1*05:11', 'DQB1*03:26', 'DQB1*02:04', 'DPB1*03:01', 'DQA1*06:01', 'DQA1*05:01', 'DQA1*01:02', 'DPB1*34:01', 'DPB1*50:01', 'DQB1*03:07', 'DPB1*94:01', 'DQB1*03:04', 'DQB1*04:02', 'DPB1*24:01', 'DQB1*06:18', 'DQA1*01:08', 'DQB1*03:17', 'DPB1*125:01', 'DQB1*03:31', 'DPB1*111:01', 'DQB1*05:09', 'DPB1*126:01', 'DPB1*78:01', 'DQB1*02:03', 'DPB1*134:01', 'DPB1*122:01', 'DQA1*05:07', 'DQB1*06:32', 'DPB1*89:01', 'DPB1*04:02', 'DQA1*05:09', 'DQA1*05:06', 'DQB1*03:37', 'DPB1*58:01', 'DPB1*70:01', 'DPB1*110:01', 'DPB1*86:01', 'DPB1*10:01', 'DQB1*03:03', 'DPA1*01:04', 'DPB1*53:01', 'DQB1*05:14', 'DQB1*03:29', 'DPB1*59:01', 'DQB1*03:20', 'DQB1*06:17', 'DQA1*03:01', 'DQB1*03:22', 'DPB1*76:01', 'DQB1*05:12', 'DPB1*02:02', 'DPB1*25:01', 'DPB1*119:01', 'DQB1*03:15', 'DQB1*03:32', 'DPB1*21:01', 'DQB1*03:38', 'DPB1*98:01', 'DQB1*05:05', 'DQB1*02:01', 'DQB1*03:02', 'DQA1*02:01', 'DPB1*116:01', 'DPB1*107:01', 'DPA1*04:01', 'DPB1*132:01', 'DPB1*87:01', 'DPB1*29:01', 'DPA1*02:02', 'DQB1*05:02', 'DPB1*66:01', 'DPB1*15:01', 'DQA1*05:03', 'DQB1*06:42', 'DPB1*19:01', 'DPB1*127:01', 'DQB1*06:19', 'DPB1*79:01', 'DQA1*03:02', 'DPB1*128:01', 'DPB1*14:01', 'DPB1*04:01', 'DPB1*88:01', 'DQB1*03:18', 'DPB1*69:01', 'DPB1*95:01', 'DPB1*101:01', 'DQB1*06:36', 'DPB1*17:01', 'DQB1*06:11', 'DPB1*22:01', 'DPB1*13:01', 'DPB1*73:01', 'DQB1*06:14', 'DQB1*06:37', 'DQB1*06:27', 'DQA1*05:08', 'DQB1*02:06', 'DQB1*06:33', 'DQB1*06:08', 'DPA1*01:10', 'DPB1*108:01', 'DQB1*06:02', 'DQA1*01:06', 'DPB1*106:01', 'DPA1*02:03', 'DPB1*39:01', 'DQA1*05:11', 'DQB1*03:10', 'DPA1*01:05', 'DQB1*06:03', 'DPB1*62:01', 'DQB1*03:33', 'DPB1*60:01', 'DQB1*05:13', 'DPB1*83:01', 'DPB1*40:01', 'DQB1*06:07', 'DPB1*48:01', 'DPB1*113:01', 'DQA1*05:04', 'DPB1*123:01', 'DQA1*03:03', 'DPB1*47:01', 'DQB1*03:35', 'DPB1*75:01', 'DPB1*77:01', 'DQB1*06:34', 'DPA1*02:01', 'DQB1*06:35', 'DQB1*03:25', 'DPB1*20:01', 'DPB1*68:01', 'DQB1*06:40', 'DPB1*104:01', 'DQA1*04:01', 'DQB1*03:30', 'DPB1*06:01', 'DPA1*01:03', 'DQB1*03:28', 'DPB1*118:01', 'DQB1*05:07', 'DQB1*04:06', 'DPA1*03:03', 'DQB1*03:09', 'DPB1*28:01', 'DPB1*93:01', 'DPB1*100:01', 'DQB1*06:24', 'DQB1*03:05', 'DQB1*05:10', 'DQB1*06:16', 'DPB1*02:01', 'DQB1*03:16', 'DPB1*85:01', 'DPA1*03:01', 'DQB1*03:19', 'DQB1*05:08', 'DPA1*01:09', 'DPA1*03:02', 'DPB1*72:01', 'DPB1*33:01', 'DPB1*32:01', 'DQB1*03:21', 'DPB1*109:01', 'DQB1*03:24', 'DPB1*30:01', 'DPB1*35:01', 'DQB1*04:05', 'DQA1*06:02', 'DPB1*55:01', 'DPB1*82:01', 'DQB1*03:27', 'DQB1*03:13', 'DPB1*96:01', 'DPB1*27:01', 'DPA1*02:04', 'DPB1*115:01', 'DQB1*04:07', 'DQA1*05:10', 'DPB1*05:01', 'DQB1*03:08', 'DQB1*02:02', 'DPB1*124:01', 'DQB1*06:15', 'DPB1*37:01', 'DPB1*23:01', 'DQB1*06:30', 'DQB1*04:01', 'DPB1*67:01', 'DQB1*03:01', 'DPB1*52:01', 'DQB1*06:31', 'DQB1*04:08', 'DPB1*91:01', 'DPB1*112:01', 'DQB1*06:43', 'DPB1*11:01', 'DQB1*05:01', 'DQB1*03:34', 'DQB1*03:36', 'DQB1*06:09', 'DQB1*06:10', 'DPB1*84:01', 'DPB1*102:01', 'DPA1*01:08', 'DQB1*04:04', 'DPA1*01:07', 'DPB1*36:01', 'DQB1*03:23', 'DQB1*05:03', 'DPB1*74:01', 'DQB1*03:06', 'DQB1*06:38', 'DQA1*01:07', 'DQB1*06:23', 'DPB1*56:01', 'DQA1*04:02', 'DPB1*80:01', 'DPA1*01:06', 'DPB1*130:01', 'DPB1*38:01', 'DQB1*06:12', 'DQB1*03:14', 'DPB1*105:01', 'DQA1*01:04', 'DPB1*99:01', 'DQB1*06:29', 'DQB1*06:04']
    cleaned_exclusive_allelels = [clean_mhcii_alleles(_) for _ in exclusive_allelels]
    exclusive_allelels_dict = dict(zip(cleaned_exclusive_allelels, exclusive_allelels))
    # print(len(exclusive_allelels))

    # According to 'analysis.ipynb', we know that 273 alleles can be mapped
    # using the molecules file. The remaining 3 alleles are unmappable completely.
    mol_df = pd.read_csv(MRO_MOLECULES_FILE, skipinitialspace=True, sep='\t')    
    parent = "MHC class II protein complex"
    mhcii_mol_df = mol_df[mol_df['Parent']==parent].reset_index(drop=True)
    
    mol_col_vals = mhcii_mol_df['IEDB Label'].values
    cleaned_tool_label_col_vals = [clean_mhcii_alleles(_) for _ in mol_col_vals]
    mhcii_mol_df['cleaned_label'] = cleaned_tool_label_col_vals
    
    # 273 alleles are added (These alleles have mapped MRO ID)
    mapped_alleles = []
    for cleaned_allele_label, original_label in exclusive_allelels_dict.items():
        # Finds all indices where there's a matching name
        idx = np.where(mhcii_mol_df['cleaned_label'].values == cleaned_allele_label)[0]
        if 0 == idx.size :
            # print(original_label)
            continue

        idx = idx[0]
        mroid = mhcii_mol_df.loc[idx, 'MRO ID']

        # These are the alleles that needs to be added to the tools-mapping
        mapped_alleles.append((original_label, mroid))

    df = pd.read_csv(TOOLS_MAPPING_FILE, skipinitialspace=True, sep='\t')    
    
    for entry in mapped_alleles:
        allele = entry[0]
        mroid = entry[1]
        length = '11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30'
        row = ['mhcii', 'netmhciipan', '4.1', allele, mroid, length]
        df = pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True)
        
    df.to_csv(f'{DATA_DIR}/tools-mapping.tsv', sep='\t', index=False)



if __name__=='__main__':
    # Handles allele.list, allelelist.txt, convert_IEDB.dat
    # No additional alleles need to be added.
    # add_allelelist_data()
    
    # Adds 273 alleles that has mapped MRO ID
    add_4_1_alleles_name_list_data()