import os
import logging
import pandas as pd
from pepmatch import Matcher
proteome_taxon_id_map = {
    'human': '9606',
    'mouse': '10090',
    'cow': '9913',
    'dog': '9612',
    'horse': '9796',
    'pig': '9823',
    'rabbit': '9986',
    'rat': '10116',
}

def format_result_table(result_table):

    # put NA for mathces number for thoes that didn't find any matches
    for row in result_table:
        for i in range(len(row)):
            if str(row[i]).lower() == "nan":
                row[i] = "-"
            elif str(row[i]).lower() == "<na>":
                row[i] = None
            elif type(row[i]) is list:
                row[i] = str(row[i])
            elif type(row[i]) is float:
                row[i] = int(row[i])

    return result_table

# Function to check if string exists in DataFrame column
def string_in_pd_column(df, column_name, search_string):
    # Check if any value in the specified column contains the search string
    result = df[column_name].str.contains(search_string, case=False, na=False)
    # If any match found, return True; otherwise, return False
    return any(result)

# for pepmatch calculation
def run_pepmatch(peptide_list, proteome='human', mismatch=1, best_match=False, pepmatch_proteomes_path='', include_unmatched_peptides=False):
    # get results
    '''
    >>> from pepmatch import Preprocessor, Matcher
    >>> Preprocessor('proteomes/9606.fasta', 3, 'pickle').preprocess()
    ...
    >>> Matcher(['DDEDSKQNIFHFLYR', 'ADPGPHLMGGGGRAK', 'KAVELGVKLLHAFHT', 'QLQNLGINPANIGLS', 'HEVWFFGLQYVDSKG'], 'proteomes/9606.fasta', 1, 3).match()
        [
        ('DDEDSKQNIFHFLYR', 'DDEDSKQNIFHFLYR', 9606, 'Homo sapiens', 'SPDYE2B', 'A6NHP3.2', 'Speedy protein E2B', 0, [], 260, 275, 3, None), 
        ('DDEDSKQNIFHFLYR', 'DDEDSKQNIFHFLYR', 9606, 'Homo sapiens', 'SPDYE6', 'P0CI01.2', 'Speedy protein E6', 0, [], 260, 275, 3, None), 
        ('DDEDSKQNIFHFLYR', 'DDEDSKQNIFHFLYR', 9606, 'Homo sapiens', 'SPDYE2', 'Q495Y8.2', 'Speedy protein E2', 0, [], 260, 275, 1, None), ('DDEDSKQNIFHFLYR', 'DDEDSKQNIFHFLYG', 9606, 'Homo sapiens', 'SPDYE1', 'A0A494C1S0.1', 'Speedy protein E1', 1, [15], 259, 274, 4, None), ('DDEDSKQNIFHFLYR', 'DDEDSKQNIFHFLYG', 9606, 'Homo sapiens', 'SPDYE5', 'A6NIY4.3', 'Speedy protein E5', 1, [15], 260, 275, 3, None), ('DDEDSKQNIFHFLYR', 'DDEDSKQNIFHFLYG', 9606, 'Homo sapiens', 'SPDYE1', 'Q8NFV5.3', 'Speedy protein E1', 1, [15], 219, 234, 2, None), ('DDEDSKQNIFHFLYR', 'DDEDSKQNIFHFLYG', 9606, 'Homo sapiens', 'SPDYE21', 'A0A494C086.1', 'Speedy/RINGO cell cycle regulator family member E21', 1, [15], 260, 275, 4, None), ('DDEDSKQNIFHFLYR', 'DDEDSKQNIFHFLYG', 9606, 'Homo sapiens', 'SPDYE21', 'A0A494C1R5.1', 'Speedy/RINGO cell cycle regulator family member E21', 1, [15], 220, 235, 4, None), ('DDEDSKQNIFHFLYR', 'DDEDSKQNIFHFLYG', 9606, 'Homo sapiens', 'SPDYE5', 'A0A096LPA1.1', 'Speedy protein E5', 1, [15], 220, 235, 4, None), ('ADPGPHLMGGGGRAK', 'ADPGPHLMGGGGGAK', 9606, 'Homo sapiens', 'SIPA1L3', 'O60292.3', 'Signal-induced proliferation-associated 1-like protein 3', 1, [13], 243, 258, 1, None), ('KAVELGVKLLHAFHT', 'KAVELGVKLLPAFHT', 9606, 'Homo sapiens', 'MAN1A1', 'P33908.3', 'Mannosyl-oligosaccharide 1,2-alpha-mannosidase IA', 1, [11], 303, 318, 1, None), ('QLQNLGINPANIGLS', 'QLQNLGINPANIGFS', 9606, 'Homo sapiens', 'CLTC', 'Q00610.5', 'Clathrin heavy chain 1', 1, [14], 13, 28, 1, None), ('QLQNLGINPANIGLS', 'QLQNLGINPANIGFS', 9606, 'Homo sapiens', 'CLTC', 'J3KSQ2.1', 'Clathrin heavy chain 1 (Fragment)', 1, [14], 13, 28, 1, None), ('QLQNLGINPANIGLS', 'QLQNLGINPANIGFS', 9606, 'Homo sapiens', 'CLTCL1', 'P53675.2', 'Clathrin heavy chain 2', 1, [14], 13, 28, 1, None), ('QLQNLGINPANIGLS', 'QLQNLGINPANIGFS', 9606, 'Homo sapiens', 'CLTC', 'A0A087WVQ6.1', 'Clathrin heavy chain', 1, [14], 13, 28, 1, None), ('QLQNLGINPANIGLS', 'QLQNLGINPANIGFS', 9606, 'Homo sapiens', 'CLTC', 'J3KS13.1', 'Clathrin heavy chain 1', 1, [14], 13, 28, 1, None), ('QLQNLGINPANIGLS', 'QLQNLGINPANIGFS', 9606, 'Homo sapiens', 'CLTC', 'J3QL20.1', 'Clathrin heavy chain 1', 1, [14], 13, 28, 1, None), ('QLQNLGINPANIGLS', 'QLQNLGINPANIGFS', 9606, 'Homo sapiens', 'CLTCL1', 'A0A087WX41.1', 'Clathrin heavy chain 2', 1, [14], 13, 28, 1, None), ('QLQNLGINPANIGLS', 'QLQNLGINPANIGFS', 9606, 'Homo sapiens', 'CLTCL1', 'F5H5N6.1', 'Clathrin heavy chain 2 (Fragment)', 1, [14], 13, 28, 1, None), ('HEVWFFGLQYVDSKG', 'REVWFFGLQYVDSKG', 9606, 'Homo sapiens', 'RDX', 'A0A2R8Y7M3.1', 'Radixin', 1, [1], 39, 54, 1, None), ('HEVWFFGLQYVDSKG', 'REVWFFGLQYVDSKG', 9606, 'Homo sapiens', 'RDX', 'P35241.1', 'Radixin', 1, [1], 39, 54, 1, None), ('HEVWFFGLQYVDSKG', 'REVWFFGLQYVDSKG', 9606, 'Homo sapiens', 'RDX', 'A0A2R8Y5S7.1', 'Radixin', 1, [1], 39, 54, 1, None), ('HEVWFFGLQYVDSKG', 'REVWFFGLQYVDSKG', 9606, 'Homo sapiens', 'RDX', 'E9PNV3.1', 'Radixin (Fragment)', 1, [1], 28, 43, 1, None), ('HEVWFFGLQYVDSKG', 'REVWFFGLQYVDSKG', 9606, 'Homo sapiens', 'RDX', 'E9PQ82.1', 'Radixin (Fragment)', 1, [1], 28, 43, 1, None)]
    '''
    PEPMATCH_PROTEOMES_PATH = os.environ.get('PEPMATCH_PROTEOMES_PATH', '/app/pepmatch-proteomes')
    if not pepmatch_proteomes_path:
        pepmatch_proteomes_path = PEPMATCH_PROTEOMES_PATH
    pepmatch_proteomes_path = os.path.expanduser(pepmatch_proteomes_path)

    taxon_id = proteome_taxon_id_map.get(proteome.lower(), None)
    if not taxon_id:
        raise ValueError('invalid proteome: %s' % proteome)
    logging.info('taxon_id: %s' % taxon_id)
    logging.info('pepmatch_proteomes_path: %s' % pepmatch_proteomes_path)
    if os.path.exists(pepmatch_proteomes_path):
        logging.info('pepmatch proteomes preprocessed file path was found.')
        # exact matching
        if mismatch == 0:
            split = 5
        #Mismatching Example
        elif mismatch > 0 and mismatch <=5:
            split = 3
        else:
            raise ValueError('invalid mismatch number: %s' % mismatch)
        logging.info('best_match for pepmatch: %s' % best_match)
        result_df = Matcher(peptide_list, taxon_id, mismatch, split, pepmatch_proteomes_path, best_match=best_match, output_format='dataframe').match()
    else:
        if pepmatch_proteomes_path == '/app/pepmatch-proteomes':
            raise ValueError("Please provide the path to the proteomes directory by setting the environment variable PEPMATCH_PROTEOMES_PATH or specifying it via the command line with the --proteomes-path parameter.")
        else:
            raise ValueError("The proteomes path '%s' doesn't. Please double check." % pepmatch_proteomes_path)

    # remove unmatched peptides
    result_df.dropna(subset=['Matched Sequence'], inplace=True)

    # add unmatched peptides
    if include_unmatched_peptides:
        unmatched_peptides = [{'Query Sequence': pep} for pep in peptide_list if not string_in_pd_column(result_df, 'Query Sequence', pep)]
        result_df = result_df._append(unmatched_peptides, ignore_index=True)

    # old_table_columns = ["peptide", "matched_sequence", "taxon_id", "species", "gene", "protein_id", "protein_name",  "mismatches", "mutated_positions", "start",  "end", "protein_existence_level"]
    # new_table_columns = ["query_sequence", "matched_sequence", "protein_id", "protein_name", "species", "taxon_id", "gene", "mismatches", "mutated_positions", "index_start", "index_end", "protein_existence_level"]
    table_data = result_df.values.tolist()
    table_columns = result_df.columns.values.tolist()
    table_columns = [h.lower().replace(" ", "_").replace("query_sequence", "peptide")  for h in table_columns]
    logging.info('result_df: %s' % result_df)
    # add NA and change float to int
    table_data = format_result_table(table_data)

    final_results = {
        "type": "peptide_table",
        "table_columns": table_columns,
        "table_data": table_data,
        'warnings': [],
       }
    # exact match results don't have "mutated_positions" column
    # no need from version 0.8.3 as it returns 12 columns for both exact match and missmatch
    #if mismatch == 0:
    #    final_results['columns'].pop(8)
    if not table_data:
        final_results['warnings'].append('No matches found with the given input peptides and selected parameters.')
    return final_results
