#! /usr/bin/python
import os
import yaml
import tempfile
import pandas as pd
from urllib.request import urlopen
from shutil import copyfileobj
from validations import mutgen_validate

MUTGEN_DOCKER_IMG = os.environ.get("MUTGEN_DOCKER_IMG", "harbor.lji.org/iedb-public/mutgen:v0.6-beta")

SELECTED_COLUMNS = ["mutgen.variant_id", "mutgen.variant_type", "mutgen.chr", "mutgen.position", "mutgen.ref_seq", "mutgen.alt_seq", "mutgen.effect", "mutgen.gene_name", "mutgen.gene_id", "mutgen.protein_ref", "mutgen.protein_alt", 
                    "mutgen.protein_position", "mutgen.peptide", "mutgen.peptide_warnings", "mutgen.hgvs_ref_allele", "mutgen.hgvs_tum_allele", "mutgen.[PEPTIDELENGTH]mer_ref_peptides", "mutgen.[PEPTIDELENGTH]mer_mut_peptides", "mutgen.mutation_position_in_peptide"
]
SELECTED_COLUMNS = ["variant_id", "variant_type", "chr", "position", "ref_seq", "alt_seq", "effect", "gene_name", "gene_id", "protein_ref", "protein_alt", "protein_position", "peptides", "peptide_warning", "hgvs_ref_allele", "hgvs_tum_allele", "reference_peptide", "mutant_peptide", "mutation_position_in_peptide"]
SELECTED_COLUMNS += ["transcript_id", "peptide_pair_id"] # new
SELECTED_COLUMNS += ["variant_id", "impact", "transcript_biotype", "hgvs_dna", "hgvs_protein", "cdna_position", "cds_position", "strand", "warning"] # verbosity_level=1


def write_yaml_to_tmp(data):
    with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp:
        yaml.dump(data, tmp)
        return tmp.name

def tsv_to_dict(tsv_file):
    # Read the TSV file into a DataFrame
    df = pd.read_csv(tsv_file, sep='\t', low_memory=False)
    # rename columns mer_ref_peptides and [PEPTIDELENGTH]mer_mut_peptides
    df.columns = df.columns.str.replace(r'\d+mer_ref_peptides', 'reference_peptide', regex=True)
    df.columns = df.columns.str.replace(r'\d+mer_mut_peptides', 'mutant_peptide', regex=True)
    df.columns = df.columns.str.replace('feature_id', 'transcript_id')
    df.columns = df.columns.str.replace('SerialNumber', 'peptide_pair_id')
    # filter pandas dataframe with specific column names
    df = df.loc[:, df.columns.isin(SELECTED_COLUMNS)]
    # replace na/nat/nan to -
    df = df.fillna('-')
    # Convert the DataFrame to JSON
    dict_data = df.to_dict(orient='split')

    return dict_data

def save_file_from_URI(file_uri, output_file=None, target_dir=None):
    """Given the URI of a file and optionally an output file name or target directory,
    Retreive the file from the URI and save it.  Return the full path to the file"""

    # first define the output file and/or directory
    if output_file == None:
        if target_dir == None:
            # set the target dir to the temp directory
            target_dir = tempfile.gettempdir()
        output_file = tempfile.NamedTemporaryFile(dir=target_dir, delete=False)

    with urlopen(file_uri) as fsrc, output_file as fdst:
        copyfileobj(fsrc, fdst)
        return fdst.name

def upper_key(d):
    new_dict = {}
    for k,v in d.items():
        new_dict[k.upper()] = v
    return new_dict


def generate_mut(**kwargs):
    #print(kwargs)
    """
    final (for container)
    kwargs = {
        "PEPTIDE_LENGTH": 20, 
        "FRAMESHIFT_OVERLAP": 9, 
        "REFERENCE_GENOME": "GRCh38", 
        "OUTPUT_PREFIX": "test", 
        "OUTPUT_DIR": "/opt/pepgen/output", 
        "MAXIMUM_PEPTIDE_LENGTH": 22, 
        "MIN_LEN_NEAR_START_STOP": 14, 
        "INPUT_VCF": "/opt/pepgen/input/syntheticX.speedseq.ann.vcf.gz", 
        "PEPTIDE_MUTATION_POSITION": "15,6"
    }
    old_input
    {
        "PEPTIDE_LENGTH": 20,
        "FRAMESHIFT_OVERLAP": 9,
        "REFERENCE_GENOME": "GRCh38",
        "OUTPUT_PREFIX": "test",
        "OUTPUT_DIR": "/opt/pepgen/output",
        "MAXIMUM_PEPTIDE_LENGTH": 22,
        "MIN_LEN_NEAR_START_STOP": 14,
        "INPUT_VCF": "/opt/pepgen/input/syntheticX.speedseq.ann.vcf",
        "PEPTIDE_MUTATION_POSITION": "15,6",
        "input_vcf": "examples/syntheticX.speedseq.ann.vcf",
        "output_dir": "tests/output_dir"
    }
    new_input
    {
        "peptide_length": 20,
        "reference_genome": "GRCh38",
        "frameshift_overlap": 9,
        "peptide_length_range": null,
        "maximum_peptide_length": 22,
        "min_len_near_start_stop": 14,
        "peptide_mutation_position1": 6,
        "peptide_mutation_position2": 15,
        "vcf_download_uri": "https://api-nextgen-tools-dev.iedb.org/api/v1/download_vcf/3966e840-d4fc-41bb-af78-ee1313e29b78",
        'output_dir': '/home/jyan/workspace/iedb/gitlab/standalone/mutated-peptide-generator/results',
    }
    """
    # fixed value
    kwargs["OUTPUT_DIR"] = "/opt/pepgen/output"
    kwargs["OUTPUT_PREFIX"] = "mutgen"

    # get PEPTIDE_MUTATION_POSITION from 1 and 2 if peptide_mutation_position is not given
    if not kwargs.get('peptide_mutation_position', None):
        kwargs["PEPTIDE_MUTATION_POSITION"] = ','.join(map(str,sorted(filter(None, [kwargs.get("peptide_mutation_position1", kwargs.get('peptide_length') // 2), kwargs.get("peptide_mutation_position2", None)]),reverse=True)))

    # download vcf and update INPUT_VCF path for container
    if "vcf_download_uri" in kwargs:
        kwargs["input_vcf"] = save_file_from_URI(kwargs.pop("vcf_download_uri"))
    else:
        kwargs["input_vcf"] = os.path.abspath(kwargs.pop("input_vcf"))
    # run validation
    kwargs, validation_errors = mutgen_validate(kwargs)
    if validation_errors:
        return {"errors": validation_errors}

    #print('start prediction')
    input_vcf = kwargs.pop("input_vcf")
    input_vcf_dir, input_vcf_name = os.path.split(input_vcf)
    kwargs["INPUT_VCF"] = f"/opt/pepgen/input/{input_vcf_name}"

    # get annotate = True or False
    annotate = kwargs.pop("annotate", False)
    if type(annotate) is str and annotate.lower() == 'false':
        annotate = False
    else:
        annotate = bool(annotate)

    # get output_dir
    output_dir = os.path.abspath(kwargs.pop("output_dir"))
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    kwargs = upper_key(kwargs)
    tsv_dir_name = f'{kwargs["OUTPUT_PREFIX"]}_{kwargs["PEPTIDE_LENGTH"]}mer_pos_{kwargs["PEPTIDE_MUTATION_POSITION"].replace(",","_")}'
    tsv_dir = os.path.join(output_dir, tsv_dir_name)

    config_file = write_yaml_to_tmp(kwargs)
    config_file_dir, config_file_name = os.path.split(config_file)

    cmd = f"""
    docker run
    -v {config_file_dir}:/workdir -v 
    {input_vcf_dir}:/opt/pepgen/input  
    -v {output_dir}:/opt/pepgen/output
    --platform=linux/amd64    
    --entrypoint ""  
    {MUTGEN_DOCKER_IMG}
    python src/run_PepGen.py -s configs/system_config.docker.yaml -r /workdir/{config_file_name}
    """

    cmd = " ".join(cmd.split())

    # If 'annotate' is true, then pass the '-a' option to the tool
    if annotate:
        cmd += ' -a'

    #print(cmd)
    with os.popen(cmd) as process:
        result = process.read()

    results = []
    errors = []
    warnings = []
    for line in result.splitlines():
        if 'ERROR' in line:
            errors.append(line)
        elif 'WARNING' in line:
            warnings.append(line)
    if errors:
        return {"errors": errors, "warnings": warnings}
    # read csv result files to json objects
    # TODO: read errors/warnings from result files as well?
    has_empty_table = False
    for tsv_file_name in os.listdir(tsv_dir):
        dict_result = tsv_to_dict(os.path.join(tsv_dir, tsv_file_name))
        del dict_result['index']
        dict_result['result_type'] = '_'.join(filter(None, tsv_file_name.replace('.tsv', '').replace('all', '').replace('output', '').replace(tsv_dir_name, '').split('_')))
        # rename result_type 'snp' to 'variant'
        if dict_result['result_type'] == 'snp':
            dict_result['result_type'] = 'variant'
        dict_result['table_columns'] = dict_result.pop('columns')
        dict_result['table_data'] = sorted(dict_result.pop('data'))
        # check if table_data is empty
        if not dict_result['table_data']:
            has_empty_table = True
            warnings.append(f"Warning: {dict_result['result_type']} table has no table_data and is skipped.")
            continue
        results.append(dict_result)
    # reuturn the info about possible reasons for no generated peptides
    if has_empty_table:
        warnings.append("Possible reasons for no generated peptides include a mismatch between the selected 'reference genome' and the genome used in the VCF file as well as variants that are in non-coding regions.")
    # Type order: 'Unique peptde', 'peptide', 'variant'(renamed from 'SNP')
    type_sort_order = 'upvs'
    results.sort(key=lambda d:(str(type_sort_order.index(d['result_type'][0]))+d['result_type'][1:]) if d['result_type'][0] in type_sort_order else d['result_type'])
    #print([(type(r),r['result_type'],r['table_columns']) for r in results])
    return {"results": results, "errors": errors, "warnings": warnings}
    
if __name__ == '__main__':
    #test script if need
    pass

