from netmhc_4_0_executable import predict_from_peptide_file, allowed_allele_names, allowed_binding_lengths
from sample_peptides import file_paths_by_binding_length
import datetime
import pickle

def generate_score_distribution_pickle_file():
    # 'ann' is the name used to identify netMHC predictions.
    method_name = 'ann'

    print("Running executable for an allele-length combination...")
    # netMHC produces ic50 scores
    scores_by_method_allele_binding_length = {}
    for binding_length in allowed_binding_lengths:
        peptide_file_path = file_paths_by_binding_length[binding_length]
        
        for allele_name in allowed_allele_names:
            print("{} - {}".format(allele_name, binding_length))
            scores = predict_from_peptide_file(allele_name, binding_length, peptide_file_path)
            scores.sort()
            # Take a sample of the scores produced
            scores_sample = []
            for i, score in enumerate(scores):
                # Select every tenth score for the first 1000 (10%)
                if i < 1000 and i % 10 == 0:
                    scores_sample.append(score)
                # Select every 100th score for the remaining scores
                elif i >= 1000 and i % 100 == 0:
                    scores_sample.append(score)
            mal = (method_name, allele_name, binding_length)
            scores_by_method_allele_binding_length[mal] = scores_sample

    print("Creating pickled file...")
    date_string = datetime.date.today().isoformat()
    pickle_file_name = 'ann_percentile_distribution_{iso8601_date}.p'.format(iso8601_date=date_string)
    with open(pickle_file_name, 'wb') as pickle_file:
        pickle.dump(scores_by_method_allele_binding_length, pickle_file)

if __name__ == '__main__':
    generate_score_distribution_pickle_file()
