import os
import pickle
import datetime

FILE_NAME = 'DRB1_0701_9'
output_path = './output'


method = 'nn_align'
version = '2.3'


def main():
    allele = 'DRB1_0701'
    outputfile = os.path.join(output_path,FILE_NAME)
    with open(outputfile,'r') as r_file:
        rows = r_file.readlines()

    rows = filter(None, rows)
    rows = [r.strip() for r in rows]
    rows = filter(lambda row:row.startswith(allele), rows)
    for i in range(0,10):
        print i
        print len(rows[i].split())
        print rows[i]

    print os.listdir(output_path)

    length_dict = {}


    scores_by_method_allele_binding_length = {}



    for file_name in os.listdir(output_path):
        outputfile = os.path.join(output_path,file_name)

        with open(outputfile,'r') as r_file:
            rows = r_file.readlines()

        allele_length_combo = file_name.split('_') 
        length = int(allele_length_combo.pop())
        allele = '_'.join(allele_length_combo)
        print 'allele=%s' % allele
        print 'length=%s' % length
        if length < 9 or length > 30:
            continue
        #if allele == 'HLA-DPA10103-HLA-DPB10301_DPB10401':
        #    continue
        #if allele == 'HLA-DPA10103-DPB10401':
        #    continue
        #if allele == 'HLA-DRB10701':
        #    continue

        rows = filter(None, rows)
        rows = [r.strip() for r in rows]
        rows = filter(lambda row:row.startswith(allele), rows)    
        print '%s: %s, %s' % (file_name,len(rows[0].split()),len(rows[1].split()))
        length_dict.setdefault((len(rows[0].split()),len(rows[1].split())),[]).append(file_name)
        ic50s = [float(row.split()[5]) for row in rows]
        #print ic50s
        ic50s.sort()
        print 'avg: %s' % (sum(ic50s)/len(ic50s))
        print 'min: %s' % min(ic50s)
        print ic50s[0]
        print 'max: %s' % max(ic50s)
        print ic50s[-1]
        print 'num: %s' % len(ic50s)

        #if (':' not in allele) and allele.startswith('HLA'):
        #    allele = allele[:-2] + ':' + allele[-2:]
        key = (method, allele, length)
        print 'key:', key 
        if scores_by_method_allele_binding_length.has_key(key):
            scores_by_method_allele_binding_length[key]=scores_by_method_allele_binding_length[key]+ic50s
        else:
            scores_by_method_allele_binding_length[key]=ic50s


    for k,v in scores_by_method_allele_binding_length.items():
        scores_by_method_allele_binding_length[k]=get_scores_sample(v)
         

    date_string = datetime.date.today().isoformat()
    pickle_file_name = '{method}_percentile_distribution_{iso8601_date}.p'.format(method=method,iso8601_date=date_string)
    with open(pickle_file_name, 'wb') as pickle_file:
        pickle.dump(scores_by_method_allele_binding_length, pickle_file)


def get_scores_sample(scores):
    if len(scores)>10000 or len(scores)<9999 :
        raise ValueError('len of scores!=10000, but =%s' % len(scores))
    scores.sort()
    scores_sample = []
    for i, score in enumerate(scores):
        # Select every score for the first 100 (1%)
        if i < 100  and i % 1 == 0:
            scores_sample.append(score)
        # Select every tenth score for the first 1000 (10%)
        elif i < 1000 and i % 10 == 0:
            scores_sample.append(score)
        # Select every 100th score for the remaining scores
        elif i >= 1000 and i % 100 == 0:
            scores_sample.append(score)
    return scores_sample

if __name__ == '__main__':
    main()

