import pandas as pd
from pathlib import Path

def read_mro_molecules():
    '''Read the mro_molecules.tsv file and return as a DataFrame'''
    data_dir = str(Path(__file__).resolve().parents[1]) + "/data"
    mro_molecules_path = data_dir + "/mro_molecules.tsv"
    return pd.read_csv(mro_molecules_path, sep='\t')

def read_tools_mapping():
    '''Read the tools-mapping.tsv file and return as a DataFrame'''
    data_dir = str(Path(__file__).resolve().parents[1]) + "/data"
    tools_mapping_path = data_dir + "/tools-mapping.tsv"
    return pd.read_csv(tools_mapping_path, sep='\t')

def read_tools_mro_mapping():
    '''Read the Tools_MRO_mapping.xlsx file and return as a DataFrame'''
    data_dir = str(Path(__file__).resolve().parents[1]) + "/data"
    tools_mro_mapping_path = data_dir + "/Tools_MRO_mapping.xlsx"
    return pd.read_excel(tools_mro_mapping_path, engine='openpyxl')

def find_n_alleles_in_mro_molecules(df):
    '''Find alleles containing 'N' in their names and return their complete rows'''
    # Create a boolean mask for rows where IEDB Label ends with 'N'
    n_allele_mask = df['IEDB Label'].str.endswith('N', na=False)
    
    # Use the mask to create a sub-dataframe with all columns for N-alleles
    n_alleles_df = df[n_allele_mask].copy()
    
    return n_alleles_df

def find_n_alleles_in_tools_mapping(df, mro_molecules_n_alleles_df):
    '''Find alleles containing 'N' in their names and return their complete rows from tools mapping.
    Also includes rows where MRO ID matches those from mro_molecules N-alleles.
    Removes duplicate entries based on Tool Label and MRO ID pairs.'''
    # Create a boolean mask for rows where Tool Label ends with 'N'
    n_allele_mask = df['Tool Label'].str.endswith('N', na=False)
    
    # Create a mask for rows where MRO ID matches those from mro_molecules N-alleles
    mro_id_mask = df['MRO ID'].isin(mro_molecules_n_alleles_df['MRO ID'])
    
    # Combine both masks (using OR operation)
    combined_mask = n_allele_mask | mro_id_mask
    
    # Use the combined mask to create a sub-dataframe with all columns
    n_alleles_df = df[combined_mask].copy()
    
    # Remove duplicate entries based on Tool Label and MRO ID pairs
    n_alleles_df = n_alleles_df.drop_duplicates(subset=['Tool Label', 'MRO ID'])
    
    return n_alleles_df

def find_n_alleles_in_tools_mro_mapping(df, mro_molecules_n_alleles_df):
    '''Find alleles containing 'N' in their names and return their complete rows from Tools_MRO_mapping.
    Also includes rows where MRO ID matches those from mro_molecules N-alleles.
    Removes duplicate entries based on Tool Label and MRO ID pairs.'''
    # Create a boolean mask for rows where Tool Label ends with 'N'
    n_allele_mask = df['term'].str.endswith('N', na=False)
    
    # Create a mask for rows where MRO ID matches those from mro_molecules N-alleles
    mro_id_mask = df['MRO ID'].isin(mro_molecules_n_alleles_df['MRO ID'])
    
    # Combine both masks (using OR operation)
    combined_mask = n_allele_mask | mro_id_mask
    
    # Use the combined mask to create a sub-dataframe with all columns
    n_alleles_df = df[combined_mask].copy()
    
    # Remove duplicate entries based on Tool Label and MRO ID pairs
    n_alleles_df = n_alleles_df.drop_duplicates(subset=['term', 'MRO ID'])
    
    return n_alleles_df

def main():
    # Search N-alleles in 'mro_molecules' file
    mro_molecules_df = read_mro_molecules()

    # Print the first few rows of the DataFrame to check the data
    print("\nFirst few rows of mro_molecules.tsv:")
    print(mro_molecules_df.head())

    # Find N-alleles in 'mro_molecules' file (IEDB Label column)
    n_alleles_df = find_n_alleles_in_mro_molecules(mro_molecules_df)
    print("\nN-alleles found in mro_molecules.tsv:")
    print(n_alleles_df.to_string(index=False))  # Print all columns for N-alleles

    # Save the N-alleles DataFrame to a TSV file
    n_alleles_df.to_csv('n_alleles_from_mro_molecules.tsv', sep='\t', index=False)

    # Search N-alleles in 'tools_mapping' file
    tools_mapping_df = read_tools_mapping()
    
    # Print the first few rows of the tools mapping DataFrame
    print("\nFirst few rows of tools-mapping.tsv:")
    print(tools_mapping_df.head())
    
    # Find N-alleles in tools mapping file
    n_alleles_tools_df = find_n_alleles_in_tools_mapping(tools_mapping_df, n_alleles_df)
    print("\nN-alleles found in tools-mapping.tsv:")
    print(n_alleles_tools_df.to_string(index=False))  # Print all columns for N-alleles

    # # Save the N-alleles DataFrame to a TSV file
    # n_alleles_tools_df.to_csv('n_alleles_from_tools_mapping.tsv', sep='\t', index=False)

    # Search N-alleles in 'Tools_MRO_mapping.xlsx' file
    tools_mro_mapping_df = read_tools_mro_mapping()
    
    # Print the first few rows of the Tools_MRO_mapping DataFrame
    print("\nFirst few rows of Tools_MRO_mapping.xlsx:")
    print(tools_mro_mapping_df.head())
    
    # Find N-alleles in Tools_MRO_mapping file
    n_alleles_tools_mro_df = find_n_alleles_in_tools_mro_mapping(tools_mro_mapping_df, n_alleles_df)
    print("\nN-alleles found in Tools_MRO_mapping.xlsx:")
    print(n_alleles_tools_mro_df.to_string(index=False))  # Print all columns for N-alleles

    # Save the N-alleles DataFrame to a TSV file
    n_alleles_tools_mro_df.to_csv('n_alleles_from_tools_mro_mapping.tsv', sep='\t', index=False)

if __name__ == "__main__":
    main()