Source code for PyMEGABASE

# Copyright (c) 2020-2023 The Center for Theoretical Biological Physics (CTBP) - Rice University
# This file is from the Open-MiChroM project, released under the MIT License. 

R"""  
The :class:`~.PyMEGABASE` classes perform subcompartment and compartment annotations based on 1D chromatin enrichment profiles.
"""

import os, glob, requests, shutil, pyBigWig, urllib, gzip
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed
from urllib.parse import urlparse
try:
    from pydca.plmdca import plmdca
except ImportError as e:
    print('Could not find pydca, it will download the package now')
    os.system('pip install -q pydca --no-deps')
    from pydca.plmdca import plmdca
    pass  

[docs]class PyMEGABASE: R""" The :class:`~.PyMEGABASE` class performs genomic annotations . The :class:`~.PyMEGABASE` sets the environment to generate prediction of genomic annotations. Args: cell_line (str, required): Name of target cell type assembly (str, required): Reference assembly of target cell line ('hg19','GRCh38','mm10') organism (str, required): Target cell type organism (str, required): signal_type (str, required): Input files signal type ('signal p-value', 'fold change over control', ...) ref_cell_line_path (str, optional): Folder/Path to place reference/training data ('tmp_meta') cell_line_path (str, optional): Folder/Path to place target cell data types_path (str, optional): Folder/Path where the genomic annotations are located histones (bool, required): Whether to use Histone Modification data from the ENCODE databank for prediction tf (bool, required): Whether to use Transcription Factor data from the ENCODE databank for prediction small_rna (bool, required): Whether to use Small RNA-Seq data from the ENCODE databank for prediction total_rna (bool, required): Whether to use Total RNA-seq data from the ENCODE databank for prediction n_states (int, optional): Number of states for the D-nodes extra_filter (str, optional): Add filter to the fetching data url to download cell type data res (int, optional): Resolution for genomic annotations calling in kilobasepairs (5, 50, 100) chromosome_sizes (list, optional): Chromosome sizes based on the reference genome assembly - required for non-human assemblies file_format (str, optional): File format for the input data """ def __init__(self, cell_line='GM12878', assembly='hg19',organism='human',signal_type='signal p-value',file_format='bigWig', ref_cell_line_path='tmp_meta',cell_line_path=None,types_path=None, histones=True,tf=False,atac=False,small_rna=False,total_rna=False,n_states=19, extra_filter='',res=50,chromosome_sizes=None,AB=False): self.printHeader() pt = os.path.dirname(os.path.realpath(__file__)) self.path_to_share = os.path.join(pt,'share/') self.cell_line=cell_line self.assembly=assembly self.signal_type=signal_type if cell_line_path==None: self.cell_line_path=cell_line+'_'+assembly else: self.cell_line_path=cell_line_path self.ref_cell_line='GM12878' self.ref_assembly='hg19' self.ref_cell_line_path=ref_cell_line_path if types_path!=None: self.types_path=types_path else: self.types_path=self.path_to_share+'types' self.hist=histones self.tf=tf self.atac=atac self.small_rna=small_rna self.total_rna=total_rna self.n_states=n_states self.extra_filter=extra_filter self.res=res self.organism=organism.lower() if file_format.lower()=='bigwig':self.file_format='bigWig' elif file_format.lower()=='bed': self.file_format='bed+narrowPeak' #Define translation dictinaries between aminoacids, intensity of Chip-seq signal/RNASeq count and states of the model self.RES_TO_INT = { 'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'V': 18, 'W':19, 'Y':20, '-':21, '.':21, '~':21,} self.INT_TO_RES = {self.RES_TO_INT[k]:k for k in self.RES_TO_INT.keys()} self.TYPE_TO_INT = {'A1':0,'A2':1,'B1':2,'B2':3,'B3':4,'B4':5,'NA':6} self.INT_TO_TYPE = {self.TYPE_TO_INT[k]:k for k in self.TYPE_TO_INT.keys()} # Define assembly of the target cell if assembly=='GRCh38': self.chrm_size = np.array([4980,4844,3966,3805,3631,3417,3187,2903,2768,2676,2702,2666,2288,2141,2040,1807,1666,1608,1173,1289,935,1017,3121])*50/self.res self.chrom_l={'chr1':248956422,'chr2':242193529,'chr3':198295559,'chr4':190214555,'chr5':181538259,'chr6':170805979,'chr7':159345973,'chrX':156040895,'chr8':145138636,'chr9':138394717,'chr11':135086622,'chr10':133797422,'chr12':133275309,'chr13':114364328,'chr14':107043718,'chr15':101991189,'chr16':90338345,'chr17':83257441,'chr18':80373285,'chr20':64444167,'chr19':58617616,'chrY': 57227415,'chr22':50818468,'chr21':46709983} elif assembly=='hg19': self.chrm_size = np.array([4990,4865,3964,3828,3620,3424,3184,2931,2826,2712,2703,2679,2307,2148,2052,1810,1626,1564,1184,1262,964,1028,3105])*50/self.res self.chrom_l={'chr1':249250621,'chr2':243199373,'chr3':198022430,'chr4':191154276,'chr5':180915260,'chr6':171115067,'chr7':159138663,'chrX':155270560,'chr8':146364022,'chr9':141213431,'chr10':135534747,'chr11':135006516,'chr12':133851895,'chr13':115169878,'chr14':107349540,'chr15':102531392,'chr16':90354753,'chr17':81195210,'chr18':78077248,'chr20':63025520,'chrY':59373566,'chr19':59128983,'chr22':51304566,'chr21':48129895} else: # If not GRCh38 or hg19 the chromosome sizes are required if chromosome_sizes == None: raise ValueError("Need to specify chromosome sizes for assembly: {}".format(assembly)) self.chrm_size = np.array(chromosome_sizes)/(self.res*1000) self.chrom_l = np.array(chromosome_sizes) self.chrm_size=np.round(self.chrm_size+0.1).astype(int) self.ref_chrm_size = np.array([4990,4865,3964,3828,3620,3424,3184,2931,2826,2712,2703,2679,2307,2148,2052,1810,1626,1564,1184,1262,964,1028,1028])*50/self.res self.ref_chrm_size=np.round(self.ref_chrm_size+0.1).astype(int) #Retrieves the available experiments on GM12878-hg19 to assess the download of experiments on the target cell #Prepare url to request information url='https://www.encodeproject.org/metadata/?type=Experiment&' if self.hist==True: url=url+'assay_title=Histone+ChIP-seq' if self.tf==True: url=url+'&assay_title=TF+ChIP-seq' if self.atac==True: url=url+'&assay_title=ATAC-seq' if self.small_rna==True: url=url+'&assay_title=small+RNA-seq' if self.total_rna==True: url=url+'&assay_title=total+RNA-seq' self.url_ref=url+'&biosample_ontology.term_name='+self.ref_cell_line+'&files.file_type='+self.file_format #Request information r = requests.get(self.url_ref) #Decode information requested content=str(r.content) experiments=[] for k in content.split('\\n')[:-1]: l=k.split('\\t') if l[5]==self.ref_assembly and l[4]==self.signal_type and l[7]=='Histone ChIP-seq': experiments.append(l[22]) elif l[5]==self.ref_assembly and l[4]==self.signal_type and l[7]=='ATAC-seq': experiments.append(l[7]) elif l[5]==self.ref_assembly and l[4]==self.signal_type and l[7]=='TF ChIP-seq': experiments.append(l[22]) elif l[5]==self.ref_assembly and l[4]=='plus strand signal of all reads' and l[7]=='small RNA-seq': experiments.append('plus-small-RNA-seq') elif l[5]==self.ref_assembly and l[4]=='plus strand signal of all reads' and l[7]=='total RNA-seq': experiments.append('plus-total-RNA-seq') elif l[5]==self.ref_assembly and l[4]=='minus strand signal of all reads' and l[7]=='small RNA-seq': experiments.append('minus-small-RNA-seq') elif l[5]==self.ref_assembly and l[4]=='minus strand signal of all reads' and l[7]=='total RNA-seq': experiments.append('minus-total-RNA-seq') #Extract set of experiments found on GM12878-hg19 self.experiments_unique=np.unique(experiments) self.es_unique=[] for e in self.experiments_unique: self.es_unique.append(e.split('-human')[0]) #Summary of input print('Selected cell line to predict: '+self.cell_line) print('Selected assembly: '+self.assembly) print('Selected signal type: '+self.signal_type) print('Selected organism: '+self.organism)
[docs] def process_replica_bw(self,line,cell_line_path,chrm_size): R""" Preprocess function for each replica formated in bigwig file Args: line (list, required): Information about the replica: name, ENCODE id and replica id cell_line_path (str, required): Path to target cell type data chrm_size (list, required): Chromosome sizes based on the assembly """ #Extract experiment id text=line.split()[0] #Extract experiment type exp=line.split()[1] #Extract number id of experiment count=line.split()[2] #Extract accession number (ENCODE) sr_number=line.split()[3] #Experiment directory exp_path=cell_line_path+'/'+exp+'_'+str(count) #Generate bookeeping files if 'human' in exp.split('-'): ext='human' else: ext=self.organism if exp.split('-'+ext)[0] in self.es_unique: try: os.mkdir(exp_path) except: print('Directory ',exp_path,' already exist') with open(exp_path+'/exp_name.txt', 'w') as f: f.write(text+' '+exp+'\n') with open(exp_path+'/exp_accession.txt', 'w') as f: f.write(sr_number+' '+exp+'\n') #Load data from server try: bw = pyBigWig.open("https://www.encodeproject.org/files/"+text+"/@@download/"+text+".bigWig") #Process replica for numbered chromosomes for chr in range(1,len(chrm_size)): signal = bw.stats("chr"+str(chr), type="mean", nBins=chrm_size[chr-1]) #Process signal and binning signal=np.array(signal) per=np.percentile(signal[signal!=None],95) per_min=np.percentile(signal[signal!=None],5) signal[signal==None]=per_min signal[signal<per_min]=per_min signal[signal>per]=per signal=signal-per_min signal=signal*self.n_states/(per-per_min) signal=np.round(signal.astype(float)).astype(int) #Save data for each chromosome with open(exp_path+'/chr'+str(chr)+'.track', 'w') as f: f.write("#chromosome file number of beads\n"+str(chrm_size[chr-1])) f.write("#\n") f.write("#bead, signal, discrete signal\n") for i in range(len(signal)): f.write(str(i)+" "+str(signal[i])+" "+str(signal[i].astype(int))+"\n") #Process seperatly chromosome X chr='X' signal = bw.stats("chr"+chr, type="mean", nBins=chrm_size[-1]) #Process signal and binning signal=np.array(signal) per=np.percentile(signal[signal!=None],95) per_min=np.percentile(signal[signal!=None],5) signal[signal==None]=per_min signal[signal<per_min]=per_min signal[signal>per]=per signal=signal-per_min signal=signal*self.n_states/(per-per_min) signal=np.round(signal.astype(float)).astype(int) #Save data with open(exp_path+'/chr'+chr+'.track', 'w') as f: f.write("#chromosome file number of beads\n"+str(chrm_size[-1])) f.write("#\n") f.write("#bead, signal, discrete signal\n") for i in range(len(signal)): f.write(str(i)+" "+str(signal[i])+" "+str(signal[i].astype(int))+"\n") return exp except: print('This experiment was incomplete:',text,'\nit will not be used.')
[docs] def process_replica_bed(self,line,cell_line_path,chrm_size): R""" Preprocess function for each replica formated in bed files Args: line (lsit, required): Information about the replica: name, ENCODE id and replica id cell_line_path (str, required): Path to target cell type data chrm_size (list, required): Chromosome sizes based on the assembly """ #Extract experiment id text=line.split()[0] #Extract experiment type exp=line.split()[1] #Extract number id of experiment count=line.split()[2] #Extract accession number (ENCODE) sr_number=line.split()[3] #Experiment directory exp_path=cell_line_path+'/'+exp+'_'+str(count) #Generate bookeeping files if 'human' in exp.split('-'): ext='human' else: ext=self.organism if exp.split('-'+ext)[0] in self.es_unique: try: os.mkdir(exp_path) except: print('Directory ',exp_path,' already exist') with open(exp_path+'/exp_name.txt', 'w') as f: f.write(text+' '+exp+'\n') with open(exp_path+'/exp_accession.txt', 'w') as f: f.write(sr_number+' '+exp+'\n') #Load data from server try: #Extract data from bed files exp_url="https://www.encodeproject.org/files/"+text+"/@@download/"+text+".bed.gz" response = urllib.request.urlopen(exp_url) gunzip_response = gzip.GzipFile(fileobj=response) content = gunzip_response.read() data=np.array([i.split('\t') for i in content.decode().split('\n')[:-1]]) #Process replica for numbered chromosomes for chr in range(1,len(chrm_size)): chrm_data=data[data[:,0]=='chr'+str(chr)][:,[1,2,6]].astype(float) signal=np.zeros(chrm_size[chr-1]) ndx_small=np.floor(chrm_data[:,1]/(self.res*1000)) == np.floor(chrm_data[:,0]/(self.res*1000)) # Aggregate peak intensity for ll in chrm_data[ndx_small]: ndx=int(ll[0]/(self.res*1000)) if ndx<len(signal): signal[ndx]+=ll[2] for ll in chrm_data[~ndx_small]: ndx1=int(ll[0]/(self.res*1000)) ndx2=int(ll[1]/(self.res*1000)) if ndx1<len(signal) and ndx2<len(signal): p=(ndx2-ll[0]/(self.res*1000))/((ll[1]-ll[0])/(self.res*1000)) signal[ndx1]+=ll[2]*p signal[ndx2]+=ll[2]*(1-p) per=np.percentile(signal[signal!=None],95) per_min=np.percentile(signal[signal!=None],5) signal[signal==None]=per_min signal[signal<per_min]=per_min signal[signal>per]=per signal=signal-per_min signal=signal*self.n_states/(per-per_min) signal=np.round(signal.astype(float)).astype(int) #Save data with open(exp_path+'/chr'+str(chr)+'.track', 'w') as f: f.write("#chromosome file number of beads\n"+str(chrm_size[chr-1])) f.write("#\n") f.write("#bead, signal, discrete signal\n") for i in range(len(signal)): f.write(str(i)+" "+str(signal[i])+" "+str(signal[i].astype(int))+"\n") #Process seperatly chromosome X chr='X' chrm_data=data[data[:,0]=='chr'+chr][:,[1,2,6]].astype(float) signal=np.zeros(chrm_size[-1]) ndx_small=np.floor(chrm_data[:,1]/(self.res*1000)) == np.floor(chrm_data[:,0]/(self.res*1000)) # Aggregate peak intensity for ll in chrm_data[ndx_small]: ndx=int(ll[0]/(self.res*1000)) if ndx<len(signal): signal[ndx]+=ll[2] for ll in chrm_data[~ndx_small]: ndx1=int(ll[0]/(self.res*1000)) ndx2=int(ll[1]/(self.res*1000)) if ndx1<len(signal) and ndx2<len(signal): p=(ndx2-ll[0]/(self.res*1000))/((ll[1]-ll[0])/(self.res*1000)) signal[ndx1]+=ll[2]*p signal[ndx2]+=ll[2]*(1-p) per=np.percentile(signal[signal!=None],95) per_min=np.percentile(signal[signal!=None],5) signal[signal==None]=per_min signal[signal<per_min]=per_min signal[signal>per]=per signal=signal-per_min signal=signal*self.n_states/(per-per_min) signal=np.round(signal.astype(float)).astype(int) #Save data with open(exp_path+'/chr'+chr+'.track', 'w') as f: f.write("#chromosome file number of beads\n"+str(chrm_size[-1])) f.write("#\n") f.write("#bead, signal, discrete signal\n") for i in range(len(signal)): f.write(str(i)+" "+str(signal[i])+" "+str(signal[i].astype(int))+"\n") return exp except: print('This experiment was incomplete:',text,'\nit will not be used.')
[docs] def download_and_process_cell_line_data(self,nproc=10,all_exp=True): R""" Download and preprocess target cell data for the D-nodes Args: nproc (int, required): Number of processors dedicated to download and process data all_exp (bool, optional): Download and process all replicas for each experiment. Set as 'False' to download only 1 replica per experiment """ #Create directory for target cell data try: os.mkdir(self.cell_line_path) except: print('Directory ',self.cell_line_path,' already exist') print('Deleting path and creating it anew') shutil.rmtree(self.cell_line_path) os.mkdir(self.cell_line_path) #Prepare url to fetch target cell data url='https://www.encodeproject.org/metadata/?type=Experiment&'+self.extra_filter if self.hist==True: url=url+'assay_title=Histone+ChIP-seq' if self.tf==True: url=url+'&assay_title=TF+ChIP-seq' if self.atac==True: url=url+'&assay_title=ATAC-seq' if self.small_rna==True: url=url+'&assay_title=small+RNA-seq' if self.total_rna==True: url=url+'&assay_title=total+RNA-seq' self.url_cell_line=url+'&biosample_ontology.term_name='+self.cell_line+'&files.file_type='+self.file_format # Request data from ENCODE server r = requests.get(self.url_cell_line) content=str(r.content) experiments=[] with open(self.cell_line_path+'/meta.txt', 'w') as f: for k in content.split('\\n')[:-1]: l=k.split('\\t') if l[5]==self.assembly and l[4]==self.signal_type and l[7]=='Histone ChIP-seq': f.write(l[0]+' '+l[22]+' '+l[5]+' '+l[4]+' '+l[6]+'\n') elif l[5]==self.assembly and l[4]==self.signal_type and l[7]=='ATAC-seq': f.write(l[0]+' '+l[7]+' '+l[5]+' '+l[4]+' '+l[6]+'\n') elif l[5]==self.assembly and l[4]==self.signal_type and l[7]=='TF ChIP-seq': f.write(l[0]+' '+l[22]+' '+l[5]+' '+l[4]+' '+l[6]+'\n') elif l[5]==self.assembly and l[4]=='plus strand signal of all reads' and l[7]=='small RNA-seq': f.write(l[0]+' '+'plus-small-RNA-seq'+' '+l[5]+' '+l[4]+' '+l[6]+'\n') elif l[5]==self.assembly and l[4]=='plus strand signal of all reads' and l[7]=='total RNA-seq': f.write(l[0]+' '+'plus-total-RNA-seq'+' '+l[5]+' '+l[4]+' '+l[6]+'\n') elif l[5]==self.assembly and l[4]=='minus strand signal of all reads' and l[7]=='small RNA-seq': f.write(l[0]+' '+'minus-small-RNA-seq'+' '+l[5]+' '+l[4]+' '+l[6]+'\n') elif l[5]==self.assembly and l[4]=='minus strand signal of all reads' and l[7]=='total RNA-seq': f.write(l[0]+' '+'minus-total-RNA-seq'+' '+l[5]+' '+l[4]+' '+l[6]+'\n') # Record experiments, number of replicas and their accession numbers count=0 self.exp_found={} exp_name='' list_names=[] with open(self.cell_line_path+'/meta.txt') as fp: Lines = fp.readlines() for line in Lines: count += 1 text=line.split()[0] exp=line.split()[1] sr_number=line.split()[-1] #Register if experiment is new if exp!=exp_name: try: count=self.exp_found[exp]+1 except: count=1 exp_name=exp self.exp_found[exp]=count if all_exp==True: list_names.append(text+' '+exp+' '+str(count)+' '+sr_number) else: if count==1: list_names.append(text+' '+exp+' '+str(count)+' '+sr_number) #Process signal for each replica in parallel print('Number of replicas:', len(list_names)) if self.file_format=='bigWig': self.successful_exp = Parallel(n_jobs=nproc)(delayed(self.process_replica_bw)(list_names[i],self.cell_line_path,self.chrm_size) for i in tqdm(range(len(list_names)), desc="Process replicas",bar_format='{l_bar}{bar:40}{r_bar}{bar:-10b}')) else: self.successful_exp = Parallel(n_jobs=nproc)(delayed(self.process_replica_bed)(list_names[i],self.cell_line_path,self.chrm_size) for i in tqdm(range(len(list_names)), desc="Process replicas",bar_format='{l_bar}{bar:40}{r_bar}{bar:-10b}')) #Record the replicas/experiments that were successfully processed self.successful_exp= [i for i in self.successful_exp if i] self.successful_unique_exp=np.unique(self.successful_exp) self.su_unique=[] for e in self.successful_unique_exp: self.su_unique.append(e.split('-'+self.organism)[0]) print('Experiments found in ENCODE for the selected cell line:') self.unique=[] #Save the set of unique experiments found in the target cell with open(self.cell_line_path+'/unique_exp.txt', 'w') as f: for e in self.experiments_unique: if e.split('-human')[0] in self.su_unique: f.write(e.split('-human')[0]+'\n') print(e.split('-human')[0]) self.unique.append(e) if len(self.unique) > 4: print('Predictions would use: ',len(self.unique),' experiments') else: print('This sample only has ',len(self.unique),' experiments. We do not recommend prediction on samples with less than 5 different experiments.')
[docs] def download_and_process_ref_data(self,nproc,all_exp=True): R""" Download and preprocess reference data for the D-nodes Args: nproc (int, required): Number of processors dedicated to download and process data all_exp (bool, optional): Download and process all replicas for each experiment. Set as 'False' to download only 1 replica per experiment """ #Create directory for target cell data try: os.mkdir(self.ref_cell_line_path) except: print('Directory ',self.ref_cell_line_path,' already exist') print('Deleting path and creating it anew') shutil.rmtree(self.ref_cell_line_path) os.mkdir(self.ref_cell_line_path) #Prepare url to fetch target cell data url='https://www.encodeproject.org/metadata/?type=Experiment&' if self.hist==True: url=url+'assay_title=Histone+ChIP-seq' if self.tf==True: url=url+'&assay_title=TF+ChIP-seq' if self.atac==True: url=url+'&assay_title=ATAC-seq' if self.small_rna==True: url=url+'&assay_title=small+RNA-seq' if self.total_rna==True: url=url+'&assay_title=total+RNA-seq' self.url_ref=url+'&biosample_ontology.term_name='+self.ref_cell_line+'&files.file_type='+self.file_format # Record experiments, number of replicas and their accession numbers r = requests.get(self.url_ref) content=str(r.content) experiments=[] with open(self.ref_cell_line_path+'/meta.txt', 'w') as f: for k in content.split('\\n')[:-1]: l=k.split('\\t') if l[5]==self.ref_assembly and l[4]==self.signal_type and l[7]=='Histone ChIP-seq': f.write(l[0]+' '+l[22]+' '+l[5]+' '+l[4]+' '+l[6]+'\n') elif l[5]==self.ref_assembly and l[4]==self.signal_type and l[7]=='ATAC-seq': f.write(l[0]+' '+l[7]+' '+l[5]+' '+l[4]+' '+l[6]+'\n') elif l[5]==self.ref_assembly and l[4]==self.signal_type and l[7]=='TF ChIP-seq': f.write(l[0]+' '+l[22]+' '+l[5]+' '+l[4]+' '+l[6]+'\n') elif l[5]==self.ref_assembly and l[4]=='plus strand signal of all reads' and l[7]=='small RNA-seq': f.write(l[0]+' '+'plus-small-RNA-seq'+' '+l[5]+' '+l[4]+' '+l[6]+'\n') elif l[5]==self.ref_assembly and l[4]=='plus strand signal of all reads' and l[7]=='total RNA-seq': f.write(l[0]+' '+'plus-total-RNA-seq'+' '+l[5]+' '+l[4]+' '+l[6]+'\n') elif l[5]==self.ref_assembly and l[4]=='minus strand signal of all reads' and l[7]=='small RNA-seq': f.write(l[0]+' '+'minus-small-RNA-seq'+' '+l[5]+' '+l[4]+' '+l[6]+'\n') elif l[5]==self.ref_assembly and l[4]=='minus strand signal of all reads' and l[7]=='total RNA-seq': f.write(l[0]+' '+'minus-total-RNA-seq'+' '+l[5]+' '+l[4]+' '+l[6]+'\n') # Record experiments, number of replicas and their accession numbers ref_chrm_size = self.ref_chrm_size count=0 exp_found={} exp_name='' list_names=[] with open(self.ref_cell_line_path+'/meta.txt') as fp: Lines = fp.readlines() for line in Lines: count += 1 text=line.split()[0] exp=line.split()[1] sr_number=line.split()[-1] #Register if experiment is new if (exp.split('-human')[0] in self.su_unique) or (text.split('-human')[0] in self.su_unique): if exp!=exp_name: try: count=exp_found[exp]+1 except: count=1 exp_name=exp exp_found[exp]=count if all_exp==True: list_names.append(text+' '+exp+' '+str(count)+' '+sr_number) else: if count==1: list_names.append(text+' '+exp+' '+str(count)+' '+sr_number) print('Number of replicas:', len(list_names)) #Process signal for each replica in parallel if self.file_format=='bigWig': results = Parallel(n_jobs=nproc)(delayed(self.process_replica_bw)(list_names[i],self.ref_cell_line_path,ref_chrm_size) for i in tqdm(range(len(list_names)), desc="Process replicas",bar_format='{l_bar}{bar:40}{r_bar}{bar:-10b}')) else: results = Parallel(n_jobs=nproc)(delayed(self.process_replica_bed)(list_names[i],self.ref_cell_line_path,ref_chrm_size) for i in tqdm(range(len(list_names)), desc="Process replicas",bar_format='{l_bar}{bar:40}{r_bar}{bar:-10b}')) print('Prediction will use:') with open(self.ref_cell_line_path+'/unique_exp.txt', 'w') as f: for e in self.unique: if e in exp_found.keys(): f.write(e.split('-'+self.organism)[0]+'\n') print(e.split('-'+self.organism)[0])
[docs] def custom_bw_track(self,experiment,bw_file): R""" Function to introduce custom bigwig tracks Args: experiment (str, required): Name of the experiment bw_file (str, required): Path to the custom track """ #Format the name of the experiment if not self.organism in experiment: experiment=experiment+'-'+self.organism if not experiment.split('-'+self.organism)[0] in self.es_unique: print('This experiment is not found in the training set, then cannot be used.') return 0 if experiment in self.exp_found.keys(): print('This target has replicas already') print('The new track will be addded as a different replica of the same target') #Experiment directory count=self.exp_found[experiment]+1 else: print('This target has no replicas') print('The new track will be added a the first replica of the target') #Experiment directory count=1 exp_path=self.cell_line_path+'/'+experiment+'_'+str(count) print(exp_path) # Create directory for new replica try: os.mkdir(exp_path) except: print('Directory ',exp_path,' already exist') with open(exp_path+'/exp_name.txt', 'w') as f: f.write(experiment+' '+experiment+'\n') #Load data from track try: bw = pyBigWig.open(bw_file) for chr in range(1,len(self.chrm_size)): signal = bw.stats("chr"+str(chr), type="mean", nBins=self.chrm_size[chr-1]) #Process signal and binning signal=np.array(signal) per=np.percentile(signal[signal!=None],95) per_min=np.percentile(signal[signal!=None],5) signal[signal==None]=per_min signal[signal<per_min]=per_min signal[signal>per]=per signal=signal-per_min signal=signal*self.n_states/(per-per_min) signal=np.round(signal.astype(float)).astype(int) #Save data with open(exp_path+'/chr'+str(chr)+'.track', 'w') as f: f.write("#chromosome file number of beads\n"+str(self.chrm_size[chr-1])) f.write("#\n") f.write("#bead, signal, discrete signal\n") for i in range(len(signal)): f.write(str(i)+" "+str(signal[i])+" "+str(signal[i].astype(int))+"\n") chr='X' signal = bw.stats("chr"+chr, type="mean", nBins=self.chrm_size[-1]) #Process signal and binning signal=np.array(signal) per=np.percentile(signal[signal!=None],95) per_min=np.percentile(signal[signal!=None],5) signal[signal==None]=per_min signal[signal<per_min]=per_min signal[signal>per]=per signal=signal-per_min signal=signal*self.n_states/(per-per_min) signal=np.round(signal.astype(float)).astype(int) #Save data with open(exp_path+'/chr'+chr+'.track', 'w') as f: f.write("#chromosome file number of beads\n"+str(self.chrm_size[-1])) f.write("#\n") f.write("#bead, signal, discrete signal\n") for i in range(len(signal)): f.write(str(i)+" "+str(signal[i])+" "+str(signal[i].astype(int))+"\n") # Register new replica in the existing set of experiments if experiment in self.exp_found.keys(): self.exp_found[experiment]=self.exp_found[experiment]+1 else: self.exp_found[experiment]=1 self.successful_unique_exp=np.append(self.successful_unique_exp,experiment) self.su_unique.append(experiment.split('-'+self.organism)[0]) with open(self.cell_line_path+'/unique_exp.txt', 'a') as f: f.write(experiment.split('-'+self.organism)[0]+'\n') self.unique.append(experiment) return experiment except: print('This experiment was incomplete:',experiment,'\nit will not be used.')
[docs] def custom_bed_track(self,experiment,bed_file): R""" Function to introduce custom bed tracks Args: experiment (str, required): Name of the experiment bed_file (str, required): Path to the custom track """ #Format the name of the experiment if not self.organism in experiment: experiment=experiment+'-'+self.organism if not experiment.split('-'+self.organism)[0] in self.es_unique: print('This experiment is not found in the training set, then cannot be used.') return 0 if experiment in self.exp_found.keys(): print('This target has replicas already') print('The new track will be addded as a different replica of the same target') #Experiment directory count=self.exp_found[experiment]+1 else: print('This target has no replicas') print('The new track will be added a the first replica of the target') #Experiment directory count=1 exp_path=self.cell_line_path+'/'+experiment+'_'+str(count) print(exp_path) # Create directory for new replica try: os.mkdir(exp_path) except: print('Directory ',exp_path,' already exist') with open(exp_path+'/exp_name.txt', 'w') as f: f.write(experiment+' '+experiment+'\n') def get_records(bed_file): try: response = urllib.request.urlopen(bed_file) gunzip_response = gzip.GzipFile(fileobj=response) content = gunzip_response.read() data=np.array([i.split('\t') for i in content.decode().split('\n')[:-1]]) except: try: gunzip_response = gzip.GzipFile(bed_file) content = gunzip_response.read() data=np.array([i.split('\t') for i in content.decode().split('\n')[:-1]]) except: data=np.loadtxt(bed_file,dtype=str) return data #Load data from track try: data=get_records(bed_file) get_records(bed_file) for chr in range(1,len(self.chrm_size)): chrm_data=data[data[:,0]=='chr'+str(chr)][:,[1,2,6]].astype(float) signal=np.zeros(self.chrm_size[chr-1]) ndx_small=np.floor(chrm_data[:,1]/(self.res*1000)) == np.floor(chrm_data[:,0]/(self.res*1000)) # Aggregate peak intensity for ll in chrm_data[ndx_small]: ndx=int(ll[0]/(self.res*1000)) if ndx<len(signal): signal[ndx]+=ll[2] for ll in chrm_data[~ndx_small]: ndx1=int(ll[0]/(self.res*1000)) ndx2=int(ll[1]/(self.res*1000)) if ndx1<len(signal) and ndx2<len(signal): p=(ndx2-ll[0]/(self.res*1000))/((ll[1]-ll[0])/(self.res*1000)) signal[ndx1]+=ll[2]*p signal[ndx2]+=ll[2]*(1-p) #Process signal and binning signal=np.array(signal) per=np.percentile(signal[signal!=None],95) per_min=np.percentile(signal[signal!=None],5) signal[signal==None]=per_min signal[signal<per_min]=per_min signal[signal>per]=per signal=signal-per_min signal=signal*self.n_states/(per-per_min) signal=np.round(signal.astype(float)).astype(int) #Save data with open(exp_path+'/chr'+str(chr)+'.track', 'w') as f: f.write("#chromosome file number of beads\n"+str(self.chrm_size[chr-1])) f.write("#\n") f.write("#bead, signal, discrete signal\n") for i in range(len(signal)): f.write(str(i)+" "+str(signal[i])+" "+str(signal[i].astype(int))+"\n") chr='X' chrm_data=data[data[:,0]=='chr'+chr][:,[1,2,6]].astype(float) signal=np.zeros(self.chrm_size[-1]) ndx_small=np.floor(chrm_data[:,1]/(self.res*1000)) == np.floor(chrm_data[:,0]/(self.res*1000)) # Aggregate peak intensity for ll in chrm_data[ndx_small]: ndx=int(ll[0]/(self.res*1000)) if ndx<len(signal): signal[ndx]+=ll[2] for ll in chrm_data[~ndx_small]: ndx1=int(ll[0]/(self.res*1000)) ndx2=int(ll[1]/(self.res*1000)) if ndx1<len(signal) and ndx2<len(signal): p=(ndx2-ll[0]/(self.res*1000))/((ll[1]-ll[0])/(self.res*1000)) signal[ndx1]+=ll[2]*p signal[ndx2]+=ll[2]*(1-p) #Process signal and binning signal=np.array(signal) per=np.percentile(signal[signal!=None],95) per_min=np.percentile(signal[signal!=None],5) signal[signal==None]=per_min signal[signal<per_min]=per_min signal[signal>per]=per signal=signal-per_min signal=signal*self.n_states/(per-per_min) signal=np.round(signal.astype(float)).astype(int) #Save data with open(exp_path+'/chr'+chr+'.track', 'w') as f: f.write("#chromosome file number of beads\n"+str(self.chrm_size[-1])) f.write("#\n") f.write("#bead, signal, discrete signal\n") for i in range(len(signal)): f.write(str(i)+" "+str(signal[i])+" "+str(signal[i].astype(int))+"\n") if experiment in self.exp_found.keys(): self.exp_found[experiment]=self.exp_found[experiment]+1 else: self.exp_found[experiment]=1 self.successful_unique_exp=np.append(self.successful_unique_exp,experiment) self.su_unique.append(experiment.split('-'+self.organism)[0]) with open(self.cell_line_path+'/unique_exp.txt', 'a') as f: f.write(experiment.split('-'+self.organism)[0]+'\n') self.unique.append(experiment) return experiment except: print('This experiment was incomplete:',experiment,'\nit will not be used.')
[docs] def build_state_vector(self,int_types,all_averages): R""" Builds the set of state vectors used on the training process Args: int_types (list, required): Genomic annotations all_averages (list, required): D-node data """ #Aggregate tracks by with data from other loci l-2, l-1, l, l+1, l+2 #l+1 shift1=np.copy(all_averages) shift1[:,:-1]=all_averages[:,1:] shift1[:,-1]=np.zeros(len(shift1[:,-1])) #l+2 shift2=np.copy(all_averages) shift2[:,:-1]=shift1[:,1:] shift2[:,-1]=np.zeros(len(shift1[:,-1])) #l-1 shift_1=np.copy(all_averages) shift_1[:,1:]=all_averages[:,:-1] shift_1[:,0]=np.zeros(len(shift_1[:,-1])) #l-2 shift_2=np.copy(all_averages) shift_2[:,1:]=shift_1[:,:-1] shift_2[:,0]=np.zeros(len(shift1[:,-1])) #Stack shifted tracks and subtypes labels all_averages=np.vstack((int_types,shift_2,shift_1,all_averages,shift1,shift2)) #To train, we exclude the centromers and B4 subcompartments ndx=(all_averages[0,:]!=6) * (all_averages[0,:]!=5) all_averages=all_averages[:,ndx] all_averages=all_averages+1 return all_averages
[docs] def get_tmatrix(self,chrms,silent=False): R""" Extract the training data Args: chrms (list, optional): Set of chromosomes from the reference data used as the training set silent (bool, optional): Silence outputs """ #Load types from Rao et al 2014 paper types=[] for chr in chrms: if self.res==50: types.append(np.loadtxt(self.types_path+'/chr'+str(chr)+'_beads.txt.original',delimiter=' ',dtype=str)[:,1]) elif self.res==100: types.append(np.loadtxt(self.types_path+'/chr'+str(chr)+'_beads.txt.original',delimiter=' ',dtype=str)[::int(self.res/50),1]) else: tmp=list(np.loadtxt(self.types_path+'/chr'+str(chr)+'_beads.txt.original',delimiter=' ',dtype=str)) if len(tmp) < self.ref_chrm_size[chr-1]: diff=self.ref_chrm_size[chr-1] - len(tmp) for i in range( diff ): tmp.append('NA') types.append(tmp) types=np.concatenate(types) int_types=np.array(list(map(self.TYPE_TO_INT.get, types))) #Check which experiments are available to train unique=np.loadtxt(self.cell_line_path+'/unique_exp.txt',dtype=str) if unique.shape==(): unique=[unique] if silent==False:print('To train the following experiments are used:') #Load each track and average over replicas all_averages=[] for u in unique: reps=[] if silent==False:print(u) for i in glob.glob(self.ref_cell_line_path+'/'+str(u)+'*'): tmp=[] try: for chr in chrms: _tmp=np.loadtxt(i+'/chr'+str(chr)+'.track',skiprows=3)[:,2] tmp.append(_tmp) tmp=np.concatenate(tmp) reps.append(tmp) except: if silent==False:print(i,' failed with at least one chromosome') reps=np.array(reps) ave_reps=np.round(np.mean(reps,axis=0)) all_averages.append(ave_reps) #Build state vectors of the Potts model for training all_averages=np.array(all_averages) all_averages=self.build_state_vector(int_types,all_averages) return all_averages
[docs] def filter_exp(self): R""" Performs assestment on experiment signal-to-noise ration based on mean and std of the signal compared to the GM12878 equivalent using chromosomes 1 and 2 """ a=[] for i in range(1,3): a.append(self.test_set(chr=i,silent=True)) a=np.concatenate(a,axis=1) locus=2 good_exp=0 gexp=[] unique=np.loadtxt(self.cell_line_path+'/unique_exp.txt',dtype=str) #Compare mean and std of signal of target cell and GM12878 for exper in range(len(unique)): i=exper+len(unique)*locus if (np.abs(np.mean(a[i])-np.mean(self.tmatrix[i+1]))<1) and (np.std(a[i])-np.std(self.tmatrix[i+1])<2): good_exp=good_exp+1 gexp.append(unique[exper]+'\n') else: print('Not using '+unique[exper],' to predict') #Records the experiments whose signal-to-noise ration is similar to GM12878 with open(self.cell_line_path+'/unique_exp_filtered.txt','w') as f: for i in gexp: f.write(i) print('Number of suitable experiments for prediction:',good_exp) if good_exp>0: os.system('mv '+self.cell_line_path+'/unique_exp.txt '+self.cell_line_path+'/unique_exp_bu.txt') os.system('mv '+self.cell_line_path+'/unique_exp_filtered.txt '+self.cell_line_path+'/unique_exp.txt') else: print('There are no experiment suitable for the prediction')
[docs] def training_set_up(self,chrms=None,filter=True): R""" Formats data to allow the training Args: chrms (list, optional): Set of chromosomes from the reference data to use as the training set filter (bool, optional): Filter experiments based on the baseline """ if chrms==None: # We are training in odd chromosomes data if self.cell_line=='GM12878' and self.assembly=='hg19': chrms=[1,3,5,7,9,11,13,15,17,19,21] else: chrms=[i for i in range(1,23)] if filter==True: all_averages=self.get_tmatrix(chrms,silent=True) self.tmatrix=np.copy(all_averages) self.filter_exp() all_averages=self.get_tmatrix(chrms,silent=False) self.tmatrix=np.copy(all_averages) # Translate Potts states to sequences sequences=np.array(list(map(self.INT_TO_RES.get, all_averages.flatten()))).reshape(all_averages.shape) #Generate sequence file with open(self.cell_line_path+"/sequences.fa",'w',encoding = 'utf-8') as f: for i in range(len(sequences.T)): f.write('>'+str(i).zfill(4)+'\n') f.write(''.join(sequences[:,i])+'\n')
[docs] def training(self,nproc=10,lambda_h=100,lambda_J=100): R""" Performs the training of the Potts model based on the reference data Args: nproc (int, required): Number of processors used to train lambda_h (bool, optional): Value for the intensity of the regularization value for the h energy term lambda_J (float, optional): Value for the intensity of the regularization value for the J energy term """ # Compute DCA scores using Pseudolikelihood maximization algorithm plmdca_inst = plmdca.PlmDCA( self.cell_line_path+"/sequences.fa", 'protein', seqid = 0.99, lambda_h = lambda_h, lambda_J = lambda_J, num_threads = nproc, max_iterations = 1000) print('Training started') # Train an get coupling and fields as lists self.fields_and_couplings = plmdca_inst.get_fields_and_couplings_from_backend() self.plmdca_inst=plmdca_inst fields_and_couplings = self.fields_and_couplings couplings = plmdca_inst.get_couplings_no_gap_state(fields_and_couplings) #Reshape couplings and fields to a working format # J should be shaped (len(exp)*5+1,len(exp)*5+1,20,20) # h should be shaped (len(exp)*5+1,20) L = plmdca_inst._get_num_and_len_of_seqs()[1] q = 21 self.L=L self.q=q qm1 = q - 1 J=np.zeros((L,L,qm1,qm1)) fields_all = fields_and_couplings[:L * q] h = list() for i in range(L): for j in range(i + 1, L): start_indx = int(((L * (L - 1)/2) - (L - i) * ((L-i)-1)/2 + j - i - 1) * qm1 * qm1) end_indx = start_indx + qm1 * qm1 couplings_ij = couplings[start_indx:end_indx] couplings_ij = np.reshape(couplings_ij, (qm1,qm1)) J[i,j]=couplings_ij J[j,i]=couplings_ij h.append(fields_all[i * q:(i+1)*q]) h=np.array(h) print('Training finished') print('J and H produced') self.h=h self.J=J h_and_J={} h_and_J['h']=h h_and_J['J']=J h_and_J['J_flat']=couplings #Save fields and couplings with open(self.cell_line_path+'/h_and_J.npy', 'wb') as f: np.save(f, h_and_J)
[docs] def test_set(self,chr=1,silent=False): R""" Predicts and outputs the genomic annotations for chromosome X Args: chr (int, required): Chromosome to extract input data fro the D-nodes silent (bool, optional): Avoid printing information Returns: array (size of chromosome,5*number of unique experiments) D-node input data """ if silent==False:print('Test set for chromosome: ',chr) if chr!='X': types=["A1" for i in range(self.chrm_size[chr-1])] else: types=["A1" for i in range(self.chrm_size[-1])] int_types=np.array(list(map(self.TYPE_TO_INT.get, types))) unique=np.loadtxt(self.cell_line_path+'/unique_exp.txt',dtype=str) if unique.shape==(): unique=[unique] #Load each track and average over all_averages=[] for u in unique: reps=[] for i in glob.glob(self.cell_line_path+'/'+str(u)+'*'): tmp=[] try: tmp=np.loadtxt(i+'/chr'+str(chr)+'.track',skiprows=3)[:,2] reps.append(tmp) except: if silent==False:print(i,' failed with at least one chromosome') reps=np.array(reps) ave_reps=np.round(np.mean(reps,axis=0)) all_averages.append(ave_reps) all_averages=np.array(all_averages) chr_averages=self.build_state_vector(int_types,all_averages)-1 return chr_averages[1:]+1
[docs] def prediction_single_chrom(self,chr=1,h_and_J_file=None,energies=False,probabilities=False): R""" Predicts and outputs the genomic annotations for chromosome X Args: chr (int, optional): Chromosome to predict h_and_J_file (str, optional): Model energy term file path Returns: array (size of chromosome) Predicted annotations """ print('Predicting subcompartments for chromosome: ',chr) #Load fields and couplings from file or memory if h_and_J_file!=None: with open(h_and_J_file, 'rb') as f: h_and_J = np.load(f, allow_pickle=True) h_and_J = h_and_J.item() self.h=h_and_J['h'] self.J=h_and_J['J'] types=["A1" for i in range(self.chrm_size[chr-1])] int_types=np.array(list(map(self.TYPE_TO_INT.get, types))) unique=np.loadtxt(self.cell_line_path+'/unique_exp.txt',dtype=str) if unique.shape==(): unique=[unique] #Load each track and average over all_averages=[] for u in unique: reps=[] for i in glob.glob(self.cell_line_path+'/'+str(u)+'*'): tmp=[] try: tmp=np.loadtxt(i+'/chr'+str(chr)+'.track',skiprows=3)[:,2] reps.append(tmp) except: print(i,' failed with at least one chromosome') reps=np.array(reps) ave_reps=np.round(np.mean(reps,axis=0)) all_averages.append(ave_reps) all_averages=np.array(all_averages) self.chr_averages=self.build_state_vector(int_types,all_averages)-1 #Prediction predict_type=np.zeros(self.chr_averages.shape[1]) self.L=len(self.h) enes=[] probs=[] for loci in range(self.chr_averages.shape[1]): energy_val=[] energy=0 #Check energy for all possible 5 states (A1,A2,B1,B2,B3) for state in range(5): tmp_energy=-self.h[0,state] for j in range(1,self.L): s2=int(self.chr_averages[j,loci]) tmp_energy=tmp_energy-self.J[0,j,state,s2] energy_val.append(energy+tmp_energy) energy_val=np.array(energy_val) enes.append(energy_val) probs.append(np.exp(-energy_val)/np.sum(np.exp(-energy_val))) #Select the state with the lowest energy predict_type[loci]=np.where(energy_val==np.min(energy_val))[0][0] enes=np.array(enes) probs=np.array(probs) #Add gaps from UCSC database try: gaps=np.loadtxt(self.path_to_share+'/gaps/'+self.assembly+'_gaps.txt',dtype=str) chr_gaps_ndx=np.where((gaps[:,0]=='chr'+str(chr)))[0] for gp in chr_gaps_ndx: init_loci=np.round(gaps[gp,1].astype(float)/(self.res*1000)).astype(int) end_loci=np.round(gaps[gp,2].astype(float)/(self.res*1000)).astype(int) predict_type[init_loci:end_loci]=6 enes[init_loci:end_loci]=0 probs[init_loci:end_loci]=0 except: print('Gaps not found, not included in predictions') if energies==True: if probabilities==True: return predict_type, enes, probs else: return predict_type, enes else: if probabilities==True: return predict_type, probs else: return predict_type
[docs] def prediction_X(self,chr='X',h_and_J_file=None,energies=False,probabilities=False): R""" Predicts and outputs the genomic annotations for chromosome X Args: chr (int, optional): Chromosome to predict h_and_J_file (str, optional): Model energy term file path Returns: array (size of chromosome) Predicted annotations """ print('Predicting subcompartments for chromosome: ',chr) if h_and_J_file!=None: with open(h_and_J_file, 'rb') as f: h_and_J = np.load(f, allow_pickle=True) h_and_J = h_and_J.item() self.h=h_and_J['h'] self.J=h_and_J['J'] types=["A1" for i in range(self.chrm_size[-1])] int_types=np.array(list(map(self.TYPE_TO_INT.get, types))) unique=np.loadtxt(self.cell_line_path+'/unique_exp.txt',dtype=str) if unique.shape==(): unique=[unique] #Load each track and average over all_averages=[] for u in unique: reps=[] for i in glob.glob(self.cell_line_path+'/'+str(u)+'*'): tmp=[] try: tmp=np.loadtxt(i+'/chrX.track',skiprows=3)[:,2] reps.append(tmp) except: print(i,' failed with at least one chromosome') reps=np.array(reps) ave_reps=np.round(np.mean(reps,axis=0)) all_averages.append(ave_reps) all_averages=np.array(all_averages) self.chr_averages=self.build_state_vector(int_types,all_averages)-1 #Prediction predict_type=np.zeros(self.chr_averages.shape[1]) self.L=len(self.h) enes=[] probs=[] for loci in range(self.chr_averages.shape[1]): energy_val=[] energy=0 #Check energy for all possible 5 states (A1,A2,B1,B2,B3) for state in range(5): tmp_energy=-self.h[0,state] for j in range(1,self.L): s2=int(self.chr_averages[j,loci]) tmp_energy=tmp_energy-self.J[0,j,state,s2] energy_val.append(energy+tmp_energy) energy_val=np.array(energy_val) enes.append(energy_val) probs.append(np.exp(-energy_val)/np.sum(np.exp(-energy_val))) #Select the state with the lowest energy predict_type[loci]=np.where(energy_val==np.min(energy_val))[0][0] enes=np.array(enes) probs=np.array(probs) #Add gaps from UCSC database try: print('Resolution:', self.res) gaps=np.loadtxt(self.path_to_share+'/gaps/'+self.assembly+'_gaps.txt',dtype=str) chr_gaps_ndx=np.where((gaps[:,0]=='chr'+str(chr)))[0] for gp in chr_gaps_ndx: init_loci=np.round(gaps[gp,1].astype(float)/(self.res*1000)).astype(int) end_loci=np.round(gaps[gp,2].astype(float)/(self.res*1000)).astype(int) predict_type[init_loci:end_loci]=6 enes[init_loci:end_loci]=0 probs[init_loci:end_loci]=0 except: print('Gaps not found, not included in predictions') if energies==True: if probabilities==True: return predict_type, enes, probs else: return predict_type, enes else: if probabilities==True: return predict_type, probs else: return predict_type
[docs] def write_bed(self,out_file='predictions', compartments=True,subcompartments=True): R""" Formats and saves predictions on BED format Args: out_file (str, optional): Folder/Path to save the prediction results save_subcompartments (bool, optional): Whether generate files with subcompartment annotations save_compartments (bool, optional): Whether generate files with compartment annotations Returns: predictions_subcompartments (dict), predictions_compartments (dict) Predicted subcompartment annotations and compartment annotations on dictionaries organized by chromosomes """ def get_color(s_id): return info[s_id][1] def get_num(s_id): return str(info[s_id][0]) def get_bed_file_line(chromosome, position, c_id): if self.chrom_l['chr'+str(chromosome)]>position*resolution: return "chr" + str(chromosome) + "\t" + str((position - 1) * resolution) + "\t" + str( position * resolution) + "\t" + c_id + "\t" + get_num(c_id) + "\t.\t" + str( (position - 1) * resolution) + "\t" + str( position * resolution) + "\t " + get_color(c_id) else: if self.chrom_l['chr'+str(chromosome)]>(position-1)*resolution: return "chr" + str(chromosome) + "\t" + str((position - 1) * resolution) + "\t" + str( self.chrom_l['chr'+str(chromosome)]) + "\t" + c_id + "\t" + get_num(c_id) + "\t.\t" + str( (position - 1) * resolution) + "\t" + str( self.chrom_l['chr'+str(chromosome)]) + "\t " + get_color(c_id) else: return '' def save_bed(type): folder=self.cell_line_path+'/predictions' if type=='c': ext='_compartments' else: ext='_subcompartments' all_data = {} for chrom_index in range(1,len(self.chrm_size)): try: filename = folder + '/chr' + str(chrom_index) + ext + '.txt' data = [] with open(filename) as file: for line in file: items = line.split() if len(items) == 2: data.append((int(items[0]), items[1])) else: print(line) all_data[chrom_index] = data except: print('Didnt found chrom:',chrom_index) try: filename = folder + '/chrX' + ext + '.txt' data = [] with open(filename) as file: for line in file: items = line.split() if len(items) == 2: data.append((int(items[0]), items[1])) else: print(line) all_data['X'] = data except: print('Didnt found chrom X:',chrom_index) with open(out_file+ext+'.bed', 'w') as f: header='# Experiments used for this prediction: ' for exp in exps: header=header+exp+' ' f.write(header+'\n') for chrom_index in range(1,len(self.chrm_size)): try: for (pos, sID) in all_data[chrom_index]: f.write(get_bed_file_line(chrom_index, pos, sID) + '\n') except: pass try: for (pos, sID) in all_data['X']: f.write(get_bed_file_line('X', pos, sID) + '\n') except: pass unique=np.loadtxt(self.cell_line_path+'/unique_exp.txt',dtype=str) for u in unique: os.system('cat '+self.cell_line_path+'/'+u+'*/exp_accession.txt | awk \'{print $1}\' | sort | uniq >> '+self.cell_line_path+'/exps_used.dat') exps=np.loadtxt(self.cell_line_path+'/exps_used.dat',dtype=str) resolution=self.res*1000 info = { "NA": (0, "255,255,255"), "A1": (1, "245,47,47"), "A2": (2, "145,47,47"), "B1": (3, "47,187,224"), "B2": (4, "47,47,224"), "B3": (5, "47,47,139"), "B4": (6, "75,0,130"), "A": (1, "245,47,47"), "B": (2, "47,187,224"), } if compartments==True: save_bed('c') if subcompartments==True: save_bed('s')
[docs] def prediction_all_chrm(self,path=None,save_subcompartments=True,save_compartments=True,energies=False,probabilities=False): R""" Predicts and outputs the genomic annotations for all the chromosomes Args: path (str, optional): Folder/Path to save the prediction results save_subcompartments (bool, optional): Whether generate files with subcompartment annotations for each chromosomes save_compartments (bool, optional): Whether generate files with compartment annotations for each chromosomes Returns: predictions_subcompartments (dict), predictions_compartments (dict) Predicted subcompartment annotations and compartment annotations on dictionaries organized by chromosomes """ if path==None: path=self.cell_line_path+'/predictions' print('Saving prediction in:',path) #Define translation dictionaries between states and subcompartments TYPE_TO_INT = {'A1':0,'A2':1,'B1':2,'B2':3,'B3':4,'B4':5,'NA':6} INT_TO_TYPE = {TYPE_TO_INT[k]:k for k in TYPE_TO_INT.keys()} INT_TO_TYPE_AB = {0:'A', 1:'A', 2:'B', 3:'B', 4:'B', 5:'B', 6:'NA'} os.system('mkdir '+path) #Predict and save data for chromosomes predictions_subcompartments={} predictions_compartments={} energies_chr={} probabilities_chr={} #Generate desired outputs (energies, probabilities and/or annoations) for chr in range(1,len(self.chrm_size)): if energies==True: if probabilities==True: pred, enes, probs=self.prediction_single_chrom(chr,h_and_J_file=self.cell_line_path+'/h_and_J.npy',energies=True,probabilities=True) energies_chr[chr]=enes probabilities_chr[chr]=probs else: pred, enes=self.prediction_single_chrom(chr,h_and_J_file=self.cell_line_path+'/h_and_J.npy',energies=True,probabilities=False) energies_chr[chr]=enes else: if probabilities==True: pred, probs=self.prediction_single_chrom(chr,h_and_J_file=self.cell_line_path+'/h_and_J.npy',energies=False,probabilities=True) probabilities_chr[chr]=probs else: pred=self.prediction_single_chrom(chr,h_and_J_file=self.cell_line_path+'/h_and_J.npy',energies=False,probabilities=False) types_pyME_sub=np.array(list(map(INT_TO_TYPE.get, pred))) types_pyME_AB=np.array(list(map(INT_TO_TYPE_AB.get, pred))) #Save data if save_subcompartments==True: with open(path+'/chr'+str(chr)+'_subcompartments.txt','w',encoding = 'utf-8') as f: for i in range(len(types_pyME_sub)): f.write("{} {}\n".format(i+1,types_pyME_sub[i])) if save_compartments==True: with open(path+'/chr'+str(chr)+'_compartments.txt','w',encoding = 'utf-8') as f: for i in range(len(types_pyME_AB)): f.write("{} {}\n".format(i+1,types_pyME_AB[i])) predictions_subcompartments[chr]=types_pyME_sub predictions_compartments[chr]=types_pyME_AB #Chromosome X chr='X' pred=self.prediction_X(h_and_J_file=self.cell_line_path+'/h_and_J.npy') types_pyME_sub=np.array(list(map(INT_TO_TYPE.get, pred))) types_pyME_AB=np.array(list(map(INT_TO_TYPE_AB.get, pred))) #Save data if save_subcompartments==True: with open(path+'/chr'+str(chr)+'_subcompartments.txt','w',encoding = 'utf-8') as f: for i in range(len(types_pyME_sub)): f.write("{} {}\n".format(i+1,types_pyME_sub[i])) if save_compartments==True: with open(path+'/chr'+str(chr)+'_compartments.txt','w',encoding = 'utf-8') as f: for i in range(len(types_pyME_AB)): f.write("{} {}\n".format(i+1,types_pyME_AB[i])) #Write output bed file self.write_bed(out_file=path+'/predictions', compartments=save_compartments,subcompartments=save_subcompartments) #Return desired outputs if energies==True: if probabilities==True: return predictions_subcompartments, predictions_compartments, energies_chr, probabilities_chr else: return predictions_subcompartments, predictions_compartments, energies_chr else: if probabilities==True: return predictions_subcompartments, predictions_compartments, probabilities_chr else: return predictions_subcompartments, predictions_compartments
[docs] def printHeader(self): print('{:^96s}'.format("****************************************************************************************")) print('{:^96s}'.format("**** *** *** *** *** *** *** *** PyMEGABASE-1.0.0 *** *** *** *** *** *** *** ****")) print('{:^96s}'.format("**** *** *** *** *** *** *** *** PyMEGABASE-1.0.0 *** *** *** *** *** *** *** ****")) print('{:^96s}'.format("**** *** *** *** *** *** *** *** PyMEGABASE-1.0.0 *** *** *** *** *** *** *** ****")) print('{:^96s}'.format("****************************************************************************************")) print('') print('{:^96s}'.format("The PyMEGABASE class performs the prediction of genomic annotations")) print('{:^96s}'.format("based on 1D data tracks of Chip-Seq and RNA-Seq. The input data is ")) print('{:^96s}'.format("obtained from ENCODE data base.")) print('{:^96s}'.format("PyMEGABASE description is described in: TBD")) print('') print('{:^96s}'.format("This package is the product of contributions from a number of people, including:")) print('{:^96s}'.format("Esteban Dodero-Rojas, Antonio Oliveira, Vinícius Contessoto,")) print('{:^96s}'.format("Ryan Cheng, and, Jose Onuchic")) print('{:^96s}'.format("Rice University")) print('') print('{:^96s}'.format("****************************************************************************************"))
[docs]class PyMEGABASE_legacy: R""" The :class:`~.PyMEGABASE` class performs genomic annotations . The :class:`~.PyMEGABASE` sets the environment to generate prediction of genomic annotations. Args: cell_line (str, required): Name of target cell type assembly (str, required): Reference assembly of target cell line ('hg19','GRCh38','mm10') organism (str, required): Target cell type organism (str, required): signal_type (str, required): Input files signal type ('signal p-value', 'fold change over control', ...) ref_cell_line_path (str, optional): Folder/Path to place reference/training data ('tmp_meta') cell_line_path (str, optional): Folder/Path to place target cell data types_path (str, optional): Folder/Path where the genomic annotations are located histones (bool, required): Whether to use Histone Modification data from the ENCODE databank for prediction tf (bool, required): Whether to use Transcription Factor data from the ENCODE databank for prediction small_rna (bool, required): Whether to use Small RNA-Seq data from the ENCODE databank for prediction total_rna (bool, required): Whether to use Total RNA-seq data from the ENCODE databank for prediction n_states (int, optional): Number of states for the D-nodes extra_filter (str, optional): Add filter to the fetching data url to download cell type data res (int, optional): Resolution for genomic annotations calling in kilobasepairs (5, 50, 100) chromosome_sizes (list, optional): Chromosome sizes based on the reference genome assembly - required for non-human assemblies """ def __init__(self, cell_line='GM12878', assembly='hg19',organism='human',signal_type='signal p-value', ref_cell_line_path='tmp_meta',cell_line_path=None,types_path='PyMEGABASE/types', histones=True,tf=False,atac=False,small_rna=False,total_rna=False,n_states=19, extra_filter='',res=50,chromosome_sizes=None,AB=False): self.printHeader() self.cell_line=cell_line self.assembly=assembly self.signal_type=signal_type if cell_line_path==None: self.cell_line_path=cell_line+'_'+assembly else: self.cell_line_path=cell_line_path self.ref_cell_line='GM12878' self.ref_assembly='hg19' self.ref_cell_line_path=ref_cell_line_path self.types_path=types_path self.hist=histones self.tf=tf self.atac=atac self.small_rna=small_rna self.total_rna=total_rna self.n_states=n_states self.extra_filter=extra_filter self.res=res self.organism=organism.lower() #Define tranlation dictinaries between aminoacids, intensity of Chip-seq signal and self.RES_TO_INT = { 'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'V': 18, 'W':19, 'Y':20, '-':21, '.':21, '~':21, } self.INT_TO_RES = {self.RES_TO_INT[k]:k for k in self.RES_TO_INT.keys()} self.TYPE_TO_INT = {'A1':0,'A2':1,'B1':2,'B2':3,'B3':4,'B4':5,'NA':6} self.INT_TO_TYPE = {self.TYPE_TO_INT[k]:k for k in self.TYPE_TO_INT.keys()} if assembly=='GRCh38': self.chrm_size = np.array([4980,4844,3966,3805,3631,3417,3187,2903,2768,2676,2702,2666,2288,2141,2040,1807,1666,1608,1173,1289,935,1017,3121])*50/self.res elif assembly=='hg19': self.chrm_size = np.array([4990,4865,3964,3828,3620,3424,3184,2931,2826,2712,2703,2679,2307,2148,2052,1810,1626,1564,1184,1262,964,1028,3105])*50/self.res else: if chromosome_sizes == None: raise ValueError("Need to specify chromosome sizes for assembly: {}".format(assembly)) self.chrm_size = np.array(chromosome_sizes)/(self.res*1000) self.chrm_size=np.round(self.chrm_size+0.1).astype(int) self.ref_chrm_size = np.array([4990,4865,3964,3828,3620,3424,3184,2931,2826,2712,2703,2679,2307,2148,2052,1810,1626,1564,1184,1262,964,1028,1028])*50/self.res self.ref_chrm_size=np.round(self.ref_chrm_size+0.1).astype(int) url='https://www.encodeproject.org/metadata/?type=Experiment&' if self.hist==True: url=url+'assay_title=Histone+ChIP-seq' if self.tf==True: url=url+'&assay_title=TF+ChIP-seq' if self.atac==True: url=url+'&assay_title=ATAC-seq' if self.small_rna==True: url=url+'&assay_title=small+RNA-seq' if self.total_rna==True: url=url+'&assay_title=total+RNA-seq' self.url_ref=url+'&biosample_ontology.term_name='+self.ref_cell_line+'&files.file_type=bigWig' r = requests.get(self.url_ref) content=str(r.content) experiments=[] for k in content.split('\\n')[:-1]: l=k.split('\\t') if l[5]==self.ref_assembly and l[4]==self.signal_type and l[7]=='Histone ChIP-seq': experiments.append(l[22]) elif l[5]==self.ref_assembly and l[4]==self.signal_type and l[7]=='ATAC-seq': experiments.append(l[7]) elif l[5]==self.ref_assembly and l[4]==self.signal_type and l[7]=='TF ChIP-seq': experiments.append(l[22]) elif l[5]==self.ref_assembly and l[4]=='plus strand signal of all reads' and l[7]=='small RNA-seq': experiments.append('plus-small-RNA-seq') elif l[5]==self.ref_assembly and l[4]=='plus strand signal of all reads' and l[7]=='total RNA-seq': experiments.append('plus-total-RNA-seq') elif l[5]==self.ref_assembly and l[4]=='minus strand signal of all reads' and l[7]=='small RNA-seq': experiments.append('minus-small-RNA-seq') elif l[5]==self.ref_assembly and l[4]=='minus strand signal of all reads' and l[7]=='total RNA-seq': experiments.append('minus-total-RNA-seq') self.experiments_unique=np.unique(experiments) self.es_unique=[] for e in self.experiments_unique: self.es_unique.append(e.split('-human')[0]) print('Selected cell line to predict: '+self.cell_line) print('Selected assembly: '+self.assembly) print('Selected signal type: '+self.signal_type) print('Selected organism: '+self.organism)
[docs] def process_replica(self,line,cell_line_path,chrm_size): R""" Preprocess function for each replica Args: line (lsit, required): Information about the replica: name, ENCODE id and replica id cell_line_path (str, required): Path to target cell type data chrm_size (list, required): Chromosome sizes based on the assembly """ text=line.split()[0] exp=line.split()[1] count=line.split()[2] #Experiment directory exp_path=cell_line_path+'/'+exp+'_'+str(count) if 'human' in exp.split('-'): ext='human' else: ext=self.organism if exp.split('-'+ext)[0] in self.es_unique: try: os.mkdir(exp_path) except: print('Directory ',exp_path,' already exist') with open(exp_path+'/exp_name.txt', 'w') as f: f.write(text+' '+exp+'\n') #Load data from server try: bw = pyBigWig.open("https://www.encodeproject.org/files/"+text+"/@@download/"+text+".bigWig") for chr in range(1,len(chrm_size)): signal = bw.stats("chr"+str(chr), type="mean", nBins=chrm_size[chr-1]) #Process signal and binning signal=np.array(signal) per=np.percentile(signal[signal!=None],95) per_min=np.percentile(signal[signal!=None],5) signal[signal==None]=per_min signal[signal<per_min]=per_min signal[signal>per]=per signal=signal-per_min signal=signal*self.n_states/(per-per_min) signal=np.round(signal.astype(float)).astype(int) #Save data with open(exp_path+'/chr'+str(chr)+'.track', 'w') as f: f.write("#chromosome file number of beads\n"+str(chrm_size[chr-1])) f.write("#\n") f.write("#bead, signal, discrete signal\n") for i in range(len(signal)): f.write(str(i)+" "+str(signal[i])+" "+str(signal[i].astype(int))+"\n") chr='X' signal = bw.stats("chr"+chr, type="mean", nBins=chrm_size[-1]) #Process signal and binning signal=np.array(signal) per=np.percentile(signal[signal!=None],95) per_min=np.percentile(signal[signal!=None],5) signal[signal==None]=per_min signal[signal<per_min]=per_min signal[signal>per]=per signal=signal-per_min signal=signal*self.n_states/(per-per_min) signal=np.round(signal.astype(float)).astype(int) #Save data with open(exp_path+'/chr'+chr+'.track', 'w') as f: f.write("#chromosome file number of beads\n"+str(chrm_size[-1])) f.write("#\n") f.write("#bead, signal, discrete signal\n") for i in range(len(signal)): f.write(str(i)+" "+str(signal[i])+" "+str(signal[i].astype(int))+"\n") return exp except: print('This experiment was incomplete:',text,'\nit will not be used.')
[docs] def download_and_process_cell_line_data(self,nproc=10,all_exp=True): R""" Download and preprocess target cell data for the D-nodes Args: nproc (int, required): Number of processors dedicated to download and process data all_exp (bool, optional): Download and process all replicas for each experiment """ try: os.mkdir(self.cell_line_path) except: print('Directory ',self.cell_line_path,' already exist') print('Deleting path and creating it anew') shutil.rmtree(self.cell_line_path) os.mkdir(self.cell_line_path) url='https://www.encodeproject.org/metadata/?type=Experiment&'+self.extra_filter if self.hist==True: url=url+'assay_title=Histone+ChIP-seq' if self.tf==True: url=url+'&assay_title=TF+ChIP-seq' if self.atac==True: url=url+'&assay_title=ATAC-seq' if self.small_rna==True: url=url+'&assay_title=small+RNA-seq' if self.total_rna==True: url=url+'&assay_title=total+RNA-seq' self.url_cell_line=url+'&biosample_ontology.term_name='+self.cell_line+'&files.file_type=bigWig' r = requests.get(self.url_cell_line) content=str(r.content) experiments=[] with open(self.cell_line_path+'/meta.txt', 'w') as f: for k in content.split('\\n')[:-1]: l=k.split('\\t') if l[5]==self.assembly and l[4]==self.signal_type and l[7]=='Histone ChIP-seq': f.write(l[0]+' '+l[22]+' '+l[5]+' '+l[4]+'\n') elif l[5]==self.assembly and l[4]==self.signal_type and l[7]=='ATAC-seq': f.write(l[0]+' '+l[7]+' '+l[5]+' '+l[4]+'\n') elif l[5]==self.assembly and l[4]==self.signal_type and l[7]=='TF ChIP-seq': f.write(l[0]+' '+l[22]+' '+l[5]+' '+l[4]+'\n') elif l[5]==self.assembly and l[4]=='plus strand signal of all reads' and l[7]=='small RNA-seq': f.write(l[0]+' '+'plus-small-RNA-seq'+' '+l[5]+' '+l[4]+'\n') elif l[5]==self.assembly and l[4]=='plus strand signal of all reads' and l[7]=='total RNA-seq': f.write(l[0]+' '+'plus-total-RNA-seq'+' '+l[5]+' '+l[4]+'\n') elif l[5]==self.assembly and l[4]=='minus strand signal of all reads' and l[7]=='small RNA-seq': f.write(l[0]+' '+'minus-small-RNA-seq'+' '+l[5]+' '+l[4]+'\n') elif l[5]==self.assembly and l[4]=='minus strand signal of all reads' and l[7]=='total RNA-seq': f.write(l[0]+' '+'minus-total-RNA-seq'+' '+l[5]+' '+l[4]+'\n') count=0 self.exp_found={} exp_name='' list_names=[] with open(self.cell_line_path+'/meta.txt') as fp: Lines = fp.readlines() for line in Lines: count += 1 text=line.split()[0] exp=line.split()[1] #Register if experiment is new if exp!=exp_name: try: count=self.exp_found[exp]+1 except: count=1 exp_name=exp self.exp_found[exp]=count if all_exp==True: list_names.append(text+' '+exp+' '+str(count)) else: if count==1: list_names.append(text+' '+exp+' '+str(count)) print('Number of replicas:', len(list_names)) self.successful_exp = Parallel(n_jobs=nproc)(delayed(self.process_replica)(list_names[i],self.cell_line_path,self.chrm_size) for i in tqdm(range(len(list_names)), desc="Process replicas",bar_format='{l_bar}{bar:40}{r_bar}{bar:-10b}')) self.successful_exp= [i for i in self.successful_exp if i] self.successful_unique_exp=np.unique(self.successful_exp) self.su_unique=[] for e in self.successful_unique_exp: self.su_unique.append(e.split('-'+self.organism)[0]) print('Experiments found in ENCODE for the selected cell line:') self.unique=[] with open(self.cell_line_path+'/unique_exp.txt', 'w') as f: for e in self.experiments_unique: if e.split('-human')[0] in self.su_unique: f.write(e.split('-human')[0]+'\n') print(e.split('-human')[0]) self.unique.append(e) if len(self.unique) > 4: print('Predictions would use: ',len(self.unique),' experiments') else: print('This sample only has ',len(self.unique),' experiments. We do not recommend prediction on samples with less than 5 different experiments.')
[docs] def download_and_process_ref_data(self,nproc,all_exp=True): R""" Download and preprocess reference data for the D-nodes Args: nproc (int, required): Number of processors dedicated to download and process data all_exp (bool, optional): Download and process all replicas for each experiment """ try: os.mkdir(self.ref_cell_line_path) except: print('Directory ',self.ref_cell_line_path,' already exist') print('Deleting path and creating it anew') shutil.rmtree(self.ref_cell_line_path) os.mkdir(self.ref_cell_line_path) url='https://www.encodeproject.org/metadata/?type=Experiment&' if self.hist==True: url=url+'assay_title=Histone+ChIP-seq' if self.tf==True: url=url+'&assay_title=TF+ChIP-seq' if self.atac==True: url=url+'&assay_title=ATAC-seq' if self.small_rna==True: url=url+'&assay_title=small+RNA-seq' if self.total_rna==True: url=url+'&assay_title=total+RNA-seq' self.url_ref=url+'&biosample_ontology.term_name='+self.ref_cell_line+'&files.file_type=bigWig' r = requests.get(self.url_ref) content=str(r.content) experiments=[] with open(self.ref_cell_line_path+'/meta.txt', 'w') as f: for k in content.split('\\n')[:-1]: l=k.split('\\t') if l[5]==self.ref_assembly and l[4]==self.signal_type and l[7]=='Histone ChIP-seq': f.write(l[0]+' '+l[22]+' '+l[5]+' '+l[4]+'\n') elif l[5]==self.ref_assembly and l[4]==self.signal_type and l[7]=='ATAC-seq': f.write(l[0]+' '+l[7]+' '+l[5]+' '+l[4]+'\n') elif l[5]==self.ref_assembly and l[4]==self.signal_type and l[7]=='TF ChIP-seq': f.write(l[0]+' '+l[22]+' '+l[5]+' '+l[4]+'\n') elif l[5]==self.ref_assembly and l[4]=='plus strand signal of all reads' and l[7]=='small RNA-seq': f.write(l[0]+' '+'plus-small-RNA-seq'+' '+l[5]+' '+l[4]+'\n') elif l[5]==self.ref_assembly and l[4]=='plus strand signal of all reads' and l[7]=='total RNA-seq': f.write(l[0]+' '+'plus-total-RNA-seq'+' '+l[5]+' '+l[4]+'\n') elif l[5]==self.ref_assembly and l[4]=='minus strand signal of all reads' and l[7]=='small RNA-seq': f.write(l[0]+' '+'minus-small-RNA-seq'+' '+l[5]+' '+l[4]+'\n') elif l[5]==self.ref_assembly and l[4]=='minus strand signal of all reads' and l[7]=='total RNA-seq': f.write(l[0]+' '+'minus-total-RNA-seq'+' '+l[5]+' '+l[4]+'\n') ref_chrm_size = self.ref_chrm_size count=0 exp_found={} exp_name='' list_names=[] with open(self.ref_cell_line_path+'/meta.txt') as fp: Lines = fp.readlines() for line in Lines: count += 1 text=line.split()[0] exp=line.split()[1] #Register if experiment is new if (exp.split('-human')[0] in self.su_unique) or (text.split('-human')[0] in self.su_unique): if exp!=exp_name: try: count=exp_found[exp]+1 except: count=1 exp_name=exp exp_found[exp]=count if all_exp==True: list_names.append(text+' '+exp+' '+str(count)) else: if count==1: list_names.append(text+' '+exp+' '+str(count)) print('Number of replicas:', len(list_names)) results = Parallel(n_jobs=nproc)(delayed(self.process_replica)(list_names[i],self.ref_cell_line_path,ref_chrm_size) for i in tqdm(range(len(list_names)), desc="Process replicas",bar_format='{l_bar}{bar:40}{r_bar}{bar:-10b}')) print('Prediction will use:') with open(self.ref_cell_line_path+'/unique_exp.txt', 'w') as f: for e in self.unique: if e in exp_found.keys(): f.write(e.split('-'+self.organism)[0]+'\n') print(e.split('-'+self.organism)[0])
[docs] def extra_track(self,experiment,bw_file): R""" Function to introduce custom tracks Args: experiment (str, required): Name of the experiment bw_file (str, required): Path to the custom track """ if not self.organism in experiment: experiment=experiment+'-'+self.organism if not experiment.split('-'+self.organism)[0] in self.es_unique: print('This experiment is not found in the training set, then cannot be used.') return 0 if experiment in self.exp_found.keys(): print('This target has replicas already') print('The new track will be addded as a different replica of the same target') #Experiment directory count=self.exp_found[experiment]+1 else: print('This target has no replicas') print('The new track will be added a the first replica of the target') #Experiment directory count=1 exp_path=self.cell_line_path+'/'+experiment+'_'+str(count) print(exp_path) try: os.mkdir(exp_path) except: print('Directory ',exp_path,' already exist') with open(exp_path+'/exp_name.txt', 'w') as f: f.write(experiment+' '+experiment+'\n') #Load data from track try: bw = pyBigWig.open(bw_file) for chr in range(1,len(self.chrm_size)): signal = bw.stats("chr"+str(chr), type="mean", nBins=self.chrm_size[chr-1]) #Process signal and binning signal=np.array(signal) per=np.percentile(signal[signal!=None],95) per_min=np.percentile(signal[signal!=None],5) signal[signal==None]=per_min signal[signal<per_min]=per_min signal[signal>per]=per signal=signal-per_min signal=signal*self.n_states/(per-per_min) signal=np.round(signal.astype(float)).astype(int) #Save data with open(exp_path+'/chr'+str(chr)+'.track', 'w') as f: f.write("#chromosome file number of beads\n"+str(self.chrm_size[chr-1])) f.write("#\n") f.write("#bead, signal, discrete signal\n") for i in range(len(signal)): f.write(str(i)+" "+str(signal[i])+" "+str(signal[i].astype(int))+"\n") chr='X' signal = bw.stats("chr"+chr, type="mean", nBins=self.chrm_size[-1]) #Process signal and binning signal=np.array(signal) per=np.percentile(signal[signal!=None],95) per_min=np.percentile(signal[signal!=None],5) signal[signal==None]=per_min signal[signal<per_min]=per_min signal[signal>per]=per signal=signal-per_min signal=signal*self.n_states/(per-per_min) signal=np.round(signal.astype(float)).astype(int) #Save data with open(exp_path+'/chr'+chr+'.track', 'w') as f: f.write("#chromosome file number of beads\n"+str(self.chrm_size[-1])) f.write("#\n") f.write("#bead, signal, discrete signal\n") for i in range(len(signal)): f.write(str(i)+" "+str(signal[i])+" "+str(signal[i].astype(int))+"\n") if experiment in self.exp_found.keys(): self.exp_found[experiment]=self.exp_found[experiment]+1 else: self.exp_found[experiment]=1 self.successful_unique_exp=np.append(self.successful_unique_exp,experiment) self.su_unique.append(experiment.split('-'+self.organism)[0]) with open(self.cell_line_path+'/unique_exp.txt', 'a') as f: f.write(experiment.split('-'+self.organism)[0]+'\n') self.unique.append(experiment) return experiment except: print('This experiment was incomplete:',experiment,'\nit will not be used.')
[docs] def build_state_vector(self,int_types,all_averages): R""" Builds the set of state vectors used on the training process Args: int_types (list, required): Genomic annotations all_averages (list, required): D-node data """ #Aggregate tracks by with data from other loci l-2, l-1, l, l+1, l+2 #l+1 shift1=np.copy(all_averages) shift1[:,:-1]=all_averages[:,1:] shift1[:,-1]=np.zeros(len(shift1[:,-1])) #l+2 shift2=np.copy(all_averages) shift2[:,:-1]=shift1[:,1:] shift2[:,-1]=np.zeros(len(shift1[:,-1])) #l-1 shift_1=np.copy(all_averages) shift_1[:,1:]=all_averages[:,:-1] shift_1[:,0]=np.zeros(len(shift_1[:,-1])) #l-2 shift_2=np.copy(all_averages) shift_2[:,1:]=shift_1[:,:-1] shift_2[:,0]=np.zeros(len(shift1[:,-1])) #Stack shifted tracks and subtypes labels all_averages=np.vstack((int_types,shift_2,shift_1,all_averages,shift1,shift2)) #To train, we exclude the centromers and B4 subcompartments ndx=(all_averages[0,:]!=6) * (all_averages[0,:]!=5) all_averages=all_averages[:,ndx] all_averages=all_averages+1 return all_averages
[docs] def get_tmatrix(self,chrms,silent=False): R""" Extract the training data Args: chrms (list, optional): Set of chromosomes from the reference data used as the training set silent (bool, optional): Silence outputs """ #Load types from Rao et al 2014 paper types=[] for chr in chrms: if self.res==50: types.append(np.loadtxt(self.types_path+'/chr'+str(chr)+'_beads.txt.original',delimiter=' ',dtype=str)[:,1]) elif self.res==100: types.append(np.loadtxt(self.types_path+'/chr'+str(chr)+'_beads.txt.original',delimiter=' ',dtype=str)[::int(self.res/50),1]) else: tmp=list(np.loadtxt(self.types_path+'/chr'+str(chr)+'_beads.txt.original',delimiter=' ',dtype=str)) if len(tmp) < self.ref_chrm_size[chr-1]: diff=self.ref_chrm_size[chr-1] - len(tmp) for i in range( diff ): tmp.append('NA') types.append(tmp) types=np.concatenate(types) int_types=np.array(list(map(self.TYPE_TO_INT.get, types))) #Check which experiments are available to train unique=np.loadtxt(self.cell_line_path+'/unique_exp.txt',dtype=str) if unique.shape==(): unique=[unique] if silent==False:print('To train the following experiments are used:') #Load each track and average over all_averages=[] for u in unique: reps=[] if silent==False:print(u) for i in glob.glob(self.ref_cell_line_path+'/'+str(u)+'*'): tmp=[] try: for chr in chrms: _tmp=np.loadtxt(i+'/chr'+str(chr)+'.track',skiprows=3)[:,2] tmp.append(_tmp) tmp=np.concatenate(tmp) reps.append(tmp) except: if silent==False:print(i,' failed with at least one chromosome') reps=np.array(reps) ave_reps=np.round(np.mean(reps,axis=0)) all_averages.append(ave_reps) all_averages=np.array(all_averages) all_averages=self.build_state_vector(int_types,all_averages) return all_averages
[docs] def filter_exp(self): R""" Performs baseline assestment on experiment baselines """ a=[] for i in range(1,3): a.append(self.test_set(chr=i,silent=True)) a=np.concatenate(a,axis=1) locus=2 good_exp=0 gexp=[] unique=np.loadtxt(self.cell_line_path+'/unique_exp.txt',dtype=str) for exper in range(len(unique)): i=exper+len(unique)*locus if (np.abs(np.mean(a[i])-np.mean(self.tmatrix[i+1]))<1) and (np.std(a[i])-np.std(self.tmatrix[i+1])<2): good_exp=good_exp+1 gexp.append(unique[exper]+'\n') else: print('Not using '+unique[exper],' to predict') #gexp=sample(gexp,8) with open(self.cell_line_path+'/unique_exp_filtered.txt','w') as f: for i in gexp: f.write(i) print('Number of suitable experiments for prediction:',good_exp) if good_exp>0: os.system('mv '+self.cell_line_path+'/unique_exp.txt '+self.cell_line_path+'/unique_exp_bu.txt') os.system('mv '+self.cell_line_path+'/unique_exp_filtered.txt '+self.cell_line_path+'/unique_exp.txt') else: print('There are no experiment suitable for the prediction')
[docs] def training_set_up(self,chrms=None,filter=True): R""" Formats data to allow the training Args: chrms (list, optional): Set of chromosomes from the reference data to use as the training set filter (bool, optional): Filter experiments based on the baseline """ if chrms==None: # We are training in odd chromosomes data if self.cell_line=='GM12878' and self.assembly=='hg19': chrms=[1,3,5,7,9,11,13,15,17,19,21] else: chrms=[i for i in range(1,23)] if filter==True: all_averages=self.get_tmatrix(chrms,silent=True) self.tmatrix=np.copy(all_averages) self.filter_exp() all_averages=self.get_tmatrix(chrms,silent=False) self.tmatrix=np.copy(all_averages) # Translate Potts states to sequences sequences=np.array(list(map(self.INT_TO_RES.get, all_averages.flatten()))).reshape(all_averages.shape) #Generate sequence file with open(self.cell_line_path+"/sequences.fa",'w',encoding = 'utf-8') as f: for i in range(len(sequences.T)): f.write('>'+str(i).zfill(4)+'\n') f.write(''.join(sequences[:,i])+'\n')
[docs] def training(self,nproc=10,lambda_h=100,lambda_J=100): R""" Performs the training of the Potts model based on the reference data Args: nproc (int, required): Number of processors used to train lambda_h (bool, optional): Value for the intensity of the regularization value for the h energy term lambda_J (float, optional): Value for the intensity of the regularization value for the J energy term Returns: array (size of chromosome,5*number of unique experiments) D-node input data """ # Compute DCA scores using Pseudolikelihood maximization algorithm plmdca_inst = plmdca.PlmDCA( self.cell_line_path+"/sequences.fa", 'protein', seqid = 0.99, lambda_h = lambda_h, lambda_J = lambda_J, num_threads = nproc, max_iterations = 1000) print('Training started') # Train an get coupling and fields as lists self.fields_and_couplings = plmdca_inst.get_fields_and_couplings_from_backend() self.plmdca_inst=plmdca_inst fields_and_couplings = self.fields_and_couplings couplings = plmdca_inst.get_couplings_no_gap_state(fields_and_couplings) fields = plmdca_inst.get_fields_no_gap_state(fields_and_couplings) #Reshape couplings and fields to a working format # J should be shaped (56,56,20,20) # h should be shaped (56,20) L = plmdca_inst._get_num_and_len_of_seqs()[1] q = 21 self.L=L self.q=q qm1 = q - 1 J=np.zeros((L,L,qm1,qm1)) fields_all = fields_and_couplings[:L * q] h = list() for i in range(L): for j in range(i + 1, L): start_indx = int(((L * (L - 1)/2) - (L - i) * ((L-i)-1)/2 + j - i - 1) * qm1 * qm1) end_indx = start_indx + qm1 * qm1 couplings_ij = couplings[start_indx:end_indx] couplings_ij = np.reshape(couplings_ij, (qm1,qm1)) J[i,j]=couplings_ij J[j,i]=couplings_ij h.append(fields_all[i * q:(i+1)*q]) h=np.array(h) print('Training finished') print('J and H produced') self.h=h self.J=J h_and_J={} h_and_J['h']=h h_and_J['J']=J h_and_J['J_flat']=couplings #Save fields and couplings with open(self.cell_line_path+'/h_and_J.npy', 'wb') as f: np.save(f, h_and_J)
[docs] def test_set(self,chr=1,silent=False): R""" Predicts and outputs the genomic annotations for chromosome X Args: chr (int, required): Chromosome to extract input data fro the D-nodes silent (bool, optional): Avoid printing information Returns: array (size of chromosome,5*number of unique experiments) D-node input data """ if silent==False:print('Test set for chromosome: ',chr) if chr!='X': types=["A1" for i in range(self.chrm_size[chr-1])] else: types=["A1" for i in range(self.chrm_size[-1])] int_types=np.array(list(map(self.TYPE_TO_INT.get, types))) unique=np.loadtxt(self.cell_line_path+'/unique_exp.txt',dtype=str) if unique.shape==(): unique=[unique] #Load each track and average over all_averages=[] for u in unique: reps=[] for i in glob.glob(self.cell_line_path+'/'+str(u)+'*'): tmp=[] try: tmp=np.loadtxt(i+'/chr'+str(chr)+'.track',skiprows=3)[:,2] reps.append(tmp) except: if silent==False:print(i,' failed with at least one chromosome') reps=np.array(reps) ave_reps=np.round(np.mean(reps,axis=0)) all_averages.append(ave_reps) all_averages=np.array(all_averages) chr_averages=self.build_state_vector(int_types,all_averages)-1 return chr_averages[1:]+1
[docs] def prediction_single_chrom(self,chr=1,h_and_J_file=None): R""" Predicts and outputs the genomic annotations for chromosome X Args: chr (int, optional): Chromosome to predict h_and_J_file (str, optional): Model energy term file path Returns: array (size of chromosome) Predicted annotations """ print('Predicting subcompartments for chromosome: ',chr) if h_and_J_file!=None: with open(h_and_J_file, 'rb') as f: h_and_J = np.load(f, allow_pickle=True) h_and_J = h_and_J.item() self.h=h_and_J['h'] self.J=h_and_J['J'] types=["A1" for i in range(self.chrm_size[chr-1])] int_types=np.array(list(map(self.TYPE_TO_INT.get, types))) unique=np.loadtxt(self.cell_line_path+'/unique_exp.txt',dtype=str) if unique.shape==(): unique=[unique] #Load each track and average over all_averages=[] for u in unique: reps=[] for i in glob.glob(self.cell_line_path+'/'+str(u)+'*'): tmp=[] try: tmp=np.loadtxt(i+'/chr'+str(chr)+'.track',skiprows=3)[:,2] reps.append(tmp) except: print(i,' failed with at least one chromosome') reps=np.array(reps) ave_reps=np.round(np.mean(reps,axis=0)) all_averages.append(ave_reps) all_averages=np.array(all_averages) self.chr_averages=self.build_state_vector(int_types,all_averages)-1 #Prediction predict_type=np.zeros(self.chr_averages.shape[1]) fails=0;r=0; self.L=len(self.h) for loci in range(self.chr_averages.shape[1]): energy_val=[] energy=0 #Check energy for all possible 5 states (A1,A2,B1,B2,B3) for state in range(5): tmp_energy=-self.h[0,state] for j in range(1,self.L): s2=int(self.chr_averages[j,loci]) tmp_energy=tmp_energy-self.J[0,j,state,s2] energy_val.append(energy+tmp_energy) energy_val=np.array(energy_val) #Select the state with the lowest energy predict_type[loci]=np.where(energy_val==np.min(energy_val))[0][0] #Add gaps from UCSC database try: gaps=np.loadtxt('PyMEGABASE/gaps/'+self.assembly+'_gaps.txt',dtype=str) chr_gaps_ndx=np.where((gaps[:,0]=='chr'+str(chr)))[0] for gp in chr_gaps_ndx: init_loci=np.round(gaps[gp,1].astype(float)/(self.res*1000)).astype(int) end_loci=np.round(gaps[gp,2].astype(float)/(self.res*1000)).astype(int) predict_type[init_loci:end_loci]=6 except: print('Gaps not found, not included in predictions') return predict_type
[docs] def prediction_X(self,chr='X',h_and_J_file=None): R""" Predicts and outputs the genomic annotations for chromosome X Args: chr (int, optional): Chromosome to predict h_and_J_file (str, optional): Model energy term file path Returns: array (size of chromosome) Predicted annotations """ print('Predicting subcompartments for chromosome: ',chr) if h_and_J_file!=None: with open(h_and_J_file, 'rb') as f: h_and_J = np.load(f, allow_pickle=True) h_and_J = h_and_J.item() self.h=h_and_J['h'] self.J=h_and_J['J'] types=["A1" for i in range(self.chrm_size[-1])] int_types=np.array(list(map(self.TYPE_TO_INT.get, types))) unique=np.loadtxt(self.cell_line_path+'/unique_exp.txt',dtype=str) if unique.shape==(): unique=[unique] #Load each track and average over all_averages=[] for u in unique: reps=[] for i in glob.glob(self.cell_line_path+'/'+str(u)+'*'): tmp=[] try: tmp=np.loadtxt(i+'/chr'+str(chr)+'.track',skiprows=3)[:,2] reps.append(tmp) except: print(i,' failed with at least one chromosome') reps=np.array(reps) ave_reps=np.round(np.mean(reps,axis=0)) all_averages.append(ave_reps) all_averages=np.array(all_averages) self.chr_averages=self.build_state_vector(int_types,all_averages)-1 #Prediction predict_type=np.zeros(self.chr_averages.shape[1]) fails=0;r=0 self.L=len(self.h) for loci in range(self.chr_averages.shape[1]): energy_val=[] energy=0 #Check energy for all possible 5 states (A1,A2,B1,B2,B3) for state in range(5): tmp_energy=-self.h[0,state] for j in range(1,self.L): s2=int(self.chr_averages[j,loci]) tmp_energy=tmp_energy-self.J[0,j,state,s2] energy_val.append(energy+tmp_energy) energy_val=np.array(energy_val) #Select the state with the lowest energy predict_type[loci]=np.where(energy_val==np.min(energy_val))[0][0] #Add gaps from UCSC database try: print('Resolution:', self.res) gaps=np.loadtxt('PyMEGABASE/gaps/'+self.assembly+'_gaps.txt',dtype=str) chr_gaps_ndx=np.where((gaps[:,0]=='chr'+str(chr)))[0] for gp in chr_gaps_ndx: init_loci=np.round(gaps[gp,1].astype(float)/self.res*1000).astype(int) end_loci=np.round(gaps[gp,2].astype(float)/self.res*1000).astype(int) predict_type[init_loci:end_loci]=6 except: print('Gaps not found, not included in predictions') return predict_type
[docs] def write_bed(self,out_file='predictions', compartments=True,subcompartments=True): R""" Formats and saves predictions on BED format Args: out_file (str, optional): Folder/Path to save the prediction results save_subcompartments (bool, optional): Whether generate files with subcompartment annotations save_compartments (bool, optional): Whether generate files with compartment annotations Returns: predictions_subcompartments (dict), predictions_compartments (dict) Predicted subcompartment annotations and compartment annotations on dictionaries organized by chromosomes """ def get_color(s_id): return info[s_id][1] def get_num(s_id): return str(info[s_id][0]) def get_bed_file_line(chromosome, position, c_id): return "chr" + str(chromosome) + "\t" + str((position - 1) * resolution) + "\t" + str( position * resolution) + "\t" + c_id + "\t" + get_num(c_id) + "\t.\t" + str( (position - 1) * resolution) + "\t" + str( position * resolution) + "\t " + get_color(c_id) def save_bed(type): folder=self.cell_line_path+'/predictions' if type=='c': ext='_compartments' else: ext='_subcompartments' all_data = {} for chrom_index in range(1,len(self.chrm_size)): try: filename = folder + '/chr' + str(chrom_index) + ext + '.txt' data = [] with open(filename) as file: for line in file: items = line.split() if len(items) == 2: data.append((int(items[0]), items[1])) else: print(line) all_data[chrom_index] = data except: print('Didnt found chrom:',chrom_index) with open(out_file+ext+'.bed', 'w') as f: header='# Experiments used for this prediction: ' for exp in exps: header=header+exp+' ' f.write(header+'\n') for chrom_index in range(1,len(self.chrm_size)): try: for (pos, sID) in all_data[chrom_index]: f.write(get_bed_file_line(chrom_index, pos, sID) + '\n') except: pass unique=np.loadtxt(self.cell_line_path+'/unique_exp.txt',dtype=str) for u in unique: os.system('cat '+self.cell_line_path+'/'+u+'*/exp_name.txt | awk \'{print $1}\' >> '+self.cell_line_path+'/exps_used.dat') exps=np.loadtxt(self.cell_line_path+'/exps_used.dat',dtype=str) resolution=self.res*1000 info = { "NA": (0, "255,255,255"), "A1": (2, "245,47,47"), "A2": (1, "145,47,47"), "B1": (-1, "47,187,224"), "B2": (-2, "47,47,224"), "B3": (-3, "47,47,139"), "B4": (-4, "75,0,130"), "A": (1, "245,47,47"), "B": (-1, "47,187,224"), } if compartments==True: save_bed('c') if subcompartments==True: save_bed('s')
[docs] def prediction_all_chrm(self,path=None,save_subcompartments=True,save_compartments=True): R""" Predicts and outputs the genomic annotations for all the chromosomes Args: path (str, optional): Folder/Path to save the prediction results save_subcompartments (bool, optional): Whether generate files with subcompartment annotations for each chromosomes save_compartments (bool, optional): Whether generate files with compartment annotations for each chromosomes Returns: predictions_subcompartments (dict), predictions_compartments (dict) Predicted subcompartment annotations and compartment annotations on dictionaries organized by chromosomes """ if path==None: path=self.cell_line_path+'/predictions' print('Saving prediction in:',path) #Define translation dictionaries between states and subcompartments TYPE_TO_INT = {'A1':0,'A2':1,'B1':2,'B2':3,'B3':4,'B4':5,'NA':6} INT_TO_TYPE = {TYPE_TO_INT[k]:k for k in TYPE_TO_INT.keys()} INT_TO_TYPE_AB = {0:'A', 1:'A', 2:'B', 3:'B', 4:'B', 5:'B', 6:'NA'} os.system('mkdir '+path) #Predict and save data for chromosomes predictions_subcompartments={} predictions_compartments={} for chr in range(1,len(self.chrm_size)): pred=self.prediction_single_chrom(chr,h_and_J_file=self.cell_line_path+'/h_and_J.npy') types_pyME_sub=np.array(list(map(INT_TO_TYPE.get, pred))) types_pyME_AB=np.array(list(map(INT_TO_TYPE_AB.get, pred))) #Save data if save_subcompartments==True: with open(path+'/chr'+str(chr)+'_subcompartments.txt','w',encoding = 'utf-8') as f: for i in range(len(types_pyME_sub)): f.write("{} {}\n".format(i+1,types_pyME_sub[i])) if save_compartments==True: with open(path+'/chr'+str(chr)+'_compartments.txt','w',encoding = 'utf-8') as f: for i in range(len(types_pyME_AB)): f.write("{} {}\n".format(i+1,types_pyME_AB[i])) predictions_subcompartments[chr]=types_pyME_sub predictions_compartments[chr]=types_pyME_AB #Chromosome X chr='X' pred=self.prediction_X(h_and_J_file=self.cell_line_path+'/h_and_J.npy') types_pyME_sub=np.array(list(map(INT_TO_TYPE.get, pred))) types_pyME_AB=np.array(list(map(INT_TO_TYPE_AB.get, pred))) #Save data if save_subcompartments==True: with open(path+'/chr'+str(chr)+'_subcompartments.txt','w',encoding = 'utf-8') as f: for i in range(len(types_pyME_sub)): f.write("{} {}\n".format(i+1,types_pyME_sub[i])) if save_compartments==True: with open(path+'/chr'+str(chr)+'_compartments.txt','w',encoding = 'utf-8') as f: for i in range(len(types_pyME_AB)): f.write("{} {}\n".format(i+1,types_pyME_AB[i])) self.write_bed(out_file=path+'/predictions', compartments=save_compartments,subcompartments=save_subcompartments) return predictions_subcompartments, predictions_compartments
[docs] def printHeader(self): print('{:^96s}'.format("****************************************************************************************")) print('{:^96s}'.format("**** *** *** *** *** *** *** *** PyMEGABASE-1.0.0 *** *** *** *** *** *** *** ****")) print('{:^96s}'.format("**** *** *** *** *** *** *** *** PyMEGABASE-1.0.0 *** *** *** *** *** *** *** ****")) print('{:^96s}'.format("**** *** *** *** *** *** *** *** PyMEGABASE-1.0.0 *** *** *** *** *** *** *** ****")) print('{:^96s}'.format("****************************************************************************************")) print('') print('{:^96s}'.format("The PyMEGABASE class performs the prediction of genomic annotations")) print('{:^96s}'.format("based on 1D data tracks of Chip-Seq and RNA-Seq. The input data is ")) print('{:^96s}'.format("obtained from ENCODE data base.")) print('{:^96s}'.format("PyMEGABASE description is described in: TBD")) print('') print('{:^96s}'.format("This package is the product of contributions from a number of people, including:")) print('{:^96s}'.format("Esteban Dodero-Rojas, Antonio Oliveira, Vinícius Contessoto,")) print('{:^96s}'.format("Ryan Cheng, and, Jose Onuchic")) print('{:^96s}'.format("Rice University")) print('') print('{:^96s}'.format("****************************************************************************************"))