Source code for decneo.analysisPipeline

''' Module holding class that implements the analysis pipeline 
'''

from .commonFunctions import *

[docs]class Analysis(): '''Class of analysis and visualization functions for DECNEO Parameters: workingDir: str, Default '' Directory to retrieve and save files and results to otherCaseDir: str, Default '' Directory holding comparison (other species) data genesOfInterest: list, Default None Particular genes to analyze, e.g. receptors knownRegulators: list, Default None Known marker genes nCPUs: int, Default 1 Number of CPUs to use for multiprocessing, recommended 10-20 panels: list, Default None Particular measurements to include in the analysis nBootstrap: int, Default 100 Number of bootstrap experiments to perform majorMetric: str, Default 'correlation' Metric name (e.g. 'correlation', 'cosine', 'euclidean', 'spearman') methodForDEG: str, Default 'ttest' Possible options: {'ttest', 'mannwhitneyu'} perEachOtherCase: boolean, Default False Whether to perform comparisons of bootstrap experiments with other bootstrap experiments or with a single case metricsFile: str, 'metricsFile.h5' Name of file where gene expression distance data is saved for specified metric seed: int, None Used to set randomness deterministic PCNpath: str, Default 'data/' Path to PCN file ''' def __init__(self, workingDir = '', otherCaseDir = '', genesOfInterest = None, knownRegulators = None, nCPUs = 1, panels = None, nBootstrap = 100, majorMetric = 'correlation', perEachOtherCase = False, metricsFile = 'metricsFile.h5', seed = None, PCNpath = 'data/', minBatches = 5, pseudoBatches = 10, dendrogramMetric = 'euclidean', dendrogramLinkageMethod = 'ward', methodForDEG = 'ttest'): '''Function called automatically and sets up working directory, files, and input information''' if seed is None: np.random.seed() else: np.random.seed(seed) self.workingDir = workingDir if not os.path.exists(self.workingDir): os.makedirs(self.workingDir) ## Locking mechanism is not implemented #if os.path.isfile('_locked') or os.path.isfile('_completed'): # self.locked = True #else: # with open('_locked', 'w'): # os.utime('_locked') #os.remove('_locked') self.otherCaseDir = otherCaseDir if not os.path.exists(self.otherCaseDir): os.makedirs(self.otherCaseDir) self.PCNpath = PCNpath self.genesOfInterest = genesOfInterest self.knownRegulators = knownRegulators self.nCPUs = nCPUs self.panels = panels self.nBootstrap = nBootstrap self.majorMetric = majorMetric self.dendrogramMetric = dendrogramMetric self.dendrogramLinkageMethod = dendrogramLinkageMethod self.methodForDEG = methodForDEG self.minBatches = minBatches self.pseudoBatches = pseudoBatches self.perEachOtherCase = perEachOtherCase self.metricsFile = metricsFile self.byBatchesDir = self.workingDir + 'byBatches/' self.bootstrapDir = self.workingDir + 'bootstrap/' self.dataSaveName = self.workingDir + 'data.h5' self.dendroDataName = self.workingDir + 'bootstrap_experiments_dendro_data.h5' self.bootstrapExperiments = list(range(0, nBootstrap)) return standardPanels = [ 'fraction', 'binomial', 'top50', 'markers', ] deprecatedPanels = [ 'PubMedHits', 'gAbove50_PanglaoMouse', 'gAbove50_PanglaoHuman', 'GOpositive', 'GOnegative', 'markerstop50', 'expression', 'closeness', 'age', 'rate', ] combinationPanels = [ 'combo3avgs', 'combo4avgs', #'combo3avgs-peak', # Use this syntax to include panels with genes of peak ] combinationPanelsDict = { 'combo2avgs': ['fraction', 'binomial'], 'combo3avgs': ['fraction', 'top50', 'binomial'], 'combo4avgs': ['fraction', 'top50', 'binomial', 'markers'], 'combo5-1avgs': ['fraction', 'top50', 'binomial', 'markers', 'PubMedHits'], 'combo5-2avgs': ['fraction', 'top50', 'binomial', 'markers', 'gAbove50_PanglaoMouse'], 'combo5-3avgs': ['fraction', 'top50', 'binomial', 'markers', 'gAbove50_PanglaoHuman'], 'combo6-1avgs': ['fraction', 'top50', 'binomial', 'markers', 'PubMedHits', 'gAbove50_PanglaoMouse'], 'combo6-2avgs': ['fraction', 'top50', 'binomial', 'markers', 'PubMedHits', 'gAbove50_PanglaoHuman'], 'combo6-3avgs': ['fraction', 'top50', 'binomial', 'markers', 'gAbove50_PanglaoMouse', 'gAbove50_PanglaoHuman'], 'combo7avgs': ['fraction', 'top50', 'binomial', 'markers', 'PubMedHits', 'gAbove50_PanglaoMouse', 'gAbove50_PanglaoHuman'], }
[docs] def prepareDEG(self, dfa, dfb, pvalueLimit = 0.001): '''Save gene expression data of cell type of interest. Create rank dataframe (df_ranks) with genes ranked by differential expression Parameters: dfa: pandas.Dataframe Dataframe containing expression data for cell type of interest Has genes as rows and (batches, cells) as columns dfb: pandas.Dataframe Dataframe containing expression data for cells of type other than cell type of interest Has genes as rows and (batches, cells) as columns pvalueLimit: float, Default 0.001 Maximum possible p-value to include Returns: None Usage: prepareDEG(dfa, dfb) ''' dfb = dfb.reindex(dfa.index).fillna(0.) nOriginalBatches = len(np.unique(dfa.columns.get_level_values('batch').values)) if nOriginalBatches < self.minBatches: dfa.columns = pd.MultiIndex.from_arrays([np.random.permutation(np.hstack([np.array([str(i)]*len(v), dtype=str) for i, v in enumerate(np.array_split(dfa.columns.get_level_values('batch').values, self.pseudoBatches))])), dfa.columns.get_level_values('cell')], names=['batch', 'cell']) dfb.columns = pd.MultiIndex.from_arrays([np.random.permutation(np.hstack([np.array([str(i)]*len(v), dtype=str) for i, v in enumerate(np.array_split(dfb.columns.get_level_values('batch').values, self.pseudoBatches))])), dfb.columns.get_level_values('cell')], names=['batch', 'cell']) print('Original bathces:', nOriginalBatches, '\tGenerated %s pseudobatches:' % self.pseudoBatches, dfa.shape, dfb.shape) print('Saving expression data', flush=True) dfa.to_hdf(self.dataSaveName, key='df', mode='a', complevel=4, complib='zlib') genes = [] batches = [] for batch in np.unique(dfa.columns.get_level_values('batch').values): try: df_temp_a = dfa.xs(batch, level='batch', axis=1, drop_level=False) df_temp_b = dfb.xs(batch, level='batch', axis=1, drop_level=False) if self.methodForDEG == 'ttest': ttest = scipy.stats.ttest_ind(df_temp_a.values, df_temp_b.values, axis=1) df_test = pd.DataFrame(index=dfa.index, columns=['statistic', 'pvalue']) df_test['statistic'] = ttest[0] df_test['pvalue'] = ttest[1] df_test = df_test.sort_values('statistic', ascending=False).dropna() elif self.methodForDEG == 'mannwhitneyu': df_temp_a = df_temp_a.apply(np.array, axis=1) df_temp_b = df_temp_b.apply(np.array, axis=1) df_temp = pd.concat([df_temp_a, df_temp_b], axis=1, sort=False) df_temp = df_temp.loc[(df_temp[0].apply(np.sum) + df_temp[1].apply(np.sum)) > 0] df_test = df_temp.apply(lambda v: pd.Series(np.array(scipy.stats.mannwhitneyu(v[0], v[1]))), axis=1) df_test.columns = ['statistic', 'pvalue'] df_test = df_test.sort_values('pvalue', ascending=True).dropna() genes.append(df_test.loc[df_test['pvalue'] <= pvalueLimit]['statistic'].index.values) batches.append(batch) except Exception as exception: print(exception) ugenes = [] for i, v in enumerate(genes): ugenes.extend(v) ugenes = np.unique(ugenes) print('\nBatches:', len(genes), 'Unique genes:', len(ugenes)) df = pd.DataFrame(index=range(len(ugenes)), columns=batches) for i, v in enumerate(genes): df.iloc[:len(v), i] = v df_ranks = pd.DataFrame(index=ugenes, columns=batches) for i, v in enumerate(genes): df_ranks.iloc[:, i] = pd.Index(df.iloc[:, i]).reindex(df_ranks.index)[1] print('Saving ranks data', flush=True) df_ranks.to_hdf(self.dataSaveName, key='df_ranks', mode='a', complevel=4, complib='zlib') print(df_ranks) return
[docs] def preparePerBatchCase(self, **kwargs): ''' Process gene expression data to generate per-batch distance measure and save to file. No plots are generated Parameters: Any parameters that function 'analyzeCase' can accept Returns: None Usage: an = Analysis() an.preparePerBatchCase() ''' self.analyzeCase(pd.read_hdf(self.dataSaveName, key='df'), suffix='all', saveDir=self.byBatchesDir, toggleCalculateMajorMetric=True, toggleExportFigureData=True, toggleCalculateMeasures=True, toggleGroupBatches=False, toggleAdjustText=False, noPlot=True, **kwargs) return
def _forPrepareBootstrapExperiments(self, args): ''' Function used internally in multiprocessing Parameters: saveSubDir: str Subdirectory for each bootstrap experiment df_ranks: pandas.DataFrame Genes ranked by differential expression. This is a legacy parameter. If None function will use rank dataframe from working directory df_measure: pandas.DataFrame Gene expression per-batch distance df_fraction: pandas.DataFrame Fraction of cells expressing each gene for each batch df_median_expr: pandas.DataFrame Median of non-zero values of each gene expression for each batch se_count: pandas.DataFrame Per batch counts Returns: None Usage: For internal use only ''' saveSubDir, df_ranks, df_measure, df_fraction, df_median_expr, se_count = args np.random.seed() try: print(saveSubDir, end='\t', flush=True) if not os.path.exists(os.path.join(self.bootstrapDir, saveSubDir)): os.makedirs(os.path.join(self.bootstrapDir, saveSubDir)) if saveSubDir == 'All': batches = df_measure.columns else: batches = np.random.choice(df_measure.columns, size=len(df_measure.columns), replace=True) np.savetxt(os.path.join(self.bootstrapDir, saveSubDir, 'batches.txt'), batches, fmt='%s') df_measure_temp = pd.Series(data=np.nanmedian(df_measure[batches].values.copy(), axis=1, overwrite_input=True), index=df_measure.index) df_measure_temp = df_measure_temp.unstack(0) df_measure_temp.to_hdf(os.path.join(self.bootstrapDir, saveSubDir, self.metricsFile), key=self.majorMetric, mode='a', complevel=4, complib='zlib') df_fraction_temp = df_fraction[batches] df_fraction_temp.columns = df_fraction_temp.columns + '_' + np.array(range(len(df_fraction_temp.columns))).astype(str) df_fraction_temp = df_fraction_temp.mean(axis=1) df_fraction_temp.to_hdf(os.path.join(self.bootstrapDir, saveSubDir, 'perGeneStats.h5'), key='df_fraction', mode='a', complevel=4, complib='zlib') df_median_expr_temp = df_median_expr[batches] df_median_expr_temp.columns = df_median_expr_temp.columns + '_' + np.array(range(len(df_median_expr_temp.columns))).astype(str) df_median_expr_temp = df_median_expr_temp.mean(axis=1) df_median_expr_temp.to_hdf(os.path.join(self.bootstrapDir, saveSubDir, 'perGeneStats.h5'), key='df_expression', mode='a', complevel=4, complib='zlib') se_count_temp = se_count[batches] se_count_temp.index = se_count_temp.index + '_' + np.array(range(len(se_count_temp.index))).astype(str) se_count_temp = se_count_temp.sort_values(ascending=False) se_count_temp.to_hdf(os.path.join(self.bootstrapDir, saveSubDir, 'perGeneStats.h5'), key='se_count', mode='a', complevel=4, complib='zlib') np.savetxt(os.path.join(self.bootstrapDir, saveSubDir, 'size.txt'), [df_fraction_temp.shape[0], se_count_temp.sum()], fmt='%i') if not df_ranks is None: batches = np.loadtxt(os.path.join(self.bootstrapDir, saveSubDir, 'batches.txt'), dtype=str, delimiter='\t') df_ranks_temp = df_ranks[df_ranks.columns.intersection(batches)] df_ranks_temp.columns = df_ranks_temp.columns + '_' + np.array(range(len(df_ranks_temp.columns))).astype(str) df_ranks_temp = df_ranks_temp.median(axis=1).sort_values() df_ranks_temp.to_hdf(os.path.join(self.bootstrapDir, saveSubDir, 'perGeneStats.h5'), key='df_ranks', mode='a', complevel=4, complib='zlib') except Exception as exception: print(exception) return
[docs] def prepareBootstrapExperiments(self, allDataToo = True, df_ranks = None, parallel = False): '''Prepare bootstrap experiments data and calculating gene statistics for each experiment Parameters: allDataToo: boolean, Default True Whether to prepare experiment for all data as well df_ranks: pd.DataFrame, Default None Genes ranked by differential expression If None function will use rank dataframe from working directory Returns: None Usage: an = Analysis() an.prepareBootstrapExperiments() ''' try: if df_ranks is None: df_ranks=pd.read_hdf(self.dataSaveName, key='df_ranks') print('Reading precalculated data from %s' % self.byBatchesDir, flush=True) df_measure = pd.read_hdf(os.path.join(self.byBatchesDir, self.metricsFile), key=self.majorMetric) print('df_measure', '%1.1fMb'%(sys.getsizeof(df_measure) / 1024**2), flush=True) df_fraction = pd.read_hdf(os.path.join(self.byBatchesDir, 'perGeneStats.h5'), key='df_fraction') df_median_expr = pd.read_hdf(os.path.join(self.byBatchesDir, 'perGeneStats.h5'), key='df_expression') se_count = pd.read_hdf(os.path.join(self.byBatchesDir, 'perGeneStats.h5'), key='se_count') saveSubDirs = ['Experiment %s' % (id + 1) for id in self.bootstrapExperiments] if allDataToo: saveSubDirs = ['All'] + saveSubDirs if parallel: pool = multiprocessing.Pool(processes=self.nCPUs) pool.map(self._forPrepareBootstrapExperiments, [(saveSubDir, df_ranks, df_measure, df_fraction, df_median_expr, se_count) for saveSubDir in saveSubDirs]) pool.close() pool.join() else: for saveSubDir in saveSubDirs: self._forPrepareBootstrapExperiments((saveSubDir, df_ranks, df_measure, df_fraction, df_median_expr, se_count)) #self._forPrepareBootstrapExperiments((saveSubDir, df_ranks, None, None, None, None)) print(flush=True) except Exception as exception: print(exception) return
[docs] def compareTwoCases(self, saveDir1, saveDir2, name1 = 'N1', name2='N2', saveName = 'saveName'): '''Compare gene measurements between two cases for each bootstrap experiment Parameters: saveDir1: str Directory storing gene measurement data for case 1 saveDir2: str Directory storing gene measurement data for case 2 name1: str, Default 'N1' Phrase to append to keys of the resulting dataframe for case 1 name2: str, Default 'N2' Phrase to append to keys of the resulting dataframe for case 2 saveName: str, Default 'saveName' Name of file to save result dataframe to Returns: None Usage: an = Analysis() an.compareTwoCases(saveDir1, saveDir2, name1, name2, saveName) ''' majorMetric = self.majorMetric try: df1 = pd.read_hdf(os.path.join(saveDir1, 'per-gene-measures-%s.h5' % majorMetric), key='df') df2 = pd.read_hdf(os.path.join(saveDir2, 'per-gene-measures-%s.h5' % majorMetric), key='df') except Exception as exception: print('ERROR reading h5 in compareTwoCases:', exception, '\nTrying reading XLSX format') try: df1 = pd.read_excel(os.path.join(saveDir1, 'per-gene-measures-%s.xlsx' % majorMetric), header=0, index_col=[0,1]).fillna('') df2 = pd.read_excel(os.path.join(saveDir2, 'per-gene-measures-%s.xlsx' % majorMetric), header=0, index_col=[0,1]).fillna('') except Exception as exception: print('ERROR reading XLSX:', exception) return n23_1 = len(np.intersect1d(np.unique(df1.index.get_level_values('gene').values), gEC22)) n23_2 = len(np.intersect1d(np.unique(df2.index.get_level_values('gene').values), gEC22)) commonIndex = df1.index.intersection(df2.index) df1 = df1.loc[commonIndex] df2 = df2.loc[commonIndex] df_T50 = pd.concat([df1['T50'].str.replace(' ','').str.split(','), df2['T50'].str.replace(' ','').str.split(',')], keys=[name1, name2], axis=1, sort=False) df_EC23T50 = pd.concat([df1['EC23T50'].str.replace(' ','').str.split(','), df2['EC23T50'].str.replace(' ','').str.split(',')], keys=[name1, name2], axis=1, sort=False) df_AUC = pd.concat([df1['AUC'].astype(float), df2['AUC'].astype(float)], keys=[name1, name2], axis=1, sort=False) df_EC23T50_count = df_EC23T50.applymap(len) df_EC23T50_common = df_EC23T50.apply(lambda s: np.intersect1d(s[0], s[1]), axis=1) df_EC23T50_common_count = df_EC23T50.apply(lambda s: len(np.intersect1d(s[0], s[1])), axis=1) df_T50_common = df_T50.apply(lambda s: np.intersect1d(s[0], s[1]), axis=1) df_T50_common_count = df_T50.apply(lambda s: len(np.intersect1d(s[0], s[1])), axis=1) df_AUC_avg = df_AUC.apply(np.mean, axis=1) df_res = pd.concat([df_EC23T50_common.apply(cleanListString), df_EC23T50_common_count, df_T50_common.apply(cleanListString), df_T50_common_count, df1['AUC'].astype(float), df2['AUC'].astype(float), df1['EC23T50'].str.split(',').apply(len), df2['EC23T50'].str.split(',').apply(len), df1['EC23T50'], df2['EC23T50']], keys=[('Inter-measures', 'EC23T50_common'), ('Inter-measures', 'EC23T50_common_count'), ('Inter-measures', 'T50_common'), ('Inter-measures', 'T50_common_count'), ('Intra-measures', 'AUC ' + name1), ('Intra-measures', 'AUC ' + name2), ('Intra-measures', 'EC23 count ' + name1 + ' %s' % n23_1), ('Intra-measures', 'EC23 count ' + name2 + ' %s' % n23_2), ('Intra-measures', 'EC23 ' + name1 + ' %s' % n23_1), ('Intra-measures', 'EC23 ' + name2 + ' %s' % n23_2)], axis=1, sort=False) df_res.columns = pd.MultiIndex.from_tuples(df_res.columns) df_res = df_res.sort_index() df_res.to_excel('%s.xlsx' % saveName, merge_cells=False) if False: df_res_f = df_res.copy() df_res_f = df_res_f.loc[(df_res_f[('Intra-measures', 'EC23 count ' + name1 + ' %s' % n23_1)] >= 5) & (df_res_f[('Intra-measures', 'EC23 count ' + name2 + ' %s' % n23_2)] >= 5) & (df_res_f[('Intra-measures', 'AUC ' + name1)] >= 0.5) & (df_res_f[('Intra-measures', 'AUC ' + name2)] >= 0.5)] df_res_f.to_excel('%s_filtered.xlsx' % saveName, merge_cells=False) return
[docs] def runPairOfExperiments(self, args): '''Analyze the case, compare it with comparison case, find the conserved genes between the cases, analyze case again Parameters: saveDir: str Directory with all bootstrap experiments saveSubDir: str Subdirectory for a bootstrap experiment otherCaseDir: str Directory holding comparison data Returns: None Usage: For internal use only ''' saveDir, saveSubDir, otherCaseDir = args print(saveDir, saveSubDir, flush=True) try: comparisonName = os.path.join(saveDir, saveSubDir, 'comparison') self.analyzeCase(None, toggleAdjustText=False, noPlot=True, suffix=saveSubDir, saveDir=os.path.join(saveDir, saveSubDir), printStages=False, toggleCalculateMajorMetric=False, toggleExportFigureData=True, toggleCalculateMeasures=True) self.compareTwoCases(os.path.join(saveDir, saveSubDir, ''), otherCaseDir, name1='name1', name2='name2', saveName=comparisonName) additionalData = externalPanelsData.copy() additionalData.update({'conservedGenes': pd.read_excel(comparisonName + '.xlsx', index_col=1, header=0)['Inter-measures.T50_common_count']}) self.analyzeCase(None, toggleAdjustText=False, dpi=300, suffix=saveSubDir, saveDir=os.path.join(saveDir, saveSubDir), printStages=False, toggleCalculateMajorMetric=False, toggleExportFigureData=True, toggleCalculateMeasures=True, externalPanelsData=additionalData) except Exception as exception: print(exception) return
[docs] def analyzeBootstrapExperiments(self): '''Analyze all bootstrap experiments Parameters: None Returns: None Usage: an = Analysis() an.analyzeBootstrapExperiments() ''' saveSubDirs = ['All'] + ['Experiment %s' % (id + 1) for id in self.bootstrapExperiments] pool = multiprocessing.Pool(processes=self.nCPUs) pool.map(self.runPairOfExperiments, [(self.bootstrapDir, saveSubDir, os.path.join(self.otherCaseDir, 'bootstrap/', saveSubDir, '') if self.perEachOtherCase else self.otherCaseDir) for saveSubDir in saveSubDirs]) pool.close() pool.join() dfs = [] for id in self.bootstrapExperiments: saveSubDir = 'Experiment %s' % (id + 1) filePath = os.path.join(self.bootstrapDir, saveSubDir, 'dendrogram-heatmap-%s-data.h5' % self.majorMetric) try: df_temp = pd.read_hdf(filePath, key='df_C') df_temp.index.name = 'gene' df_temp = df_temp.reset_index() df_temp = pd.concat([df_temp], keys=[saveSubDir], axis=0, sort=False) df_temp = pd.concat([df_temp], keys=['species'], axis=0, sort=False) df_temp.index.names = ['species', 'experiment', 'order'] dfs.append(df_temp) except Exception as exception: print(exception) pass try: dfs = pd.concat(dfs, axis=0, sort=False) print(dfs) dfs.to_hdf(self.dendroDataName, key='df', mode='a', complevel=4, complib='zlib') except Exception as exception: print(exception) pass return
[docs] def analyzeCombinationVariant(self, variant): ''' Analyze a combination of measures (same as in panels) Parameters: variant: str Name of combination variant (e.g. 'Avg combo4avgs', 'Avg combo3avgs') Returns: pandas.DataFrame Analysis result Usage: an = Analysis() an.analyzeCombinationVariant(variant) ''' def getPeaksLists(df_temp): '''Get list of peaks and experiments Parameters: df_temp: pandas.DataFrame Dataframe containing dendrogram data for specific variant Returns: list: All genes in peak for all experiments list: Lists of genes in peak for each experiment list Experiments Usage: getPeaksList(df_temp) ''' peaksListsMerged = [] peaksLists = [] experiments = [] for experiment in np.unique(df_temp.index.get_level_values('experiment')): se = df_temp.xs(experiment, level='experiment', axis=0).xs('species', level='species', axis=0) genesInHalfPeak = getGenesOfPeak(se) peaksListsMerged.extend(genesInHalfPeak) peaksLists.append(genesInHalfPeak) experiments.append(experiment) return peaksListsMerged, peaksLists, experiments print('Variant:', variant) df = pd.read_hdf(self.dendroDataName, key='df').fillna(0).set_index('gene', append=True).droplevel('order')[variant] variabilitySavePath = self.workingDir + '/variability_' + variant.replace(' ', '-') + '.xlsx' get_mean_std_cov_ofDataframe(df.unstack('gene').fillna(0.).T).to_excel(variabilitySavePath) writer = pd.ExcelWriter(self.workingDir + '%s bootstrap_in-peak_genes_SD.xlsx' % variant) try: listsMerged, lists, experiments = getPeaksLists(df.copy()) listsSizes = [len(item) for item in lists] umL = np.unique(listsMerged, return_counts=True) se = pd.Series(index=umL[0], data=umL[1]/len(experiments)).sort_values(ascending=False) filePath = os.path.join(self.bootstrapDir + '/All/', 'dendrogram-heatmap-%s-data.xlsx' % self.majorMetric) peakGenesAll = getGenesOfPeak(pd.read_excel(filePath, index_col=0, header=0, sheet_name='Cluster index')[variant]) df_res = pd.concat([se, se], axis=1, sort=False) df_res.iloc[:, 1] = np.where(np.isin(df_res.index.values, peakGenesAll), 1, np.nan) df_res.columns = ['Bootstrap', 'In all'] try: dfv = pd.read_excel(variabilitySavePath, header=0, index_col=0).reindex(df_res.index) df_res = pd.concat([df_res, dfv], axis=1, sort=False) except Exception as exception: print('ERROR reading variability file:', exception) df_res.to_excel(writer, 'Sheet1') df_exp = pd.DataFrame(index=range(1000), columns=experiments) for i, col in enumerate(df_exp.columns): df_exp.iloc[:len(lists[i]), i] = lists[i] df_exp.to_excel(writer, 'Bootstrap lists', index=False) except Exception as exception: print('ERROR:', exception) writer.save() df_res['Bootstrap'].to_excel(self.workingDir + variant + '_variant.xlsx') return df_res
def _forScramble(self, args): '''Function used internally in multiprocessing Parameters: workingDir: str Working directory to retrieve and save file and results to measures: list Measures (e.g: [Markers', 'Binomial -log(pvalue)', 'Top50 overlap']) df: pandas.DataFrame Ordered genes data N: int Size of a chunk maxDistance: int Maximum distance away considered to be in peak halfWindowSize: int Moving average half-window size j: int Identifier of a chunk Returns: None Usage: For internal use only ''' workingDir, measures, df, N, maxDistance, halfWindowSize, j, getMax = args np.random.seed() allGenes = df.index.values.copy() listsNonmerged = [] for i in range(N): np.random.shuffle(allGenes) data = np.zeros(len(allGenes)) for measure in measures: data += movingAverageCentered(normSum1(df[measure].loc[allGenes]), halfWindowSize, looped=False) data = movingAverageCentered(data, halfWindowSize, looped=False) listsNonmerged.append(getGenesOfPeak(pd.Series(index=allGenes, data=data), maxDistance=maxDistance)) se = pd.Series(listsNonmerged) if getMax: dfs = se.apply(pd.Series).reset_index(drop=True).replace(np.nan, 'RemoveNaN') df = pd.DataFrame(index=range(len(dfs)), columns=np.unique(dfs.values.flatten()), data=False, dtype=np.bool_) try: df = df.drop('RemoveNaN', axis=1) except: pass df[:] = (df.columns.values==dfs.values[..., None]).any(axis=1) return df.mean(axis=0).max() else: se.to_pickle(workingDir + '%s' % j) return
[docs] def scramble(self, measures, subDir = '', case='All', N = 10**4, M = 20, getMax = False, maxSuff = ''): '''Run control analysis for the dendrogram order Parameters: measures: list Measures (e.g: [Markers', 'Binomial -log(pvalue)', 'Top50 overlap']) subDir: str, Default '' Subdirectory to save dataframe to N: int Chunk size M: int Number of chunks Returns: None Usage: an = Analysis() an.scramble (measures) ''' df = pd.read_excel(os.path.join(self.bootstrapDir + '%s/dendrogram-heatmap-%s-data.xlsx' % (case, self.majorMetric)), index_col=0, header=0, sheet_name='Cluster index') workingDir = self.workingDir + 'random/' if subDir != '': workingDir = os.path.join(workingDir, subDir) if not os.path.exists(workingDir): os.makedirs(workingDir) # Run the randomization and prepare results dataframe if True: print('\nCalculating chunks', flush=True) pool = multiprocessing.Pool(processes=self.nCPUs) result = pool.map(self._forScramble, [(workingDir, measures, df.copy(), N, 25, 10, j, getMax) for j in range(M)]) pool.close() pool.join() if getMax: pd.Series(result).to_hdf(workingDir + 'max%s%s.h5' % (N, maxSuff), key='df', mode='a', complevel=4, complib='zlib') return print('\nCombining chunks', flush=True) dfs = [] for j in range(M): dfs.append(pd.read_pickle(workingDir + '%s' % j).apply(pd.Series)) print(j, end=' ', flush=True) dfs = pd.concat(dfs, axis=0, sort=False).reset_index(drop=True).replace(np.nan, 'RemoveNaN') print('\nAligning genes', flush=True) df = pd.DataFrame(index=range(len(dfs)), columns=np.unique(dfs.values.flatten()), data=False, dtype=np.bool_) try: df = df.drop('RemoveNaN', axis=1) except: pass df[:] = (df.columns.values==dfs.values[..., None]).any(axis=1) print(df) print('\nRecording', flush=True) df.to_hdf(workingDir + 'combined_%s_aligned.h5' % M, key='df', mode='a', complevel=4, complib='zlib') for j in range(M): os.remove(workingDir + '%s' % j) # Save and plot counts distribution if True: df = pd.read_hdf(workingDir + 'combined_%s_aligned.h5' % M, key='df') se = df.sum(axis=0).sort_values(ascending=False)/df.shape[0] se.to_excel(workingDir + 'se_distribution.xlsx') se.hist(bins=250) plt.savefig(workingDir + 'se_distribution.png', dpi=300) plt.clf() # Check for variation of i-th quantile if False: df = pd.read_hdf(workingDir + 'combined_%s_aligned.h5' % M, key='df') q = 99.999 res = dict() for i in range(1, 100): print(i, end=' ', flush=True) size = i*2*10**3 res.update({size: np.percentile((df.sample(n=size, axis=0, replace=True).sum(axis=0).sort_values(ascending=False)/size).values, q)}) se_per = pd.Series(res).sort_index() se_per.to_excel(workingDir + 'se_percentile_variation_%s.xlsx' % q) se_per.plot() plt.savefig(workingDir + 'se_percentile_variation_%s.png' % q, dpi=300) plt.clf() #os.remove(workingDir + 'combined_%s_aligned.h5' % M) return
[docs] def analyzeCase(self, df_expr, toggleCalculateMajorMetric = True, exprCutoff = 0.05, toggleExportFigureData = True, toggleCalculateMeasures = True, suffix = '', saveDir = '', toggleGroupBatches = True, dpi = 300, toggleAdjustText = True, markersLabelsRepelForce = 1.5, figureSize=(8, 22), toggleAdjustFigureHeight=True, noPlot = False, halfWindowSize = 10, printStages = True, externalPanelsData = None, toggleIncludeHeatmap = True, addDeprecatedPanels = False, includeClusterNumber = True, togglePublicationFigure = False): '''Analyze, calculate, and generate plots for individual experiment Parameters: df_expr: pandas.Dataframe Gene expression data toggleCalculateMajorMatric: boolean, Default True Whether to calculate cdist of major metric. This is a legacy parameter exprCutoff: float, Default 0.05 Cutoff for percent expression in a batch of input data toggleExportFigureData: boolean, Default True Whether to export figure data toggleCalculateMeasures: boolean, Default True Whether to calculate measures suffix: str, Default '' Name of experiment saveDir: str, Default '' Exerything is exported to this directory, should be unique for each dataset toggleGroupBatches: boolean, Default True Whether to group batches or save per-batch distance measure dpi: int or 'figure', Default 300 Resolution in dots per inch, if 'float' use figures dpi value toggleAdjustText: boolean, Default True Whether to use (external) module to minimize text overlap in figure figure_size: tuple, Default (8, 20) Width, height in inches toggleAdjustFigureHeight: boolean, Default True Whether to adjust figure height noPlot: boolean, Default False Whether to generate plot halfWindowSize: int, Default 10 Moving average half-window size printStages: boolean, Default True Whether to print stage status to output externalPanelsData: dict, Default None Dictionary containing additional panels data toggleIncludeHeatmap: boolean, Default True Whether to include heatmap in figure addDeprecatedPanels: boolean, Default False Whether to include deprecated panels Returns: None Usage: self.analyzeCase(df_expr) ''' stimulators, inhibitors = self.knownRegulators, [] #if togglePublicationFigure: # toggleExportFigureData = True def calculateMajorMetricAndGeneStats(df_expr, saveDir, groupBatches, selGenes, exprCutoff): '''Calculate cdist of metric (e.g. correlation) Calculate fraction of cells expressing each gene, and median of non-zero gene expression (per batch) Parameters: df_expr: pandas.DataFrame Gene expression of one species, one cluster (subset of clusters) saveDir: str Exerything is exported to this directory, should be unique for each dataset groupBatches: boolean Whether to take median across batches selGenes: list List of receptors, or transcription factors exprCutoff: float Cutoff for percent expression of input data Returns: None Usage: calculateMajorMetricAndGeneStats(df_expr, saveDir, groupBatches, selGenes, exprCutoff) ''' print('Received expression data of shape:', df_expr.shape, flush=True) np.savetxt(os.path.join(saveDir, 'size.txt'), np.array(df_expr.shape), fmt='%i') # For each batch calculate gene expression distance metric print('Calculating distance metric', flush=True) df_measure = get_df_distance(df_expr, metric=self.majorMetric, genes=selGenes, analyzeBy='batch', minSize=10, groupBatches=groupBatches, cutoff=exprCutoff, nCPUs=self.nCPUs) print('Recording major metric (shape: %s, %s) to h5' % df_measure.shape, flush=True) df_measure.to_hdf(os.path.join(saveDir, self.metricsFile), key=self.majorMetric, mode='a', complevel=4, complib='zlib') # For each batch calculate fraction of cells expressing each gene df_fraction = df_expr.replace(0, np.nan).replace(0., np.nan).groupby(axis=1, level='batch').agg('count') /\ df_expr.fillna(0.).groupby(axis=1, level='batch').agg('count') print('Recording fractions (shape: %s, %s) to h5' % df_fraction.shape, flush=True) df_fraction.to_hdf(os.path.join(saveDir, 'perGeneStats.h5'), key='df_fraction', mode='a', complevel=4, complib='zlib') # For each batch calculate median of non-zero values of each gene expression df_median_expr = df_expr.replace(0, np.nan).replace(0., np.nan).groupby(axis=1, level='batch').agg(np.nanmedian) print('Recording median expression (shape: %s, %s) to h5' % df_fraction.shape, flush=True) df_median_expr.to_hdf(os.path.join(saveDir, 'perGeneStats.h5'), key='df_expression', mode='a', complevel=4, complib='zlib') # For each batch calculate median of non-zero values of each gene expression se_count = df_expr.fillna(0.).groupby(axis=1, level='batch').agg('count').iloc[0] print('Recording per batch counts to h5', flush=True) se_count.to_hdf(os.path.join(saveDir, 'perGeneStats.h5'), key='se_count', mode='a', complevel=4, complib='zlib') return def makeCombinationPlot(df, n_clusters = 10, adjustText = toggleAdjustText): '''Builds and plots dendrogram, heatmap, and bargraphs. Parameters: df: pandas.DataFrame All calculated measurement metric: str, Default 'euclidean' Name of metric used to build dendrogram and identify clusters in it Metric has to be of type "Euclidean" to use linkage method "Ward" With any other metric (e.g. correlation distance) use linkage method "average" etc. Metric 'euclidean_missing' used commonly-non-missing points only linkageMethod: str, Default 'ward' Linkage algorithm to use n_clusters: int, Default 10 Specific number of clusters to find adjustText: str, Default toggleAdjustText Whether to use module to fix text overlap in figure Returns: dict: Figure data for export Usage: makeCombinationPlot(df) ''' nonlocal figureSize, self, togglePublicationFigure, markersLabelsRepelForce, includeClusterNumber metric = self.dendrogramMetric linkageMethod = self.dendrogramLinkageMethod if metric == 'euclidean_missing': metric = metric_euclidean_missing if self.panels is None: self.panels = self.standardPanels if addDeprecatedPanels: self.panels += deprecatedPanels self.panels += self.combinationPanels def addDendro(fig, dataGenes, M, coords, metric = metric, linkageMethod = 'ward', linewidth = 0.25, adjustText = adjustText, fontsize = 6): genesSubset = list(stimulators) + list(inhibitors) ax = fig.add_axes(coords, frame_on=False) Z = hierarchy.linkage(np.nan_to_num(M, nan=max(M)), method=linkageMethod, optimal_ordering=True) origLineWidth = matplotlib.rcParams['lines.linewidth'] matplotlib.rcParams['lines.linewidth'] = linewidth cmap = cm.gist_ncar(np.linspace(0, 0.5, n_clusters + 1)) hierarchy.set_link_color_palette([matplotlib.colors.rgb2hex(rgb[:3]) for rgb in cmap]) D = hierarchy.dendrogram(Z, ax=ax, color_threshold = (Z[-n_clusters,2] + Z[-n_clusters+1,2]) / 2, above_threshold_color='k', orientation='top') hierarchy.set_link_color_palette(None) matplotlib.rcParams['lines.linewidth'] = origLineWidth reindexed = pd.Index(dataGenes[D['leaves']]).reindex(pd.Index(genesSubset).intersection(dataGenes)) genes = reindexed[0][reindexed[1] > -1].values locations = reindexed[1][reindexed[1] > -1] if True: tickLabelsColors = np.array(['navy']*len(dataGenes), dtype=np.dtype('U20')) xtickslabels = np.array(['']*len(dataGenes), dtype=np.dtype('U20')) for gene, location in zip(genes, locations): xtickslabels[location] = gene tickLabelsColors[location] = 'green' if (gene in stimulators) else 'red' ax.set_xticklabels(xtickslabels, fontsize=4) ax.tick_params(axis='y', labelsize=4, width=0.25, length=1) ax.set_yticklabels([]) ax.set_yticks([]) for xtick, color in zip(ax.get_xticklabels(), tickLabelsColors): xtick.set_color(color) texts = [] origPos = [] for xpos, xtext, color in zip(ax.get_xticks(), xtickslabels, tickLabelsColors): if xtext != '': texts.append(ax.text(xpos, -2., xtext, fontsize=fontsize, rotation=90, va='top', ha='center', color=color)) origPos.append(xpos) ticks_x = [] ticks_y = [] vdistance = -0.01 * ax.get_ylim()[1] for tick in ax.get_xticks(): ticks_x.extend([tick, tick, None]) ticks_y.extend([0, vdistance, None]) ax.plot(ticks_x, ticks_y, color='k', lw=0.4, clip_on=False) ax.set_xticklabels([]) if adjustText: adjustTexts1D(texts, fig, ax) #adjust_text(texts, va='top', ha='center', autoalign='x', lim=400, only_move={'text':'x'}, force_text=(markersLabelsRepelForce, 0.5)) v = 0.05 * ax.get_ylim()[1] for text, opos in zip(texts, origPos): text._y = -v ax.plot([text._x, opos], [text._y, 0.], color=text._color, lw=0.5, clip_on=False) if True: clusters = scipy.cluster.hierarchy.fcluster(Z, t=n_clusters, criterion='maxclust')[D['leaves']] - 1 clusterBoundaries = (np.where(clusters - np.roll(clusters, 1) != 0)[0]/ len(D['leaves'])) * ax.get_xlim()[1] clusterBoundaries = np.append(clusterBoundaries, ax.get_xlim()[1]) clusterCenters = clusterBoundaries[:-1] + ((clusterBoundaries - np.roll(clusterBoundaries, 1))/2.)[1:] vposition = (Z[-n_clusters,2] + Z[-n_clusters+1,2]) / 5 if includeClusterNumber: for cluster, position in zip(np.unique(clusters), clusterCenters): ltext = ax.text(position, vposition, '#%s' % cluster, fontsize=fontsize, color='white', va='center', ha='center') ltext.set_path_effects([path_effects.Stroke(linewidth=1., foreground='k'), path_effects.Normal()]) return {'order': D['leaves'], 'M': squareform(M)[:, D['leaves']][D['leaves'], :], 'genes': genes, 'allGenes': dataGenes[D['leaves']], 'locations': locations, 'tickLabelsColors': tickLabelsColors, 'xtickslabels': xtickslabels, 'clusters': clusters, 'clusterBoundaries': clusterBoundaries / 10., 'clusterCenters': clusterCenters / 10.} def addHeatmap(fig, dataArgs, coords, adjustText = adjustText, fontsize = 6): plottingMajorMetricOfSelected = False if plottingMajorMetricOfSelected: M = dataArgs['majorMetricOfSelected'] else: M = dataArgs['M'] order = dataArgs['order'] genes = dataArgs['genes'] locations = dataArgs['locations'] tickLabelsColors = dataArgs['tickLabelsColors'] tickslabels = dataArgs['xtickslabels'] clusters = dataArgs['clusters'] clusterBoundaries = dataArgs['clusterBoundaries'] clusterCenters = dataArgs['clusterCenters'] ax = fig.add_axes(coords, frame_on=True) masked_M = np.ma.array(M, mask=np.isnan(M)) if plottingMajorMetricOfSelected: cmap = copy.copy(plt.cm.bwr) cmap.set_bad('grey') vmin, vmax = -1, 1 else: cmap = copy.copy(plt.cm.bwr) # Greens_r cmap.set_bad('red') vmin, vmax = None, None im = ax.imshow(masked_M, cmap=cmap, aspect='auto', vmin=vmin, vmax=vmax, interpolation='None', extent=(-0.5, M.shape[0] - 0.5, M.shape[1] - 0.5, -0.5)) xlim = ax.get_xlim() ylim = ax.get_ylim() # Selected x tick labels if True: ax.set_xticks(range(len(tickslabels))) ax.set_xticklabels(tickslabels, fontsize=4) for xtick, color in zip(ax.get_xticklabels(), tickLabelsColors): xtick.set_color(color) texts = [] origPos = [] for xpos, xtext, color in zip(ax.get_xticks(), tickslabels, tickLabelsColors): if xtext != '': texts.append(ax.text(xpos, 1.01*ax.get_ylim()[0], xtext, fontsize=fontsize, rotation=90, va='top', ha='center', color=color)) origPos.append(xpos) ax.set_xticklabels([]) ax.set_xticks([]) if adjustText: adjustTexts1D(texts, fig, ax) #adjust_text(texts, va='top', ha='center', autoalign='x', lim=400, only_move={'text':'x'}, force_text=(1.05*markersLabelsRepelForce, 0.5)) v = ax.get_ylim()[0] for text, opos in zip(texts, origPos): text._y = 1.01 * v ax.plot([text._x, opos], [text._y, v], color=text._color, lw=0.5, clip_on=False) # Selected y tick labels if True: ax.set_yticks(range(len(tickslabels))) ax.set_yticklabels(tickslabels, fontsize=4) for ytick, color in zip(ax.get_yticklabels(), tickLabelsColors): ytick.set_color(color) texts = [] origPos = [] for ypos, xtext, color in zip(ax.get_yticks(), tickslabels, tickLabelsColors): if xtext != '': texts.append(ax.text(-0.01*ax.get_xlim()[1], ypos, xtext, fontsize=fontsize, va='center', ha='right', color=color)) origPos.append(ypos) ax.set_yticklabels([]) ax.set_yticks([]) if adjustText: adjustTexts1D([ele for ele in reversed(texts)], fig, ax) #adjust_text(texts, va='center', ha='right', autoalign='y', lim=400, only_move={'text':'y'}, force_text=(1.05*markersLabelsRepelForce, 0.5)) v = -0.01 * ax.get_xlim()[1] for text, opos in zip(texts, origPos): text._x = v ax.plot([0., text._x], [opos, text._y], color=text._color, lw=0.5, clip_on=False) # Clusters outline boxes if not plottingMajorMetricOfSelected: for cluster, position in zip(np.unique(clusters), clusterCenters): ltext = ax.text(position, position, '#%s' % cluster, fontsize=fontsize, color='white', va='center', ha='center') ltext.set_path_effects([path_effects.Stroke(linewidth=1., foreground='k'), path_effects.Normal()]) clusterBoundaries -= 0.5 for i in range(len(np.unique(clusters))): ax.plot([clusterBoundaries[i], clusterBoundaries[i+1], clusterBoundaries[i+1], clusterBoundaries[i], clusterBoundaries[i]], [clusterBoundaries[i], clusterBoundaries[i], clusterBoundaries[i+1], clusterBoundaries[i+1], clusterBoundaries[i]], '--', lw=0.75, color='k', clip_on=False) ax.set_xlim(xlim) ax.set_ylim(ylim) # Colorbar if True: ax = fig.add_axes([0.85, 0.05, 0.025, 0.1], frame_on=False) ax.set_xticks([]) ax.set_xticklabels([]) ax.set_yticks([]) ax.set_yticklabels([]) ltext = '' # 'Eucl. dist. of gene expr.\n %s dist.' % self.majorMetric clb = fig.colorbar(im, ax=ax, fraction=0.4, label=ltext) clb.ax.tick_params(labelsize=fontsize) return def addBar(fig, dataArgs, panel, coords, halfWindowSize = halfWindowSize, noAverage = False): nonlocal panelsData, panelsDataNames, externalPanelsData, togglePublicationFigure M = dataArgs['M'] order = dataArgs['order'] genes = dataArgs['genes'] locations = dataArgs['locations'] tickLabelsColors = dataArgs['tickLabelsColors'] tickslabels = dataArgs['xtickslabels'] clusters = dataArgs['clusters'] clusterBoundaries = dataArgs['clusterBoundaries'] clusterCenters = dataArgs['clusterCenters'] allGenes = dataArgs['allGenes'] showBoundaries = True # Draw randomized bar panels if False: showBoundaries = False np.random.seed(0) rorder = np.arange(len(allGenes)) np.random.shuffle(rorder) allGenes = allGenes[rorder] tickLabelsColors = tickLabelsColors[rorder] ax = fig.add_axes(coords, frame_on=True) ax.set_xlim([min(clusterBoundaries), max(clusterBoundaries)]) if panel == 'fraction': if togglePublicationFigure: ylabel='Expression\nfraction' else: ylabel='Fraction' try: data = pd.read_hdf(os.path.join(saveDir, 'perGeneStats.h5'), key='df_fraction').reindex(allGenes) if type(data) is pd.Series: data = data.values else: data = data.mean(axis=1).values except: data = np.zeros(len(allGenes)) elif panel == 'expression': ylabel='Expression' try: data = pd.read_hdf(os.path.join(saveDir, 'perGeneStats.h5'), key='df_expression').reindex(allGenes) if type(data) is pd.Series: data = data.values else: data = data.mean(axis=1).values except: data = np.zeros(len(allGenes)) elif panel == 'closeness': ylabel='Closeness' try: data = -np.append([0], np.diagonal(M, offset=1)) data -= min(data) except: data = np.zeros(len(allGenes)) elif panel == 'rate': ylabel='Rate' try: data = externalPanelsData['Evolutionary rate'].reindex(allGenes).values except: data = np.zeros(len(allGenes)) elif panel == 'age': ylabel='Age' try: data = externalPanelsData['Evolutionary age'].reindex(allGenes).values except: data = np.zeros(len(allGenes)) elif panel == 'markerstop50': ylabel='Markers\nTop50\noverlap' try: data = externalPanelsData['conservedMarkers'] data = data.loc[~data.index.duplicated(keep='first')].reindex(allGenes).values except: data = np.zeros(len(allGenes)) elif panel == 'PubMedHits': ylabel='PubMedHits' try: data = pd.Series(externalPanelsData['pubMed angiogenesis hits']).reindex(allGenes).values #data = np.log(data) except: data = np.zeros(len(allGenes)) elif panel == 'gAbove50_PanglaoMouse': ylabel='gAbove50\nPanglaoMouse' try: data = np.zeros(len(allGenes)) data[np.where(np.isin(allGenes, externalPanelsData['gAbove50_PanglaoMouse']))[0]] = 1. except: data = np.zeros(len(allGenes)) elif panel == 'GOpositive': ylabel='GO positive' try: data = np.zeros(len(allGenes)) data[np.where(np.isin(allGenes, externalPanelsData['GOpositive']))[0]] = 1. except: data = np.zeros(len(allGenes)) elif panel == 'GOnegative': ylabel='GO negative' try: data = np.zeros(len(allGenes)) data[np.where(np.isin(allGenes, externalPanelsData['GOnegative']))[0]] = 1. except: data = np.zeros(len(allGenes)) elif panel == 'gAbove50_PanglaoHuman': ylabel='gAbove50\nPanglaoHuman' try: data = np.zeros(len(allGenes)) data[np.where(np.isin(allGenes, externalPanelsData['gAbove50_PanglaoHuman']))[0]] = 1. except: data = np.zeros(len(allGenes)) elif panel == 'markers': if togglePublicationFigure: ylabel='Known angiogenesis\nreceptors' else: ylabel='Markers' try: data = np.zeros(len(allGenes)) data[locations] = 1. try: data = data[rorder] except: pass except: data = np.zeros(len(allGenes)) elif panel == 'top50': if togglePublicationFigure: ylabel='Evolutionary\nconservation' else: ylabel='Top50\noverlap' try: data = externalPanelsData['conservedGenes'] data = data.loc[~data.index.duplicated(keep='first')].reindex(allGenes).values except: data = np.zeros(len(allGenes)) elif panel == 'binomial': if togglePublicationFigure: ylabel='Network\nenrichment' else: ylabel='Binomial\n-log(pvalue)' try: data = pd.read_hdf(os.path.join(saveDir, 'perGeneStats.h5'), key='df_ranks') if not type(data) is pd.Series: data = data.median(axis=1) data = data[data > 0].sort_values()[:1000] diffExpressedGenes = data.index data = -np.log(binomialEnrichmentProbability('PCN.txt', enriched_genes=diffExpressedGenes, target_genes=selGenes, PCNpath=self.PCNpath)['Binomial_Prob'].reindex(allGenes).values) except: data = np.zeros(len(allGenes)) try: data = externalPanelsData['diffExpressedGenes'] if not type(data) is pd.Series: data = data.median(axis=1) data = data[data > 0].sort_values()[:1000] diffExpressedGenes = data.index data = -np.log(binomialEnrichmentProbability('PCN.txt', enriched_genes=diffExpressedGenes, target_genes=selGenes, PCNpath=self.PCNpath)['Binomial_Prob'].reindex(allGenes).values) except: pass elif panel == 'variability_3mean': ylabel='variability\n3 mean' try: data = externalPanelsData['variability_3']['mean'].reindex(allGenes).values except: data = np.zeros(len(allGenes)) elif panel == 'variability_3std': ylabel='variability\n3 std' try: data = externalPanelsData['variability_3']['std'].reindex(allGenes).values except: data = np.zeros(len(allGenes)) elif panel == 'variability_3cov': ylabel='variability\n3 cov' try: data = externalPanelsData['variability_3']['cov'].reindex(allGenes).values except: data = np.zeros(len(allGenes)) elif panel[-len('-peak'):] == '-peak': ylabel = panel try: targetPanel = 'Avg ' + panel[:-len('-peak')] data = pd.Series(index=allGenes, data=np.nan_to_num(panelsData[panelsDataNames[targetPanel]])) data /= data.max() peaks = getGenesOfPeak(data) data[:] = 0. data[peaks] = 1. noAverage = True except: data = np.zeros(len(allGenes)) elif panel[:len('combo')] == 'combo': ylabel = panel if ylabel == 'combo3avgs': if togglePublicationFigure: ylabel = 'Combination\nof measures' # 'Combination of 3' elif ylabel == 'combo4avgs': if togglePublicationFigure: ylabel = 'Combination of 4' try: def norm1(s): w = np.nansum(panelsData[panelsDataNames[s]]) if w != w or w == 0.: w = 1. return np.nan_to_num(panelsData[panelsDataNames[s]]) / w data = np.zeros(len(allGenes)) for subPanel in self.combinationPanelsDict[panel]: if panel[-len('avgs'):] == 'avgs': data += movingAverageCentered(norm1(subPanel), halfWindowSize) else: data += norm1(subPanel) except: data = np.zeros(len(allGenes)) else: ylabel = panel try: data = externalPanelsData[panel] data = data.loc[~data.index.duplicated(keep='first')].reindex(allGenes).values except: data = np.zeros(len(allGenes)) ax.bar(range(len(clusters)), data, width=ax.get_xlim()[1]/len(clusters), color=tickLabelsColors) data_avg = movingAverageCentered(np.nan_to_num(data), halfWindowSize) if not noAverage: ax.plot(range(len(clusters)), data_avg, linewidth=1.0, color='coral', alpha=1.0) if not togglePublicationFigure: ax.text(0.999, 0.95, 'window = %s' % (2*halfWindowSize + 1), color='coral', ha='right', va='top', transform=ax.transAxes, fontsize=3) nandata = np.isnan(data) if np.sum(nandata) > 0: wh = np.where(nandata) ax.bar(wh[0], [np.nanmax(data)]*wh[0].shape[0], width=ax.get_xlim()[1]/len(clusters), color='lightgrey', alpha=0.3) ax.set_xticks([]) ax.set_xticklabels([]) yticks = np.round(ax.get_ylim(), 3) ax.set_yticks(yticks) ax.set_yticklabels(yticks) ax.tick_params(axis='y', labelsize=4, width=0.75, length=3) ax.yaxis.tick_right() if showBoundaries: ylim = ax.get_ylim() for i in range(1, len(np.unique(clusters))): ax.plot([clusterBoundaries[i] - 0.5]*2, [ylim[0], ylim[1]], '--', lw=0.5, color='k', clip_on=False) ax.text(-0.01, 0.5, ylabel, fontsize=6.5, rotation=0, va='center', ha='right', transform=ax.transAxes) if togglePublicationFigure: if panel == 'combo3avgs': peaks = getGenesOfPeak(pd.Series(data/data.max())) bw = ax.get_xlim()[1]/len(clusters) ax.axvspan(min(peaks) - 0.5*bw, max(peaks) + 0.5*bw, facecolor='crimson', linewidth=0., zorder=-np.inf) return ylabel, data, data_avg mmin, mmax = np.nanmin(np.nanmin(df.values)), np.nanmax(np.nanmax(df.values)) if self.majorMetric in ['correlation', 'spearman', 'cosine']: missingFillValue = 1.0 elif self.majorMetric == 'euclidean': missingFillValue = mmax else: missingFillValue = mmax if printStages: print('\tCalculating %s metric of %s . . .' % (metric, self.majorMetric), end='\t', flush=True) df = df.fillna(missingFillValue) if True: M = pdist(df.values.T, metric=metric) else: M = df.loc[df.columns].values M = (M + M.T)/2. np.fill_diagonal(M, 0.) M = squareform(M) if printStages: print('Done', flush=True) nPanels = len(self.panels) dendroHeight = 0.10 panelHeight = 0.030 # 0.022 delta = 0.01 vsum = nPanels*(panelHeight + delta) + dendroHeight + 0.8 if toggleAdjustFigureHeight: figureSize = (figureSize[0], figureSize[0] * (9./8.) * vsum) factor = 1 / vsum topBorder = 1. - 0.03*factor bottomBorder = 0.05*factor dendroHeight *= factor panelHeight *= factor delta *= factor heatmapHeight = (topBorder - bottomBorder - dendroHeight) - nPanels * (panelHeight + delta) - 0.05*factor if not toggleIncludeHeatmap: rescaleFactor = 1. - heatmapHeight figureSize = figureSize[0], figureSize[1] * rescaleFactor heatmapHeight = 0. topBorder = 1. - (1. - topBorder) / rescaleFactor dendroHeight /= rescaleFactor panelHeight /= rescaleFactor delta /= rescaleFactor bottomBorder /= rescaleFactor fig = plt.figure(figsize=figureSize) dataArgs = addDendro(fig, df.columns, M, [0.1, topBorder-dendroHeight, 0.75, dendroHeight], metric=metric, linkageMethod=linkageMethod) if nPanels > 0: if printStages: print('\tPlotting bar panels . . .', end='\t', flush=True) panelsData = dict() panelsDataNames = dict() for ipanel, panel in enumerate(self.panels): coords = [0.1, 0.015*factor + bottomBorder + heatmapHeight + (len(self.panels) - ipanel - 1)*(panelHeight + delta), 0.75, panelHeight] if panel[-1] == '~': fig.add_artist(lines.Line2D([0, 0.875], [coords[1] - 0.5 * delta]*2, linewidth=0.75)) panel = panel[:-1] panelName, data, data_avg = addBar(fig, dataArgs, panel, coords) wname = panelName.replace('\n', ' ') panelsData.update({wname: data}) panelsDataNames.update({panel: wname}) panelsData.update({'Avg ' + wname: data_avg}) panelsDataNames.update({'Avg ' + panel: 'Avg ' + wname}) dataArgs.update({'panelsData': panelsData}) if printStages: print('Done', flush=True) if not noPlot: if toggleIncludeHeatmap: if printStages: print('\tPlotting heatmap . . .', end='\t', flush=True) dataArgs.update({'majorMetricOfSelected': 1. - df.loc[dataArgs['allGenes']][dataArgs['allGenes']].values}) addHeatmap(fig, dataArgs, [0.1, bottomBorder, 0.75, heatmapHeight]) if printStages: print('Done', flush=True) number = np.loadtxt(os.path.join(saveDir, 'size.txt'), dtype=int) np.savetxt(os.path.join(saveDir, 'figureInfo.txt'), np.array([['Cells', number[1]], ['Genes', number[0]], ['SelGenes', df.shape[1]]]), fmt='%s') if not togglePublicationFigure: fig.suptitle('Ordering by single cell co-expression\nData: %s\n(%s cells, %s x %s genes)' % (suffix, number[1], number[0], df.shape[1]), fontsize=8, backgroundcolor='white') if printStages: print('\tSaving image . . .', end='\t', flush=True) fig.savefig(os.path.join(saveDir, '%s dendrogram-heatmap-%s.png' % (suffix, self.majorMetric)), dpi=dpi) if printStages: print('Done', flush=True) plt.close(fig) return dataArgs def exportFigureData(dataArgs, saveXLSX = True, saveHDF = True): '''Function to assist in exporting figure data to excel and/or hdf file Parameters: dataArgs: dict Figure data for export saveXLSX: boolean, Default True Whether to export data as an excel file saveHDF: boolean, Default True Whether to export data as an hdf file Returns: None Usage: exportFigureData(dataArgs) ''' M = dataArgs['M'] allGenes = dataArgs['allGenes'] clusters = dataArgs['clusters'] selGenes = dataArgs['genes'] selGenesLocations = dataArgs['locations'] df_C = pd.DataFrame(index=allGenes) df_C['cluster'] = clusters try: for panel, panelData in dataArgs['panelsData'].items(): df_C[panel] = panelData except: pass df_C['stimulator'] = np.where(np.isin(allGenes, stimulators), True, np.nan) df_C['inhibitor'] = np.where(np.isin(allGenes, inhibitors), True, np.nan) df_M = pd.DataFrame(data=M, index=allGenes, columns=allGenes) if saveXLSX: writer = pd.ExcelWriter(os.path.join(saveDir, 'dendrogram-heatmap-%s-data.xlsx' % self.majorMetric)) df_C.to_excel(writer, 'Cluster index') df_M.to_excel(writer, 'Expression distance measure') writer.save() if saveHDF: df_C.to_hdf(os.path.join(saveDir, 'dendrogram-heatmap-%s-data.h5' % self.majorMetric), key='df_C', mode='a', complevel=4, complib='zlib') df_M.to_hdf(os.path.join(saveDir, 'dendrogram-heatmap-%s-data.h5' % self.majorMetric), key='df_M', mode='a', complevel=4, complib='zlib') return if not os.path.exists(saveDir): os.makedirs(saveDir) selGenes = np.unique(list(self.genesOfInterest)) if toggleCalculateMajorMetric and (not df_expr is None): calculateMajorMetricAndGeneStats(df_expr, saveDir, toggleGroupBatches, selGenes, exprCutoff) # Load and prepare df_measure if True: try: df_measure = pd.read_hdf(os.path.join(saveDir, self.metricsFile), key=self.majorMetric) if not toggleGroupBatches: df_measure = pd.Series(data=np.nanmedian(df_measure.values, axis=1), index=df_measure.index).unstack(0) except Exception as exception: print(exception, flush=True) return df_measure = df_measure[df_measure.columns.intersection(np.unique(selGenes))] #print(df_measure.shape, flush=True) for gene in df_measure.columns: df_measure.loc[gene, gene] = np.nan df_measure = df_measure[df_measure.columns[df_measure.count(axis=0) > 0]] # Make dendrogram with heatmap, label clusters, export all(!) figure data and clusters if True: dataArgs = makeCombinationPlot(df_measure) if toggleExportFigureData: if printStages: print('\tExporting data . . .', end='\t', flush=True) exportFigureData(dataArgs) if printStages: print('Done', flush=True) # For each Approach [1-4] calculate (1) AUC of EC23, (2) T50, and (3) EC23 of T50 if toggleCalculateMeasures: if printStages: print('\tCalculating measures . . .', end='\t', flush=True) df_Em = pd.read_hdf(os.path.join(saveDir, 'dendrogram-heatmap-%s-data.h5' % self.majorMetric), key='df_M') orderedGenes = df_Em.columns data_Cm = squareform(pdist(df_measure[orderedGenes].fillna(0.).values.T, metric='correlation')) df_Cm = pd.DataFrame(index=orderedGenes, columns=orderedGenes, data=data_Cm) df_m = df_measure.loc[orderedGenes, orderedGenes] def getForGene(approach, gene, orderedGenes, selGenes23, top = 50): if approach == 'm': scores = df_m.loc[gene] elif approach =='Em': scores = df_Em.loc[gene] elif approach =='Cm': scores = df_Cm.loc[gene] elif approach =='Dm': scores = np.abs(np.arange(len(orderedGenes)) - np.where(orderedGenes==gene)[0]) se = pd.Series(index=orderedGenes, data=scores).dropna().sort_values(ascending=True) try: AUC = roc_auc_score(~np.isin(se.index.values, selGenes23), se.values) if len(se) >= 10 else np.nan except: AUC = np.nan T50 = se.index.values[:top] subT50 = np.intersect1d(T50, selGenes23) return approach, gene, AUC, len(subT50), cleanListString(T50), cleanListString(subT50) list_gEC23 = np.unique(list(stimulators) + list(inhibitors)) temps = [] for approach in ['m', 'Cm', 'Em', 'Dm']: temps.extend([getForGene(approach, gene, orderedGenes, list_gEC23) for gene in orderedGenes]) temps = np.array(temps).T df = pd.DataFrame(index=pd.MultiIndex.from_arrays([temps[0], temps[1]], names=['approach', 'gene']), data=temps[2:].T, columns=['AUC', 'numEC23T50', 'T50', 'EC23T50']) #print(df) df.to_excel(os.path.join(saveDir, 'per-gene-measures-%s.xlsx' % self.majorMetric), merge_cells=False) df.to_hdf(os.path.join(saveDir, 'per-gene-measures-%s.h5' % self.majorMetric), key='df', mode='a', complevel=4, complib='zlib') if printStages: print('Done', flush=True) return
[docs] def reanalyzeMain(self, case='All', **kwargs): '''Reanalyze case Parameters: Any parameters that function 'analyzeCase' can accept Returns: None Usage: an = Analyze() an.reanalyzeMain() ''' np.random.seed(42) try: print('Re-analyzing %s-data case' % case, flush=True) kwargs.setdefault('toggleAdjustText', True) kwargs.setdefault('dpi', 600) kwargs.setdefault('suffix', case) kwargs.setdefault('saveDir', os.path.join(self.bootstrapDir, '%s/' % case)) kwargs.setdefault('printStages', True) kwargs.setdefault('toggleCalculateMajorMetric', False) kwargs.setdefault('toggleExportFigureData', True) kwargs.setdefault('toggleCalculateMeasures', True) kwargs.setdefault('externalPanelsData', externalPanelsData) kwargs['externalPanelsData'].update({'conservedGenes': pd.read_excel(os.path.join(self.bootstrapDir, case, 'comparison.xlsx'), index_col=1, header=0)['Inter-measures.T50_common_count']}) self.analyzeCase(None, **kwargs) shutil.copyfile(os.path.join(self.bootstrapDir, case, '%s dendrogram-heatmap-%s.png' % (case, self.majorMetric)), os.path.join(self.workingDir, 'results %s %s %s %s.png' % (case, self.majorMetric, self.dendrogramMetric, self.dendrogramLinkageMethod))) shutil.copyfile(os.path.join(self.bootstrapDir, case, 'dendrogram-heatmap-%s-data.xlsx' % (self.majorMetric)), os.path.join(self.workingDir, 'results %s %s %s %s.xlsx' % (case, self.majorMetric, self.dendrogramMetric, self.dendrogramLinkageMethod))) except Exception as exception: print(exception) return
[docs] def analyzeAllPeaksOfCombinationVariant(self, variant, nG = 8, nE = 30, dcutoff = 0.5, fcutoff = 0.5, width = 50): '''Find all peaks and their frequency from the bootstrap experiments Parameters: variant: str Name of combination variant (e.g. 'Avg combo4avgs', 'Avg combo3avgs') nG: int, Default 8 Number of clusters of genes nE: int, Default 30 Number of clusters of bootstrap experiments fcutoff: float, Default 0.5 Lower peak height cutoff width: int, Default 50 Width of peak Returns: None Usage: an = Analyze() an.analyzeAllPeaksOfCombinationVariant('Avg combo4avgs', nG=8, nE=30, fcutoff=0.5, width=50) ''' def getPeaksLists(df_temp): '''Get list of peaks and genes Parameters: df_temp: pandas.DataFrame Dendrogram data Returns: list: Lists of genes in peak for each experiment ndarray: Sorted and unique genes Usage: getPeaksList(df_temp) ''' peaksLists = {} genes = [] for experiment in np.unique(df_temp.index.get_level_values('experiment')): se = df_temp.xs(experiment, level='experiment', axis=0) peaksLists.update({(experiment, i, se.iloc[peak]): getGenesOfPeak(se, peak=peak, maxDistance=int(width/2)) for i, peak in enumerate(getPeaks(se, threshold = 0.05, prominence = 0.05, distance=width))}) genes.extend(se.index.values.tolist()) return peaksLists, np.unique(genes) print('Variant:', variant) df = pd.read_hdf(self.dendroDataName, key='df').fillna(0).set_index('gene', append=True).droplevel('order')[variant] listsDict, allgenes = getPeaksLists(df.xs('species', level='species', axis=0).copy()) se_peakAssignments = pd.Series(listsDict).apply(pd.Series) df_m = pd.DataFrame(index=allgenes, data=0, columns=se_peakAssignments.index) for peak in df_m.columns: df_m.loc[se_peakAssignments.loc[peak].dropna().values, peak] = 1 df_m = df_m.loc[df_m.sum(axis=1) > 0] print(df_m.shape) df_m = df_m.loc[df_m.sum(axis=1) > (0.05 * df_m.shape[1])] print(df_m.shape) se_heights = pd.Series(index=['E' + df_m.columns.get_level_values(0).str.split('Experiment ', expand=True).get_level_values(-1) + '.' + df_m.columns.get_level_values(1).astype(str)], data=df_m.columns.get_level_values(-1)) se_heights.index = se_heights.index.get_level_values(0) # Example plot of determining peaks if False: e = 'All' # 'Experiment 1' 'All' if False: self.reanalyzeMain(case=e, togglePublicationFigure=True, toggleIncludeHeatmap=False, markersLabelsRepelForce=1.5) if e == 'All': se = pd.read_hdf(os.path.join(self.bootstrapDir, 'All', 'dendrogram-heatmap-%s-data.h5' % self.majorMetric), key='df_C') #se = se[variant] se = se['Avg Combination'] se.index.name = 'gene' else: se = df.xs('species', level='species', axis=0).copy().xs(e, level='experiment') fig, ax = plt.subplots(figsize=(8,2)) ax.plot(range(len(se)), se.values, color='coral', linewidth=1.5, zorder=np.inf) if e == 'All': listsDict, allgenes = getPeaksLists(pd.concat([se.to_frame().copy()], keys=['All'], names=['experiment'], axis=0, sort=False)['Avg Combination']) #listsDict, allgenes = getPeaksLists(pd.concat([se.to_frame().copy()], keys=['All'], names=['experiment'], axis=0, sort=False)[variant]) se_peakAssignments = pd.Series(listsDict).apply(pd.Series) df_m_a = pd.DataFrame(index=allgenes, data=0, columns=se_peakAssignments.index) for peak in df_m_a.columns: df_m_a.loc[se_peakAssignments.loc[peak].dropna().values, peak] = 1 se_peaks = df_m_a.xs(e, level=0, axis=1) else: se_peaks = df_m.xs(e, level=0, axis=1) for peak in se_peaks[:]: se_peak = se_peaks[peak] se_peak = se_peak[se_peak == 1] x = np.where(np.isin(se.index.values, se_peak.index.values))[0] y = se.iloc[x].values if peak[1] == 1.: print(len(se_peak)) print(cleanListString(se_peak.index.values.tolist())) c = 'grey' if peak[1] != 1. else 'blue' ax.fill_between(x, y, facecolor=c, edgecolor=c, alpha=0.5) pg = se.index.values[x][np.argmax(y)] ax.text(x[np.argmax(y)], -0.001, pg, fontsize=8, rotation=90, va='top', ha='center') ax.plot([x[np.argmax(y)] - 1, x[np.argmax(y)] - 1], [0, max(y)], color='k', linewidth=1., alpha=1.) ax.set_xticks([]) ax.set_xticklabels([]) yticks = (0, 0.03) ax.set_yticks(yticks) ax.set_yticklabels(yticks) ax.tick_params(axis='y', labelsize=8, width=0.75, length=3) ax.text(-0.05, 0.5, 'Combination\nof measures', fontsize=10, rotation=90, va='center', ha='right', transform=ax.transAxes) ax.set_xlim([0, len(se)]) ax.set_ylim([0, 0.03]) fig.tight_layout() fig.savefig(self.workingDir + 'one example peaks %s.png' % variant, dpi=600) plt.close(fig) exit() Z1 = hierarchy.linkage(df_m, 'ward') Z2 = hierarchy.linkage(df_m.T, 'ward') O1 = hierarchy.dendrogram(Z1, no_plot=True, get_leaves=True)['leaves'][::-1] O2 = hierarchy.dendrogram(Z2, no_plot=True, get_leaves=True)['leaves'] df_m = df_m.iloc[O1] df_m = df_m.T.iloc[O2].T clusters1 = hierarchy.fcluster(Z1, t=nG, criterion='maxclust')[O1] - 1 clusters2 = hierarchy.fcluster(Z2, t=nE, criterion='maxclust')[O2] - 1 u1 = np.unique(clusters1, return_counts=True)[1] u2 = np.unique(clusters2, return_counts=True)[1] df_m.index = pd.MultiIndex.from_arrays([df_m.index, clusters1], names=['gene', 'cluster']) df_m.columns = pd.MultiIndex.from_arrays(['E' + df_m.columns.get_level_values(0).str.split('Experiment ', expand=True).get_level_values(-1) + '.' + df_m.columns.get_level_values(1).astype(str), clusters2], names=['peak', 'cluster']) we = pd.Series(index=df_m.columns, data=se_heights[df_m.columns.get_level_values(0)].values).droplevel(1) # 570 peaks ndict = dict() resf = dict() resh = dict() for ci in df_m.index.levels[-1]: for cj in df_m.columns.levels[-1]: df_temp = df_m.xs(ci, level='cluster', axis=0).xs(cj, level='cluster', axis=1) if df_temp.values.mean() >= dcutoff: resf[(ci, cj)] = df_temp.shape[1] resh[(ci, cj)] = we[df_temp.columns].mean() ndict[ci] = cleanListString(sorted(df_temp.index.values.tolist())) sef = pd.Series(resf) sed = pd.Series(index=sef.index, data=sef.groupby(level=0).sum().reindex(sef.droplevel(1).index).values) dff = pd.concat([pd.Series(resh), sef / sed], axis=1, keys=['h', 'w'], sort=False) seh = dff.groupby(level=0).agg(lambda s: np.average(s, weights=dff.loc[s.index, 'w'])).h #pd.Series(resf).to_excel(self.workingDir + 'resf %s.xlsx' % variant) #pd.Series(resh).to_excel(self.workingDir + 'resh %s.xlsx' % variant) se = pd.Series(resf).groupby(level=0).sum() se.name = 'frequency' df = se.to_frame() df['height'] = seh df['genes'] = pd.Series(se.index).replace(ndict).values df['length'] = df['genes'].str.split(', ').map(len) df['frequency'] = np.zeros(len(df['frequency'])) df = df[['frequency', 'height', 'length', 'genes']] locnbootstap = np.unique(se_peakAssignments.index.get_level_values(0)).shape[0] for i, group in enumerate(df['genes'].values): group = group.split(', ') fractions = df_m.droplevel('cluster').loc[group].sum(axis=0) / len(group) uv = np.unique(fractions[fractions >= fcutoff].index.get_level_values(0).str.split('.', expand=True).get_level_values(0).values) df.loc[df.index[i], 'frequency'] = len(uv) / locnbootstap df = df.sort_values(by='frequency', ascending=False) print(df) df.columns = ['Fequency of group', 'Average peak height', 'Number of genes', 'Genes'] df.index = np.array(range(len(df.index))) + 1 df.index.name = 'Group' df.to_excel(self.workingDir + 'All peaks %s.xlsx' % variant) # Enrichment analysis if False: from pyiomica.enrichmentAnalyses import KEGGAnalysis, GOAnalysis, ReactomeAnalysis, ExportEnrichmentReport, ExportReactomeEnrichmentReport dataForAnalysis = df['Genes'].str.split(', ').to_dict() resKEGG = KEGGAnalysis(dataForAnalysis) ExportEnrichmentReport(resKEGG, AppendString='KEGG11', OutputDirectory=self.workingDir) resGO = GOAnalysis(dataForAnalysis) for key in resGO.keys(): resGO[key] = {k:v for k,v in resGO[key].items() if v[2][1]=='biological_process'} ExportEnrichmentReport(resGO, AppendString='GO11', OutputDirectory=self.workingDir) resReactome = ReactomeAnalysis(dataForAnalysis) resReactome = {k:v.loc[v['Entities FDR'] < 0.05] for k,v in resReactome.items()} ExportReactomeEnrichmentReport(resReactome, AppendString='Reactome11', OutputDirectory=self.workingDir) # Plot all-peaks heatmap if True: fig = plt.figure(figsize=(10, 10)) groupsColors = True # Genes dendrogram if True: n_clusters = nG ax = fig.add_axes([0.1, 0.25, 0.15, 0.5], frame_on=False) origLineWidth = matplotlib.rcParams['lines.linewidth'] matplotlib.rcParams['lines.linewidth'] = 0.5 if groupsColors: D = hierarchy.dendrogram(Z1, ax=ax, color_threshold=Z1[-n_clusters + 1][2], above_threshold_color='k', orientation='left') else: D = hierarchy.dendrogram(Z1, ax=ax, color_threshold=0, above_threshold_color='k', orientation='left') matplotlib.rcParams['lines.linewidth'] = origLineWidth ax.set_xticklabels([]) ax.set_xticks([]) ax.set_yticklabels([]) ax.set_yticks([]) ax.set_title('Receptors') # Peaks dendrogram if True: n_clusters = nE ax = fig.add_axes([0.25, 0.75, 0.5, 0.15], frame_on=False) origLineWidth = matplotlib.rcParams['lines.linewidth'] matplotlib.rcParams['lines.linewidth'] = 0.5 if groupsColors: D = hierarchy.dendrogram(Z2, ax=ax, color_threshold=Z2[-n_clusters + 1][2], above_threshold_color='k', orientation='top') else: D = hierarchy.dendrogram(Z2, ax=ax, color_threshold=0, above_threshold_color='k', orientation='top') matplotlib.rcParams['lines.linewidth'] = origLineWidth ax.set_xticklabels([]) ax.set_xticks([]) ax.set_yticklabels([]) ax.set_yticks([]) ax.set_title('Peaks') # Heatmap if True: heatdata = df_m.values * we.values[None, :] ax = fig.add_axes([0.25, 0.25, 0.5, 0.5], frame_on=True) #cmap = matplotlib.colors.LinearSegmentedColormap.from_list('WB', [(1, 1, 1), (0, 0, 0.5)], N=2) cmap = copy.copy(plt.cm.jet_r) cmap.set_bad('white') im = ax.imshow(np.ma.array(heatdata, mask=(heatdata==0.)), cmap=cmap, aspect='auto', interpolation='None', extent=(-0.5, df_m.shape[0] - 0.5, df_m.shape[1] - 0.5, -0.5), vmin=0., vmax=1.) ax.set_xticklabels([]) ax.set_xticks([]) ax.set_yticklabels([]) ax.set_yticks([]) # Export heatmap data if True: print('Exporting all peaks heatmap', flush=True) (df_m * we.values[None, :]).to_excel(self.workingDir + 'heatmap all peaks %s.xlsx' % variant) # Box annotations if False: xl, yl = ax.get_xlim()[1], ax.get_ylim()[0] sea = pd.Series(index=pd.MultiIndex.from_tuples(df_m.index.values), data=df_m.index.values).apply(pd.Series) gsea = sea.groupby(level=1).count()[0] dexp = pd.Series(index=df_m.columns.get_level_values(1).values, data=df_m.columns.get_level_values(1).values) gexp = dexp.groupby(level=0).count() def ann(xy2, clusterG, clusterE, n = 10): xy1 = xl * np.mean(np.where(dexp == clusterE)[0]) / len(dexp), yl * np.mean(np.where(sea[1] == clusterG)[0]) / len(sea) xy2 = xy2[0] * xl, xy2[1] * yl genes = sea.xs(key=clusterG, level=1)[0].values sgenes = '\n'.join([cleanListString(g) for g in np.array_split(genes, int(len(genes)/n))]) ax.annotate(sgenes, xy=xy1, xytext=xy2, fontsize=5, ha='left', va='center', arrowprops=dict(arrowstyle="-", lw=0.5), bbox=dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.25), zorder=np.inf) return ann((0.76, 1.14), 8, 6, n=5) ann((0.76, 1.05), 7, 7, n=5) ann((0.39, 1.09), 1, 2, n=5) ann((-0.3, 1.125), 0, 0, n=10) ann((-0.3, 1.05), 6, 1, n=10) # Colorbar if True: axColor = fig.add_axes([0.72, 0.25, 0.05, 0.3], frame_on=False) axColor.set_xticks([]) axColor.set_xticklabels([]) axColor.set_yticks([]) axColor.set_yticklabels([]) clb = fig.colorbar(im, ax=axColor) clb.set_label(label='Peak height', fontsize=6) #clb.set_ticks([0.25, 0.75]) #clb.set_ticklabels(['No', 'Yes']) clb.ax.tick_params(labelsize=6) fig.savefig(self.workingDir + 'all peaks %s.png' % variant, dpi=600) plt.close(fig) return
[docs] def analyzePerGeneCombinationVariant(self, variant, hcutoff = 0.2, fcutoff = 0.3, width = 50): '''Find all peaks and their frequency from the bootstrap experiments Parameters: variant: str Name of combination variant (e.g. 'Avg combo4avgs', 'Avg combo3avgs') nG: int, Default 8 Number of clusters of genes nE: int, Default 30 Number of clusters of bootstrap experiments fcutoff: float, Default 0.5 Lower peak height cutoff width: int, Default 50 Width of peak Returns: None Usage: an = Analyze() an.analyzeAllPeaksOfCombinationVariant('Avg combo4avgs', nG=8, nE=30, fcutoff=0.5, width=50) ''' print('Variant:', variant) df = pd.read_hdf(self.dendroDataName, key='df').fillna(0).set_index('gene', append=True).droplevel('order')[variant] df_temp = df.xs('species', level='species', axis=0).copy() experiments = np.unique(df_temp.index.get_level_values('experiment')) genes = np.unique(df_temp.index.get_level_values('gene')) dfs = [] for gene in genes: peaksLists = {} for experiment in experiments: se = df_temp.xs(experiment, level='experiment', axis=0) se = se/se.max() if gene in se.index: i0 = np.where(se.index==gene)[0][0] i1, i2 = max(0, i0-int(width/2)), min(i0+int(width/2), len(se)) # e.g. w=50, then take 25 to the left of the gene, the gene itself, 24 to the right of the gene se_temp = se.iloc[i1: i2] se_temp = se_temp[se_temp >= hcutoff] peaksLists.update({experiment: se_temp.index.values}) df_peaks = pd.Series(peaksLists).apply(pd.Series) se = df_peaks.stack().dropna().value_counts().sort_values(ascending=False)/len(experiments) se = se.reindex(genes).fillna(0.) se.name = gene if se[gene] >= fcutoff: print(gene, end=' ', flush=True) dfs.append(se) dfs = pd.concat(dfs, axis=1, sort=False) print(dfs) dfs.to_excel(self.workingDir + 'near frequency %s %s.xlsx' % (variant, hcutoff)) return
[docs] def bootstrapMaxpeakPlot(self, variant): '''Bootstrap max-peak plot ''' df = pd.read_excel(self.workingDir + '%s bootstrap_in-peak_genes_SD.xlsx' % variant, sheet_name='Bootstrap lists', index_col=None, header=0) df_temp = pd.DataFrame(index=np.unique(df.stack().dropna().values), columns=df.columns).fillna(0.) for col in df.columns: df_temp.loc[df[col].dropna().values, col] = 1. df_temp = df_temp.iloc[scipy.cluster.hierarchy.dendrogram(scipy.cluster.hierarchy.linkage(df_temp, 'ward'), no_plot=True, get_leaves=True)['leaves']] df_temp = df_temp.T.iloc[scipy.cluster.hierarchy.dendrogram(scipy.cluster.hierarchy.linkage(df_temp.T, 'ward'), no_plot=True, get_leaves=True)['leaves']].T df_temp = df_temp.loc[df_temp.sum(axis=1) > (0.1 * df_temp.shape[1])] fig = plt.figure(figsize=(12, 10)) ax = fig.add_axes([0.25, 0.2, 0.65, 0.7]) cmap = matplotlib.colors.LinearSegmentedColormap.from_list('WB', [(1, 1, 1), (0, 0, 0.5)], N=2) im = ax.imshow(df_temp.values, cmap=cmap, interpolation='None', aspect='auto') ax.set_xticks(range(df_temp.shape[1])) ax.set_xticklabels(df_temp.columns, rotation=90, fontsize=5) ax.set_xlim([-0.5, df_temp.shape[1] - 0.5]) ax.set_yticks(range(df_temp.shape[0])) ax.set_yticklabels(df_temp.index, rotation=0, fontsize=5) ax.set_ylim([-0.5, df_temp.shape[0] - 0.5]) if True: axColor = fig.add_axes([0.825, 0.8, 0.1, 0.1], frame_on=False) axColor.set_xticks([]) axColor.set_xticklabels([]) axColor.set_yticks([]) axColor.set_yticklabels([]) clb = fig.colorbar(im, ax=axColor) clb.set_ticks([0.25, 0.75]) clb.set_ticklabels(['No', 'Yes']) clb.ax.tick_params(labelsize=6) fig.savefig(self.workingDir + '%s bootstrap.png' % variant, dpi=600) return
[docs] def generateAnalysisReport(self): '''Generate analysis report. ''' saveDir = os.path.join(self.workingDir, 'results majorMetric=%s, dendrogramMetric=%s, linkageMethod=%s' % (self.majorMetric, self.dendrogramMetric, self.dendrogramLinkageMethod)) if not os.path.exists(saveDir): os.makedirs(saveDir) for file in os.listdir(self.workingDir): if file != 'data.h5' and not os.path.isdir(os.path.join(self.workingDir, file)): try: shutil.copyfile(os.path.join(self.workingDir, file), os.path.join(saveDir, file)) except Exception as exception: print('Cannot copy file:', file, '\n', exception) return
def process(df1main, df1other, df2main, df2other, dir1, dir2, genesOfInterest = None, knownRegulators = None, nCPUs = 4, panels = ['fraction', 'binomial', 'top50', 'markers', 'combo3avgs', 'combo4avgs'], parallelBootstrap = False, exprCutoff1 = 0.05, exprCutoff2 = 0.05, perEachOtherCase = True, doScramble = False, part1 = True, part2 = True, part3 = True, **kwargs): '''Main workflow programmed in two scenaria depending on parameter "perEachOtherCase". Parameters: df1main: pandas.DataFrame Expression data of main group of cells of the first species df1other: pandas.DataFrame Expression data of other cells of the first species df2main: pandas.DataFrame Expression data of main group of cells of the second species df2other: pandas.DataFrame Expression data of other cells of the second species dir1: str Path to the first species working directory dir2: str Path to the second species working directory genesOfInterest: list, Default None Particular genes to analyze, e.g. receptors knownRegulators: list, Default None Known marker genes nCPUs: int, Default 1 Number of CPUs to use for multiprocessing, recommended 10-20 panels: list, Default None Particular measurements to include in the analysis parallelBootstrap: boolean, Default False Whether to generate bootstrap experiments in parallel mode exprCutoff1: float, Default 0.05 Per-batch expression cutoff for the first dataset exprCutoff2: float, Default 0.05 Per-batch expression cutoff for the second dataset perEachOtherCase: boolean, Default True Scenario of comparison Any other parameters that class "Analysis" can take Returns: Analysis First class Analysis instance Analysis Second class Analysis instance ''' if genesOfInterest is None: genesOfInterest = receptorsListHugo_2555 if knownRegulators is None: knownRegulators = gEC22 an1 = Analysis(workingDir=dir1, otherCaseDir=dir2, genesOfInterest=genesOfInterest, knownRegulators=knownRegulators, panels=panels, nCPUs=nCPUs, perEachOtherCase=perEachOtherCase, **kwargs) if perEachOtherCase: an2 = Analysis(workingDir=dir2, otherCaseDir=dir1, genesOfInterest=genesOfInterest, knownRegulators=knownRegulators, panels=panels, nCPUs=nCPUs, perEachOtherCase=perEachOtherCase, **kwargs) if part1: if not df1main is None: an1.prepareDEG(df1main, df1other) an1.preparePerBatchCase(exprCutoff=exprCutoff1) an1.prepareBootstrapExperiments(parallel=parallelBootstrap) if perEachOtherCase: if not df2main is None: an2.prepareDEG(df2main, df2other) an2.preparePerBatchCase(exprCutoff=exprCutoff2) an2.prepareBootstrapExperiments(parallel=parallelBootstrap) if part2: an1.analyzeBootstrapExperiments() if perEachOtherCase: an2.analyzeBootstrapExperiments() an1.analyzeBootstrapExperiments() an1.reanalyzeMain() if an1.nBootstrap > 0: an1.analyzeCombinationVariant('Avg combo3avgs') an1.analyzeCombinationVariant('Avg combo4avgs') an1.bootstrapMaxpeakPlot('Avg combo3avgs') an1.bootstrapMaxpeakPlot('Avg combo4avgs') an1.analyzeAllPeaksOfCombinationVariant('Avg combo3avgs', nG=15, nE=15, fcutoff=0.5, width=50) an1.analyzeAllPeaksOfCombinationVariant('Avg combo4avgs', nG=15, nE=15, fcutoff=0.5, width=50) if doScramble: an1.scramble(['Binomial -log(pvalue)', 'Top50 overlap', 'Fraction'], subDir='combo3/', M=20) an1.scramble(['Markers', 'Binomial -log(pvalue)', 'Top50 overlap', 'Fraction'], subDir='combo4/', M=20) if perEachOtherCase: an2.reanalyzeMain() if an2.nBootstrap > 0: an2.analyzeCombinationVariant('Avg combo3avgs') an2.analyzeCombinationVariant('Avg combo4avgs') an2.bootstrapMaxpeakPlot('Avg combo3avgs') an2.bootstrapMaxpeakPlot('Avg combo4avgs') an2.analyzeAllPeaksOfCombinationVariant('Avg combo3avgs', nG=15, nE=15, fcutoff=0.5, width=50) an2.analyzeAllPeaksOfCombinationVariant('Avg combo4avgs', nG=15, nE=15, fcutoff=0.5, width=50) if doScramble: an2.scramble(['Binomial -log(pvalue)', 'Top50 overlap', 'Fraction'], subDir='combo3/', M=10) an2.scramble(['Markers', 'Binomial -log(pvalue)', 'Top50 overlap', 'Fraction'], subDir='combo4/', M=20) if part3: an1.generateAnalysisReport() if perEachOtherCase: an2.generateAnalysisReport() if perEachOtherCase: return an1, an2 return an1