Source code for decneo.commonFunctions

''' Common functions used in analysis pipeline 
'''

from .general import *
from .genes import *

cleanListString = lambda c: str(list(c)).replace(' ', '').replace("'", '').replace(']', '').replace('[', '').replace(',', ', ')

[docs]def movingAverageCentered(a, halfWindowSize, looped = False):

    '''Function to smooth a 1d signal
        
    Parameters:
        a: ndarray
            Input data

        halfWindowSize: int
            Size of half-window for averaging

        looped: boolean, Default False
            Determined looped behaviour at the boundaries

    Returns:
        ndarray
            Smoothed signal

    Usage:
        movingAverageCentered(a, halfWindowSize)
    '''

    '''
    An intuitive way:
    b = np.append(np.append(a[-n:], a), a[:n])
    a_smoothed = np.array([np.sum(b[i-n:i+n+1]) / (2.*n + 1.) for i in range(n, len(b)-n)])

    Faster way is to use np.cumsum(), especially for long vectors

    if False:
        np.random.seed(0)
        a = np.sin(np.random.rand(100)*2.*180./np.pi) + 1.
        a[a > 0.5] = 0
        print(np.sum(a))
        plt.plot(np.arange(len(a)), a, 'o')

        for n in [1,2,5,10]:
            a_avg = movingAverageCenteredLooped(a, n, looped=False)
            print(np.sum(a_avg))
            plt.plot(np.arange(len(a)), a_avg)

        plt.show()
    '''

    n = halfWindowSize

    if looped:
        b = np.append(np.append(a[-n:], a), a[:n])
    else:
        b = np.append(np.append(np.zeros(n), a), np.zeros(n))

    s = np.cumsum(b) / (2.*n + 1.)
    s[2*n+1:] -= s[:-2*n-1]

    aSm = s[2*n:]

    if not looped:
        for i in range(n):
            e = i + n + 1
            aSm[i] = a[:e].mean()
            aSm[len(a) - i - 1] = a[-e:].mean()

    return aSm

[docs]def get_mean_std_cov_ofDataframe(df):

    '''Calculate mean, standard deviation, and covariance 
        
    Parameters:
        df: pandas.DataFrame
            Data with bootstrap experiment data 
        
    Returns:
        pandas.DataFrame:
            DataFrame with columns expressing mean, standard deviation, and covariance of input columns

    Usage:
        get_mean_std_cov_ofDataFrame(df)
    '''

    se_mean = df.mean(axis=1)
    se_std = df.std(axis=1)

    df['1 mean'] = se_mean
    df['2 std'] = se_std
    df['3 cov'] = se_std / se_mean
    df = df[['1 mean', '2 std', '3 cov']]

    df.columns = ['mean', 'std', 'cov']

    df = df.sort_values('mean', axis=0, ascending=False)

    return df

[docs]def getGenesOfPeak(se, peak=None, heightCutoff = 0.5, maxDistance = None):

    '''Find peak region of greatest value
        
    Parameters:
        se: Series
            Normalized aggregated data 

        peak: ndarray, Default None
            Indices of max values in data

        heighCutoff: float, Default 0.5
            Height/value considered to be in peak 

        maxDistance: int, Default None
            Maximum distance away considered to be in peak
        
    Returns:
        Genes appearing the peak 

    Usage:
        getGenesOfPeak(se)
    '''

    se /= se.max()

    if peak is None:
        i0 = np.argmax(se.values)
    else:
        i0 = peak

    genes = [se.index[i0]]

    if heightCutoff is None:
        heightCutoff = -np.inf

    if maxDistance is None:
        r = range(i0 + 1, i0 + 2*10**3, 1)
    else:
        r = range(i0 + 1, i0 + 1 + maxDistance, 1)
    for i in r:
        try:
            if se.iloc[i] >= (heightCutoff * se.iloc[i0]):
                genes.append(se.index[i])
            else:
                break
        except:
            break

    if maxDistance is None:
        r = range(i0 - 1, 0, -1)
    else:
        r = range(i0 - 1, max(0, i0 - 1 - maxDistance), -1)
    for i in r:
        try:
            if se.iloc[i] >= (heightCutoff * se.iloc[i0]):
                genes.append(se.index[i])
            else:
                break
        except:
            break

    return genes

[docs]def getPeaks(se, threshold = 0.2, distance = 50, prominence = 0.05, returnAllInfo = False):

    '''Find peak regions 
        
    Parameters:
        se: Series
            Normalized aggregated data 

        threshold: float, Default 0.2
            Minimum value to be considered peak 

        distance: int, Default 50
            Minimum horizontal distance between peaks 

    Returns:
        Indices of peaks satisfying input conditions 

    Usage:
        getPeaks(se)
    '''

    se /= se.max()

    allInfo = scipy.signal.find_peaks(np.hstack([0, se.values, 0]), distance=distance, prominence=prominence)
    peaks = allInfo[0] - 1
    peaks = peaks[se[peaks] >= threshold]

    if returnAllInfo:
        return peaks, allInfo
    else:
        return peaks

[docs]def getDistanceOfBatch(args):

    '''Calculate correlation distance of metric for given batch 

    Parameters:
        args: tuple
            Tuple that contains: 
                batch: str
                    Batch identifier
                    
                df_sample: pandas.DataFrame
                    Expression data 

                metric: str
                    Metric name (e.g. 'correlation') 

                genes: list or 1d numpy.array
                    Genes of interest

                minSize: int
                    Minimum size of input pandas.DataFrame

                cutoff: float
                    Cutoff for percent expression of input data 

    Returns:
        tuple:
            Results in form of a tuple:
                pandas.Series 
                    Series containting correlation distance 

                str
                    Batch identifier 

                pandas.Series
                    Series of genes 

    Usage:
        getDistanceOfBatch
    '''

    batch, df_sample, metric, genes, minSize, cutoff = args

    try:
        df_sample = df_sample.loc[((df_sample > 0).sum(axis=1) / df_sample.shape[1]) >= cutoff]
    
        if df_sample.shape[1] < minSize:

            return pd.DataFrame(), '', set()

        print('\t', batch, df_sample.shape, end=' ')

        temp_genes = df_sample.index.intersection(genes)
        print(len(temp_genes), end=' ', flush=True)

        if metric == 'spearman':
            # The Spearman correlation distance is defined as "1. -  Spearman correlation coefficient".
            # The Spearman correlation coefficient is defined as the Pearson correlation coefficient between the ranks of variables.
            # Using specialized function "scipy.stats.spearmanr" is not recommended, since it consumes more than 10x memory and 2x time.
            # i.e. this is not efficient:
            # measure = 1. - scipy.stats.spearmanr(df_sample.loc[temp_genes].values, df_sample.values, axis=1).correlation[-len(df_sample.values):, :len(temp_genes)]

            measure = cdist(df_sample.loc[temp_genes].apply(lambda s: pd.Series(index=s.index, data=scipy.stats.rankdata(s.values)), axis=1), df_sample.apply(lambda s: pd.Series(index=s.index, data=scipy.stats.rankdata(s.values)), axis=1), metric='correlation').T
        else:
            measure = cdist(df_sample.loc[temp_genes].values, df_sample.values, metric=metric).T

        se_batch_corr = pd.DataFrame(measure, index=df_sample.index, columns=temp_genes).stack().reorder_levels([1, 0]).sort_index()
        se_batch_corr = pd.concat([se_batch_corr], keys=[batch], axis=1, sort=False)
        se_batch_corr = pd.concat([se_batch_corr], keys=[metric], axis=1, sort=False)

    except Exception as exception:
        print('\nERROR in correlation calculation:', exception, '\n')

        return pd.DataFrame(), '', set()

    return se_batch_corr.copy(), batch, pd.Series(index=se_batch_corr.index)

[docs]def reindexSeries(args):

    '''Assists in reindexing Series 

    Parameters:
        args: tuple
            Tuple that contains: 
                se: pandas.Series
                    Series to perform reindexing 
                    
                batch: str
                    Batch identifier 

                index: list or 1d numpy.array
                    List of genes 

    Returns:
        tuple:
            Results in form of a tuple:
                pandas.Series
                    Reindexed pandas.Series

                str
                    Batch identifier


    Usage:
        reindexSeries
    '''

    se, batch, index = args

    print('\t', batch, end=' ', flush=True)
    se = se.replace(0., np.nan).reindex(index)

    return se, batch

[docs]def get_df_distance(df, metric = 'correlation', genes = [], analyzeBy = 'batch', minSize = 10, groupBatches = True, pname = None, cutoff = 0.05, nCPUs = 4):

    '''Calculate distance measurement
        
    Parameters:
        df: pandas.DataFrame
            Expression data 

        metric: str, Default 'correlation'
            Metric name (e.g. 'correlation') 

        genes: list, Default []
            Genes for analysis

        analyzeBy: str, Default 'batch'
            Level to analyze data by (e.g. batches)

        minSize: int, Default 10
            Minimum size of input pandas.DataFrame

        groupBatches: boolean, Default True 
            Whether to group batched or save per-batch distance measure 

        pname: Default None
            Deprecated

        cutoff: float, Default 0.05
            Cutoff for percent expression of input data 

        nCPUs: int, Default 4
            Number of CPUs to use for multiprocessing

    Returns:
        pandas.DataFrame 
            Distance measure

    Usage:
        get_df_distance(df)
    '''

    print('\tMetric:', metric, '\tAnalyzeBy:', analyzeBy, '\tminSize:', minSize)
    print('df_expr', '%1.1fMb'%(sys.getsizeof(df) / 1024**2), flush=True)

    genes = np.unique(genes)
    print('\tReceived %s genes for analysis' % len(genes))

    temp_batches = np.unique(df.columns.get_level_values(analyzeBy).values)
    print('Staring multiprocessing for %s batches' % len(temp_batches), flush=True)
    pool = multiprocessing.Pool(processes=nCPUs)
    results = pool.map(getDistanceOfBatch, [(batch, df.xs(key=batch, level=analyzeBy, axis=1, drop_level=False), metric, genes, minSize, cutoff) for batch in temp_batches])
    pool.close()
    pool.join()
    print()

    del df

    index = pd.concat([item[2] for item in results if item[1] != ''], axis=0).index.drop_duplicates()
    print('\tIndex pairs:', len(index), '\tSelected genes:', len(index.levels[0]), '\tAll genes:', len(index.levels[1]))

    print('Starting multiprocessing for reindexing', flush=True)
    pool = multiprocessing.Pool(processes=nCPUs)
    results = pool.map(reindexSeries, [(item[0], item[1], index) for item in results if item[1] != ''])
    pool.close()
    pool.join()
    print()

    print('Merging reindexed batches', flush=True)
    df_batches = pd.DataFrame(index=index, dtype=float)
    for se, batch in results:
        print('\t', batch, end=' ', flush=True)
        df_batches[batch] = se
    print()

    del results
    
    print(df_batches, flush=True)
    print('df_batches', '%1.1fMb'%(sys.getsizeof(df_batches) / 1024**2), flush=True)

    if groupBatches:
        df_batches = pd.Series(data=np.nanmedian(df_batches.values, axis=1, overwrite_input=True), index=df_batches.index).unstack(0)

        for gene in df_batches.columns:
            try:
                df_batches.loc[gene, gene] = 0.
            except:
                pass

    return df_batches

[docs]def reduce(vIn, size = 100):

    '''Interpolate data to reduce the number of data points
        
    Parameters:
        vIn: 1d vector
            Data to resample

        size: int, Default 100
            New data size
        
    Returns:
        Resampled data

    Usage:
        reduce(vIn)
    '''

    '''
    def reduce(v, size = 100):

        bins =  np.linspace(np.min(v), np.max(v), num=size)

        return bins[np.digitize(v, bins) - 1]
    '''

    v = vIn.copy()

    wh = np.where(~np.isnan(v))[0]

    if len(wh) > 0:
        bins =  np.linspace(np.min(v[wh]), np.max(v[wh]), num=size)

        v[wh] = bins[np.digitize(v[wh], bins) - 1]
    else:
        v[0] = 0.

    return v

[docs]def metric_euclidean_missing(u, v):

    '''Metric of euclidean distance between two arrays, excluding missing points
        
    Parameters:
        u: 1d vector
            Data array

        v: 1d vector
            Data array
        
    Returns:
        ndarray 
            Non-negative squareroot of the array, element-wise

    Usage:
        metric_euclidean_missing(u, v)
    '''

    wh = np.where(~np.isnan(u * v))[0]
                    
    return np.sqrt(((u[wh] - v[wh])**2).sum())

[docs]def binomialEnrichmentProbability(nx_obj, enriched_genes, target_genes = False, background_genes = False, PCNpath = 'data/'):

    '''Takes in a network x object and list of enriched genes and calculates the binomial
    enrichment based on the number of enriched interaction there are.
    Uses Survival function (also defined as 1 - cdf): scipy.stats.binom.sf(k, n, p, loc=0)

    Parameters:
        nx_obj: networkx.Graph or str
            A networkx undirected graph or edge list file name.

        enriched_genes: list
            List of enriched genes.

        target_genes: list or boolean, Default False
            List of target_genes. Default use all genes in background.

        background_genes: list or boolean, Default False
            List of genes to use as background probability. Default use all genes in the network.

    Returns:
        pandas.DataFrame
    '''
    
    if not isinstance(nx_obj, nx.Graph):
        if isinstance(nx_obj, str):
            if not os.path.isfile(os.path.join(PCNpath, 'PCN.pklz')):
                try:
                    nx_obj = nx.read_edgelist(nx_obj).to_undirected()
                except Exception as exception:
                    print(exception)

                write(nx_obj, os.path.join(PCNpath, 'PCN'))
            else:
                nx_obj = read(os.path.join(PCNpath, 'PCN'))
        else:
            print("Error: nx_obj needs to be a networkx graph or a network edgelist file name")
            return

    if background_genes is False:
        background_genes = nx_obj.nodes()
        
    if target_genes is False:
        target_genes = background_genes

    enriched_genes = set(enriched_genes).intersection(background_genes)
    
    enr_with_enr = 0
    enr_non_enr = 0
    for gene in enriched_genes:
        gene_neighbors = set(nx_obj.neighbors(gene))
        enr_non_enr += len(gene_neighbors - enriched_genes)
        enr_with_enr += len(gene_neighbors.intersection(enriched_genes))
        
    # Divide by 2 to not double count enriched interaction
    enr_with_enr = int(enr_with_enr/2)
    
    p = float(enr_non_enr + enr_with_enr)/float(len(nx_obj.edges())) 
    nx_nodes = set(nx_obj.nodes())
    df_binom = pd.DataFrame()
    for gene in target_genes:
    
        if gene not in nx_nodes: 
            continue

        gene_neighbors = set(nx_obj.neighbors(gene))
        num_neighbors_enriched = len(gene_neighbors.intersection(enriched_genes))
        num_neighbors_background = len(gene_neighbors.intersection(background_genes))

        if num_neighbors_background == 0: 
            continue
        
        df_binom.loc[gene, 'Enriched_Interact'] = num_neighbors_enriched
        df_binom.loc[gene, 'Total_Interact'] = num_neighbors_background
        df_binom.loc[gene, 'Binomial_Prob'] = scipy.stats.binom.sf(num_neighbors_enriched, num_neighbors_background, p, loc=1)
        
    return df_binom

[docs]def getROC(data):

    '''Calculate axes of ROC (false positive rates and true positive rates) using index as thresholds
        
    Parameters:
        data: 1d vector
            Input data
        
    Returns:
        np.array 
            Array holding false positive rates

        np.array
            Array holding true positive rates

    Usage:
        getROC(data)
    '''

    fpr = []
    for i in range(len(data)):
        FP = np.sum(data[:i] == False)
        TN = np.sum(data[i:] == False)
        fpr.append(FP/(FP+TN))

    tpr = []
    for i in range(len(data)):
        TP = np.sum(data[:i] == True)
        FN = np.sum(data[i:] == True)
        tpr.append(TP/(TP+FN))

    return np.array(fpr), np.array(tpr)

[docs]def clusterData(data, n_clusters = None, method = 'Spectral', random_state = None):

    '''Return cluster labels
    
    Parameters: 
        data: np.array 
            Array of shape (features, objects)

        n_clusters: int, Default None 
            Number of desired clusters 

        method: str or int , Default 'Spectral'
            Method used cluster the data 
            Options: 1 or 'Agglomerative', 2 or 'Spectral', 3 or 'KMeans'

        random_state: int, Default None
            Used to determine randomness deterministic 
            Methods 2 and 3 initial state is random, unless random_state is specified

    Returns:
        list
            Cluster assignment of each object

    Usage:
        clusterData(data)

    '''

    if n_clusters is None:
        
        print('Specify number of clusters via n_clusters')
        return

    if method in ['Agglomerative', 1]:
        model = AgglomerativeClustering(n_clusters=n_clusters).fit(data.T)
        labels = model.labels_

    elif method in ['Spectral', 2]:
        model = SpectralCoclustering(n_clusters=n_clusters, random_state=random_state)
        model.fit(data.T)
        labels =  model.row_labels_

    elif method in ['KMeans', 3]:
        model = KMeans(n_clusters=n_clusters, random_state=random_state).fit(data.T)
        labels =  model.labels_

    print('Cluster sizes:', np.unique(labels, return_counts=True)[1])
                            
    return labels

[docs]def testNumbersOfClusters(data, text = '', n_min = 2, n_max = 20, k = 10):

    '''Test different number of clusters with KMeans, calculate ARI for each
    
    Parameters: 
        data: np.array 
            Array of shape (features, objects)

        text: str, Default ''
            String identifier

        n_min: int, Default 2
            Minimum number of cluster

        n_max: int, Default 20 
            Maximum number of cluster

        k: int, Default 10 
            Number of iterations for each clusters number

    Usage: 
        testNumbersofClusters(data)
    '''

    print()

    for n in range(n_min, n_max):
        labels = [KMeans(n_clusters=n).fit(data.T).labels_ for i in range(k)]

        score = np.zeros((k, k))
        for i in range(k):
            for j in range(k):
                score[i, j] = adjusted_rand_score(labels[i], labels[j])

        score[np.triu_indices(k, k=0, m=k)] = np.nan
        score = np.nanmean(score.flatten())

        print('%s, n=%s, k=%s, ARI=%s' % (text, n, k, score), flush=True)

    print()

    return

[docs]def KeyInStore(key, file):

    '''Check if a key present in HDF store file

    Parameters:
        key: str
            The key to check

        file: str
            Path to HDF store file

    Returns
        True or False
            Whether key is in store or not
    
    Usage:
        KeyInStore(key, file)
    '''

    try:
        with pd.HDFStore(file, mode='r') as hdf5file:
            if "/" + key.strip("/") in hdf5file.keys():
                return True
            else:
                return False

    except Exception as exception:
        print(exception)

    return

[docs]def KeysOfStore(file):

    '''Get list of keys in HDF store file

    Parameters:
        file: str
            Path to HDF store file

    Returns
        list
           Keys
    
    Usage:
        KeysOfStore(file)
    '''

    try:
        with pd.HDFStore(file, mode='r') as hdf5file:
            return list(hdf5file.keys())

    except Exception as exception:
        print(exception)

    return

[docs]def downloadFile(url, saveDir, saveName = None):

    '''Download a file from internet

    Parameters:
        url: str
            URL to download from

        saveDir: str
            Path to save downloaded file to

        saveName: str, Default None
            New name for downloaded file

    Returns
        None
    
    Usage:
        downloadFile(url, saveDir)
    '''

    if not os.path.exists(saveDir): 
        os.makedirs(saveDir)
    
    if saveName is None:
        saveName = url.strip('"').split('/')[-1:][0]

    path = os.path.join(saveDir, saveName)

    if os.path.isfile(path):
        print('File has been downloaded already')
    else:
        print('Downloading file:', url.strip('"'), end='\t', flush=True)

        try:
            urllib.request.urlretrieve(url.strip('"'), path)
            print('Done', flush=True)

        except Exception as exception:
            print(exception)

    return

[docs]def readRDataFile(fullPath, takeGeneSymbolOnly = True, saveToHDF = True, returnSizeOnly = False):

    '''Read R data file

    Parameters:
        fullPath: str
            Path to file

        takeGeneSymbolOnly: boolean, Default True
            Wether to save gene symbol only

        saveToHDF: boolean, Default True
            Wether to save data in HDF format

        returnSizeOnly: boolean, Default False
            Get data size and return, without reading the data itself

    Returns
        None
    
    Usage:
        readRDataFile(path)
    '''

    if (not returnSizeOnly) and os.path.isfile(fullPath):
        print('File already exists:', fullPath)

        df = pd.read_hdf(fullPath, key='df')

        return df

    from rpy2.robjects import r as R

    R['load'](fullPath[:-len('.h5')])
    ls = np.array(R['ls']())

    rSparseMatrix = R[ls[0]]

    print('Matrix size:', end='\t', flush=True)
    size = R('dim')(rSparseMatrix)
    print(size)

    if returnSizeOnly:

        return size[0], size[1]

    columns = pd.Index(np.array(R['colnames'](rSparseMatrix))).astype(str)
    index = pd.Index(np.array(R['rownames'](rSparseMatrix))).astype(str)

    if takeGeneSymbolOnly:
        index = index.str.split('_ENS', expand=True).get_level_values(0)
    
    R['writeMM'](obj=rSparseMatrix, file='%s.mtx' % (fullPath))

    df = pd.DataFrame(index=index, columns=columns, data=scipy.io.mmread('%s.mtx' % (fullPath)).toarray().astype(int))

    os.remove('%s.mtx' % (fullPath))

    df = df.loc[~df.index.duplicated(keep='first')]
    df = df.T.loc[~df.T.index.duplicated(keep='first')].T

    if saveToHDF:
        df.to_hdf(fullPath, key='df', mode='a', complevel=4, complib='zlib')

        return df

    return df
      
[docs]def normSum1(data):

    w = np.nansum(data)
    if w != w or w == 0.:
        w = 1.

    return np.nan_to_num(data) / w

[docs]def silhouette(data, n_clusters, cluster_labels, saveDir, saveName):

    u = np.unique(cluster_labels)
    n_clusters = len(u)

    df = pd.DataFrame(data=data, index=cluster_labels, columns=cluster_labels)
    print(df)

    print('DCS')
    dft = pd.DataFrame()
    for t in u:
        temp = df.loc[df.index==t, df.columns==t].values
        if temp.shape[0]>1:
            inside = temp[np.triu_indices(temp.shape[0], 1)].mean()
            outside = df.loc[df.index==t, df.columns!=t].values.mean()
            print(t, '\t', temp.shape[0], '\t', np.round(inside, 2), '\t', np.round(outside, 2))

    if type(cluster_labels[0]) in [str, np.str_]:
        dnames = {i:l for i,l in enumerate(u)}
        rdnames = {l:i for i,l in enumerate(u)}
        cluster_labels = np.array([rdnames[l] for l in cluster_labels])

    silhouette_avg = silhouette_score(data, cluster_labels)
    sample_silhouette_values = silhouette_samples(data, cluster_labels)

    fig, ax1 = plt.subplots(figsize=(7,7))

    y_lower = 1
    for i in range(n_clusters):
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
        ith_cluster_silhouette_values.sort()
        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i
        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)
        try:
            label = str(dnames[i])
        except:
            label = str(i)

        ax1.text(0., y_lower + 0.5 * size_cluster_i, '' + str(dnames[i]) + ', avg=' + str(np.round(np.mean(ith_cluster_silhouette_values), 2)) + ', max=' + str(np.round(np.max(ith_cluster_silhouette_values), 2)), fontsize=10, color='k', va='center', ha='center').set_path_effects([path_effects.Stroke(linewidth=1., foreground='white'), path_effects.Normal()])
        y_lower = y_upper + 1

    ax1.set_xlabel("Silhouette coefficient values")
    ax1.set_xlim([min(sample_silhouette_values) - 0.1, max(sample_silhouette_values) + 0.1])
    ax1.set_ylim([0, len(data) + (n_clusters + 1) * 1])

    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
    ax1.text(silhouette_avg + 0.025, ax1.get_ylim()[1], 'avg=' + str(np.round(silhouette_avg, 2)), color="red")

    ax1.spines['left'].set_visible(False)
    ax1.spines['right'].set_visible(False)
    ax1.spines['top'].set_visible(False)
    ax1.set_yticklabels([])
    ax1.set_yticks([])

    fig.savefig(os.path.join(saveDir, saveName + '.png'), dpi=300)

    return

[docs]def getPanglaoDBAnnotationsSummaryDf(dirName, saveToFile = True, printDf = False):

    df_metadata = pd.read_csv(os.path.join(dirName, 'PanglaoDB', 'data', 'metadata.txt'), index_col=[0, 1], header=None)
    df_metadata.index.names = ['SRA accession', 'SRS accession']
    df_metadata.columns = ['Tissue origin of the sample', 'scRNA-seq protocol', 'Species', 'Sequencing instrument', 'Number of expressed genes', 'Median number of expressed genes per cell', 'Number of cell clusters in this sample', 'Is the sample from a tumor? (1 true otherwise false)', 'Is the sample from primary adult tissue?', 'Is the sample from a cell line?']

    df_counts = pd.read_csv(os.path.join(dirName, 'PanglaoDB', 'data', 'counts.txt'), index_col=[0, 1], header=0).replace(0, np.nan)
    df_metadata = pd.concat([df_metadata, df_counts], axis=1, sort=False)
    df_metadata['Fraction of cells passed QC'] = df_metadata['Number of cells'] / df_metadata['Number of raw cells']

    df_metadata.sort_index(axis=1, inplace=True, ascending=False)

    df_cell_type_annotations = pd.read_csv(os.path.join(dirName, 'PanglaoDB', 'data', 'cell_type_annotations.txt'), index_col=[0, 1, 2], header=None)
    df_cell_type_annotations.index.names = ['SRA accession', 'SRS accession', 'Cluster index']
    df_cell_type_annotations.columns = ['Cell type annotation', 'P-value from Hypergeometric test', 'Adjusted p-value (BH)', 'Cell type Activity Score']

    df_clusters_to_number_of_cells = pd.read_csv(os.path.join(dirName, 'PanglaoDB', 'data', 'clusters_to_number_of_cells.txt'), index_col=[0, 1, 2], header=None)
    df_clusters_to_number_of_cells.index.names = ['SRA accession', 'SRS accession', 'Cluster index']
    df_clusters_to_number_of_cells.columns = ['Number of cells in cluster']
    df_cell_type_annotations = pd.concat([df_clusters_to_number_of_cells, df_cell_type_annotations], axis=1, sort=False)
    
    df_cell_type_annotations = df_cell_type_annotations.reindex(np.hstack([df_cell_type_annotations.columns, df_metadata.columns]), axis=1)
    df_cell_type_annotations.loc[:, df_metadata.columns] = df_metadata.loc[pd.MultiIndex.from_arrays([df_cell_type_annotations.index.get_level_values(0), df_cell_type_annotations.index.get_level_values(1)])].values

    df_cell_type_annotations = df_cell_type_annotations.replace('\\N', np.nan)

    for col in ['Cell type Activity Score',
                'Number of cell clusters in this sample',
                'Is the sample from primary adult tissue?',
                'Is the sample from a tumor? (1 true otherwise false)',
                'Is the sample from a cell line?']:
        df_cell_type_annotations[col] = df_cell_type_annotations[col].astype(float)

    celltypes = df_cell_type_annotations['Cell type annotation'].values
    #df_cell_type_annotations = df_cell_type_annotations.loc[celltypes==celltypes]

    if saveToFile:
        if not os.path.isfile(os.path.join(dirName, 'df_cell_type_annotations.xlsx')):
            df_cell_type_annotations.to_excel(os.path.join(dirName, 'df_cell_type_annotations.xlsx'), merge_cells=False)

    if printDf:
        print(df_cell_type_annotations)

    return df_cell_type_annotations

[docs]def makeBarplot(labels, saveDir, saveName):

    if not os.path.exists(saveDir):
        os.makedirs(saveDir)

    fig, ax = plt.subplots(figsize=(4.5,8), frameon=False)

    ulabels = np.sort(np.unique(labels))
    colors = pd.Series({label:cm.jet(ilabel / len(ulabels)) for ilabel, label in enumerate(ulabels)})
    se = (100. * pd.Series(index=labels, data=0).groupby(level=0).count() / len(labels)).round(1).sort_values(ascending=True)
    colors = colors.reindex(se.index).values

    ax.bar([0]*len(ulabels), se.values, width=1., edgecolor='white', bottom=np.append(0, np.cumsum(se.values)[:-1]), color=colors)

    poss = np.append(0, np.cumsum(se.values)[:-1]) + se.values*0.5

    texts = []
    bottom = 0.
    for i, label, value, y in zip(range(len(se)), se.index, se.values, poss):
        bottom += 2.5 if (value < 5. or bottom > y - 0.5*value) else 0.5*value
        texts.append(ax.text(1., bottom, '%s%% ' % value + label, fontsize=12, color='k'))
        bottom += 2.5 if (value < 5. or bottom > y - 0.5*value) else 0.5*value

    for text, value, y, color in zip(texts, se.values, poss, colors):
        text._x = 1.25
        ax.plot([0.55, text._x - 0.04], [y, text._y + 1.25], color=color, lw=1., clip_on=False)

    ax.set_xlim([-0.5, 6])

    plt.xticks([])
    plt.yticks([0, 20, 40, 60, 80, 100], ['0', '20%', '40%', '60%', '80%', '100%'], fontsize=10)

    ax.spines['left'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['bottom'].set_visible(False)

    ax.yaxis.set_ticks_position('left')

    ax.set_title('%s cells' % len(labels))

    fig.savefig(os.path.join(saveDir, saveName + '.png'), dpi=300)
            
    return

[docs]def autoDetect1dGroups(se, halfWindowSize = 50, gaussianWidth = 15, **kwargs):

    se2 = pd.Series(index=np.arange(len(se)), data=scipy.ndimage.gaussian_filter1d(movingAverageCentered(se.values, halfWindowSize), gaussianWidth))

    kwargs.setdefault('threshold', 0.1)
    kwargs.setdefault('prominence', 0.1)
    kwargs.setdefault('distance', 100)

    peaks = getPeaks(se2.copy(), **kwargs)

    boundaries = []
    for i, peak in enumerate(peaks):
        a = 0 if i==0 else peaks[i-1]
        if i==0:
            boundaries.append([a, np.nan])
        else:
            if a==peak:
                boundaries.append([a, np.nan])
            else:
                v = se2.iloc[a:peak].sort_values().head(1).reset_index().values[0].tolist()
                boundaries.append([int(v[0]), v[1]])
    boundaries.append([len(se)-1, np.nan])

    peaksinfo = []
    for i, peak in enumerate(peaks):
        se_temp = se.iloc[boundaries[i][0]:boundaries[i+1][0]]
        members = se_temp[se_temp==1].index.values.tolist()
        height = np.round(se2[peak], 3)
        prominence = np.round(se2[peak] - np.nanmax([boundaries[i][1], boundaries[i+1][1]]), 3)
        peaksinfo.append([peak, height, prominence, members])
        print(height, '\t', prominence, '\t', ', '.join(members))
    print()

    return peaksinfo

[docs]def adjustTexts1D(texts, fig, ax, w = 'auto', direction = 'auto', maxIterations = 10**3, tolerance = 0.02):
    
    def get_text_position(text, ax):
        x, y = text.get_position()
        return text.get_transform().transform((ax.convert_xunits(x), ax.convert_yunits(y)))
    
    def set_text_position(text, x, y):
        return text.set_position(text.get_transform().inverted().transform((x, y)))

    extent = texts[0].get_window_extent(renderer=fig.canvas.get_renderer())
    
    if direction=='auto':
        direction = 'y' if extent.width > extent.height else 'x'
        
    if w == 'auto':      
        w = (extent.width if direction=='x' else extent.height) + 5
    
    orig_pos = [get_text_position(text, ax)[0 if direction=='x' else 1] for text in texts]
    
    for i, text in enumerate(texts):
        x, y = get_text_position(text, ax)
        set_text_position(text, x + i*w*(1 if direction=='x' else 0), y + i*w*(0 if direction=='x' else 1))
    
    objs = []
    for i_iter in range(maxIterations):
        curr_pos = [get_text_position(text, ax)[0 if direction=='x' else 1] for text in texts]
        
        obj = 0
        for i, (opos, text) in enumerate(zip(orig_pos, texts)):
            x, y = get_text_position(text, ax)
            p, q = (y, x) if direction=='x' else (x, y)
            
            cx = curr_pos[np.argsort(np.abs(curr_pos - q))[1]]
            ov, ovdel = max(0, w + min(q, cx) - max(q, cx)), max(0, w + min(q, cx+0.001) - max(q, cx+0.001))
            
            delta1 = 0.1*ov*np.sign(ovdel - ov)
            delta2 = np.sign(q - opos) * np.abs(q - opos)/25.

            delta = delta1 - delta2 * ((w-ov)/w)**2.5
            
            set_text_position(text, *((q + delta, p) if direction=='x' else (p, q + delta)))
            
            obj += 5.*ov + np.abs(q - opos)
        objs.append(obj)
        
        try:
            if abs(np.mean(objs[-20:]) - np.mean(objs[-40:-20])) < tolerance:
                break
        except:
            pass
        
    return