Source code for scSemiProfiler_dev.singlecell_process

import pdb,sys,os
import anndata
import scanpy as sc
import argparse
import copy
import torch
import numpy as np
import gc
import pandas as pd
import timeit
import scipy
import warnings
warnings.filterwarnings('ignore')
import faiss
from sklearn.cluster import KMeans
import sklearn
from scipy import stats
from sklearn.neighbors import kneighbors_graph
from matplotlib.pyplot import figure

from typing import Tuple
from torch.utils.data import Dataset



def gen_tf_gene_table(genes, tf_list, dTD):
    """
    Adapted from:
    Author: Jun Ding
    Project: SCDIFF2
    Ref: Ding, J., Aronow, B. J., Kaminski, N., Kitzmiller, J., Whitsett, J. A., & Bar-Joseph, Z.
    (2018). Reconstructing differentiation networks and their regulation from time series
    single-cell expression data. Genome research, 28(3), 383-395.
    """
    

    gene_names = [g.upper() for g in genes]
    TF_names = [g.upper() for g in tf_list]
    tf_gene_table = dict.fromkeys(tf_list)

    for i, tf in enumerate(tf_list):
        tf_gene_table[tf] = np.zeros(len(gene_names))
        _genes = dTD[tf]

        _existed_targets = list(set(_genes).intersection(gene_names))
        _idx_targets = map(lambda x: gene_names.index(x), _existed_targets)

        for _g in _idx_targets:
            tf_gene_table[tf][_g] = 1

    del gene_names
    del TF_names
    del _genes
    del _existed_targets
    del _idx_targets

    gc.collect()

    return tf_gene_table



def getGeneSetMatrix(_name, genes_upper, gene_sets_path):
    """

    Adapted from:
    Author: Jun Ding
    Project: SCDIFF2
    Ref: Ding, J., Aronow, B. J., Kaminski, N., Kitzmiller, J., Whitsett, J. A., & Bar-Joseph, Z.
    (2018). Reconstructing differentiation networks and their regulation from time series
    single-cell expression data. Genome research, 28(3), 383-395.

    """
    if _name[-3:] == 'gmt':
        print(f"GMT file {_name} loading ... ")
        filename = _name
        filepath = os.path.join(gene_sets_path, f"{filename}")

        with open(filepath) as genesets:
            pathway2gene = {line.strip().split("\t")[0]: line.strip().split("\t")[2:]
                            for line in genesets.readlines()}

        print(len(pathway2gene))

        gs = []
        for k, v in pathway2gene.items():
            gs += v

        print(f"Number of genes in {_name} {len(set(gs).intersection(genes_upper))}")

        pathway_list = pathway2gene.keys()
        pathway_gene_table = gen_tf_gene_table(genes_upper, pathway_list, pathway2gene)
        gene_set_matrix = np.array(list(pathway_gene_table.values()))
        keys = pathway_gene_table.keys()

        del pathway2gene
        del gs
        del pathway_list
        del pathway_gene_table

        gc.collect()


    elif _name == 'TF-DNA':

        # get TF-DNA dictionary
        # TF->DNA
        def getdTD(tfDNA):
            dTD = {}
            with open(tfDNA, 'r') as f:
                tfRows = f.readlines()
                tfRows = [item.strip().split() for item in tfRows]
                for row in tfRows:
                    itf = row[0].upper()
                    itarget = row[1].upper()
                    if itf not in dTD:
                        dTD[itf] = [itarget]
                    else:
                        dTD[itf].append(itarget)

            del tfRows
            del itf
            del itarget
            gc.collect()

            return dTD

        from collections import defaultdict

        def getdDT(dTD):
            gene_tf_dict = defaultdict(lambda: [])
            for key, val in dTD.items():
                for v in val:
                    gene_tf_dict[v.upper()] += [key.upper()]

            return gene_tf_dict

        tfDNA_file = os.path.join(gene_sets_path, f"Mouse_TF_targets.txt")
        dTD = getdTD(tfDNA_file)
        dDT = getdDT(dTD)

        tf_list = list(sorted(dTD.keys()))
        tf_list.remove('TF')

        tf_gene_table = gen_tf_gene_table(genes_upper, tf_list, dTD)
        gene_set_matrix = np.array(list(tf_gene_table.values()))
        keys = tf_gene_table.keys()

        del dTD
        del dDT
        del tf_list
        del tf_gene_table

        gc.collect()

    else:
        gene_set_matrix = None

    return gene_set_matrix, keys

        
def fast_cellgraph(adata: anndata.AnnData,k: int = 15,diagw: float=1.0) -> Tuple[anndata.AnnData, np.ndarray]:
    """
    Augment an anndata object using a cell neighbor graph. 

    Parameters
    ----------
    adata
        The dataset to be augmented
    k
        The number of neighbors to consider
    diagw
        The weight of the original cell when agregating the information

    Returns
    -------
    adata
        The augmented anndata object. 
    adj
        The adjacency matrix of the cell neighbor graph.
    """
    
    
    adj = kneighbors_graph(np.array(adata.X), k, mode='connectivity', include_self=True)
    adj = adj.toarray()
    diag = np.array(np.identity(adj.shape[0]).astype('float32'))*diagw
    adj = adj + diag
    adj = adj/adj.sum(axis=1)        
    selfw = np.zeros(adj.shape[0])
    for i in range(adj.shape[0]):
        selfw[i] = adj[i,i]
    selfw=selfw.astype('float32')
    adata.obs['selfw']=selfw
    #remove self so that not in neighbors
    for i in range(adj.shape[0]):
        adj[i,i]=0
        
    adata.obsm['adj'] = adj
    adj = torch.from_numpy(adj.astype('float32'))#.type(torch.FloatTensor)
    neighboridx = np.where(adj!=0)
    xs = neighboridx[0]
    ys = neighboridx[1]
        
    maxn=k
    neighbors = np.zeros((adj.shape[0],maxn-1)) - 1
    for i in range(len(adata.obs)):
        ns=np.zeros(maxn-1)-1
        flag=0
        j=0
        k=0
        while flag!=2 and j<xs.shape[0]:
            if xs[j]==i:
                ns[k] = (ys[j])
                k+=1
                flag=1
            elif flag==1:
                flag=2
            j+=1
        neighbors[i] = ns
        
    neighbors = neighbors.astype(int)
    adata.obsm['neighbors']=neighbors
    neighborx = np.array(adata.X)
    
    normchoice = 0
    if normchoice == 0:
        neighborx = np.log(1+neighborx)*selfw[0] + np.log(1+neighborx[neighbors,:]).sum(axis=1)*(1-selfw[0])/(maxn-1)
    else:                                       ## 2*
        neighborx = neighborx*selfw[0] + neighborx[neighbors].sum(axis=1)*(1-selfw[0])/(maxn-1)
        neighborx = np.log(1+neighborx)
    
    adata.obsm['neighborx']=neighborx
    
    return adata,adj



    
[docs]def scprocess(name:str,singlecell:str,normed:str='yes',cellfilter:str='no',threshold:float = 1e-3,geneset:str = 'human',weight:float=0.5,k:int=15) -> None:
    """
    Process the reprsentatives' single-cell data, including preprocessing and feature augmentations.

    Parameters
    ----------
    name
        Project name.
    singlecell
        Path to representatives' single-cell data.
    normed
        Whether the data has been library size normed or not.
    cellfilter
        Whether to perform cell selection.
    threshold
        Threshold for background noise removal.
    geneset
        Gene set file name. 
    weight
        The proportion of top features to increase importance weight.
    k
        K for the K-NN graph built for cells.
        
        
    Example
    -------
    >>> scSemiProfiler.scprocess(name='project_name',singlecell=name+'/representative_sc.h5ad',normed='yes',cellfilter='no',threshold=1e-3,geneset='yes',weight='yes',k=15)
    
    
    """
    
    print('Processing representative single-cell data')
    
    scdata = anndata.read_h5ad(singlecell)
    sids = np.unique(scdata.obs['sample_ids'])
    
    # cell filtering
    if cellfilter == 'yes':
        print('Filtering cells')
        sc.pp.filter_cells(scdata, min_genes=200)
        scdata.var['mt'] = scdata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
        sc.pp.calculate_qc_metrics(scdata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
        scdata = scdata[scdata.obs.n_genes_by_counts < 2500, :]
        scdata = scdata[scdata.obs.pct_counts_mt < 5, :]
    
    if normed == 'no':
        print('Library size normalization.')
        sc.pp.normalize_total(scdata, target_sum=1e4)    
    
    # convert to dense if sparse
    if type(scdata.X) == scipy.sparse._csr.csr_matrix:
        X = np.array(scdata.X.todense())
        tempdata = anndata.AnnData(X)
        tempdata.obs = scdata.obs
        tempdata.var = scdata.var
        scdata = tempdata
    
    
    # norm remove noise
    if float(threshold) > 0:
        print('Removing background noise')
        X = np.array(scdata.X)
        cutoff = 1e4*threshold
        X = X * np.array(X>cutoff)
        nscdata = anndata.AnnData(X)
        nscdata.obs = scdata.obs
        nscdata.var = scdata.var
        nscdata.uns = scdata.uns
        scdata = nscdata
    
    
    
    # store singlecell data, geneset score
    if (os.path.isdir(name + '/sample_sc')) == False:
        os.system('mkdir ' + name + '/sample_sc')
    if (os.path.isdir(name + '/geneset_scores')) == False:
        os.system('mkdir ' + name + '/geneset_scores')
        
    if geneset != 'none':
        prior_name = "c2.cp.v7.4.symbols.gmt" # "c5.go.bp.v7.4.symbols.gmt+c2.cp.v7.4.symbols.gmt+TF-DNA"
    

        
    print('Computing geneset scores')
    zps=[]
    for sid in sids:
        adata = scdata[scdata.obs['sample_ids'] == sid]
        X = adata.X

        gene_sets_path = "genesets/"
        genes_upper = [g.upper() for g in list(adata.var.index)]
        N = adata.X.shape[0]
        G = len(genes_upper)
        gene_set_matrix, keys_all = getGeneSetMatrix(prior_name, genes_upper, gene_sets_path)
        
        zp = X.dot(np.array(gene_set_matrix).T)
        eps = 1e-6
        den = (np.array(gene_set_matrix.sum(axis=1))+eps)
        zp = (zp+eps)/den
        zp = zp - eps/den
        np.save(name + '/geneset_scores/' + sid,zp)
        zps.append(zp)
    
    if 'hvset.npy' not in os.listdir(name):
        zps=np.concatenate(zps,axis=0)
        zdata = anndata.AnnData(zps)
        sc.pp.log1p(zdata)
        sc.pp.highly_variable_genes(zdata)
        hvset = zdata.var.highly_variable
        np.save(name + '/hvset.npy',hvset)

        
    
        
    # select highly variable genes (genes in preprocessed bulk data)
    hvgenes = np.load(name + '/hvgenes.npy', allow_pickle = True)
    hvmask = []
    for i in scdata.var.index:
        if i in hvgenes:
            hvmask.append(True)
        else:
            hvmask.append(False)
    hvmask = np.array(hvmask)
    scdata = scdata[:,hvmask]
    np.save(name + '/hvmask.npy',hvmask)
    

    print('Augmenting and saving single-cell data.')
    for sid in sids:
        adata = scdata[scdata.obs['sample_ids'] == sid]
        
        # gcn
        adata.obs['cellidx']=range(len(adata.obs))
        adata,adj = fast_cellgraph(adata,k=k,diagw=1.0)
        
        
        # # importance weight
        sample_geneset = np.load(name + '/geneset_scores/'+sid+'.npy')
        setmask = np.load(name + '/hvset.npy')
        sample_geneset = sample_geneset[:,setmask]
        sample_geneset = sample_geneset.astype('float32')
        
        features = np.concatenate([adata.X,sample_geneset],1)
        
        variances = np.var(features,axis=0)
        adata.uns['feature_var'] = variances
        
        adata.write(name + '/sample_sc/' + sid + '.h5ad')
    
    print('Finished processing representative single-cell data')
    return 




def main():
    parser=argparse.ArgumentParser(description="scSemiProfiler scprocess")
    parser._action_groups.pop()
    required = parser.add_argument_group('required arguments')
    optional = parser.add_argument_group('optional arguments')
    
    required.add_argument('--singlecell',required=True,help="Input representatives' single-cell data as a h5ad file. Sample IDs should be stored in obs.['sample_ids']. Cell IDs should be stored in obs.index. Gene symbols should be stored in var.index. Values should either be raw read counts or normalized expression.")
    
    required.add_argument('--name',required=True, help="Project name.")
    
    optional.add_argument('--normed',required=False, default='no', help="Whether the library size normalization has already been done (Default: no)") ###
    
    
    optional.add_argument('--cellfilter',required=False, default='yes', help="Whether to perform cell filtering: 'yes' or 'no'. (Default: yes)")
    optional.add_argument('--threshold',required=False, default='1e-3', help="The threshold for removing extremely low expressed background noise, as a proportion of the library size. (Default: 1e-3)")
    optional.add_argument('--geneset',required=False, default='human', help="Specify the gene set file: 'human', 'mouse', 'none', or path to the file (Default: 'human')")
    optional.add_argument('--weight',required=False, default=0.5, help="The proportion of top highly variable features to increase importance weight. (Default: 0.5)")
    optional.add_argument('--k',required=False, default=15, help="K-nearest cell neighbors used for cell graph convolution. (Default: 15)")
    
    args = parser.parse_args()
    singlecell = args.singlecell
    normed = args.normed
    name = args.name
    cellfilter = args.cellfilter
    threshold = float(args.threshold)
    geneset = args.geneset
    weight = args.weight
    k = args.k
    
    scprocess(name,singlecell,normed,cellfilter,threshold,geneset,weight,k)

if __name__=="__main__":
    main()