Source code for scSemiProfiler_dev.initial_setup

import pdb,sys,os
import anndata
import scanpy as sc
import argparse
import copy
import numpy as np
from sklearn.cluster import KMeans

[docs]def initsetup(name:str, bulk:str,normed:str,geneselection:str,batch:int) -> None:
    """
    Initial setup of the semi-profiling pipeline, including processing the bulk data, clustering for finding the initial representatives.
    
    Parameters
    ----------
    name
        Project name. 
    bulk
        Path to bulk data. 
    normed
        Whether the data has been library size normed or not. 
    geneselection
        Whether to perform gene selection.
    batch 
        Representative selection batch size.
    
    
    Example
    --------
    >>> import scSemiProfiler
    >>> name = 'runexample'
    >>> bulk = 'example_data/bulkdata.h5ad'
    >>> normed = 'yes'
    >>> geneselection = 'no'
    >>> batch = 2
    >>> scSemiProfiler.initsetup(name, bulk,normed,geneselection,batch)

    """
    
    print('Start initial setup')
    
    if (os.path.isdir(name)) == False:
        os.system('mkdir '+name)
    else:
        print(name + ' exists. Please choose another name.')
        return
    
    
    
    bulkdata = anndata.read_h5ad(bulk)

    if normed == 'no':
        sc.pp.normalize_total(bulkdata, target_sum=1e4)
        
    # write sample ids
    sids = list(bulkdata.obs['sample_ids'])
    f = open(name+'/sids.txt','w')
    for sid in sids:
        f.write(sid+'\n')
    f.close()
    
    # log transformation
    sc.pp.log1p(bulkdata)
    
    if geneselection == 'no':
        hvgenes = np.array(bulkdata.var.index)
    else:
        sc.pp.highly_variable_genes(bulkdata, n_top_genes=6000)
        #sc.pp.highly_variable_genes(bulkdata, min_mean=0.0125, max_mean=3, min_disp=0.5)
        bulkdata = bulkdata[:, bulkdata.var.highly_variable]
        hvgenes = (np.array(bulkdata.var.index))[bulkdata.var.highly_variable]
    np.save(name+'/hvgenes.npy',hvgenes)
    
    #dim reduction and clustering
    sc.tl.pca(bulkdata)
    
    bulkdata.write(name + '/processed_bulkdata.h5ad')
    
    #cluster
    BATCH_SIZE = batch
    kmeans = KMeans(n_clusters=BATCH_SIZE, random_state=0).fit(bulkdata.obsm['X_pca'])
    cluster_labels = kmeans.labels_
    #find representatives and cluster labels
    pnums = []
    for i in range(len(bulkdata.X)):
        pnums.append(i)
    pnums=np.array(pnums)
    centers=[]
    representatives=[]
    repredic={}
    for i in range(len(np.unique(cluster_labels))):
        mask = (cluster_labels==i)
        cluster = bulkdata.obsm['X_pca'][mask]
        cluster_patients = pnums[mask]
        center = cluster.mean(axis=0)
        centers.append(center)
        # find the closest patient
        sqdist = ((cluster - center)**2).sum(axis=1)
        cluster_representative = cluster_patients[np.argmin(sqdist)]
        representatives.append(cluster_representative)
        repredic[i] = cluster_representative
    centers = np.array(centers)
    #store representatives cluster labels
    if (os.path.isdir(name + '/status')) == False:
        os.system('mkdir ' + name + '/status')
    

    f=open(name + '/status/init_cluster_labels.txt','w')
    for i in range(len(cluster_labels)):
        f.write(str(cluster_labels[i])+'\n')
    f.close()

    f=open(name + '/status/init_representatives.txt','w')
    for i in range(len(representatives)):
        f.write(str(representatives[i])+'\n')
    f.close()
    
    print('Initial setup finished. Among ' + str(len(sids)) + ' total samples, selected '+str(batch)+' representatives:')
    for i in range(batch):
        print(sids[representatives[i]])
    
    return




def main():
    parser=argparse.ArgumentParser(description="scSemiProfiler initsetup")
    parser._action_groups.pop()
    required = parser.add_argument_group('required arguments')
    optional = parser.add_argument_group('optional arguments')
    
    required.add_argument('--bulk',required=True,help="Input bulk data as a h5ad file. Sample IDs should be stored in obs.['sample_ids']. Gene symbols should be stored in var.index.")
    
    required.add_argument('--name',required=True, help="Project name.")
    
    optional.add_argument('--normed',required=False, default='no', help="Whether the library size normalization has already been done (Default: no)") ###
    
    optional.add_argument('--geneselection',required=False,default='yes', help="Whether to perform highly variable gene selection: 'yes' or 'no'. (Default: yes)")
    
    optional.add_argument('--batch',required=False, default=4, help="The representative sample batch size (Default: 4)")
    
    args = parser.parse_args()
    bulk = args.bulk
    name = args.name
    geneselection = args.geneselection
    normed = args.normed
    batch = int(args.batch)
    
    initsetup(name,bulk,normed,geneselection,batch)

if __name__=="__main__":
    main()