Skip to content

Commit

Permalink
perf(main): Command line tool supports a new input: filtered_feature_…
Browse files Browse the repository at this point in the history
…bc_matrix.h5
  • Loading branch information
Sheng, Caibin committed Aug 1, 2022
1 parent 2faed3e commit 73bc13e
Show file tree
Hide file tree
Showing 2 changed files with 158 additions and 42 deletions.
185 changes: 149 additions & 36 deletions scar/main/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@

import argparse

# from distutils.command.config import config
import os
import pandas as pd
import scanpy as sc
from scipy.sparse import csr_matrix
from ._scar import model
from .__version__ import __version__

Expand All @@ -25,6 +26,7 @@ def main():
nn_layer2 = args.hidden_layer2
latent_dim = args.latent_dim
epochs = args.epochs
device = args.device
sparsity = args.sparsity
save_model = args.save_model
batch_size = args.batchsize
Expand All @@ -33,7 +35,77 @@ def main():
cutoff = args.cutoff
moi = args.moi
round_to_int = args.round2int
count_matrix = pd.read_pickle(count_matrix_path)

_, file_extension = os.path.splitext(count_matrix_path)

if file_extension == ".pickle":
count_matrix = pd.read_pickle(count_matrix_path)

# Denoising transcritomic data
elif file_extension == ".h5":
adata = sc.read_10x_h5(count_matrix_path, gex_only=False)

print(
"unprocessed data contains: {0} cells and {1} genes".format(
adata.shape[0], adata.shape[1]
)
)
adata = adata[:, adata.X.sum(axis=0) > 0] # filter out features of zero counts
print(
"filter out features of zero counts, remaining data contains: {0} cells and {1} genes".format(
adata.shape[0], adata.shape[1]
)
)

if feature_type.lower() == "all":
features = adata.var["feature_types"].unique()
count_matrix = adata.to_df()

# Denoising mRNAs
elif feature_type.lower() in ["mrna", "mrnas"]:
features = "Gene Expression"
adata_fb = adata[:, adata.var["feature_types"] == features]
count_matrix = adata_fb.to_df()

# Denoising sgRNAs
elif feature_type.lower() in ["sgrna", "sgrnas"]:
features = "CRISPR Guide Capture"
adata_fb = adata[:, adata.var["feature_types"] == features]
count_matrix = adata_fb.to_df()

# Denoising CMO tags
elif feature_type.lower() in ["tag", "tags"]:
features = "Multiplexing Capture"
adata_fb = adata[:, adata.var["feature_types"] == features]
count_matrix = adata_fb.to_df()

# Denoising CMO tags
elif feature_type.lower() in ["tag", "tags"]:
features = "Antibody Capture"
adata_fb = adata[:, adata.var["feature_types"] == features]
count_matrix = adata_fb.to_df()

print(f"{len(features)} modalities to denoise: {features}")

else:
raise Exception(file_extension + " files are not supported.")

if ambient_profile_path:

_, ambient_profile_file_extension = os.path.splitext(ambient_profile_path)
if ambient_profile_file_extension == ".pickle":
ambient_profile = pd.read_pickle(ambient_profile_path)

# Currently, use the default approach to calculate the ambient profile in the case of h5
elif ambient_profile_file_extension == ".h5":
ambient_profile = None

else:
raise Exception(
ambient_profile_file_extension + " files are not supported."
)
else:
ambient_profile = None

print("===========================================")
print("feature_type: ", feature_type)
Expand All @@ -49,14 +121,15 @@ def main():

# Run model
scar_model = model(
raw_count=count_matrix_path,
ambient_profile=ambient_profile_path,
raw_count=count_matrix,
ambient_profile=ambient_profile,
nn_layer1=nn_layer1,
nn_layer2=nn_layer2,
latent_dim=latent_dim,
feature_type=feature_type,
count_model=count_model,
sparsity=sparsity,
device=device,
)

scar_model.train(
Expand All @@ -74,38 +147,71 @@ def main():
scar_model.assignment(cutoff=cutoff, moi=moi)

print("===========================================\n Saving results...")
output_path01, output_path02, output_path03, output_path04 = (
os.path.join(output_dir, "denoised_counts.pickle"),
os.path.join(output_dir, "BayesFactor.pickle"),
os.path.join(output_dir, "native_frequency.pickle"),
os.path.join(output_dir, "noise_ratio.pickle"),
)

# save results
pd.DataFrame(
scar_model.native_counts, index=count_matrix.index, columns=count_matrix.columns
).to_pickle(output_path01)
pd.DataFrame(
scar_model.bayesfactor, index=count_matrix.index, columns=count_matrix.columns
).to_pickle(output_path02)
pd.DataFrame(
scar_model.native_frequencies,
index=count_matrix.index,
columns=count_matrix.columns,
).to_pickle(output_path03)
pd.DataFrame(
scar_model.noise_ratio, index=count_matrix.index, columns=["noise_ratio"]
).to_pickle(output_path04)

print("...denoised counts saved in: ", output_path01)
print("...BayesFactor matrix saved in: ", output_path02)
print("...expected native frequencies saved in: ", output_path03)
print("...expected noise ratio saved in: ", output_path04)
if file_extension == ".pickle":

if feature_type.lower() in ["sgrna", "sgrnas", "tag", "tags", "cmo", "cmos"]:
output_path05 = os.path.join(output_dir, "assignment.pickle")
scar_model.feature_assignment.to_pickle(output_path05)
print("...assignment saved in: ", output_path05)
output_path01, output_path02, output_path03, output_path04 = (
os.path.join(output_dir, "denoised_counts.pickle"),
os.path.join(output_dir, "BayesFactor.pickle"),
os.path.join(output_dir, "native_frequency.pickle"),
os.path.join(output_dir, "noise_ratio.pickle"),
)

pd.DataFrame(
scar_model.native_counts,
index=count_matrix.index,
columns=count_matrix.columns,
).to_pickle(output_path01)
pd.DataFrame(
scar_model.bayesfactor,
index=count_matrix.index,
columns=count_matrix.columns,
).to_pickle(output_path02)
pd.DataFrame(
scar_model.native_frequencies,
index=count_matrix.index,
columns=count_matrix.columns,
).to_pickle(output_path03)
pd.DataFrame(
scar_model.noise_ratio, index=count_matrix.index, columns=["noise_ratio"]
).to_pickle(output_path04)

print("...denoised counts saved in: ", output_path01)
print("...BayesFactor matrix saved in: ", output_path02)
print("...expected native frequencies saved in: ", output_path03)
print("...expected noise ratio saved in: ", output_path04)

if feature_type.lower() in ["sgrna", "sgrnas", "tag", "tags", "cmo", "cmos"]:

output_path05 = os.path.join(output_dir, "assignment.pickle")
scar_model.feature_assignment.to_pickle(output_path05)
print("...assignment saved in: ", output_path05)

elif file_extension == ".h5":

output_path_h5ad = os.path.join(
output_dir, "filtered_feature_bc_matrix_denoised_{feature_type}.h5ad"
)

denoised_adata = adata.copy()
denoised_adata.X = csr_matrix(scar_model.native_counts)
denoised_adata.obs["noise_ratio"] = pd.DataFrame(
scar_model.noise_ratio,
index=count_matrix.index,
columns=["noise_ratio"],
)

denoised_adata.layers["native_frequencies"] = csr_matrix(
scar_model.native_frequencies
)
denoised_adata.layers["BayesFactor"] = csr_matrix(scar_model.bayesfactor)

if feature_type.lower() in ["sgrna", "sgrnas", "tag", "tags", "cmo", "cmos"]:
denoised_adata.obs = denoised_adata.obs.join(scar_model.feature_assignment)

denoised_adata.write(output_path_h5ad)
print("the denoised h5ad file saved in: ", output_path_h5ad)

print("===========================================\n Done!!!")

Expand Down Expand Up @@ -141,14 +247,14 @@ def scar_parser():
"count_matrix",
type=str,
nargs="+",
help="The file of observed count matrix, 2D array (cells x genes)",
help="The file of observed count matrix, 2D array (cells x genes) or the path of a filtered_feature_bc_matrix.h5",
)
parser.add_argument(
"-ap",
"--ambient_profile",
type=str,
default=None,
help="The file of empty profile obtained from empty droplets, 1D array",
help="The file of empty profile obtained from empty droplets, 1D array or the path of a raw_feature_bc_matrix.h5",
)
parser.add_argument(
"-ft",
Expand Down Expand Up @@ -197,6 +303,13 @@ def scar_parser():
parser.add_argument(
"-epo", "--epochs", type=int, default=800, help="Training epochs"
)
parser.add_argument(
"-device",
"--device",
type=str,
default="auto",
help="Device used for training, either 'auto', 'cpu', or 'cuda'",
)
parser.add_argument(
"-s",
"--save_model",
Expand All @@ -215,7 +328,7 @@ def scar_parser():
"-batchsize_infer",
"--batchsize_infer",
type=int,
default=None,
default=4096,
help="Batch size for inference, set a small value upon out of memory error",
)
parser.add_argument(
Expand Down
15 changes: 9 additions & 6 deletions scar/main/_scar.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,12 +194,15 @@ def __init__(
dropout_prob: float = 0,
feature_type: str = "mRNA",
count_model: str = "binomial",
sparsity: float = .9
sparsity: float = .9,
device: str = 'auto'
):
"""initialize object"""

self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
"""str, "cuda" if gpu is available
if device == 'auto':
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
else:
self.device = device
"""str, either "auto, "cpu" or "cuda".
"""
self.nn_layer1 = nn_layer1
"""int, number of neurons of the 1st layer.
Expand Down Expand Up @@ -547,14 +550,14 @@ def train(
# Inference
@torch.no_grad()
def inference(
self, batch_size=None, count_model_inf="poisson", adjust="micro", cutoff=3, round_to_int="stochastic_rounding", moi=None
self, batch_size=4096, count_model_inf="poisson", adjust="micro", cutoff=3, round_to_int="stochastic_rounding", moi=None
):
"""inference infering the expected native signals, noise ratios, Bayesfactors and expected native frequencies
Parameters
----------
batch_size : int, optional
batch size, set a small value upon GPU memory issue, by default None
batch size, set a small value upon GPU memory issue, by default 4096
count_model_inf : str, optional
inference model for evaluation of ambient presence, by default "poisson"
adjust : str, optional
Expand Down

0 comments on commit 73bc13e

Please # to comment.