Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
# Conflicts:
#	src/palantir/io.py
  • Loading branch information
awnimo committed May 6, 2020
1 parent 5c88f25 commit 6d80708
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 3 deletions.
8 changes: 6 additions & 2 deletions src/palantir/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,9 @@ def run_palantir(

# pseudotime and weighting matrix
print("Determining pseudotime...")
pseudotime, W = _compute_pseudotime(data, start_cell, knn, waypoints, n_jobs, max_iterations)
pseudotime, W = _compute_pseudotime(
data, start_cell, knn, waypoints, n_jobs, max_iterations
)

# Entropy and branch probabilities
print("Entropy and branch probabilities...")
Expand Down Expand Up @@ -274,7 +276,9 @@ def identify_terminal_states(
waypoints = pd.Index([start_cell]).append(waypoints)

# Distance to start cell as pseudo pseudotime
pseudotime, _ = _compute_pseudotime(data, start_cell, knn, waypoints, n_jobs, max_iterations)
pseudotime, _ = _compute_pseudotime(
data, start_cell, knn, waypoints, n_jobs, max_iterations
)

# Markov chain
wp_data = data.loc[waypoints, :]
Expand Down
124 changes: 124 additions & 0 deletions src/palantir/io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import numpy as np
import pandas as pd
import os.path
import fcsparser
import scanpy as sc
from scipy.io import mmread


def _clean_up(df):
df = df.loc[df.index[df.sum(axis=1) > 0], :]
df = df.loc[:, df.columns[df.sum() > 0]]
return df


def from_csv(counts_csv_file, delimiter=","):
# Read in csv file
df = pd.read_csv(counts_csv_file, sep=delimiter, index_col=0)
clean_df = _clean_up(df)
return clean_df


def from_mtx(mtx_file, gene_name_file):

# Read in mtx file
count_matrix = mmread(mtx_file)

gene_names = np.loadtxt(gene_name_file, dtype=np.dtype("S"))
gene_names = np.array([gene.decode("utf-8") for gene in gene_names])

# remove todense
df = pd.DataFrame(count_matrix.todense(), columns=gene_names)

return _clean_up(df)


def from_10x(data_dir, use_ensemble_id=True):
# loads 10x sparse format data
# data_dir is dir that contains matrix.mtx, genes.tsv and barcodes.tsv
# return_sparse=True -- returns data matrix in sparse format (default = False)

if data_dir is None:
data_dir = "./"
elif data_dir[len(data_dir) - 1] != "/":
data_dir = data_dir + "/"

filename_dataMatrix = os.path.expanduser(data_dir + "matrix.mtx")
filename_genes = os.path.expanduser(data_dir + "genes.tsv")
filename_cells = os.path.expanduser(data_dir + "barcodes.tsv")

# Read in gene expression matrix (sparse matrix)
# Rows = genes, columns = cells
dataMatrix = mmread(filename_dataMatrix)

# Read in row names (gene names / IDs)
gene_names = np.loadtxt(filename_genes, delimiter="\t", dtype=bytes).astype(str)
if use_ensemble_id:
gene_names = [gene[0] for gene in gene_names]
else:
gene_names = [gene[1] for gene in gene_names]
cell_names = np.loadtxt(filename_cells, delimiter="\t", dtype=bytes).astype(str)

dataMatrix = pd.DataFrame(
dataMatrix.todense(), columns=cell_names, index=gene_names
)

# combine duplicate genes
if not use_ensemble_id:
dataMatrix = dataMatrix.groupby(dataMatrix.index).sum()
dataMatrix = dataMatrix.transpose()

return _clean_up(dataMatrix)


def from_10x_HDF5(filename, genome=None):

ad = sc.read_10x_h5(filename, genome, True)

dataMatrix = pd.DataFrame(ad.X.todense(), columns=ad.var_names, index=ad.obs_names)

return _clean_up(dataMatrix)


def from_fcs(
cls,
fcs_file,
cofactor=5,
metadata_channels=[
"Time",
"Event_length",
"DNA1",
"DNA2",
"Cisplatin",
"beadDist",
"bead1",
],
):

# Parse the fcs file
text, data = fcsparser.parse(fcs_file)
data = data.astype(np.float64)

# Extract the S and N features (Indexing assumed to start from 1)
# Assumes channel names are in S
no_channels = text["$PAR"]
channel_names = [""] * no_channels
for i in range(1, no_channels + 1):
# S name
try:
channel_names[i - 1] = text["$P%dS" % i]
except KeyError:
channel_names[i - 1] = text["$P%dN" % i]
data.columns = channel_names

# Metadata and data
metadata_channels = data.columns.intersection(metadata_channels)
data_channels = data.columns.difference(metadata_channels)
# metadata = data[metadata_channels]
data = data[data_channels]

# Transform if necessary
if cofactor is not None or cofactor > 0:
data = np.arcsinh(np.divide(data, cofactor))

return data
2 changes: 1 addition & 1 deletion src/palantir/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,7 +363,7 @@ def plot_palantir_results(pr_res, tsne):

for i, branch in enumerate(pr_res.branch_probs.columns):
row = int(np.floor(i / n_cols))
ax = plt.subplot(gs[row + 2, np.remainder(i,n_cols)])
ax = plt.subplot(gs[row + 2, np.remainder(i, n_cols)])
c = pr_res.branch_probs.loc[tsne.index, branch]
ax.scatter(
tsne.loc[:, "x"], tsne.loc[:, "y"], s=3, cmap=matplotlib.cm.plasma, c=c
Expand Down

0 comments on commit 6d80708

Please # to comment.