Skip to content

[WIP] added a silhouette calculation #876

New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Merged
merged 19 commits into from
Jan 21, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 44 additions & 3 deletions deeptools/heatmapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -991,7 +991,10 @@ def save_matrix_values(self, file_name):
def save_BED(self, file_handle):
boundaries = np.array(self.matrix.group_boundaries)
# Add a header
file_handle.write("#chrom\tstart\tend\tname\tscore\tstrand\tthickStart\tthickEnd\titemRGB\tblockCount\tblockSizes\tblockStart\tdeepTools_group\n")
file_handle.write("#chrom\tstart\tend\tname\tscore\tstrand\tthickStart\tthickEnd\titemRGB\tblockCount\tblockSizes\tblockStart\tdeepTools_group")
if self.matrix.silhouette is not None:
file_handle.write("\tsilhouette")
file_handle.write("\n")
for idx, region in enumerate(self.matrix.regions):
# the label id corresponds to the last boundary
# that is smaller than the region index.
Expand All @@ -1013,11 +1016,14 @@ def save_BED(self, file_handle):
region[5],
region[4]))
file_handle.write(
'\t{0}\t{1}\t{2}\t{3}\n'.format(
'\t{0}\t{1}\t{2}\t{3}'.format(
len(region[1]),
",".join([str(int(y) - int(x)) for x, y in region[1]]),
",".join([str(int(x) - int(starts[0])) for x, y in region[1]]),
self.matrix.group_labels[label_idx]))
if self.matrix.silhouette is not None:
file_handle.write("\t{}".format(self.matrix.silhouette[idx]))
file_handle.write("\n")
file_handle.close()

@staticmethod
Expand Down Expand Up @@ -1053,6 +1059,23 @@ def get_num_individual_matrix_cols(self):
return matrixCols


def computeSilhouetteScore(d, idx, labels):
"""
Given a square distance matrix with NaN diagonals, compute the silhouette score
of a given row (idx). Each row should have an associated label (labels).
"""
keep = ~np.isnan(d[idx, ])
foo = np.bincount(labels[keep], weights=d[idx, ][keep])
groupSizes = np.bincount(labels[keep])
intraIdx = labels[idx]
if groupSizes[intraIdx] == 1:
return 0
intra = foo[labels[idx]] / groupSizes[intraIdx]
interMask = np.arange(len(foo))[np.arange(len(foo)) != labels[idx]]
inter = np.min(foo[interMask] / groupSizes[interMask])
return (inter - intra) / max(inter, intra)


class _matrix(object):
"""
class to hold heatmapper matrices
Expand Down Expand Up @@ -1082,6 +1105,7 @@ def __init__(self, regions, matrix, group_boundaries, sample_boundaries,
self.sample_boundaries = sample_boundaries
self.sort_method = None
self.sort_using = None
self.silhouette = None

if group_labels is None:
self.group_labels = ['group {}'.format(x)
Expand Down Expand Up @@ -1225,7 +1249,7 @@ def sort_groups(self, sort_using='mean', sort_method='no', sample_list=None):
self.regions = _sorted_regions
self.set_sorting_method(sort_method, sort_using)

def hmcluster(self, k, method='kmeans', clustering_samples=None):
def hmcluster(self, k, evaluate_silhouette=True, method='kmeans', clustering_samples=None):
matrix = np.asarray(self.matrix)
matrix_to_cluster = matrix
if clustering_samples is not None:
Expand Down Expand Up @@ -1295,8 +1319,25 @@ def hmcluster(self, k, method='kmeans', clustering_samples=None):

self.regions = _clustered_regions
self.matrix = np.vstack(_clustered_matrix)

return idx

def computeSilhouette(self, k):
if k > 1:
from scipy.spatial.distance import pdist, squareform

silhouette = np.repeat(0.0, self.group_boundaries[-1])
groupSizes = np.subtract(self.group_boundaries[1:], self.group_boundaries[:-1])
labels = np.repeat(np.arange(k), groupSizes)

d = pdist(self.matrix)
d2 = squareform(d)
np.fill_diagonal(d2, np.nan) # This excludes the diagonal
for idx in range(len(labels)):
silhouette[idx] = computeSilhouetteScore(d2, idx, labels)
sys.stderr.write("The average silhouette score is: {}\n".format(np.mean(silhouette)))
self.silhouette = silhouette

def removeempty(self):
"""
removes matrix rows containing only zeros or nans
Expand Down
11 changes: 11 additions & 0 deletions deeptools/parserCommon.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,17 @@ def heatmapperOptionalArgs(mode=['heatmap', 'profile'][0]):
'fail with an error if a cluster has very few members compared to the '
'total number of regions.',
type=int)
cluster.add_argument(
'--silhouette',
help='Compute the silhouette score for regions. This is only'
' applicable if clustering has been performed. The silhouette score'
' is a measure of how similar a region is to other regions in the'
' same cluster as opposed to those in other clusters. It will be reported'
' in the final column of the BED file with regions. The '
'silhouette evaluation can be very slow when you have more'
'than 100 000 regions.',
action='store_true'
)

optional = parser.add_argument_group('Optional arguments')

Expand Down
19 changes: 11 additions & 8 deletions deeptools/plotHeatmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -780,14 +780,11 @@ def main(args=None):
if args.sortRegions == 'keep':
args.sortRegions = 'no' # These are the same thing
if args.kmeans is not None:
hm.matrix.hmcluster(args.kmeans, method='kmeans',
clustering_samples=args.clusterUsingSamples)
else:
if args.hclust is not None:
print("Performing hierarchical clustering."
"Please note that it might be very slow for large datasets.\n")
hm.matrix.hmcluster(args.hclust, method='hierarchical',
clustering_samples=args.clusterUsingSamples)
hm.matrix.hmcluster(args.kmeans, method='kmeans', clustering_samples=args.clusterUsingSamples)
elif args.hclust is not None:
print("Performing hierarchical clustering."
"Please note that it might be very slow for large datasets.\n")
hm.matrix.hmcluster(args.hclust, method='hierarchical', clustering_samples=args.clusterUsingSamples)

group_len_ratio = np.diff(hm.matrix.group_boundaries) / len(hm.matrix.regions)
if np.any(group_len_ratio < 5.0 / 1000):
Expand Down Expand Up @@ -816,6 +813,12 @@ def main(args=None):
sort_method=args.sortRegions,
sample_list=sortUsingSamples)

if args.silhouette:
if args.kmeans is not None:
hm.matrix.computeSilhouette(args.kmeans)
elif args.hclust is not None:
hm.matrix.computeSilhouette(args.args.hclust)

if args.outFileNameMatrix:
hm.save_matrix_values(args.outFileNameMatrix)

Expand Down
7 changes: 7 additions & 0 deletions galaxy/wrapper/deepTools_macros.xml
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,12 @@
</when>
<when value="none" />
</conditional>
<param argument="--silhouette" type="boolean" truevalue="--silhouette" falsevalue=""
label="Compute the silhouette score for each region"
help="Compute the silhouette score for regions. This is only applicable if clustering has been performed.
The silhouette score is a measure of how similar a region is to other regions in the same cluster as
opposed to those in other clusters. It will be reported in the final column of the BED file with regions.
The silhouette evaluation can be very slow when you have more than 100 000 regions." />
</when>
<when value="yes" />
</conditional>
Expand All @@ -181,6 +187,7 @@
--hclust $advancedOpt.used_multiple_regions.clustering.n_hclust
#end if
#end if
$advancedOpt.used_multiple_regions.silhouette
#end if
</token>

Expand Down