Skip to content

Removed external library and added clustering algorithm #532

New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Merged
merged 16 commits into from
Jul 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions utbot-summary/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,9 @@ dependencies {
api project(':utbot-framework-api')
compile(project(':utbot-instrumentation'))

implementation group: 'com.github.haifengl', name: 'smile-kotlin', version: '2.6.0'
implementation group: 'com.github.haifengl', name: 'smile-core', version: '2.6.0'

implementation group: 'io.github.microutils', name: 'kotlin-logging', version: kotlin_logging_version

implementation group: 'com.github.javaparser', name: 'javaparser-core', version: '3.22.1'

testImplementation("org.junit.jupiter:junit-jupiter:$junit5_version")
}
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ object UtSummarySettings {
* DBSCAN hyperparameter
* Sets radius of search for algorithm
*/
var RADIUS_DBSCAN: Double = 5.0
var RADIUS_DBSCAN: Float = 5.0f
}

object SummarySentenceConstants {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
package org.utbot.summary.clustering

import org.utbot.framework.plugin.api.Step
import smile.math.distance.Distance

class ExecutionDistance : Distance<Iterable<Step>> {
override fun d(x: Iterable<Step>, y: Iterable<Step>): Double {
return compareTwoPaths(x, y)
}
import org.utbot.summary.clustering.dbscan.Metric

/** The existing implementation of [Metric] for the space of [Step]. */
class ExecutionMetric : Metric<Iterable<Step>> {
/**
* Minimum Edit Distance
*/
Expand All @@ -19,7 +16,7 @@ class ExecutionDistance : Distance<Iterable<Step>> {
val stmt1 = path1.elementAt(i)
val stmt2 = path2.elementAt(j)

val d1 = distances[i - 1][j] + 1 //path 1 insert -> diff stmt from path2
val d1 = distances[i - 1][j] + 1 // path 1 insert -> diff stmt from path2
val d2 = distances[i][j - 1] + 1 // path 2 insert -> diff stmt from path1
val d3 = distances[i - 1][j - 1] + distance(stmt1, stmt2) // aligned or diff
distances[i][j] = minOf(d1, d2, d3)
Expand All @@ -31,4 +28,8 @@ class ExecutionDistance : Distance<Iterable<Step>> {
private fun distance(stmt1: Step, stmt2: Step): Int {
return if (stmt1 == stmt2) 0 else 2
}

override fun compute(object1: Iterable<Step>, object2: Iterable<Step>): Double {
return compareTwoPaths(object1, object2)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ package org.utbot.summary.clustering
import org.utbot.framework.plugin.api.Step
import org.utbot.framework.plugin.api.UtExecution
import org.utbot.summary.UtSummarySettings
import smile.clustering.dbscan
import org.utbot.summary.clustering.dbscan.DBSCANTrainer
import org.utbot.summary.clustering.dbscan.neighbor.LinearRangeQuery

class MatrixUniqueness(executions: List<UtExecution>) {

Expand All @@ -21,7 +22,10 @@ class MatrixUniqueness(executions: List<UtExecution>) {
}

/**
* Creates uniquness matrix. Rows are executions, columns are unique steps from all executions
* Creates uniqueness matrix.
*
* Rows are executions, columns are unique steps from all executions
*
* Every matrix i,j is 1 or 0, as if step in execution or not.
*/
private fun createMatrix(): List<IntArray> {
Expand Down Expand Up @@ -49,10 +53,10 @@ class MatrixUniqueness(executions: List<UtExecution>) {
private fun colSums(matrix: List<IntArray>) = matrix.first().indices.map { col -> this.colSum(matrix, col) }

/**
* Splits all steps into common, partly common and unique
* Splits all steps into common, partly common and unique.
*
* Unique steps are steps that only occur in one execution
* Common steps are steps that occur in all executions
* Unique steps are steps that only occur in one execution.
* Common steps are steps that occur in all executions.
* Partly common steps are steps that occur more than one time, but not in all executions
*/
fun splitSteps(): SplitSteps {
Expand All @@ -74,19 +78,24 @@ class MatrixUniqueness(executions: List<UtExecution>) {
}

companion object {
/**
* Returns map: cluster identifier, List<executions>
* DBSCAN - Density-Based Spatial Clustering of Applications with Noise
* Finds core samples of high density and expands clusters from them
*/
/** Returns map: cluster identifier, List<executions>. */
fun dbscanClusterExecutions(
methodExecutions: List<UtExecution>,
minPts: Int = UtSummarySettings.MIN_EXEC_DBSCAN,
radius: Double = UtSummarySettings.RADIUS_DBSCAN
radius: Float = UtSummarySettings.RADIUS_DBSCAN
): Map<Int, List<UtExecution>> {

val executionPaths = methodExecutions.map { it.path.asIterable() }.toTypedArray()
val cluster = dbscan(executionPaths, ExecutionDistance(), minPts, radius)
return methodExecutions.withIndex().groupBy({ cluster.y[it.index] }, { it.value })

val dbscan = DBSCANTrainer(
eps = radius,
minSamples = minPts,
metric = ExecutionMetric(),
rangeQuery = LinearRangeQuery()
)
val dbscanModel = dbscan.fit(executionPaths)
val clusterLabels = dbscanModel.clusterLabels
return methodExecutions.withIndex().groupBy({ clusterLabels[it.index] }, { it.value })
}
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package org.utbot.summary.clustering.dbscan

/**
* Keeps the information about clusters produced by [DBSCANTrainer].
*
* @property [numberOfClusters] Number of clusters.
* @property [clusterLabels] It contains labels of clusters in the range ```[0; k)```
* or [Int.MIN_VALUE] if point could not be assigned to any cluster.
*/
data class DBSCANModel(
val numberOfClusters: Int = 0,
val clusterLabels: IntArray
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
package org.utbot.summary.clustering.dbscan

import org.utbot.summary.clustering.dbscan.neighbor.LinearRangeQuery
import org.utbot.summary.clustering.dbscan.neighbor.Neighbor
import org.utbot.summary.clustering.dbscan.neighbor.RangeQuery

private const val NOISE = Int.MIN_VALUE
private const val CLUSTER_PART = -2
private const val UNDEFINED = -1

/**
* DBSCAN algorithm implementation.
*
* NOTE: The existing implementation with the [LinearRangeQuery] has a complexity O(n^2) in the worst case.
*
* @property [eps] The radius of search. Should be more than 0.0.
* @property [minSamples] The minimum number of samples to form the cluster. Should be more than 0.
* @property [metric] Metric to calculate distances.
* @property [rangeQuery] Gives access to the data in the implemented order.
*
* @see <a href="https://www.aaai.org/Papers/KDD/1996/KDD96-037.pdf">
* A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise</a>
*/
class DBSCANTrainer<T>(val eps: Float, val minSamples: Int, val metric: Metric<T>, val rangeQuery: RangeQuery<T>) {
init {
require(minSamples > 0) { "MinSamples parameter should be more than 0: $minSamples" }
require(eps > 0.0f) { "Eps parameter should be more than 0: $eps" }
}

/** Builds a clustering model based on the given data. */
fun fit(data: Array<T>): DBSCANModel {
require(data.isNotEmpty()) { "Nothing to learn, data is empty." }

if (rangeQuery is LinearRangeQuery) {
rangeQuery.data = data
rangeQuery.metric = metric
} // TODO: could be refactored if we add some new variants of RangeQuery

val labels = IntArray(data.size) { UNDEFINED }

// It changes in the range [0; k), where k is a final number of clusters found by DBSCAN
var clusterLabel = 0

for (i in data.indices) {
if (labels[i] == UNDEFINED) {
val neighbors = rangeQuery.findNeighbors(data[i], eps).toMutableList()
if (neighbors.size < minSamples) {
labels[i] = NOISE
} else {
labels[i] = clusterLabel
expandCluster(neighbors, labels, clusterLabel)

// If the existing cluster can not be expanded, the cluster label is incremented.
clusterLabel++
}
}
}

return DBSCANModel(numberOfClusters = clusterLabel, clusterLabels = labels)
}

private fun expandCluster(
neighbors: MutableList<Neighbor<T>>,
labels: IntArray,
k: Int
) {
// Neighbors to expand.
neighbors.forEach {
if (labels[it.index] == UNDEFINED) {
// All neighbors of a cluster point became cluster points.
labels[it.index] = CLUSTER_PART
}
}

// NOTE: the size of neighbors could grow from iteration to iteration and the classical for-loop in Kotlin could not be used
var j = 0

// Process every seed point Q.
while (j < neighbors.count())
{
val q = neighbors[j]
val idx = q.index

// Change Noise to border point.
if (labels[idx] == NOISE) {
labels[idx] = k
}

if (labels[idx] == UNDEFINED || labels[idx] == CLUSTER_PART) {
labels[idx] = k

val qNeighbors = rangeQuery.findNeighbors(q.key, eps)

if (qNeighbors.size >= minSamples) {
mergeTwoGroupsInCluster(qNeighbors, labels, neighbors)
}
}
j++
}
}

private fun mergeTwoGroupsInCluster(
qNeighbors: List<Neighbor<T>>,
labels: IntArray,
neighbors: MutableList<Neighbor<T>>
) {
for (qNeighbor in qNeighbors) {
val label = labels[qNeighbor.index]
if (label == UNDEFINED) {
labels[qNeighbor.index] = CLUSTER_PART
}

if (label == UNDEFINED || label == NOISE) {
neighbors.add(qNeighbor)
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
package org.utbot.summary.clustering.dbscan

interface Metric<T> {
/** Computes the distance between [object1] and [object2] according the given metric. */
fun compute(object1: T, object2: T): Double
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package org.utbot.summary.clustering.dbscan.neighbor

import org.utbot.summary.clustering.dbscan.Metric

/**
* This approach implements brute-force search with complexity O(n).
*
* @property [data] The whole dataset to search in it.
* @property [metric] Metric.
*/
class LinearRangeQuery<K> : RangeQuery<K> {
lateinit var data: Array<K>
lateinit var metric: Metric<K>

override fun findNeighbors(queryKey: K, radius: Float): List<Neighbor<K>> {
val neighbors = mutableListOf<Neighbor<K>>()
data.forEachIndexed { index, point ->
val distance = metric.compute(queryKey, point)
if (distance <= radius && queryKey != point) {
neighbors.add(Neighbor(point, index, distance))
}
}

return neighbors
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package org.utbot.summary.clustering.dbscan.neighbor

/**
* Neighbor abstraction for algorithms with searching in metric space specialization.
*
* @property [key] Search key.
* @property [index] Direct index to access the point in the basic data structure that keeps a set of points.
* @property [distance] Numerical value that keeps distance from the [key] point in the chosen metric space.
*
* NOTE: Neighbors should be ordered and this is implemented via [Comparable] interface.
*/
class Neighbor<K>(val key: K, val index: Int, private val distance: Double) : Comparable<Neighbor<K>> {
override fun compareTo(other: Neighbor<K>): Int {
val distance = distance.compareTo(other.distance)
return if (distance == 0) index.compareTo(other.index) else distance
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package org.utbot.summary.clustering.dbscan.neighbor

/** This is a basic interface for our approaches to ask the set of all points return the subset of the closest neighbors. */
interface RangeQuery<K> {
/** Returns the list of the closest neighbors in the [radius] from the [queryKey]. */
fun findNeighbors(queryKey: K, radius: Float): List<Neighbor<K>>
}
Loading