Skip to content
This repository was archived by the owner on Jan 10, 2023. It is now read-only.

Add new algorithm: suffixtree #199

Closed
wants to merge 29 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
f0a93b7
Add files for suffix tree algorithm
olleharstedt Jun 17, 2021
0179eff
Add new command argument: algorithm
olleharstedt Jun 17, 2021
8d3c0cb
Report clone properly
olleharstedt Jun 17, 2021
4224ba3
Remove unused code
olleharstedt Jun 17, 2021
f2dd32a
Free tokens memory
olleharstedt Jun 17, 2021
093a8d0
Add post-process step, so suffixtree algo can collect all tokens first
olleharstedt Jun 17, 2021
793d486
Add algorithm setting to help text
olleharstedt Jun 17, 2021
12279b4
Add description of the rabin karp algorithm
olleharstedt Jun 17, 2021
c9e22de
Move hashes from abstract strategy
olleharstedt Jun 22, 2021
af01d9e
Factor out strategy configuration DTO
olleharstedt Jun 22, 2021
d2e1429
Remove test file
olleharstedt Jun 22, 2021
c2acc9e
Add missing file
olleharstedt Jun 22, 2021
65f22c7
Remove duplicated method; fix argument access
olleharstedt Jun 22, 2021
f060ac3
Indentation
olleharstedt Jun 22, 2021
eaae196
Testing
olleharstedt Jun 22, 2021
c8625c8
Check if we fetch sentinel by mistake
olleharstedt Jun 22, 2021
7142b7b
Apply cs-fixer
olleharstedt Jun 23, 2021
eea6450
Psalm fixes (WIP)
olleharstedt Jun 23, 2021
e24e902
Psalm fixes (WIP)
olleharstedt Jun 23, 2021
fe9e37b
Replace JavaObjectInterface with AbstractToken
olleharstedt Jun 24, 2021
6775c1d
Make it run
olleharstedt Jun 24, 2021
197cde3
Psalm fixes, done
olleharstedt Jun 24, 2021
893dbfd
Run cs-fix
olleharstedt Jun 24, 2021
75d1e22
Phpunit (WIP)
olleharstedt Jun 26, 2021
873a934
Psalm + cs fix
olleharstedt Jun 26, 2021
44b9df9
Some small notes
olleharstedt Jun 26, 2021
bbb3203
Apply cs fix
olleharstedt Jun 26, 2021
33a3b95
Add cover annotations to tests
olleharstedt Jun 30, 2021
6231b93
Apply cs fix
olleharstedt Jul 20, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .psalm/baseline.xml
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,15 @@
<MixedArgumentTypeCoercion occurrences="1">
<code>$argv</code>
</MixedArgumentTypeCoercion>
<MixedArrayAccess occurrences="6">
<MixedArrayAccess occurrences="9">
<code>$option[0]</code>
<code>$option[1]</code>
<code>$option[1]</code>
<code>$option[1]</code>
<code>$option[1]</code>
<code>$option[1]</code>
<code>$option[1]</code>
<code>$option[1]</code>
</MixedArrayAccess>
<MixedAssignment occurrences="5">
<code>$directories</code>
Expand Down
39 changes: 29 additions & 10 deletions src/CLI/Application.php
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,13 @@
use const PHP_EOL;
use function count;
use function printf;
use Exception;
use SebastianBergmann\FileIterator\Facade;
use SebastianBergmann\PHPCPD\Detector\Detector;
use SebastianBergmann\PHPCPD\Detector\Strategy\AbstractStrategy;
use SebastianBergmann\PHPCPD\Detector\Strategy\DefaultStrategy;
use SebastianBergmann\PHPCPD\Detector\Strategy\StrategyConfiguration;
use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTreeStrategy;
use SebastianBergmann\PHPCPD\Log\PMD;
use SebastianBergmann\PHPCPD\Log\Text;
use SebastianBergmann\Timer\ResourceUsageFormatter;
Expand Down Expand Up @@ -62,17 +66,14 @@ public function run(array $argv): int
return 1;
}

$strategy = new DefaultStrategy;
$config = new StrategyConfiguration($arguments);

$strategy = $this->pickStrategy($arguments->algorithm(), $config);

$timer = new Timer;
$timer->start();

$clones = (new Detector($strategy))->copyPasteDetection(
$files,
$arguments->linesThreshold(),
$arguments->tokensThreshold(),
$arguments->fuzzy()
);
$clones = (new Detector($strategy))->copyPasteDetection($files);

(new Text)->printResult($clones, $arguments->verbose());

Expand All @@ -93,6 +94,21 @@ private function printVersion(): void
);
}

private function pickStrategy(?string $algorithm, StrategyConfiguration $config): AbstractStrategy
{
switch ($algorithm) {
case null:
case 'rabin-karp':
return new DefaultStrategy($config);

case 'suffixtree':
return new SuffixTreeStrategy($config);

default:
throw new Exception('Unsupported algorithm: ' . $algorithm);
}
}

private function help(): void
{
print <<<'EOT'
Expand All @@ -108,9 +124,12 @@ private function help(): void

Options for analysing files:

--fuzzy Fuzz variable names
--min-lines <N> Minimum number of identical lines (default: 5)
--min-tokens <N> Minimum number of identical tokens (default: 70)
--fuzzy Fuzz variable names
--min-lines <N> Minimum number of identical lines (default: 5)
--min-tokens <N> Minimum number of identical tokens (default: 70)
--algorithm <name> Select which algorithm to use ('rabin-karp' (default) or 'suffixtree')
--edit-distance <N> Distance in number of edits between two clones (only for suffixtree; default: 5)
--head-equality <N> Minimum equality at start of clone (only for suffixtree; default 10)

Options for report generation:

Expand Down
35 changes: 34 additions & 1 deletion src/CLI/Arguments.php
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,22 @@ final class Arguments
*/
private $version;

public function __construct(array $directories, array $suffixes, array $exclude, ?string $pmdCpdXmlLogfile, int $linesThreshold, int $tokensThreshold, bool $fuzzy, bool $verbose, bool $help, bool $version)
/**
* @var ?string
*/
private $algorithm;

/**
* @var int
*/
private $editDistance;

/**
* @var int
*/
private $headEquality;

public function __construct(array $directories, array $suffixes, array $exclude, ?string $pmdCpdXmlLogfile, int $linesThreshold, int $tokensThreshold, bool $fuzzy, bool $verbose, bool $help, bool $version, ?string $algorithm, int $editDistance, int $headEquality)
{
$this->directories = $directories;
$this->suffixes = $suffixes;
Expand All @@ -73,6 +88,9 @@ public function __construct(array $directories, array $suffixes, array $exclude,
$this->verbose = $verbose;
$this->help = $help;
$this->version = $version;
$this->algorithm = $algorithm;
$this->editDistance = $editDistance;
$this->headEquality = $headEquality;
}

/**
Expand Down Expand Up @@ -133,4 +151,19 @@ public function version(): bool
{
return $this->version;
}

public function algorithm(): ?string
{
return $this->algorithm;
}

public function editDistance(): int
{
return $this->editDistance;
}

public function headEquality(): int
{
return $this->headEquality;
}
}
24 changes: 24 additions & 0 deletions src/CLI/ArgumentsBuilder.php
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,12 @@ public function build(array $argv): Arguments
'fuzzy',
'min-lines=',
'min-tokens=',
'head-equality=',
'edit-distance=',
'verbose',
'help',
'version',
'algorithm=',
]
);
} catch (CliParserException $e) {
Expand All @@ -49,10 +52,13 @@ public function build(array $argv): Arguments
$pmdCpdXmlLogfile = null;
$linesThreshold = 5;
$tokensThreshold = 70;
$editDistance = 5;
$headEquality = 10;
$fuzzy = false;
$verbose = false;
$help = false;
$version = false;
$algorithm = 'rabin-karp';

foreach ($options[0] as $option) {
switch ($option[0]) {
Expand Down Expand Up @@ -86,6 +92,16 @@ public function build(array $argv): Arguments

break;

case '--head-equality':
$headEquality = (int) $option[1];

break;

case '--edit-distance':
$editDistance = (int) $option[1];

break;

case '--verbose':
$verbose = true;

Expand All @@ -101,6 +117,11 @@ public function build(array $argv): Arguments
case '--version':
$version = true;

break;

case '--algorithm':
$algorithm = (string) $option[1];

break;
}
}
Expand All @@ -122,6 +143,9 @@ public function build(array $argv): Arguments
$verbose,
$help,
$version,
$algorithm,
$editDistance,
$headEquality
);
}
}
9 changes: 4 additions & 5 deletions src/Detector/Detector.php
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ public function __construct(AbstractStrategy $strategy)
$this->strategy = $strategy;
}

public function copyPasteDetection(iterable $files, int $minLines = 5, int $minTokens = 70, bool $fuzzy = false): CodeCloneMap
public function copyPasteDetection(iterable $files): CodeCloneMap
{
$result = new CodeCloneMap;

Expand All @@ -35,13 +35,12 @@ public function copyPasteDetection(iterable $files, int $minLines = 5, int $minT

$this->strategy->processFile(
$file,
$minLines,
$minTokens,
$result,
$fuzzy
$result
);
}

$this->strategy->postProcess();

return $result;
}
}
21 changes: 16 additions & 5 deletions src/Detector/Strategy/AbstractStrategy.php
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,21 @@ abstract class AbstractStrategy
T_NS_SEPARATOR => true,
];

/**
* @psalm-var array<string,array{0: string, 1: int}>
*/
protected $hashes = [];
protected $config;

public function __construct(StrategyConfiguration $config)
{
$this->config = $config;
}

public function setConfig(StrategyConfiguration $config): void
{
$this->config = $config;
}

abstract public function processFile(string $file, CodeCloneMap $result): void;

abstract public function processFile(string $file, int $minLines, int $minTokens, CodeCloneMap $result, bool $fuzzy = false): void;
public function postProcess(): void
{
}
}
33 changes: 25 additions & 8 deletions src/Detector/Strategy/DefaultStrategy.php
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,26 @@
use SebastianBergmann\PHPCPD\CodeCloneFile;
use SebastianBergmann\PHPCPD\CodeCloneMap;

/**
* This is a Rabin-Karp with an additional normalization steps before
* the hashing happens.
*
* 1. Tokenization
* 2. Deletion of logic neutral tokens like T_CLOSE_TAG;T_COMMENT;
* T_DOC_COMMENT; T_INLINE_HTML; T_NS_SEPARATOR; T_OPEN_TAG;
* T_OPEN_TAG_WITH_ECHO; T_USE; T_WHITESPACE;
* 3. If needed deletion of variable names
* 4. Normalization of token + value using crc32
* 5. Now the classic Rabin-Karp hashing takes place
*/
final class DefaultStrategy extends AbstractStrategy
{
public function processFile(string $file, int $minLines, int $minTokens, CodeCloneMap $result, bool $fuzzy = false): void
/**
* @psalm-var array<string,array{0: string, 1: int}>
*/
protected $hashes = [];

public function processFile(string $file, CodeCloneMap $result): void
{
$buffer = file_get_contents($file);
$currentTokenPositions = [];
Expand Down Expand Up @@ -55,7 +72,7 @@ public function processFile(string $file, int $minLines, int $minTokens, CodeClo

$currentTokenRealPositions[$tokenNr++] = $token[2];

if ($fuzzy && $token[0] === T_VARIABLE) {
if ($this->config->getFuzzy() && $token[0] === T_VARIABLE) {
$token[1] = 'variable';
}

Expand All @@ -73,7 +90,7 @@ public function processFile(string $file, int $minLines, int $minTokens, CodeClo
$found = false;
$tokenNr = 0;

while ($tokenNr <= $count - $minTokens) {
while ($tokenNr <= $count - $this->config->getMinTokens()) {
$line = $currentTokenPositions[$tokenNr];
$realLine = $currentTokenRealPositions[$tokenNr];

Expand All @@ -82,7 +99,7 @@ public function processFile(string $file, int $minLines, int $minTokens, CodeClo
substr(
$currentSignature,
$tokenNr * 5,
$minTokens * 5
$this->config->getMinTokens() * 5
),
true
),
Expand All @@ -103,13 +120,13 @@ public function processFile(string $file, int $minLines, int $minTokens, CodeClo
if ($found) {
$fileA = $this->hashes[$firstHash][0];
$firstLineA = $this->hashes[$firstHash][1];
$lastToken = ($tokenNr - 1) + $minTokens - 1;
$lastToken = ($tokenNr - 1) + $this->config->getMinTokens() - 1;
$lastLine = $currentTokenPositions[$lastToken];
$lastRealLine = $currentTokenRealPositions[$lastToken];
$numLines = $lastLine + 1 - $firstLine;
$realNumLines = $lastRealLine + 1 - $firstRealLine;

if ($numLines >= $minLines &&
if ($numLines >= $this->config->getMinLines() &&
($fileA !== $file ||
$firstLineA !== $firstRealLine)) {
$result->add(
Expand All @@ -135,13 +152,13 @@ public function processFile(string $file, int $minLines, int $minTokens, CodeClo
if ($found) {
$fileA = $this->hashes[$firstHash][0];
$firstLineA = $this->hashes[$firstHash][1];
$lastToken = ($tokenNr - 1) + $minTokens - 1;
$lastToken = ($tokenNr - 1) + $this->config->getMinTokens() - 1;
$lastLine = $currentTokenPositions[$lastToken];
$lastRealLine = $currentTokenRealPositions[$lastToken];
$numLines = $lastLine + 1 - $firstLine;
$realNumLines = $lastRealLine + 1 - $firstRealLine;

if ($numLines >= $minLines &&
if ($numLines >= $this->config->getMinLines() &&
($fileA !== $file || $firstLineA !== $firstRealLine)) {
$result->add(
new CodeClone(
Expand Down
Loading