Page MenuHome GnuPG

No OneTemporary

diff --git a/src/utils/PhutilEditDistanceMatrix.php b/src/utils/PhutilEditDistanceMatrix.php
index 8e8b900..74e6f82 100644
--- a/src/utils/PhutilEditDistanceMatrix.php
+++ b/src/utils/PhutilEditDistanceMatrix.php
@@ -1,523 +1,527 @@
<?php
/**
* Compute edit distance between two scalar sequences. This class uses
* Levenshtein (or Damerau-Levenshtein) to compute the edit distance between
* two inputs. The inputs are arrays containing any scalars (not just strings)
* so it can be used with, e.g., utf8 sequences.
*
* $matrix = id(new PhutilEditDistanceMatrix())
* ->setSequences(str_split('ran'), str_split('rat'));
*
* $cost = $matrix->getEditDistance();
*
* Edit distance computation is slow and requires a large amount of memory;
* both are roughly O(N * M) in the length of the strings.
*
* You can customize the cost of insertions, deletions and replacements. By
* default, each has a cost of 1.
*
* $matrix->setReplaceCost(1);
*
* By default, transpositions are not evaluated (i.e., the algorithm is
* Levenshtein). You can cause transpositions to be evaluated by setting a
* transpose cost (which will change the algorithm to Damerau-Levenshtein):
*
* $matrix->setTransposeCost(1);
*
* You can also provide a cost to alter the type of operation being applied.
* Many strings have several equivalently expensive edit sequences, but one
* some sequences are more readable to humans than others. Providing a small
* cost to alter operation type tends to smooth out the sequence and produce
* long runs of a single operation, which are generally more readable. For
* example, these strings:
*
* (x)
* ((x))
*
* ...have edit strings "issis" and "isssi", which describe edit operations with
* the same cost, but the latter string is more comprehensible to human viewers.
*
* If you set an alter cost, you must call @{method:setComputeString} to enable
* type computation. The alter cost should generally be smaller than `c / N`,
* where `c` is the smallest operational cost and `N` is the length of the
* longest string. For example, if you are using the default costs (insert = 1,
* delete = 1, replace = 1) and computing edit distances for strings of fewer
* than 1,000 characters, you might set the alter cost to 0.001.
*/
final class PhutilEditDistanceMatrix extends Phobject {
private $insertCost = 1;
private $deleteCost = 1;
private $replaceCost = 1;
private $transposeCost = null;
private $alterCost = 0;
private $maximumLength;
private $computeString;
private $applySmoothing;
private $x;
private $y;
private $prefix;
private $suffix;
private $distanceMatrix = null;
private $typeMatrix = null;
public function setMaximumLength($maximum_length) {
$this->maximumLength = $maximum_length;
return $this;
}
public function getMaximumLength() {
return coalesce($this->maximumLength, $this->getInfinity());
}
public function setComputeString($compute_string) {
$this->computeString = $compute_string;
return $this;
}
public function getComputeString() {
return $this->computeString;
}
public function setTransposeCost($transpose_cost) {
$this->transposeCost = $transpose_cost;
return $this;
}
public function getTransposeCost() {
return $this->transposeCost;
}
public function setReplaceCost($replace_cost) {
$this->replaceCost = $replace_cost;
return $this;
}
public function getReplaceCost() {
return $this->replaceCost;
}
public function setDeleteCost($delete_cost) {
$this->deleteCost = $delete_cost;
return $this;
}
public function getDeleteCost() {
return $this->deleteCost;
}
public function setInsertCost($insert_cost) {
$this->insertCost = $insert_cost;
return $this;
}
public function getInsertCost() {
return $this->insertCost;
}
public function setAlterCost($alter_cost) {
$this->alterCost = $alter_cost;
return $this;
}
public function getAlterCost() {
return $this->alterCost;
}
public function setApplySmoothing($apply_smoothing) {
$this->applySmoothing = $apply_smoothing;
return $this;
}
public function getApplySmoothing() {
return $this->applySmoothing;
}
public function setSequences(array $x, array $y) {
// NOTE: We strip common prefixes and suffixes from the inputs because
// the runtime of the edit distance algorithm is large and it is common
// to diff similar strings.
$xl = count($x);
$yl = count($y);
$l = min($xl, $yl);
$prefix_l = 0;
$suffix_l = 0;
for ($ii = 0; $ii < $l; $ii++) {
if ($x[$ii] !== $y[$ii]) {
break;
}
$prefix_l++;
}
for ($ii = 1; $ii <= ($l - $prefix_l); $ii++) {
if ($x[$xl - $ii] !== $y[$yl - $ii]) {
break;
}
$suffix_l++;
}
$this->prefix = array_slice($x, 0, $prefix_l);
$this->suffix = array_slice($x, $xl - $suffix_l);
$this->x = array_slice($x, $prefix_l, $xl - ($suffix_l + $prefix_l));
$this->y = array_slice($y, $prefix_l, $yl - ($suffix_l + $prefix_l));
$this->distanceMatrix = null;
return $this;
}
private function requireSequences() {
if ($this->x === null) {
throw new PhutilInvalidStateException('setSequences');
}
}
public function getEditDistance() {
$this->requireSequences();
$x = $this->x;
$y = $this->y;
if (!$x) {
return $this->insertCost * count($y);
}
if (!$y) {
return $this->deleteCost * count($x);
}
$max = $this->getMaximumLength();
if (count($x) > $max || count($y) > $max) {
return ($this->insertCost * count($y)) + ($this->deleteCost * count($x));
}
if ($x === $y) {
return 0;
}
$matrix = $this->getDistanceMatrix();
return $matrix[count($x)][count($y)];
}
/**
* Return a string representing the edits between the sequences. The string
* has these characters:
*
* - s (same): Same character in both strings.
* - i (insert): Character inserted.
* - d (delete): Character deleted.
* - x (replace): Character replaced.
* - t (transpose): Character transposed.
*/
public function getEditString() {
$this->requireSequences();
$x = $this->x;
$y = $this->y;
if (!$x && !$y) {
return $this->padEditString('');
}
if (!$x) {
return $this->padEditString(str_repeat('i', count($y)));
}
if (!$y) {
return $this->padEditString(str_repeat('d', count($x)));
}
if ($x === $y) {
return $this->padEditString(str_repeat('s', count($x)));
}
$max = $this->getMaximumLength();
if (count($x) > $max || count($y) > $max) {
return $this->padEditString(
str_repeat('d', count($x)).
str_repeat('i', count($y)));
}
$matrix = $this->getTypeMatrix();
$xx = count($x);
$yy = count($y);
$transposes = array();
$str = '';
while (true) {
$type = $matrix[$xx][$yy];
if (is_array($type)) {
$chr = 't';
$transposes[$type[0]][$type[1]] = true;
$type = $type[2];
} else {
$chr = $type;
}
if (isset($transposes[$xx][$yy])) {
$chr = 't';
}
if ($type == 's' || $type == 'x') {
$xx -= 1;
$yy -= 1;
} else if ($type == 'i') {
$yy -= 1;
} else if ($type == 'd') {
$xx -= 1;
} else {
throw new Exception(pht("Unknown type '%s' in type matrix.", $type));
}
$str .= $chr;
if ($xx <= 0 && $yy <= 0) {
break;
}
}
$str = strrev($str);
+ // We pad the edit string before smoothing it, so ranges of similar
+ // characters at the beginning or end of the string can also be smoothed.
+ $str = $this->padEditString($str);
+
if ($this->getApplySmoothing()) {
$str = $this->applySmoothing($str);
}
- return $this->padEditString($str);
+ return $str;
}
private function padEditString($str) {
if ($this->prefix) {
$str = str_repeat('s', count($this->prefix)).$str;
}
if ($this->suffix) {
$str = $str.str_repeat('s', count($this->suffix));
}
return $str;
}
private function getTypeMatrix() {
if (!$this->computeString) {
throw new PhutilInvalidStateException('setComputeString');
}
if ($this->typeMatrix === null) {
$this->computeMatrix($this->x, $this->y);
}
return $this->typeMatrix;
}
private function getDistanceMatrix() {
if ($this->distanceMatrix === null) {
$this->computeMatrix($this->x, $this->y);
}
return $this->distanceMatrix;
}
private function computeMatrix(array $x, array $y) {
$xl = count($x);
$yl = count($y);
$m = array();
$infinity = $this->getInfinity();
$use_damerau = ($this->transposeCost !== null);
if ($use_damerau) {
// Initialize the default cost of a transpose.
$m[-1][-1] = $infinity;
// Initialize the character position dictionary for Damerau.
$d = array();
foreach ($x as $c) {
$d[$c] = -1;
}
foreach ($y as $c) {
$d[$c] = -1;
}
// Initialize the transpose costs for Damerau.
for ($xx = 0; $xx <= $xl; $xx++) {
$m[$xx][-1] = $infinity;
}
for ($yy = 0; $yy <= $yl; $yy++) {
$m[-1][$yy] = $infinity;
}
}
// Initialize the top row of the matrix.
for ($xx = 0; $xx <= $xl; $xx++) {
$m[$xx][0] = $xx * $this->deleteCost;
}
// Initialize the left column of the matrix.
$cost = 0;
for ($yy = 0; $yy <= $yl; $yy++) {
$m[0][$yy] = $yy * $this->insertCost;
}
$use_types = ($this->computeString);
if ($use_types) {
$t = array();
for ($xx = 0; $xx <= $xl; $xx++) {
$t[$xx][0] = 'd';
}
for ($yy = 0; $yy <= $yl; $yy++) {
$t[0][$yy] = 'i';
}
$t[0][0] = 's';
}
$alt_cost = $this->getAlterCost();
if ($alt_cost && !$use_types) {
throw new Exception(
pht(
'If you provide an alter cost with %s, you must enable '.
'type computation with %s.',
'setAlterCost()',
'setComputeStrings()'));
}
// Build the edit distance matrix.
for ($xx = 1; $xx <= $xl; $xx++) {
$last_dy = -1;
for ($yy = 1; $yy <= $yl; $yy++) {
if ($use_damerau) {
$dx = $d[$y[$yy - 1]];
$dy = $last_dy;
}
if ($x[$xx - 1] === $y[$yy - 1]) {
$rep_cost = $m[$xx - 1][$yy - 1] + 0;
$rep_type = 's';
} else {
$rep_cost = $m[$xx - 1][$yy - 1] + $this->replaceCost;
$rep_type = 'x';
}
$del_cost = $m[$xx - 1][$yy ] + $this->deleteCost;
$ins_cost = $m[$xx ][$yy - 1] + $this->insertCost;
if ($alt_cost) {
$del_char = $t[$xx - 1][$yy ];
$ins_char = $t[$xx ][$yy - 1];
$rep_char = $t[$xx - 1][$yy - 1];
if ($del_char !== 'd') {
$del_cost += $alt_cost;
}
if ($ins_char !== 'i') {
$ins_cost += $alt_cost;
}
if ($rep_char !== $rep_type) {
$rep_cost += $alt_cost;
}
}
if ($rep_cost <= $del_cost && $rep_cost <= $ins_cost) {
$cost = $rep_cost;
$type = $rep_type;
if ($rep_type === 's') {
$last_dy = $yy - 1;
}
} else if ($ins_cost <= $del_cost) {
$cost = $ins_cost;
$type = 'i';
} else {
$cost = $del_cost;
$type = 'd';
}
if ($use_damerau) {
$trn_count = (($xx - $dx - 2) + ($yy - $dy - 2) + 1);
$trn_cost = $m[$dx][$dy] + ($trn_count * $this->transposeCost);
if ($trn_cost < $cost) {
$cost = $trn_cost;
$type = array($dx + 1, $dy + 1, $type);
}
}
$m[$xx][$yy] = $cost;
if ($use_types) {
$t[$xx][$yy] = $type;
}
}
if ($use_damerau) {
$d[$x[$xx - 1]] = ($xx - 1);
}
}
$this->distanceMatrix = $m;
if ($use_types) {
$this->typeMatrix = $t;
}
}
private function getInfinity() {
return INF;
}
private function printMatrix(array $m) {
$x = $this->x;
$y = $this->y;
$p = '% 8s ';
printf($p, ' ');
foreach (head($m) as $yk => $yv) {
printf($p, idx($y, $yk - 1, '-'));
}
echo "\n";
foreach ($m as $xk => $xv) {
printf($p, idx($x, $xk - 1, '-'));
foreach ($xv as $yk => $yv) {
printf($p, ($yv == $this->getInfinity() ? 'inf' : $yv));
}
echo "\n";
}
}
private function printTypeMatrix(array $t) {
$x = $this->x;
$y = $this->y;
$p = '% 8s ';
printf($p, ' ');
foreach (head($t) as $yk => $yv) {
printf($p, idx($y, $yk - 1, '-'));
}
echo "\n";
foreach ($t as $xk => $xv) {
printf($p, idx($x, $xk - 1, '-'));
foreach ($xv as $yk => $yv) {
printf($p, ($yv == $this->getInfinity() ? 'inf' : $yv));
}
echo "\n";
}
}
private function applySmoothing($str) {
$result = $str;
// Smooth the string out, by replacing short runs of similar characters
// with 'x' operations. This makes the result more readable to humans,
// since there are fewer choppy runs of short added and removed substrings.
do {
$original = $result;
- $result = preg_replace('/([xdi])(s{3})([xdi])/', '$1xxx$3', $result);
- $result = preg_replace('/([xdi])(s{2})([xdi])/', '$1xx$3', $result);
- $result = preg_replace('/([xdi])(s{1})([xdi])/', '$1x$3', $result);
+ $result = preg_replace('/(^|[xdi])(s{3})([xdi]|\z)/', '$1xxx$3', $result);
+ $result = preg_replace('/(^|[xdi])(s{2})([xdi]|\z)/', '$1xx$3', $result);
+ $result = preg_replace('/(^|[xdi])(s{1})([xdi]|\z)/', '$1x$3', $result);
} while ($result != $original);
return $result;
}
}
diff --git a/src/utils/PhutilProseDiff.php b/src/utils/PhutilProseDiff.php
index 84737fb..3dd279c 100644
--- a/src/utils/PhutilProseDiff.php
+++ b/src/utils/PhutilProseDiff.php
@@ -1,91 +1,195 @@
<?php
final class PhutilProseDiff extends Phobject {
private $parts = array();
public function addPart($type, $text) {
$this->parts[] = array(
'type' => $type,
'text' => $text,
);
return $this;
}
public function getParts() {
return $this->parts;
}
public function reorderParts() {
// Reorder sequences of removed and added sections to put all the "-"
// parts together first, then all the "+" parts together. This produces
// a more human-readable result than intermingling them.
+
$o_run = array();
$n_run = array();
$result = array();
foreach ($this->parts as $part) {
$type = $part['type'];
switch ($type) {
case '-':
$o_run[] = $part;
break;
case '+':
$n_run[] = $part;
break;
default:
- foreach ($o_run as $o) {
- $result[] = $o;
- }
- foreach ($n_run as $n) {
- $result[] = $n;
+ if ($o_run || $n_run) {
+ foreach ($this->combineRuns($o_run, $n_run) as $merged_part) {
+ $result[] = $merged_part;
+ }
+ $o_run = array();
+ $n_run = array();
}
$result[] = $part;
- $o_run = array();
- $n_run = array();
break;
}
}
- foreach ($o_run as $o) {
- $result[] = $o;
- }
- foreach ($n_run as $n) {
- $result[] = $n;
+ if ($o_run || $n_run) {
+ foreach ($this->combineRuns($o_run, $n_run) as $part) {
+ $result[] = $part;
+ }
}
// Now, combine consecuitive runs of the same type of change (like a
// series of "-" parts) into a single run.
$combined = array();
$last = null;
$last_text = null;
foreach ($result as $part) {
$type = $part['type'];
if ($last !== $type) {
if ($last !== null) {
$combined[] = array(
'type' => $last,
'text' => $last_text,
);
}
$last_text = null;
$last = $type;
}
$last_text .= $part['text'];
}
if ($last_text !== null) {
$combined[] = array(
'type' => $last,
'text' => $last_text,
);
}
$this->parts = $combined;
return $this;
}
+ private function combineRuns($o_run, $n_run) {
+ $o_merge = $this->mergeParts($o_run);
+ $n_merge = $this->mergeParts($n_run);
+
+ // When removed and added blocks share a prefix or suffix, we sometimes
+ // want to count it as unchanged (for example, if it is whitespace) but
+ // sometimes want to count it as changed (for example, if it is a word
+ // suffix like "ing"). Find common prefixes and suffixes of these layout
+ // characters and emit them as "=" (unchanged) blocks.
+
+ $layout_characters = array(
+ ' ' => true,
+ "\n" => true,
+ '.' => true,
+ '!' => true,
+ ',' => true,
+ '?' => true,
+ );
+
+ $o_text = $o_merge['text'];
+ $n_text = $n_merge['text'];
+ $o_len = strlen($o_text);
+ $n_len = strlen($n_text);
+ $min_len = min($o_len, $n_len);
+
+ $prefix_len = 0;
+ for ($pos = 0; $pos < $min_len; $pos++) {
+ $o = $o_text[$pos];
+ $n = $n_text[$pos];
+ if ($o !== $n) {
+ break;
+ }
+ if (empty($layout_characters[$o])) {
+ break;
+ }
+ $prefix_len++;
+ }
+
+ $suffix_len = 0;
+ for ($pos = 1; $pos <= $min_len; $pos++) {
+ $o = $o_text[$o_len - $pos];
+ $n = $n_text[$n_len - $pos];
+ if ($o !== $n) {
+ break;
+ }
+ if (empty($layout_characters[$o])) {
+ break;
+ }
+ $suffix_len++;
+ }
+
+ $results = array();
+
+ if ($prefix_len) {
+ $results[] = array(
+ 'type' => '=',
+ 'text' => substr($o_text, 0, $prefix_len),
+ );
+ }
+
+ if ($prefix_len < $o_len) {
+ $results[] = array(
+ 'type' => '-',
+ 'text' => substr($o_text, $prefix_len, $o_len - $suffix_len),
+ );
+ }
+
+ if ($prefix_len < $n_len) {
+ $results[] = array(
+ 'type' => '+',
+ 'text' => substr($n_text, $prefix_len, $n_len - $suffix_len),
+ );
+ }
+
+ if ($suffix_len) {
+ $results[] = array(
+ 'type' => '=',
+ 'text' => substr($o_text, -$suffix_len),
+ );
+ }
+
+ return $results;
+ }
+
+ private function mergeParts(array $parts) {
+ $text = '';
+ $type = null;
+ foreach ($parts as $part) {
+ $part_type = $part['type'];
+ if ($type === null) {
+ $type = $part_type;
+ }
+ if ($type !== $part_type) {
+ throw new Exception(pht('Can not merge parts of dissimilar types!'));
+ }
+ $text .= $part['text'];
+ }
+
+ return array(
+ 'type' => $type,
+ 'text' => $text,
+ );
+ }
+
+
}
diff --git a/src/utils/__tests__/PhutilProseDiffTestCase.php b/src/utils/__tests__/PhutilProseDiffTestCase.php
index 8028b52..918f23a 100644
--- a/src/utils/__tests__/PhutilProseDiffTestCase.php
+++ b/src/utils/__tests__/PhutilProseDiffTestCase.php
@@ -1,77 +1,106 @@
<?php
final class PhutilProseDiffTestCase extends PhutilTestCase {
public function testProseDiffsDistance() {
$this->assertProseParts(
'',
'',
array(),
pht('Empty'));
$this->assertProseParts(
"xxx\nyyy",
"xxx\nzzz\nyyy",
array(
"= xxx\n",
"+ zzz\n",
'= yyy',
),
pht('Add Paragraph'));
$this->assertProseParts(
"xxx\nzzz\nyyy",
"xxx\nyyy",
array(
"= xxx\n",
"- zzz\n",
'= yyy',
),
pht('Remove Paragraph'));
// Without smoothing, the alogorithm identifies that "shark" and "cat"
// both contain the letter "a" and tries to express this as a very
// fine-grained edit which replaces "sh" with "c" and then "rk" with "t".
// This is technically correct, but it is much easier for human viewers to
// parse if we smooth this into a single removal and a single addition.
$this->assertProseParts(
'They say the shark has nine lives.',
'They say the cat has nine lives.',
array(
'= They say the ',
'- shark',
'+ cat',
'= has nine lives.',
),
pht('"Shark/cat" word edit smoothenss.'));
$this->assertProseParts(
'Rising quickly, she says',
'Rising quickly, she remarks:',
array(
'= Rising quickly, she ',
'- says',
'+ remarks:',
),
pht('"Says/remarks" word edit smoothenss.'));
+ $this->assertProseParts(
+ 'See screenshots',
+ 'Viewed video files',
+ array(
+ '- See screenshots',
+ '+ Viewed video files',
+ ),
+ pht('Complete paragraph rewrite.'));
+
+ $this->assertProseParts(
+ 'xaaax',
+ 'xbbbx',
+ array(
+ '- xaaax',
+ '+ xbbbx',
+ ),
+ pht('Whole word rewrite with common prefix and suffix.'));
+
+ $this->assertProseParts(
+ ' aaa ',
+ ' bbb ',
+ array(
+ '= ',
+ '- aaa',
+ '+ bbb',
+ '= ',
+ ),
+ pht('Whole word rewrite with whitespace prefix and suffix.'));
+
}
private function assertProseParts($old, $new, array $expect_parts, $label) {
$engine = new PhutilProseDifferenceEngine();
$diff = $engine->getDiff($old, $new);
$parts = $diff->getParts();
$actual = array();
foreach ($parts as $part) {
$actual[] = $part['type'].' '.$part['text'];
}
$this->assertEqual($expect_parts, $actual, $label);
}
}

File Metadata

Mime Type
text/x-diff
Expires
Fri, Jan 16, 1:07 AM (16 h, 53 m)
Storage Engine
local-disk
Storage Format
Raw Data
Storage Handle
f5/8c/4e08ccc11861a928b144b99a3ddf

Event Timeline