Page Menu
Home
GnuPG
Search
Configure Global Search
Log In
Files
F34555739
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Size
22 KB
Subscribers
None
View Options
diff --git a/src/utils/PhutilEditDistanceMatrix.php b/src/utils/PhutilEditDistanceMatrix.php
index 8e8b900..74e6f82 100644
--- a/src/utils/PhutilEditDistanceMatrix.php
+++ b/src/utils/PhutilEditDistanceMatrix.php
@@ -1,523 +1,527 @@
<?php
/**
* Compute edit distance between two scalar sequences. This class uses
* Levenshtein (or Damerau-Levenshtein) to compute the edit distance between
* two inputs. The inputs are arrays containing any scalars (not just strings)
* so it can be used with, e.g., utf8 sequences.
*
* $matrix = id(new PhutilEditDistanceMatrix())
* ->setSequences(str_split('ran'), str_split('rat'));
*
* $cost = $matrix->getEditDistance();
*
* Edit distance computation is slow and requires a large amount of memory;
* both are roughly O(N * M) in the length of the strings.
*
* You can customize the cost of insertions, deletions and replacements. By
* default, each has a cost of 1.
*
* $matrix->setReplaceCost(1);
*
* By default, transpositions are not evaluated (i.e., the algorithm is
* Levenshtein). You can cause transpositions to be evaluated by setting a
* transpose cost (which will change the algorithm to Damerau-Levenshtein):
*
* $matrix->setTransposeCost(1);
*
* You can also provide a cost to alter the type of operation being applied.
* Many strings have several equivalently expensive edit sequences, but one
* some sequences are more readable to humans than others. Providing a small
* cost to alter operation type tends to smooth out the sequence and produce
* long runs of a single operation, which are generally more readable. For
* example, these strings:
*
* (x)
* ((x))
*
* ...have edit strings "issis" and "isssi", which describe edit operations with
* the same cost, but the latter string is more comprehensible to human viewers.
*
* If you set an alter cost, you must call @{method:setComputeString} to enable
* type computation. The alter cost should generally be smaller than `c / N`,
* where `c` is the smallest operational cost and `N` is the length of the
* longest string. For example, if you are using the default costs (insert = 1,
* delete = 1, replace = 1) and computing edit distances for strings of fewer
* than 1,000 characters, you might set the alter cost to 0.001.
*/
final class PhutilEditDistanceMatrix extends Phobject {
private $insertCost = 1;
private $deleteCost = 1;
private $replaceCost = 1;
private $transposeCost = null;
private $alterCost = 0;
private $maximumLength;
private $computeString;
private $applySmoothing;
private $x;
private $y;
private $prefix;
private $suffix;
private $distanceMatrix = null;
private $typeMatrix = null;
public function setMaximumLength($maximum_length) {
$this->maximumLength = $maximum_length;
return $this;
}
public function getMaximumLength() {
return coalesce($this->maximumLength, $this->getInfinity());
}
public function setComputeString($compute_string) {
$this->computeString = $compute_string;
return $this;
}
public function getComputeString() {
return $this->computeString;
}
public function setTransposeCost($transpose_cost) {
$this->transposeCost = $transpose_cost;
return $this;
}
public function getTransposeCost() {
return $this->transposeCost;
}
public function setReplaceCost($replace_cost) {
$this->replaceCost = $replace_cost;
return $this;
}
public function getReplaceCost() {
return $this->replaceCost;
}
public function setDeleteCost($delete_cost) {
$this->deleteCost = $delete_cost;
return $this;
}
public function getDeleteCost() {
return $this->deleteCost;
}
public function setInsertCost($insert_cost) {
$this->insertCost = $insert_cost;
return $this;
}
public function getInsertCost() {
return $this->insertCost;
}
public function setAlterCost($alter_cost) {
$this->alterCost = $alter_cost;
return $this;
}
public function getAlterCost() {
return $this->alterCost;
}
public function setApplySmoothing($apply_smoothing) {
$this->applySmoothing = $apply_smoothing;
return $this;
}
public function getApplySmoothing() {
return $this->applySmoothing;
}
public function setSequences(array $x, array $y) {
// NOTE: We strip common prefixes and suffixes from the inputs because
// the runtime of the edit distance algorithm is large and it is common
// to diff similar strings.
$xl = count($x);
$yl = count($y);
$l = min($xl, $yl);
$prefix_l = 0;
$suffix_l = 0;
for ($ii = 0; $ii < $l; $ii++) {
if ($x[$ii] !== $y[$ii]) {
break;
}
$prefix_l++;
}
for ($ii = 1; $ii <= ($l - $prefix_l); $ii++) {
if ($x[$xl - $ii] !== $y[$yl - $ii]) {
break;
}
$suffix_l++;
}
$this->prefix = array_slice($x, 0, $prefix_l);
$this->suffix = array_slice($x, $xl - $suffix_l);
$this->x = array_slice($x, $prefix_l, $xl - ($suffix_l + $prefix_l));
$this->y = array_slice($y, $prefix_l, $yl - ($suffix_l + $prefix_l));
$this->distanceMatrix = null;
return $this;
}
private function requireSequences() {
if ($this->x === null) {
throw new PhutilInvalidStateException('setSequences');
}
}
public function getEditDistance() {
$this->requireSequences();
$x = $this->x;
$y = $this->y;
if (!$x) {
return $this->insertCost * count($y);
}
if (!$y) {
return $this->deleteCost * count($x);
}
$max = $this->getMaximumLength();
if (count($x) > $max || count($y) > $max) {
return ($this->insertCost * count($y)) + ($this->deleteCost * count($x));
}
if ($x === $y) {
return 0;
}
$matrix = $this->getDistanceMatrix();
return $matrix[count($x)][count($y)];
}
/**
* Return a string representing the edits between the sequences. The string
* has these characters:
*
* - s (same): Same character in both strings.
* - i (insert): Character inserted.
* - d (delete): Character deleted.
* - x (replace): Character replaced.
* - t (transpose): Character transposed.
*/
public function getEditString() {
$this->requireSequences();
$x = $this->x;
$y = $this->y;
if (!$x && !$y) {
return $this->padEditString('');
}
if (!$x) {
return $this->padEditString(str_repeat('i', count($y)));
}
if (!$y) {
return $this->padEditString(str_repeat('d', count($x)));
}
if ($x === $y) {
return $this->padEditString(str_repeat('s', count($x)));
}
$max = $this->getMaximumLength();
if (count($x) > $max || count($y) > $max) {
return $this->padEditString(
str_repeat('d', count($x)).
str_repeat('i', count($y)));
}
$matrix = $this->getTypeMatrix();
$xx = count($x);
$yy = count($y);
$transposes = array();
$str = '';
while (true) {
$type = $matrix[$xx][$yy];
if (is_array($type)) {
$chr = 't';
$transposes[$type[0]][$type[1]] = true;
$type = $type[2];
} else {
$chr = $type;
}
if (isset($transposes[$xx][$yy])) {
$chr = 't';
}
if ($type == 's' || $type == 'x') {
$xx -= 1;
$yy -= 1;
} else if ($type == 'i') {
$yy -= 1;
} else if ($type == 'd') {
$xx -= 1;
} else {
throw new Exception(pht("Unknown type '%s' in type matrix.", $type));
}
$str .= $chr;
if ($xx <= 0 && $yy <= 0) {
break;
}
}
$str = strrev($str);
+ // We pad the edit string before smoothing it, so ranges of similar
+ // characters at the beginning or end of the string can also be smoothed.
+ $str = $this->padEditString($str);
+
if ($this->getApplySmoothing()) {
$str = $this->applySmoothing($str);
}
- return $this->padEditString($str);
+ return $str;
}
private function padEditString($str) {
if ($this->prefix) {
$str = str_repeat('s', count($this->prefix)).$str;
}
if ($this->suffix) {
$str = $str.str_repeat('s', count($this->suffix));
}
return $str;
}
private function getTypeMatrix() {
if (!$this->computeString) {
throw new PhutilInvalidStateException('setComputeString');
}
if ($this->typeMatrix === null) {
$this->computeMatrix($this->x, $this->y);
}
return $this->typeMatrix;
}
private function getDistanceMatrix() {
if ($this->distanceMatrix === null) {
$this->computeMatrix($this->x, $this->y);
}
return $this->distanceMatrix;
}
private function computeMatrix(array $x, array $y) {
$xl = count($x);
$yl = count($y);
$m = array();
$infinity = $this->getInfinity();
$use_damerau = ($this->transposeCost !== null);
if ($use_damerau) {
// Initialize the default cost of a transpose.
$m[-1][-1] = $infinity;
// Initialize the character position dictionary for Damerau.
$d = array();
foreach ($x as $c) {
$d[$c] = -1;
}
foreach ($y as $c) {
$d[$c] = -1;
}
// Initialize the transpose costs for Damerau.
for ($xx = 0; $xx <= $xl; $xx++) {
$m[$xx][-1] = $infinity;
}
for ($yy = 0; $yy <= $yl; $yy++) {
$m[-1][$yy] = $infinity;
}
}
// Initialize the top row of the matrix.
for ($xx = 0; $xx <= $xl; $xx++) {
$m[$xx][0] = $xx * $this->deleteCost;
}
// Initialize the left column of the matrix.
$cost = 0;
for ($yy = 0; $yy <= $yl; $yy++) {
$m[0][$yy] = $yy * $this->insertCost;
}
$use_types = ($this->computeString);
if ($use_types) {
$t = array();
for ($xx = 0; $xx <= $xl; $xx++) {
$t[$xx][0] = 'd';
}
for ($yy = 0; $yy <= $yl; $yy++) {
$t[0][$yy] = 'i';
}
$t[0][0] = 's';
}
$alt_cost = $this->getAlterCost();
if ($alt_cost && !$use_types) {
throw new Exception(
pht(
'If you provide an alter cost with %s, you must enable '.
'type computation with %s.',
'setAlterCost()',
'setComputeStrings()'));
}
// Build the edit distance matrix.
for ($xx = 1; $xx <= $xl; $xx++) {
$last_dy = -1;
for ($yy = 1; $yy <= $yl; $yy++) {
if ($use_damerau) {
$dx = $d[$y[$yy - 1]];
$dy = $last_dy;
}
if ($x[$xx - 1] === $y[$yy - 1]) {
$rep_cost = $m[$xx - 1][$yy - 1] + 0;
$rep_type = 's';
} else {
$rep_cost = $m[$xx - 1][$yy - 1] + $this->replaceCost;
$rep_type = 'x';
}
$del_cost = $m[$xx - 1][$yy ] + $this->deleteCost;
$ins_cost = $m[$xx ][$yy - 1] + $this->insertCost;
if ($alt_cost) {
$del_char = $t[$xx - 1][$yy ];
$ins_char = $t[$xx ][$yy - 1];
$rep_char = $t[$xx - 1][$yy - 1];
if ($del_char !== 'd') {
$del_cost += $alt_cost;
}
if ($ins_char !== 'i') {
$ins_cost += $alt_cost;
}
if ($rep_char !== $rep_type) {
$rep_cost += $alt_cost;
}
}
if ($rep_cost <= $del_cost && $rep_cost <= $ins_cost) {
$cost = $rep_cost;
$type = $rep_type;
if ($rep_type === 's') {
$last_dy = $yy - 1;
}
} else if ($ins_cost <= $del_cost) {
$cost = $ins_cost;
$type = 'i';
} else {
$cost = $del_cost;
$type = 'd';
}
if ($use_damerau) {
$trn_count = (($xx - $dx - 2) + ($yy - $dy - 2) + 1);
$trn_cost = $m[$dx][$dy] + ($trn_count * $this->transposeCost);
if ($trn_cost < $cost) {
$cost = $trn_cost;
$type = array($dx + 1, $dy + 1, $type);
}
}
$m[$xx][$yy] = $cost;
if ($use_types) {
$t[$xx][$yy] = $type;
}
}
if ($use_damerau) {
$d[$x[$xx - 1]] = ($xx - 1);
}
}
$this->distanceMatrix = $m;
if ($use_types) {
$this->typeMatrix = $t;
}
}
private function getInfinity() {
return INF;
}
private function printMatrix(array $m) {
$x = $this->x;
$y = $this->y;
$p = '% 8s ';
printf($p, ' ');
foreach (head($m) as $yk => $yv) {
printf($p, idx($y, $yk - 1, '-'));
}
echo "\n";
foreach ($m as $xk => $xv) {
printf($p, idx($x, $xk - 1, '-'));
foreach ($xv as $yk => $yv) {
printf($p, ($yv == $this->getInfinity() ? 'inf' : $yv));
}
echo "\n";
}
}
private function printTypeMatrix(array $t) {
$x = $this->x;
$y = $this->y;
$p = '% 8s ';
printf($p, ' ');
foreach (head($t) as $yk => $yv) {
printf($p, idx($y, $yk - 1, '-'));
}
echo "\n";
foreach ($t as $xk => $xv) {
printf($p, idx($x, $xk - 1, '-'));
foreach ($xv as $yk => $yv) {
printf($p, ($yv == $this->getInfinity() ? 'inf' : $yv));
}
echo "\n";
}
}
private function applySmoothing($str) {
$result = $str;
// Smooth the string out, by replacing short runs of similar characters
// with 'x' operations. This makes the result more readable to humans,
// since there are fewer choppy runs of short added and removed substrings.
do {
$original = $result;
- $result = preg_replace('/([xdi])(s{3})([xdi])/', '$1xxx$3', $result);
- $result = preg_replace('/([xdi])(s{2})([xdi])/', '$1xx$3', $result);
- $result = preg_replace('/([xdi])(s{1})([xdi])/', '$1x$3', $result);
+ $result = preg_replace('/(^|[xdi])(s{3})([xdi]|\z)/', '$1xxx$3', $result);
+ $result = preg_replace('/(^|[xdi])(s{2})([xdi]|\z)/', '$1xx$3', $result);
+ $result = preg_replace('/(^|[xdi])(s{1})([xdi]|\z)/', '$1x$3', $result);
} while ($result != $original);
return $result;
}
}
diff --git a/src/utils/PhutilProseDiff.php b/src/utils/PhutilProseDiff.php
index 84737fb..3dd279c 100644
--- a/src/utils/PhutilProseDiff.php
+++ b/src/utils/PhutilProseDiff.php
@@ -1,91 +1,195 @@
<?php
final class PhutilProseDiff extends Phobject {
private $parts = array();
public function addPart($type, $text) {
$this->parts[] = array(
'type' => $type,
'text' => $text,
);
return $this;
}
public function getParts() {
return $this->parts;
}
public function reorderParts() {
// Reorder sequences of removed and added sections to put all the "-"
// parts together first, then all the "+" parts together. This produces
// a more human-readable result than intermingling them.
+
$o_run = array();
$n_run = array();
$result = array();
foreach ($this->parts as $part) {
$type = $part['type'];
switch ($type) {
case '-':
$o_run[] = $part;
break;
case '+':
$n_run[] = $part;
break;
default:
- foreach ($o_run as $o) {
- $result[] = $o;
- }
- foreach ($n_run as $n) {
- $result[] = $n;
+ if ($o_run || $n_run) {
+ foreach ($this->combineRuns($o_run, $n_run) as $merged_part) {
+ $result[] = $merged_part;
+ }
+ $o_run = array();
+ $n_run = array();
}
$result[] = $part;
- $o_run = array();
- $n_run = array();
break;
}
}
- foreach ($o_run as $o) {
- $result[] = $o;
- }
- foreach ($n_run as $n) {
- $result[] = $n;
+ if ($o_run || $n_run) {
+ foreach ($this->combineRuns($o_run, $n_run) as $part) {
+ $result[] = $part;
+ }
}
// Now, combine consecuitive runs of the same type of change (like a
// series of "-" parts) into a single run.
$combined = array();
$last = null;
$last_text = null;
foreach ($result as $part) {
$type = $part['type'];
if ($last !== $type) {
if ($last !== null) {
$combined[] = array(
'type' => $last,
'text' => $last_text,
);
}
$last_text = null;
$last = $type;
}
$last_text .= $part['text'];
}
if ($last_text !== null) {
$combined[] = array(
'type' => $last,
'text' => $last_text,
);
}
$this->parts = $combined;
return $this;
}
+ private function combineRuns($o_run, $n_run) {
+ $o_merge = $this->mergeParts($o_run);
+ $n_merge = $this->mergeParts($n_run);
+
+ // When removed and added blocks share a prefix or suffix, we sometimes
+ // want to count it as unchanged (for example, if it is whitespace) but
+ // sometimes want to count it as changed (for example, if it is a word
+ // suffix like "ing"). Find common prefixes and suffixes of these layout
+ // characters and emit them as "=" (unchanged) blocks.
+
+ $layout_characters = array(
+ ' ' => true,
+ "\n" => true,
+ '.' => true,
+ '!' => true,
+ ',' => true,
+ '?' => true,
+ );
+
+ $o_text = $o_merge['text'];
+ $n_text = $n_merge['text'];
+ $o_len = strlen($o_text);
+ $n_len = strlen($n_text);
+ $min_len = min($o_len, $n_len);
+
+ $prefix_len = 0;
+ for ($pos = 0; $pos < $min_len; $pos++) {
+ $o = $o_text[$pos];
+ $n = $n_text[$pos];
+ if ($o !== $n) {
+ break;
+ }
+ if (empty($layout_characters[$o])) {
+ break;
+ }
+ $prefix_len++;
+ }
+
+ $suffix_len = 0;
+ for ($pos = 1; $pos <= $min_len; $pos++) {
+ $o = $o_text[$o_len - $pos];
+ $n = $n_text[$n_len - $pos];
+ if ($o !== $n) {
+ break;
+ }
+ if (empty($layout_characters[$o])) {
+ break;
+ }
+ $suffix_len++;
+ }
+
+ $results = array();
+
+ if ($prefix_len) {
+ $results[] = array(
+ 'type' => '=',
+ 'text' => substr($o_text, 0, $prefix_len),
+ );
+ }
+
+ if ($prefix_len < $o_len) {
+ $results[] = array(
+ 'type' => '-',
+ 'text' => substr($o_text, $prefix_len, $o_len - $suffix_len),
+ );
+ }
+
+ if ($prefix_len < $n_len) {
+ $results[] = array(
+ 'type' => '+',
+ 'text' => substr($n_text, $prefix_len, $n_len - $suffix_len),
+ );
+ }
+
+ if ($suffix_len) {
+ $results[] = array(
+ 'type' => '=',
+ 'text' => substr($o_text, -$suffix_len),
+ );
+ }
+
+ return $results;
+ }
+
+ private function mergeParts(array $parts) {
+ $text = '';
+ $type = null;
+ foreach ($parts as $part) {
+ $part_type = $part['type'];
+ if ($type === null) {
+ $type = $part_type;
+ }
+ if ($type !== $part_type) {
+ throw new Exception(pht('Can not merge parts of dissimilar types!'));
+ }
+ $text .= $part['text'];
+ }
+
+ return array(
+ 'type' => $type,
+ 'text' => $text,
+ );
+ }
+
+
}
diff --git a/src/utils/__tests__/PhutilProseDiffTestCase.php b/src/utils/__tests__/PhutilProseDiffTestCase.php
index 8028b52..918f23a 100644
--- a/src/utils/__tests__/PhutilProseDiffTestCase.php
+++ b/src/utils/__tests__/PhutilProseDiffTestCase.php
@@ -1,77 +1,106 @@
<?php
final class PhutilProseDiffTestCase extends PhutilTestCase {
public function testProseDiffsDistance() {
$this->assertProseParts(
'',
'',
array(),
pht('Empty'));
$this->assertProseParts(
"xxx\nyyy",
"xxx\nzzz\nyyy",
array(
"= xxx\n",
"+ zzz\n",
'= yyy',
),
pht('Add Paragraph'));
$this->assertProseParts(
"xxx\nzzz\nyyy",
"xxx\nyyy",
array(
"= xxx\n",
"- zzz\n",
'= yyy',
),
pht('Remove Paragraph'));
// Without smoothing, the alogorithm identifies that "shark" and "cat"
// both contain the letter "a" and tries to express this as a very
// fine-grained edit which replaces "sh" with "c" and then "rk" with "t".
// This is technically correct, but it is much easier for human viewers to
// parse if we smooth this into a single removal and a single addition.
$this->assertProseParts(
'They say the shark has nine lives.',
'They say the cat has nine lives.',
array(
'= They say the ',
'- shark',
'+ cat',
'= has nine lives.',
),
pht('"Shark/cat" word edit smoothenss.'));
$this->assertProseParts(
'Rising quickly, she says',
'Rising quickly, she remarks:',
array(
'= Rising quickly, she ',
'- says',
'+ remarks:',
),
pht('"Says/remarks" word edit smoothenss.'));
+ $this->assertProseParts(
+ 'See screenshots',
+ 'Viewed video files',
+ array(
+ '- See screenshots',
+ '+ Viewed video files',
+ ),
+ pht('Complete paragraph rewrite.'));
+
+ $this->assertProseParts(
+ 'xaaax',
+ 'xbbbx',
+ array(
+ '- xaaax',
+ '+ xbbbx',
+ ),
+ pht('Whole word rewrite with common prefix and suffix.'));
+
+ $this->assertProseParts(
+ ' aaa ',
+ ' bbb ',
+ array(
+ '= ',
+ '- aaa',
+ '+ bbb',
+ '= ',
+ ),
+ pht('Whole word rewrite with whitespace prefix and suffix.'));
+
}
private function assertProseParts($old, $new, array $expect_parts, $label) {
$engine = new PhutilProseDifferenceEngine();
$diff = $engine->getDiff($old, $new);
$parts = $diff->getParts();
$actual = array();
foreach ($parts as $part) {
$actual[] = $part['type'].' '.$part['text'];
}
$this->assertEqual($expect_parts, $actual, $label);
}
}
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Fri, Jan 16, 1:07 AM (16 h, 53 m)
Storage Engine
local-disk
Storage Format
Raw Data
Storage Handle
f5/8c/4e08ccc11861a928b144b99a3ddf
Attached To
rPHUTIL libphutil
Event Timeline
Log In to Comment