1 | // Copyright 2022 The Go Authors. All rights reserved. |
---|---|
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | package lcs |
6 | |
7 | // TODO(adonovan): remove unclear references to "old" in this package. |
8 | |
9 | import ( |
10 | "fmt" |
11 | ) |
12 | |
13 | // A Diff is a replacement of a portion of A by a portion of B. |
14 | type Diff struct { |
15 | Start, End int // offsets of portion to delete in A |
16 | ReplStart, ReplEnd int // offset of replacement text in B |
17 | } |
18 | |
19 | // DiffStrings returns the differences between two strings. |
20 | // It does not respect rune boundaries. |
21 | func DiffStrings(a, b string) []Diff { return diff(stringSeqs{a, b}) } |
22 | |
23 | // DiffBytes returns the differences between two byte sequences. |
24 | // It does not respect rune boundaries. |
25 | func DiffBytes(a, b []byte) []Diff { return diff(bytesSeqs{a, b}) } |
26 | |
27 | // DiffRunes returns the differences between two rune sequences. |
28 | func DiffRunes(a, b []rune) []Diff { return diff(runesSeqs{a, b}) } |
29 | |
30 | func diff(seqs sequences) []Diff { |
31 | // A limit on how deeply the LCS algorithm should search. The value is just a guess. |
32 | const maxDiffs = 30 |
33 | diff, _ := compute(seqs, twosided, maxDiffs/2) |
34 | return diff |
35 | } |
36 | |
37 | // compute computes the list of differences between two sequences, |
38 | // along with the LCS. It is exercised directly by tests. |
39 | // The algorithm is one of {forward, backward, twosided}. |
40 | func compute(seqs sequences, algo func(*editGraph) lcs, limit int) ([]Diff, lcs) { |
41 | if limit <= 0 { |
42 | limit = 1 << 25 // effectively infinity |
43 | } |
44 | alen, blen := seqs.lengths() |
45 | g := &editGraph{ |
46 | seqs: seqs, |
47 | vf: newtriang(limit), |
48 | vb: newtriang(limit), |
49 | limit: limit, |
50 | ux: alen, |
51 | uy: blen, |
52 | delta: alen - blen, |
53 | } |
54 | lcs := algo(g) |
55 | diffs := lcs.toDiffs(alen, blen) |
56 | return diffs, lcs |
57 | } |
58 | |
59 | // editGraph carries the information for computing the lcs of two sequences. |
60 | type editGraph struct { |
61 | seqs sequences |
62 | vf, vb label // forward and backward labels |
63 | |
64 | limit int // maximal value of D |
65 | // the bounding rectangle of the current edit graph |
66 | lx, ly, ux, uy int |
67 | delta int // common subexpression: (ux-lx)-(uy-ly) |
68 | } |
69 | |
70 | // toDiffs converts an LCS to a list of edits. |
71 | func (lcs lcs) toDiffs(alen, blen int) []Diff { |
72 | var diffs []Diff |
73 | var pa, pb int // offsets in a, b |
74 | for _, l := range lcs { |
75 | if pa < l.X || pb < l.Y { |
76 | diffs = append(diffs, Diff{pa, l.X, pb, l.Y}) |
77 | } |
78 | pa = l.X + l.Len |
79 | pb = l.Y + l.Len |
80 | } |
81 | if pa < alen || pb < blen { |
82 | diffs = append(diffs, Diff{pa, alen, pb, blen}) |
83 | } |
84 | return diffs |
85 | } |
86 | |
87 | // --- FORWARD --- |
88 | |
89 | // fdone decides if the forwward path has reached the upper right |
90 | // corner of the rectangle. If so, it also returns the computed lcs. |
91 | func (e *editGraph) fdone(D, k int) (bool, lcs) { |
92 | // x, y, k are relative to the rectangle |
93 | x := e.vf.get(D, k) |
94 | y := x - k |
95 | if x == e.ux && y == e.uy { |
96 | return true, e.forwardlcs(D, k) |
97 | } |
98 | return false, nil |
99 | } |
100 | |
101 | // run the forward algorithm, until success or up to the limit on D. |
102 | func forward(e *editGraph) lcs { |
103 | e.setForward(0, 0, e.lx) |
104 | if ok, ans := e.fdone(0, 0); ok { |
105 | return ans |
106 | } |
107 | // from D to D+1 |
108 | for D := 0; D < e.limit; D++ { |
109 | e.setForward(D+1, -(D + 1), e.getForward(D, -D)) |
110 | if ok, ans := e.fdone(D+1, -(D + 1)); ok { |
111 | return ans |
112 | } |
113 | e.setForward(D+1, D+1, e.getForward(D, D)+1) |
114 | if ok, ans := e.fdone(D+1, D+1); ok { |
115 | return ans |
116 | } |
117 | for k := -D + 1; k <= D-1; k += 2 { |
118 | // these are tricky and easy to get backwards |
119 | lookv := e.lookForward(k, e.getForward(D, k-1)+1) |
120 | lookh := e.lookForward(k, e.getForward(D, k+1)) |
121 | if lookv > lookh { |
122 | e.setForward(D+1, k, lookv) |
123 | } else { |
124 | e.setForward(D+1, k, lookh) |
125 | } |
126 | if ok, ans := e.fdone(D+1, k); ok { |
127 | return ans |
128 | } |
129 | } |
130 | } |
131 | // D is too large |
132 | // find the D path with maximal x+y inside the rectangle and |
133 | // use that to compute the found part of the lcs |
134 | kmax := -e.limit - 1 |
135 | diagmax := -1 |
136 | for k := -e.limit; k <= e.limit; k += 2 { |
137 | x := e.getForward(e.limit, k) |
138 | y := x - k |
139 | if x+y > diagmax && x <= e.ux && y <= e.uy { |
140 | diagmax, kmax = x+y, k |
141 | } |
142 | } |
143 | return e.forwardlcs(e.limit, kmax) |
144 | } |
145 | |
146 | // recover the lcs by backtracking from the farthest point reached |
147 | func (e *editGraph) forwardlcs(D, k int) lcs { |
148 | var ans lcs |
149 | for x := e.getForward(D, k); x != 0 || x-k != 0; { |
150 | if ok(D-1, k-1) && x-1 == e.getForward(D-1, k-1) { |
151 | // if (x-1,y) is labelled D-1, x--,D--,k--,continue |
152 | D, k, x = D-1, k-1, x-1 |
153 | continue |
154 | } else if ok(D-1, k+1) && x == e.getForward(D-1, k+1) { |
155 | // if (x,y-1) is labelled D-1, x, D--,k++, continue |
156 | D, k = D-1, k+1 |
157 | continue |
158 | } |
159 | // if (x-1,y-1)--(x,y) is a diagonal, prepend,x--,y--, continue |
160 | y := x - k |
161 | ans = ans.prepend(x+e.lx-1, y+e.ly-1) |
162 | x-- |
163 | } |
164 | return ans |
165 | } |
166 | |
167 | // start at (x,y), go up the diagonal as far as possible, |
168 | // and label the result with d |
169 | func (e *editGraph) lookForward(k, relx int) int { |
170 | rely := relx - k |
171 | x, y := relx+e.lx, rely+e.ly |
172 | if x < e.ux && y < e.uy { |
173 | x += e.seqs.commonPrefixLen(x, e.ux, y, e.uy) |
174 | } |
175 | return x |
176 | } |
177 | |
178 | func (e *editGraph) setForward(d, k, relx int) { |
179 | x := e.lookForward(k, relx) |
180 | e.vf.set(d, k, x-e.lx) |
181 | } |
182 | |
183 | func (e *editGraph) getForward(d, k int) int { |
184 | x := e.vf.get(d, k) |
185 | return x |
186 | } |
187 | |
188 | // --- BACKWARD --- |
189 | |
190 | // bdone decides if the backward path has reached the lower left corner |
191 | func (e *editGraph) bdone(D, k int) (bool, lcs) { |
192 | // x, y, k are relative to the rectangle |
193 | x := e.vb.get(D, k) |
194 | y := x - (k + e.delta) |
195 | if x == 0 && y == 0 { |
196 | return true, e.backwardlcs(D, k) |
197 | } |
198 | return false, nil |
199 | } |
200 | |
201 | // run the backward algorithm, until success or up to the limit on D. |
202 | func backward(e *editGraph) lcs { |
203 | e.setBackward(0, 0, e.ux) |
204 | if ok, ans := e.bdone(0, 0); ok { |
205 | return ans |
206 | } |
207 | // from D to D+1 |
208 | for D := 0; D < e.limit; D++ { |
209 | e.setBackward(D+1, -(D + 1), e.getBackward(D, -D)-1) |
210 | if ok, ans := e.bdone(D+1, -(D + 1)); ok { |
211 | return ans |
212 | } |
213 | e.setBackward(D+1, D+1, e.getBackward(D, D)) |
214 | if ok, ans := e.bdone(D+1, D+1); ok { |
215 | return ans |
216 | } |
217 | for k := -D + 1; k <= D-1; k += 2 { |
218 | // these are tricky and easy to get wrong |
219 | lookv := e.lookBackward(k, e.getBackward(D, k-1)) |
220 | lookh := e.lookBackward(k, e.getBackward(D, k+1)-1) |
221 | if lookv < lookh { |
222 | e.setBackward(D+1, k, lookv) |
223 | } else { |
224 | e.setBackward(D+1, k, lookh) |
225 | } |
226 | if ok, ans := e.bdone(D+1, k); ok { |
227 | return ans |
228 | } |
229 | } |
230 | } |
231 | |
232 | // D is too large |
233 | // find the D path with minimal x+y inside the rectangle and |
234 | // use that to compute the part of the lcs found |
235 | kmax := -e.limit - 1 |
236 | diagmin := 1 << 25 |
237 | for k := -e.limit; k <= e.limit; k += 2 { |
238 | x := e.getBackward(e.limit, k) |
239 | y := x - (k + e.delta) |
240 | if x+y < diagmin && x >= 0 && y >= 0 { |
241 | diagmin, kmax = x+y, k |
242 | } |
243 | } |
244 | if kmax < -e.limit { |
245 | panic(fmt.Sprintf("no paths when limit=%d?", e.limit)) |
246 | } |
247 | return e.backwardlcs(e.limit, kmax) |
248 | } |
249 | |
250 | // recover the lcs by backtracking |
251 | func (e *editGraph) backwardlcs(D, k int) lcs { |
252 | var ans lcs |
253 | for x := e.getBackward(D, k); x != e.ux || x-(k+e.delta) != e.uy; { |
254 | if ok(D-1, k-1) && x == e.getBackward(D-1, k-1) { |
255 | // D--, k--, x unchanged |
256 | D, k = D-1, k-1 |
257 | continue |
258 | } else if ok(D-1, k+1) && x+1 == e.getBackward(D-1, k+1) { |
259 | // D--, k++, x++ |
260 | D, k, x = D-1, k+1, x+1 |
261 | continue |
262 | } |
263 | y := x - (k + e.delta) |
264 | ans = ans.append(x+e.lx, y+e.ly) |
265 | x++ |
266 | } |
267 | return ans |
268 | } |
269 | |
270 | // start at (x,y), go down the diagonal as far as possible, |
271 | func (e *editGraph) lookBackward(k, relx int) int { |
272 | rely := relx - (k + e.delta) // forward k = k + e.delta |
273 | x, y := relx+e.lx, rely+e.ly |
274 | if x > 0 && y > 0 { |
275 | x -= e.seqs.commonSuffixLen(0, x, 0, y) |
276 | } |
277 | return x |
278 | } |
279 | |
280 | // convert to rectangle, and label the result with d |
281 | func (e *editGraph) setBackward(d, k, relx int) { |
282 | x := e.lookBackward(k, relx) |
283 | e.vb.set(d, k, x-e.lx) |
284 | } |
285 | |
286 | func (e *editGraph) getBackward(d, k int) int { |
287 | x := e.vb.get(d, k) |
288 | return x |
289 | } |
290 | |
291 | // -- TWOSIDED --- |
292 | |
293 | func twosided(e *editGraph) lcs { |
294 | // The termination condition could be improved, as either the forward |
295 | // or backward pass could succeed before Myers' Lemma applies. |
296 | // Aside from questions of efficiency (is the extra testing cost-effective) |
297 | // this is more likely to matter when e.limit is reached. |
298 | e.setForward(0, 0, e.lx) |
299 | e.setBackward(0, 0, e.ux) |
300 | |
301 | // from D to D+1 |
302 | for D := 0; D < e.limit; D++ { |
303 | // just finished a backwards pass, so check |
304 | if got, ok := e.twoDone(D, D); ok { |
305 | return e.twolcs(D, D, got) |
306 | } |
307 | // do a forwards pass (D to D+1) |
308 | e.setForward(D+1, -(D + 1), e.getForward(D, -D)) |
309 | e.setForward(D+1, D+1, e.getForward(D, D)+1) |
310 | for k := -D + 1; k <= D-1; k += 2 { |
311 | // these are tricky and easy to get backwards |
312 | lookv := e.lookForward(k, e.getForward(D, k-1)+1) |
313 | lookh := e.lookForward(k, e.getForward(D, k+1)) |
314 | if lookv > lookh { |
315 | e.setForward(D+1, k, lookv) |
316 | } else { |
317 | e.setForward(D+1, k, lookh) |
318 | } |
319 | } |
320 | // just did a forward pass, so check |
321 | if got, ok := e.twoDone(D+1, D); ok { |
322 | return e.twolcs(D+1, D, got) |
323 | } |
324 | // do a backward pass, D to D+1 |
325 | e.setBackward(D+1, -(D + 1), e.getBackward(D, -D)-1) |
326 | e.setBackward(D+1, D+1, e.getBackward(D, D)) |
327 | for k := -D + 1; k <= D-1; k += 2 { |
328 | // these are tricky and easy to get wrong |
329 | lookv := e.lookBackward(k, e.getBackward(D, k-1)) |
330 | lookh := e.lookBackward(k, e.getBackward(D, k+1)-1) |
331 | if lookv < lookh { |
332 | e.setBackward(D+1, k, lookv) |
333 | } else { |
334 | e.setBackward(D+1, k, lookh) |
335 | } |
336 | } |
337 | } |
338 | |
339 | // D too large. combine a forward and backward partial lcs |
340 | // first, a forward one |
341 | kmax := -e.limit - 1 |
342 | diagmax := -1 |
343 | for k := -e.limit; k <= e.limit; k += 2 { |
344 | x := e.getForward(e.limit, k) |
345 | y := x - k |
346 | if x+y > diagmax && x <= e.ux && y <= e.uy { |
347 | diagmax, kmax = x+y, k |
348 | } |
349 | } |
350 | if kmax < -e.limit { |
351 | panic(fmt.Sprintf("no forward paths when limit=%d?", e.limit)) |
352 | } |
353 | lcs := e.forwardlcs(e.limit, kmax) |
354 | // now a backward one |
355 | // find the D path with minimal x+y inside the rectangle and |
356 | // use that to compute the lcs |
357 | diagmin := 1 << 25 // infinity |
358 | for k := -e.limit; k <= e.limit; k += 2 { |
359 | x := e.getBackward(e.limit, k) |
360 | y := x - (k + e.delta) |
361 | if x+y < diagmin && x >= 0 && y >= 0 { |
362 | diagmin, kmax = x+y, k |
363 | } |
364 | } |
365 | if kmax < -e.limit { |
366 | panic(fmt.Sprintf("no backward paths when limit=%d?", e.limit)) |
367 | } |
368 | lcs = append(lcs, e.backwardlcs(e.limit, kmax)...) |
369 | // These may overlap (e.forwardlcs and e.backwardlcs return sorted lcs) |
370 | ans := lcs.fix() |
371 | return ans |
372 | } |
373 | |
374 | // Does Myers' Lemma apply? |
375 | func (e *editGraph) twoDone(df, db int) (int, bool) { |
376 | if (df+db+e.delta)%2 != 0 { |
377 | return 0, false // diagonals cannot overlap |
378 | } |
379 | kmin := -db + e.delta |
380 | if -df > kmin { |
381 | kmin = -df |
382 | } |
383 | kmax := db + e.delta |
384 | if df < kmax { |
385 | kmax = df |
386 | } |
387 | for k := kmin; k <= kmax; k += 2 { |
388 | x := e.vf.get(df, k) |
389 | u := e.vb.get(db, k-e.delta) |
390 | if u <= x { |
391 | // is it worth looking at all the other k? |
392 | for l := k; l <= kmax; l += 2 { |
393 | x := e.vf.get(df, l) |
394 | y := x - l |
395 | u := e.vb.get(db, l-e.delta) |
396 | v := u - l |
397 | if x == u || u == 0 || v == 0 || y == e.uy || x == e.ux { |
398 | return l, true |
399 | } |
400 | } |
401 | return k, true |
402 | } |
403 | } |
404 | return 0, false |
405 | } |
406 | |
407 | func (e *editGraph) twolcs(df, db, kf int) lcs { |
408 | // db==df || db+1==df |
409 | x := e.vf.get(df, kf) |
410 | y := x - kf |
411 | kb := kf - e.delta |
412 | u := e.vb.get(db, kb) |
413 | v := u - kf |
414 | |
415 | // Myers proved there is a df-path from (0,0) to (u,v) |
416 | // and a db-path from (x,y) to (N,M). |
417 | // In the first case the overall path is the forward path |
418 | // to (u,v) followed by the backward path to (N,M). |
419 | // In the second case the path is the backward path to (x,y) |
420 | // followed by the forward path to (x,y) from (0,0). |
421 | |
422 | // Look for some special cases to avoid computing either of these paths. |
423 | if x == u { |
424 | // "babaab" "cccaba" |
425 | // already patched together |
426 | lcs := e.forwardlcs(df, kf) |
427 | lcs = append(lcs, e.backwardlcs(db, kb)...) |
428 | return lcs.sort() |
429 | } |
430 | |
431 | // is (u-1,v) or (u,v-1) labelled df-1? |
432 | // if so, that forward df-1-path plus a horizontal or vertical edge |
433 | // is the df-path to (u,v), then plus the db-path to (N,M) |
434 | if u > 0 && ok(df-1, u-1-v) && e.vf.get(df-1, u-1-v) == u-1 { |
435 | // "aabbab" "cbcabc" |
436 | lcs := e.forwardlcs(df-1, u-1-v) |
437 | lcs = append(lcs, e.backwardlcs(db, kb)...) |
438 | return lcs.sort() |
439 | } |
440 | if v > 0 && ok(df-1, (u-(v-1))) && e.vf.get(df-1, u-(v-1)) == u { |
441 | // "abaabb" "bcacab" |
442 | lcs := e.forwardlcs(df-1, u-(v-1)) |
443 | lcs = append(lcs, e.backwardlcs(db, kb)...) |
444 | return lcs.sort() |
445 | } |
446 | |
447 | // The path can't possibly contribute to the lcs because it |
448 | // is all horizontal or vertical edges |
449 | if u == 0 || v == 0 || x == e.ux || y == e.uy { |
450 | // "abaabb" "abaaaa" |
451 | if u == 0 || v == 0 { |
452 | return e.backwardlcs(db, kb) |
453 | } |
454 | return e.forwardlcs(df, kf) |
455 | } |
456 | |
457 | // is (x+1,y) or (x,y+1) labelled db-1? |
458 | if x+1 <= e.ux && ok(db-1, x+1-y-e.delta) && e.vb.get(db-1, x+1-y-e.delta) == x+1 { |
459 | // "bababb" "baaabb" |
460 | lcs := e.backwardlcs(db-1, kb+1) |
461 | lcs = append(lcs, e.forwardlcs(df, kf)...) |
462 | return lcs.sort() |
463 | } |
464 | if y+1 <= e.uy && ok(db-1, x-(y+1)-e.delta) && e.vb.get(db-1, x-(y+1)-e.delta) == x { |
465 | // "abbbaa" "cabacc" |
466 | lcs := e.backwardlcs(db-1, kb-1) |
467 | lcs = append(lcs, e.forwardlcs(df, kf)...) |
468 | return lcs.sort() |
469 | } |
470 | |
471 | // need to compute another path |
472 | // "aabbaa" "aacaba" |
473 | lcs := e.backwardlcs(db, kb) |
474 | oldx, oldy := e.ux, e.uy |
475 | e.ux = u |
476 | e.uy = v |
477 | lcs = append(lcs, forward(e)...) |
478 | e.ux, e.uy = oldx, oldy |
479 | return lcs.sort() |
480 | } |
481 |
Members