source: other-projects/diffcol/trunk/diffcol/Algorithm/Diff.pm@ 21711

Last change on this file since 21711 was 21711, checked in by oranfry, 14 years ago

bringing across the diffcol project

File size: 24.2 KB
Line 
1package Algorithm::Diff;
2use strict;
3use vars qw($VERSION @EXPORT_OK @ISA @EXPORT);
4use integer; # see below in _replaceNextLargerWith() for mod to make
5 # if you don't use this
6require Exporter;
7@ISA = qw(Exporter);
8@EXPORT = qw();
9@EXPORT_OK = qw(LCS diff traverse_sequences traverse_balanced sdiff);
10$VERSION = sprintf('%d.%02d', (q$Revision: 1.15 $ =~ /\d+/g));
11
12# McIlroy-Hunt diff algorithm
13# Adapted from the Smalltalk code of Mario I. Wolczko, <[email protected]>
14# by Ned Konz, [email protected]
15
16=head1 NAME
17
18Algorithm::Diff - Compute `intelligent' differences between two files / lists
19
20=head1 SYNOPSIS
21
22 use Algorithm::Diff qw(diff sdiff LCS traverse_sequences
23 traverse_balanced);
24
25 @lcs = LCS( \@seq1, \@seq2 );
26
27 @lcs = LCS( \@seq1, \@seq2, $key_generation_function );
28
29 $lcsref = LCS( \@seq1, \@seq2 );
30
31 $lcsref = LCS( \@seq1, \@seq2, $key_generation_function );
32
33 @diffs = diff( \@seq1, \@seq2 );
34
35 @diffs = diff( \@seq1, \@seq2, $key_generation_function );
36
37 @sdiffs = sdiff( \@seq1, \@seq2 );
38
39 @sdiffs = sdiff( \@seq1, \@seq2, $key_generation_function );
40
41 traverse_sequences( \@seq1, \@seq2,
42 { MATCH => $callback,
43 DISCARD_A => $callback,
44 DISCARD_B => $callback,
45 } );
46
47 traverse_sequences( \@seq1, \@seq2,
48 { MATCH => $callback,
49 DISCARD_A => $callback,
50 DISCARD_B => $callback,
51 },
52 $key_generation_function );
53
54 traverse_balanced( \@seq1, \@seq2,
55 { MATCH => $callback,
56 DISCARD_A => $callback,
57 DISCARD_B => $callback,
58 CHANGE => $callback,
59 } );
60
61=head1 INTRODUCTION
62
63(by Mark-Jason Dominus)
64
65I once read an article written by the authors of C<diff>; they said
66that they hard worked very hard on the algorithm until they found the
67right one.
68
69I think what they ended up using (and I hope someone will correct me,
70because I am not very confident about this) was the `longest common
71subsequence' method. in the LCS problem, you have two sequences of
72items:
73
74 a b c d f g h j q z
75
76 a b c d e f g i j k r x y z
77
78and you want to find the longest sequence of items that is present in
79both original sequences in the same order. That is, you want to find
80a new sequence I<S> which can be obtained from the first sequence by
81deleting some items, and from the secend sequence by deleting other
82items. You also want I<S> to be as long as possible. In this case
83I<S> is
84
85 a b c d f g j z
86
87From there it's only a small step to get diff-like output:
88
89 e h i k q r x y
90 + - + + - + + +
91
92This module solves the LCS problem. It also includes a canned
93function to generate C<diff>-like output.
94
95It might seem from the example above that the LCS of two sequences is
96always pretty obvious, but that's not always the case, especially when
97the two sequences have many repeated elements. For example, consider
98
99 a x b y c z p d q
100 a b c a x b y c z
101
102A naive approach might start by matching up the C<a> and C<b> that
103appear at the beginning of each sequence, like this:
104
105 a x b y c z p d q
106 a b c a b y c z
107
108This finds the common subsequence C<a b c z>. But actually, the LCS
109is C<a x b y c z>:
110
111 a x b y c z p d q
112 a b c a x b y c z
113
114=head1 USAGE
115
116This module provides three exportable functions, which we'll deal with in
117ascending order of difficulty: C<LCS>,
118C<diff>, C<sdiff>, C<traverse_sequences>, and C<traverse_balanced>.
119
120=head2 C<LCS>
121
122Given references to two lists of items, LCS returns an array containing their
123longest common subsequence. In scalar context, it returns a reference to
124such a list.
125
126 @lcs = LCS( \@seq1, \@seq2 );
127 $lcsref = LCS( \@seq1, \@seq2 );
128
129C<LCS> may be passed an optional third parameter; this is a CODE
130reference to a key generation function. See L</KEY GENERATION
131FUNCTIONS>.
132
133 @lcs = LCS( \@seq1, \@seq2, $keyGen );
134 $lcsref = LCS( \@seq1, \@seq2, $keyGen );
135
136Additional parameters, if any, will be passed to the key generation
137routine.
138
139=head2 C<diff>
140
141 @diffs = diff( \@seq1, \@seq2 );
142 $diffs_ref = diff( \@seq1, \@seq2 );
143
144C<diff> computes the smallest set of additions and deletions necessary
145to turn the first sequence into the second, and returns a description
146of these changes. The description is a list of I<hunks>; each hunk
147represents a contiguous section of items which should be added,
148deleted, or replaced. The return value of C<diff> is a list of
149hunks, or, in scalar context, a reference to such a list.
150
151Here is an example: The diff of the following two sequences:
152
153 a b c e h j l m n p
154 b c d e f j k l m r s t
155
156Result:
157
158 [
159 [ [ '-', 0, 'a' ] ],
160
161 [ [ '+', 2, 'd' ] ],
162
163 [ [ '-', 4, 'h' ] ,
164 [ '+', 4, 'f' ] ],
165
166 [ [ '+', 6, 'k' ] ],
167
168 [ [ '-', 8, 'n' ],
169 [ '-', 9, 'p' ],
170 [ '+', 9, 'r' ],
171 [ '+', 10, 's' ],
172 [ '+', 11, 't' ],
173 ]
174 ]
175
176There are five hunks here. The first hunk says that the C<a> at
177position 0 of the first sequence should be deleted (C<->). The second
178hunk says that the C<d> at position 2 of the second sequence should
179be inserted (C<+>). The third hunk says that the C<h> at position 4
180of the first sequence should be removed and replaced with the C<f>
181from position 4 of the second sequence. The other two hunks similarly.
182
183C<diff> may be passed an optional third parameter; this is a CODE
184reference to a key generation function. See L</KEY GENERATION
185FUNCTIONS>.
186
187Additional parameters, if any, will be passed to the key generation
188routine.
189
190=head2 C<sdiff>
191
192 @sdiffs = sdiff( \@seq1, \@seq2 );
193 $sdiffs_ref = sdiff( \@seq1, \@seq2 );
194
195C<sdiff> computes all necessary components to show two sequences
196and their minimized differences side by side, just like the
197Unix-utility I<sdiff> does:
198
199 same same
200 before | after
201 old < -
202 - > new
203
204It returns a list of array refs, each pointing to an array of
205display instructions. In scalar context it returns a reference
206to such a list.
207
208Display instructions consist of three elements: A modifier indicator
209(C<+>: Element added, C<->: Element removed, C<u>: Element unmodified,
210C<c>: Element changed) and the value of the old and new elements, to
211be displayed side by side.
212
213An C<sdiff> of the following two sequences:
214
215 a b c e h j l m n p
216 b c d e f j k l m r s t
217
218results in
219
220[ [ '-', 'a', '' ],
221 [ 'u', 'b', 'b' ],
222 [ 'u', 'c', 'c' ],
223 [ '+', '', 'd' ],
224 [ 'u', 'e', 'e' ],
225 [ 'c', 'h', 'f' ],
226 [ 'u', 'j', 'j' ],
227 [ '+', '', 'k' ],
228 [ 'u', 'l', 'l' ],
229 [ 'u', 'm', 'm' ],
230 [ 'c', 'n', 'r' ],
231 [ 'c', 'p', 's' ],
232 [ '+', '', 't' ] ]
233
234C<sdiff> may be passed an optional third parameter; this is a CODE
235reference to a key generation function. See L</KEY GENERATION
236FUNCTIONS>.
237
238Additional parameters, if any, will be passed to the key generation
239routine.
240
241=head2 C<traverse_sequences>
242
243C<traverse_sequences> is the most general facility provided by this
244module; C<diff> and C<LCS> are implemented as calls to it.
245
246Imagine that there are two arrows. Arrow A points to an element of sequence A,
247and arrow B points to an element of the sequence B. Initially, the arrows
248point to the first elements of the respective sequences. C<traverse_sequences>
249will advance the arrows through the sequences one element at a time, calling an
250appropriate user-specified callback function before each advance. It
251willadvance the arrows in such a way that if there are equal elements C<$A[$i]>
252and C<$B[$j]> which are equal and which are part of the LCS, there will be
253some moment during the execution of C<traverse_sequences> when arrow A is
254pointing to C<$A[$i]> and arrow B is pointing to C<$B[$j]>. When this happens,
255C<traverse_sequences> will call the C<MATCH> callback function and then it will
256advance both arrows.
257
258Otherwise, one of the arrows is pointing to an element of its sequence that is
259not part of the LCS. C<traverse_sequences> will advance that arrow and will
260call the C<DISCARD_A> or the C<DISCARD_B> callback, depending on which arrow it
261advanced. If both arrows point to elements that are not part of the LCS, then
262C<traverse_sequences> will advance one of them and call the appropriate
263callback, but it is not specified which it will call.
264
265The arguments to C<traverse_sequences> are the two sequences to traverse, and a
266hash which specifies the callback functions, like this:
267
268 traverse_sequences( \@seq1, \@seq2,
269 { MATCH => $callback_1,
270 DISCARD_A => $callback_2,
271 DISCARD_B => $callback_3,
272 } );
273
274Callbacks for MATCH, DISCARD_A, and DISCARD_B are invoked with at least the
275indices of the two arrows as their arguments. They are not expected to return
276any values. If a callback is omitted from the table, it is not called.
277
278Callbacks for A_FINISHED and B_FINISHED are invoked with at least the
279corresponding index in A or B.
280
281If arrow A reaches the end of its sequence, before arrow B does,
282C<traverse_sequences> will call the C<A_FINISHED> callback when it advances
283arrow B, if there is such a function; if not it will call C<DISCARD_B> instead.
284Similarly if arrow B finishes first. C<traverse_sequences> returns when both
285arrows are at the ends of their respective sequences. It returns true on
286success and false on failure. At present there is no way to fail.
287
288C<traverse_sequences> may be passed an optional fourth parameter; this is a
289CODE reference to a key generation function. See L</KEY GENERATION FUNCTIONS>.
290
291Additional parameters, if any, will be passed to the key generation function.
292
293=head2 C<traverse_balanced>
294
295C<traverse_balanced> is an alternative to C<traverse_sequences>. It
296uses a different algorithm to iterate through the entries in the
297computed LCS. Instead of sticking to one side and showing element changes
298as insertions and deletions only, it will jump back and forth between
299the two sequences and report I<changes> occurring as deletions on one
300side followed immediatly by an insertion on the other side.
301
302In addition to the
303C<DISCARD_A>,
304C<DISCARD_B>, and
305C<MATCH>
306callbacks supported by C<traverse_sequences>, C<traverse_balanced> supports
307a C<CHANGE> callback indicating that one element got C<replaced> by another:
308
309 traverse_sequences( \@seq1, \@seq2,
310 { MATCH => $callback_1,
311 DISCARD_A => $callback_2,
312 DISCARD_B => $callback_3,
313 CHANGE => $callback_4,
314 } );
315
316If no C<CHANGE> callback is specified, C<traverse_balanced>
317will map C<CHANGE> events to C<DISCARD_A> and C<DISCARD_B> actions,
318therefore resulting in a similar behaviour as C<traverse_sequences>
319with different order of events.
320
321C<traverse_balanced> might be a bit slower than C<traverse_sequences>,
322noticable only while processing huge amounts of data.
323
324The C<sdiff> function of this module
325is implemented as call to C<traverse_balanced>.
326
327=head1 KEY GENERATION FUNCTIONS
328
329C<diff>, C<LCS>, and C<traverse_sequences> accept an optional last parameter.
330This is a CODE reference to a key generating (hashing) function that should
331return a string that uniquely identifies a given element. It should be the
332case that if two elements are to be considered equal, their keys should be the
333same (and the other way around). If no key generation function is provided,
334the key will be the element as a string.
335
336By default, comparisons will use "eq" and elements will be turned into keys
337using the default stringizing operator '""'.
338
339Where this is important is when you're comparing something other than strings.
340If it is the case that you have multiple different objects that should be
341considered to be equal, you should supply a key generation function. Otherwise,
342you have to make sure that your arrays contain unique references.
343
344For instance, consider this example:
345
346 package Person;
347
348 sub new
349 {
350 my $package = shift;
351 return bless { name => '', ssn => '', @_ }, $package;
352 }
353
354 sub clone
355 {
356 my $old = shift;
357 my $new = bless { %$old }, ref($old);
358 }
359
360 sub hash
361 {
362 return shift()->{'ssn'};
363 }
364
365 my $person1 = Person->new( name => 'Joe', ssn => '123-45-6789' );
366 my $person2 = Person->new( name => 'Mary', ssn => '123-47-0000' );
367 my $person3 = Person->new( name => 'Pete', ssn => '999-45-2222' );
368 my $person4 = Person->new( name => 'Peggy', ssn => '123-45-9999' );
369 my $person5 = Person->new( name => 'Frank', ssn => '000-45-9999' );
370
371If you did this:
372
373 my $array1 = [ $person1, $person2, $person4 ];
374 my $array2 = [ $person1, $person3, $person4, $person5 ];
375 Algorithm::Diff::diff( $array1, $array2 );
376
377everything would work out OK (each of the objects would be converted
378into a string like "Person=HASH(0x82425b0)" for comparison).
379
380But if you did this:
381
382 my $array1 = [ $person1, $person2, $person4 ];
383 my $array2 = [ $person1, $person3, $person4->clone(), $person5 ];
384 Algorithm::Diff::diff( $array1, $array2 );
385
386$person4 and $person4->clone() (which have the same name and SSN)
387would be seen as different objects. If you wanted them to be considered
388equivalent, you would have to pass in a key generation function:
389
390 my $array1 = [ $person1, $person2, $person4 ];
391 my $array2 = [ $person1, $person3, $person4->clone(), $person5 ];
392 Algorithm::Diff::diff( $array1, $array2, \&Person::hash );
393
394This would use the 'ssn' field in each Person as a comparison key, and
395so would consider $person4 and $person4->clone() as equal.
396
397You may also pass additional parameters to the key generation function
398if you wish.
399
400=head1 AUTHOR
401
402This version by Ned Konz, [email protected]
403
404=head1 LICENSE
405
406Copyright (c) 2000-2002 Ned Konz. All rights reserved.
407This program is free software;
408you can redistribute it and/or modify it under the same terms
409as Perl itself.
410
411=head1 CREDITS
412
413Versions through 0.59 (and much of this documentation) were written by:
414
415Mark-Jason Dominus, [email protected]
416
417This version borrows the documentation and names of the routines
418from Mark-Jason's, but has all new code in Diff.pm.
419
420This code was adapted from the Smalltalk code of
421Mario Wolczko <[email protected]>, which is available at
422ftp://st.cs.uiuc.edu/pub/Smalltalk/MANCHESTER/manchester/4.0/diff.st
423
424C<sdiff> and C<traverse_balanced> were written by Mike Schilli
425<[email protected]>.
426
427The algorithm is that described in
428I<A Fast Algorithm for Computing Longest Common Subsequences>,
429CACM, vol.20, no.5, pp.350-353, May 1977, with a few
430minor improvements to improve the speed.
431
432=cut
433
434# Create a hash that maps each element of $aCollection to the set of positions
435# it occupies in $aCollection, restricted to the elements within the range of
436# indexes specified by $start and $end.
437# The fourth parameter is a subroutine reference that will be called to
438# generate a string to use as a key.
439# Additional parameters, if any, will be passed to this subroutine.
440#
441# my $hashRef = _withPositionsOfInInterval( \@array, $start, $end, $keyGen );
442
443sub _withPositionsOfInInterval
444{
445 my $aCollection = shift; # array ref
446 my $start = shift;
447 my $end = shift;
448 my $keyGen = shift;
449 my %d;
450 my $index;
451 for ( $index = $start ; $index <= $end ; $index++ )
452 {
453 my $element = $aCollection->[$index];
454 my $key = &$keyGen( $element, @_ );
455 if ( exists( $d{$key} ) )
456 {
457 unshift ( @{ $d{$key} }, $index );
458 }
459 else
460 {
461 $d{$key} = [$index];
462 }
463 }
464 return wantarray ? %d : \%d;
465}
466
467# Find the place at which aValue would normally be inserted into the array. If
468# that place is already occupied by aValue, do nothing, and return undef. If
469# the place does not exist (i.e., it is off the end of the array), add it to
470# the end, otherwise replace the element at that point with aValue.
471# It is assumed that the array's values are numeric.
472# This is where the bulk (75%) of the time is spent in this module, so try to
473# make it fast!
474
475sub _replaceNextLargerWith
476{
477 my ( $array, $aValue, $high ) = @_;
478 $high ||= $#$array;
479
480 # off the end?
481 if ( $high == -1 || $aValue > $array->[-1] )
482 {
483 push ( @$array, $aValue );
484 return $high + 1;
485 }
486
487 # binary search for insertion point...
488 my $low = 0;
489 my $index;
490 my $found;
491 while ( $low <= $high )
492 {
493 $index = ( $high + $low ) / 2;
494
495 # $index = int(( $high + $low ) / 2); # without 'use integer'
496 $found = $array->[$index];
497
498 if ( $aValue == $found )
499 {
500 return undef;
501 }
502 elsif ( $aValue > $found )
503 {
504 $low = $index + 1;
505 }
506 else
507 {
508 $high = $index - 1;
509 }
510 }
511
512 # now insertion point is in $low.
513 $array->[$low] = $aValue; # overwrite next larger
514 return $low;
515}
516
517# This method computes the longest common subsequence in $a and $b.
518
519# Result is array or ref, whose contents is such that
520# $a->[ $i ] == $b->[ $result[ $i ] ]
521# foreach $i in ( 0 .. $#result ) if $result[ $i ] is defined.
522
523# An additional argument may be passed; this is a hash or key generating
524# function that should return a string that uniquely identifies the given
525# element. It should be the case that if the key is the same, the elements
526# will compare the same. If this parameter is undef or missing, the key
527# will be the element as a string.
528
529# By default, comparisons will use "eq" and elements will be turned into keys
530# using the default stringizing operator '""'.
531
532# Additional parameters, if any, will be passed to the key generation routine.
533
534sub _longestCommonSubsequence
535{
536 my $a = shift; # array ref
537 my $b = shift; # array ref
538 my $keyGen = shift; # code ref
539 my $compare; # code ref
540
541 # set up code refs
542 # Note that these are optimized.
543 if ( !defined($keyGen) ) # optimize for strings
544 {
545 $keyGen = sub { $_[0] };
546 $compare = sub { my ( $a, $b ) = @_; $a eq $b };
547 }
548 else
549 {
550 $compare = sub {
551 my $a = shift;
552 my $b = shift;
553 &$keyGen( $a, @_ ) eq &$keyGen( $b, @_ );
554 };
555 }
556
557 my ( $aStart, $aFinish, $bStart, $bFinish, $matchVector ) =
558 ( 0, $#$a, 0, $#$b, [] );
559
560 # First we prune off any common elements at the beginning
561 while ( $aStart <= $aFinish
562 and $bStart <= $bFinish
563 and &$compare( $a->[$aStart], $b->[$bStart], @_ ) )
564 {
565 $matchVector->[ $aStart++ ] = $bStart++;
566 }
567
568 # now the end
569 while ( $aStart <= $aFinish
570 and $bStart <= $bFinish
571 and &$compare( $a->[$aFinish], $b->[$bFinish], @_ ) )
572 {
573 $matchVector->[ $aFinish-- ] = $bFinish--;
574 }
575
576 # Now compute the equivalence classes of positions of elements
577 my $bMatches =
578 _withPositionsOfInInterval( $b, $bStart, $bFinish, $keyGen, @_ );
579 my $thresh = [];
580 my $links = [];
581
582 my ( $i, $ai, $j, $k );
583 for ( $i = $aStart ; $i <= $aFinish ; $i++ )
584 {
585 $ai = &$keyGen( $a->[$i], @_ );
586 if ( exists( $bMatches->{$ai} ) )
587 {
588 $k = 0;
589 for $j ( @{ $bMatches->{$ai} } )
590 {
591
592 # optimization: most of the time this will be true
593 if ( $k and $thresh->[$k] > $j and $thresh->[ $k - 1 ] < $j )
594 {
595 $thresh->[$k] = $j;
596 }
597 else
598 {
599 $k = _replaceNextLargerWith( $thresh, $j, $k );
600 }
601
602 # oddly, it's faster to always test this (CPU cache?).
603 if ( defined($k) )
604 {
605 $links->[$k] =
606 [ ( $k ? $links->[ $k - 1 ] : undef ), $i, $j ];
607 }
608 }
609 }
610 }
611
612 if (@$thresh)
613 {
614 for ( my $link = $links->[$#$thresh] ; $link ; $link = $link->[0] )
615 {
616 $matchVector->[ $link->[1] ] = $link->[2];
617 }
618 }
619
620 return wantarray ? @$matchVector : $matchVector;
621}
622
623sub traverse_sequences
624{
625 my $a = shift; # array ref
626 my $b = shift; # array ref
627 my $callbacks = shift || {};
628 my $keyGen = shift;
629 my $matchCallback = $callbacks->{'MATCH'} || sub { };
630 my $discardACallback = $callbacks->{'DISCARD_A'} || sub { };
631 my $finishedACallback = $callbacks->{'A_FINISHED'};
632 my $discardBCallback = $callbacks->{'DISCARD_B'} || sub { };
633 my $finishedBCallback = $callbacks->{'B_FINISHED'};
634 my $matchVector = _longestCommonSubsequence( $a, $b, $keyGen, @_ );
635
636 # Process all the lines in @$matchVector
637 my $lastA = $#$a;
638 my $lastB = $#$b;
639 my $bi = 0;
640 my $ai;
641
642 for ( $ai = 0 ; $ai <= $#$matchVector ; $ai++ )
643 {
644 my $bLine = $matchVector->[$ai];
645 if ( defined($bLine) ) # matched
646 {
647 &$discardBCallback( $ai, $bi++, @_ ) while $bi < $bLine;
648 &$matchCallback( $ai, $bi++, @_ );
649 }
650 else
651 {
652 &$discardACallback( $ai, $bi, @_ );
653 }
654 }
655
656 # The last entry (if any) processed was a match.
657 # $ai and $bi point just past the last matching lines in their sequences.
658
659 while ( $ai <= $lastA or $bi <= $lastB )
660 {
661
662 # last A?
663 if ( $ai == $lastA + 1 and $bi <= $lastB )
664 {
665 if ( defined($finishedACallback) )
666 {
667 &$finishedACallback( $lastA, @_ );
668 $finishedACallback = undef;
669 }
670 else
671 {
672 &$discardBCallback( $ai, $bi++, @_ ) while $bi <= $lastB;
673 }
674 }
675
676 # last B?
677 if ( $bi == $lastB + 1 and $ai <= $lastA )
678 {
679 if ( defined($finishedBCallback) )
680 {
681 &$finishedBCallback( $lastB, @_ );
682 $finishedBCallback = undef;
683 }
684 else
685 {
686 &$discardACallback( $ai++, $bi, @_ ) while $ai <= $lastA;
687 }
688 }
689
690 &$discardACallback( $ai++, $bi, @_ ) if $ai <= $lastA;
691 &$discardBCallback( $ai, $bi++, @_ ) if $bi <= $lastB;
692 }
693
694 return 1;
695}
696
697sub traverse_balanced
698{
699 my $a = shift; # array ref
700 my $b = shift; # array ref
701 my $callbacks = shift || {};
702 my $keyGen = shift;
703 my $matchCallback = $callbacks->{'MATCH'} || sub { };
704 my $discardACallback = $callbacks->{'DISCARD_A'} || sub { };
705 my $discardBCallback = $callbacks->{'DISCARD_B'} || sub { };
706 my $changeCallback = $callbacks->{'CHANGE'};
707 my $matchVector = _longestCommonSubsequence( $a, $b, $keyGen, @_ );
708
709 # Process all the lines in match vector
710 my $lastA = $#$a;
711 my $lastB = $#$b;
712 my $bi = 0;
713 my $ai = 0;
714 my $ma = -1;
715 my $mb;
716
717 while (1)
718 {
719
720 # Find next match indices $ma and $mb
721 do { $ma++ } while ( $ma <= $#$matchVector && !defined $matchVector->[$ma] );
722
723 last if $ma > $#$matchVector; # end of matchVector?
724 $mb = $matchVector->[$ma];
725
726 # Proceed with discard a/b or change events until
727 # next match
728 while ( $ai < $ma || $bi < $mb )
729 {
730
731 if ( $ai < $ma && $bi < $mb )
732 {
733
734 # Change
735 if ( defined $changeCallback )
736 {
737 &$changeCallback( $ai++, $bi++, @_ );
738 }
739 else
740 {
741 &$discardACallback( $ai++, $bi, @_ );
742 &$discardBCallback( $ai, $bi++, @_ );
743 }
744 }
745 elsif ( $ai < $ma )
746 {
747 &$discardACallback( $ai++, $bi, @_ );
748 }
749 else
750 {
751
752 # $bi < $mb
753 &$discardBCallback( $ai, $bi++, @_ );
754 }
755 }
756
757 # Match
758 &$matchCallback( $ai++, $bi++, @_ );
759 }
760
761 while ( $ai <= $lastA || $bi <= $lastB )
762 {
763 if ( $ai <= $lastA && $bi <= $lastB )
764 {
765
766 # Change
767 if ( defined $changeCallback )
768 {
769 &$changeCallback( $ai++, $bi++, @_ );
770 }
771 else
772 {
773 &$discardACallback( $ai++, $bi, @_ );
774 &$discardBCallback( $ai, $bi++, @_ );
775 }
776 }
777 elsif ( $ai <= $lastA )
778 {
779 &$discardACallback( $ai++, $bi, @_ );
780 }
781 else
782 {
783
784 # $bi <= $lastB
785 &$discardBCallback( $ai, $bi++, @_ );
786 }
787 }
788
789 return 1;
790}
791
792sub LCS
793{
794 my $a = shift; # array ref
795 my $matchVector = _longestCommonSubsequence( $a, @_ );
796 my @retval;
797 my $i;
798 for ( $i = 0 ; $i <= $#$matchVector ; $i++ )
799 {
800 if ( defined( $matchVector->[$i] ) )
801 {
802 push ( @retval, $a->[$i] );
803 }
804 }
805 return wantarray ? @retval : \@retval;
806}
807
808sub diff
809{
810 my $a = shift; # array ref
811 my $b = shift; # array ref
812 my $retval = [];
813 my $hunk = [];
814 my $discard = sub { push ( @$hunk, [ '-', $_[0], $a->[ $_[0] ] ] ) };
815 my $add = sub { push ( @$hunk, [ '+', $_[1], $b->[ $_[1] ] ] ) };
816 my $match = sub { push ( @$retval, $hunk ) if scalar(@$hunk); $hunk = [] };
817 traverse_sequences( $a, $b,
818 { MATCH => $match, DISCARD_A => $discard, DISCARD_B => $add }, @_ );
819 &$match();
820 return wantarray ? @$retval : $retval;
821}
822
823sub sdiff
824{
825 my $a = shift; # array ref
826 my $b = shift; # array ref
827 my $retval = [];
828 my $discard = sub { push ( @$retval, [ '-', $a->[ $_[0] ], "" ] ) };
829 my $add = sub { push ( @$retval, [ '+', "", $b->[ $_[1] ] ] ) };
830 my $change = sub {
831 push ( @$retval, [ 'c', $a->[ $_[0] ], $b->[ $_[1] ] ] );
832 };
833 my $match = sub {
834 push ( @$retval, [ 'u', $a->[ $_[0] ], $b->[ $_[1] ] ] );
835 };
836 traverse_balanced(
837 $a,
838 $b,
839 {
840 MATCH => $match,
841 DISCARD_A => $discard,
842 DISCARD_B => $add,
843 CHANGE => $change,
844 },
845 @_
846 );
847 return wantarray ? @$retval : $retval;
848}
849
8501;
Note: See TracBrowser for help on using the repository browser.