source: other-projects/diffcol/trunk/diffcol/Text/Diff.pm@ 21711

Last change on this file since 21711 was 21711, checked in by oranfry, 14 years ago

bringing across the diffcol project

File size: 20.7 KB
Line 
1package Text::Diff;
2
3$VERSION = 0.35;
4
5=head1 NAME
6
7Text::Diff - Perform diffs on files and record sets
8
9=head1 SYNOPSIS
10
11 use Text::Diff;
12
13 ## Mix and match filenames, strings, file handles, producer subs,
14 ## or arrays of records; returns diff in a string.
15 ## WARNING: can return B<large> diffs for large files.
16 my $diff = diff "file1.txt", "file2.txt", { STYLE => "Context" };
17 my $diff = diff \$string1, \$string2, \%options;
18 my $diff = diff \*FH1, \*FH2;
19 my $diff = diff \&reader1, \&reader2;
20 my $diff = diff \@records1, \@records2;
21
22 ## May also mix input types:
23 my $diff = diff \@records1, "file_B.txt";
24
25=head1 DESCRIPTION
26
27C<diff()> provides a basic set of services akin to the GNU C<diff> utility. It
28is not anywhere near as feature complete as GNU C<diff>, but it is better
29integrated with Perl and available on all platforms. It is often faster than
30shelling out to a system's C<diff> executable for small files, and generally
31slower on larger files.
32
33Relies on L<Algorithm::Diff> for, well, the algorithm. This may not produce
34the same exact diff as a system's local C<diff> executable, but it will be a
35valid diff and comprehensible by C<patch>. We haven't seen any differences
36between Algorithm::Diff's logic and GNU diff's, but we have not examined them
37to make sure they are indeed identical.
38
39B<Note>: If you don't want to import the C<diff> function, do one of the
40following:
41
42 use Text::Diff ();
43
44 require Text::Diff;
45
46That's a pretty rare occurence, so C<diff()> is exported by default.
47
48=cut
49
50use Exporter;
51@ISA = qw( Exporter );
52@EXPORT = qw( diff );
53
54use strict;
55use Carp;
56use Algorithm::Diff qw( traverse_sequences );
57
58## Hunks are made of ops. An op is the starting index for each
59## sequence and the opcode:
60use constant A => 0; # Array index before match/discard
61use constant B => 1;
62use constant OPCODE => 2; # "-", " ", "+"
63use constant FLAG => 3; # What to display if not OPCODE "!"
64
65
66=head1 OPTIONS
67
68diff() takes two parameters from which to draw input and a set of
69options to control it's output. The options are:
70
71=over
72
73=item FILENAME_A, MTIME_A, FILENAME_B, MTIME_B
74
75The name of the file and the modification time "files"
76
77These are filled in automatically for each file when diff() is passed a
78filename, unless a defined value is passed in.
79
80If a filename is not passed in and FILENAME_A and FILENAME_B are not provided
81or C<undef>, the header will not be printed.
82
83Unused on C<OldStyle> diffs.
84
85=item OFFSET_A, OFFSET_B
86
87The index of the first line / element. These default to 1 for all
88parameter types except ARRAY references, for which the default is 0. This
89is because ARRAY references are presumed to be data structures, while the
90others are line oriented text.
91
92=item STYLE
93
94"Unified", "Context", "OldStyle", or an object or class reference for a class
95providing C<file_header()>, C<hunk_header()>, C<hunk()>, C<hunk_footer()> and
96C<file_footer()> methods. The two footer() methods are provided for
97overloading only; none of the formats provide them.
98
99Defaults to "Unified" (unlike standard C<diff>, but Unified is what's most
100often used in submitting patches and is the most human readable of the three.
101
102If the package indicated by the STYLE has no hunk() method, c<diff()> will
103load it automatically (lazy loading). Since all such packages should inherit
104from Text::Diff::Base, this should be marvy.
105
106Styles may be specified as class names (C<STYLE => "Foo"), in which case they
107will be C<new()>ed with no parameters, or as objects (C<STYLE => Foo->new>).
108
109=item CONTEXT
110
111How many lines before and after each diff to display. Ignored on old-style
112diffs. Defaults to 3.
113
114=item OUTPUT
115
116Examples and their equivalent subroutines:
117
118 OUTPUT => \*FOOHANDLE, # like: sub { print FOOHANDLE shift() }
119 OUTPUT => \$output, # like: sub { $output .= shift }
120 OUTPUT => \@output, # like: sub { push @output, shift }
121 OUTPUT => sub { $output .= shift },
122
123If no C<OUTPUT> is supplied, returns the diffs in a string. If
124C<OUTPUT> is a C<CODE> ref, it will be called once with the (optional)
125file header, and once for each hunk body with the text to emit. If
126C<OUTPUT> is an L<IO::Handle>, output will be emitted to that handle.
127
128=item FILENAME_PREFIX_A, FILENAME_PREFIX_B
129
130The string to print before the filename in the header. Unused on C<OldStyle>
131diffs. Defaults are C<"---">, C<"+++"> for Unified and C<"***">, C<"+++"> for
132Context.
133
134=item KEYGEN, KEYGEN_ARGS
135
136These are passed to L<Algorithm::Diff/traverse_sequences>.
137
138=back
139
140B<Note>: if neither C<FILENAME_> option is defined, the header will not be
141printed. If at one is present, the other and both MTIME_ options must be
142present or "Use of undefined variable" warnings will be generated (except
143on C<OldStyle> diffs, which ignores these options).
144
145=cut
146
147my %internal_styles = (
148 Unified => undef,
149 Context => undef,
150 OldStyle => undef,
151 Table => undef, ## "internal", but in another module
152);
153
154sub diff {
155 my @seqs = ( shift, shift );
156 my $options = shift || {};
157
158 for my $i ( 0..1 ) {
159 my $seq = $seqs[$i];
160 my $type = ref $seq;
161
162 while ( $type eq "CODE" ) {
163 $seqs[$i] = $seq = $seq->( $options );
164 $type = ref $seq;
165 }
166
167 my $AorB = !$i ? "A" : "B";
168
169 if ( $type eq "ARRAY" ) {
170 ## This is most efficient :)
171 $options->{"OFFSET_$AorB"} = 0
172 unless defined $options->{"OFFSET_$AorB"};
173 }
174 elsif ( $type eq "SCALAR" ) {
175 $seqs[$i] = [split( /^/m, $$seq )];
176 $options->{"OFFSET_$AorB"} = 1
177 unless defined $options->{"OFFSET_$AorB"};
178 }
179 elsif ( ! $type ) {
180 $options->{"OFFSET_$AorB"} = 1
181 unless defined $options->{"OFFSET_$AorB"};
182 $options->{"FILENAME_$AorB"} = $seq
183 unless defined $options->{"FILENAME_$AorB"};
184 $options->{"MTIME_$AorB"} = (stat($seq))[9]
185 unless defined $options->{"MTIME_$AorB"};
186
187 local $/ = "\n";
188 open F, "<$seq" or carp "$!: $seq";
189 $seqs[$i] = [<F>];
190 close F;
191
192 }
193 elsif ( $type eq "GLOB" || UNIVERSAL::isa( $seq, "IO::Handle" ) ) {
194 $options->{"OFFSET_$AorB"} = 1
195 unless defined $options->{"OFFSET_$AorB"};
196 local $/ = "\n";
197 $seqs[$i] = [<$seq>];
198 }
199 else {
200 confess "Can't handle input of type ", ref;
201 }
202 }
203
204 ## Config vars
205 my $output;
206 my $output_handler = $options->{OUTPUT};
207 my $type = ref $output_handler ;
208 if ( ! defined $output_handler ) {
209 $output = "";
210 $output_handler = sub { $output .= shift };
211 }
212 elsif ( $type eq "CODE" ) {
213 ## No problems, mate.
214 }
215 elsif ( $type eq "SCALAR" ) {
216 my $out_ref = $output_handler;
217 $output_handler = sub { $$out_ref .= shift };
218 }
219 elsif ( $type eq "ARRAY" ) {
220 my $out_ref = $output_handler;
221 $output_handler = sub { push @$out_ref, shift };
222 }
223 elsif ( $type eq "GLOB" || UNIVERSAL::isa $output_handler, "IO::Handle" ) {
224 my $output_handle = $output_handler;
225 $output_handler = sub { print $output_handle shift };
226 }
227 else {
228 croak "Unrecognized output type: $type";
229 }
230
231 my $style = $options->{STYLE};
232 $style = "Unified" unless defined $options->{STYLE};
233 $style = "Text::Diff::$style" if exists $internal_styles{$style};
234
235 if ( ! $style->can( "hunk" ) ) {
236 eval "require $style; 1" or die $@;
237 }
238
239 $style = $style->new
240 if ! ref $style && $style->can( "new" );
241
242 my $ctx_lines = $options->{CONTEXT};
243 $ctx_lines = 3 unless defined $ctx_lines;
244 $ctx_lines = 0 if $style->isa( "Text::Diff::OldStyle" );
245
246 my @keygen_args = $options->{KEYGEN_ARGS}
247 ? @{$options->{KEYGEN_ARGS}}
248 : ();
249
250 ## State vars
251 my $diffs = 0; ## Number of discards this hunk
252 my $ctx = 0; ## Number of " " (ctx_lines) ops pushed after last diff.
253 my @ops; ## ops (" ", +, -) in this hunk
254 my $hunks = 0; ## Number of hunks
255
256 my $emit_ops = sub {
257 $output_handler->( $style->file_header( @seqs, $options ) )
258 unless $hunks++;
259 $output_handler->( $style->hunk_header( @seqs, @_, $options ) );
260 $output_handler->( $style->hunk ( @seqs, @_, $options ) );
261 $output_handler->( $style->hunk_footer( @seqs, @_, $options ) );
262 };
263
264 ## We keep 2*ctx_lines so that if a diff occurs
265 ## at 2*ctx_lines we continue to grow the hunk instead
266 ## of emitting diffs and context as we go. We
267 ## need to know the total length of both of the two
268 ## subsequences so the line count can be printed in the
269 ## header.
270 my $dis_a = sub {push @ops, [@_[0,1],"-"]; ++$diffs ; $ctx = 0 };
271 my $dis_b = sub {push @ops, [@_[0,1],"+"]; ++$diffs ; $ctx = 0 };
272
273 traverse_sequences(
274 @seqs,
275 {
276 MATCH => sub {
277 push @ops, [@_[0,1]," "];
278
279 if ( $diffs && ++$ctx > $ctx_lines * 2 ) {
280 $emit_ops->( [ splice @ops, 0, $#ops - $ctx_lines ] );
281 $ctx = $diffs = 0;
282 }
283
284 ## throw away context lines that aren't needed any more
285 shift @ops if ! $diffs && @ops > $ctx_lines;
286 },
287 DISCARD_A => $dis_a,
288 DISCARD_B => $dis_b,
289 },
290 $options->{KEYGEN}, # pass in user arguments for key gen function
291 @keygen_args,
292 );
293
294 if ( $diffs ) {
295 $#ops -= $ctx - $ctx_lines if $ctx > $ctx_lines;
296 $emit_ops->( \@ops );
297 }
298
299 $output_handler->( $style->file_footer( @seqs, $options ) ) if $hunks;
300
301 return defined $output ? $output : $hunks;
302}
303
304
305sub _header {
306 my ( $h ) = @_;
307 my ( $p1, $fn1, $t1, $p2, $fn2, $t2 ) = @{$h}{
308 "FILENAME_PREFIX_A",
309 "FILENAME_A",
310 "MTIME_A",
311 "FILENAME_PREFIX_B",
312 "FILENAME_B",
313 "MTIME_B"
314 };
315
316 ## remember to change Text::Diff::Table if this logic is tweaked.
317 return "" unless defined $fn1 && defined $fn2;
318
319 return join( "",
320 $p1, " ", $fn1, defined $t1 ? "\t" . localtime $t1 : (), "\n",
321 $p2, " ", $fn2, defined $t2 ? "\t" . localtime $t2 : (), "\n",
322 );
323}
324
325## _range encapsulates the building of, well, ranges. Turns out there are
326## a few nuances.
327sub _range {
328 my ( $ops, $a_or_b, $format ) = @_;
329
330 my $start = $ops->[ 0]->[$a_or_b];
331 my $after = $ops->[-1]->[$a_or_b];
332
333 ## The sequence indexes in the lines are from *before* the OPCODE is
334 ## executed, so we bump the last index up unless the OP indicates
335 ## it didn't change.
336 ++$after
337 unless $ops->[-1]->[OPCODE] eq ( $a_or_b == A ? "+" : "-" );
338
339 ## convert from 0..n index to 1..(n+1) line number. The unless modifier
340 ## handles diffs with no context, where only one file is affected. In this
341 ## case $start == $after indicates an empty range, and the $start must
342 ## not be incremented.
343 my $empty_range = $start == $after;
344 ++$start unless $empty_range;
345
346 return
347 $start == $after
348 ? $format eq "unified" && $empty_range
349 ? "$start,0"
350 : $start
351 : $format eq "unified"
352 ? "$start,".($after-$start+1)
353 : "$start,$after";
354}
355
356
357sub _op_to_line {
358 my ( $seqs, $op, $a_or_b, $op_prefixes ) = @_;
359
360 my $opcode = $op->[OPCODE];
361 return () unless defined $op_prefixes->{$opcode};
362
363 my $op_sym = defined $op->[FLAG] ? $op->[FLAG] : $opcode;
364 $op_sym = $op_prefixes->{$op_sym};
365 return () unless defined $op_sym;
366
367 $a_or_b = $op->[OPCODE] ne "+" ? 0 : 1 unless defined $a_or_b;
368 return ( $op_sym, $seqs->[$a_or_b][$op->[$a_or_b]] );
369}
370
371
372=head1 Formatting Classes
373
374These functions implement the output formats. They are grouped in to classes
375so diff() can use class names to call the correct set of output routines and so
376that you may inherit from them easily. There are no constructors or instance
377methods for these classes, though subclasses may provide them if need be.
378
379Each class has file_header(), hunk_header(), hunk(), and footer() methods
380identical to those documented in the Text::Diff::Unified section. header() is
381called before the hunk() is first called, footer() afterwards. The default
382footer function is an empty method provided for overloading:
383
384 sub footer { return "End of patch\n" }
385
386Some output formats are provided by external modules (which are loaded
387automatically), such as L<Text::Diff::Table>. These are
388are documented here to keep the documentation simple.
389
390=over
391
392=head2 Text::Diff::Base
393
394Returns "" for all methods (other than C<new()>).
395
396=cut
397
398{
399 package Text::Diff::Base;
400 sub new {
401 my $proto = shift;
402 return bless { @_ }, ref $proto || $proto;
403 }
404
405 sub file_header { return "" }
406 sub hunk_header { return "" }
407 sub hunk { return "" }
408 sub hunk_footer { return "" }
409 sub file_footer { return "" }
410}
411
412
413=head2 Text::Diff::Unified
414
415 --- A Mon Nov 12 23:49:30 2001
416 +++ B Mon Nov 12 23:49:30 2001
417 @@ -2,13 +2,13 @@
418 2
419 3
420 4
421 -5d
422 +5a
423 6
424 7
425 8
426 9
427 +9a
428 10
429 11
430 -11d
431 12
432 13
433
434=over
435
436=item file_header
437
438 $s = Text::Diff::Unified->file_header( $options );
439
440Returns a string containing a unified header. The sole parameter is the
441options hash passed in to diff(), containing at least:
442
443 FILENAME_A => $fn1,
444 MTIME_A => $mtime1,
445 FILENAME_B => $fn2,
446 MTIME_B => $mtime2
447
448May also contain
449
450 FILENAME_PREFIX_A => "---",
451 FILENAME_PREFIX_B => "+++",
452
453to override the default prefixes (default values shown).
454
455=cut
456
457@Text::Diff::Unified::ISA = qw( Text::Diff::Base );
458
459sub Text::Diff::Unified::file_header {
460 shift; ## No instance data
461 my $options = pop ;
462
463 _header(
464 { FILENAME_PREFIX_A => "---", FILENAME_PREFIX_B => "+++", %$options }
465 );
466}
467
468=item hunk_header
469
470 Text::Diff::Unified->hunk_header( \@ops, $options );
471
472Returns a string containing the output of one hunk of unified diff.
473
474=cut
475
476sub Text::Diff::Unified::hunk_header {
477 shift; ## No instance data
478 pop; ## Ignore options
479 my $ops = pop;
480
481 return join( "",
482 "@@ -",
483 _range( $ops, A, "unified" ),
484 " +",
485 _range( $ops, B, "unified" ),
486 " @@\n",
487 );
488}
489
490
491=item Text::Diff::Unified::hunk
492
493 Text::Diff::Unified->hunk( \@seq_a, \@seq_b, \@ops, $options );
494
495Returns a string containing the output of one hunk of unified diff.
496
497=cut
498
499sub Text::Diff::Unified::hunk {
500 shift; ## No instance data
501 pop; ## Ignore options
502 my $ops = pop;
503
504 my $prefixes = { "+" => "+", " " => " ", "-" => "-" };
505
506 return join "", map _op_to_line( \@_, $_, undef, $prefixes ), @$ops
507}
508
509
510=back
511
512=head2 Text::Diff::Table
513
514 +--+----------------------------------+--+------------------------------+
515 | |../Test-Differences-0.2/MANIFEST | |../Test-Differences/MANIFEST |
516 | |Thu Dec 13 15:38:49 2001 | |Sat Dec 15 02:09:44 2001 |
517 +--+----------------------------------+--+------------------------------+
518 | | * 1|Changes *
519 | 1|Differences.pm | 2|Differences.pm |
520 | 2|MANIFEST | 3|MANIFEST |
521 | | * 4|MANIFEST.SKIP *
522 | 3|Makefile.PL | 5|Makefile.PL |
523 | | * 6|t/00escape.t *
524 | 4|t/00flatten.t | 7|t/00flatten.t |
525 | 5|t/01text_vs_data.t | 8|t/01text_vs_data.t |
526 | 6|t/10test.t | 9|t/10test.t |
527 +--+----------------------------------+--+------------------------------+
528
529This format also goes to some pains to highlight "invisible" characters on
530differing elements by selectively escaping whitespace:
531
532 +--+--------------------------+--------------------------+
533 | |demo_ws_A.txt |demo_ws_B.txt |
534 | |Fri Dec 21 08:36:32 2001 |Fri Dec 21 08:36:50 2001 |
535 +--+--------------------------+--------------------------+
536 | 1|identical |identical |
537 * 2| spaced in | also spaced in *
538 * 3|embedded space |embedded tab *
539 | 4|identical |identical |
540 * 5| spaced in |\ttabbed in *
541 * 6|trailing spaces\s\s\n |trailing tabs\t\t\n *
542 | 7|identical |identical |
543 * 8|lf line\n |crlf line\r\n *
544 * 9|embedded ws |embedded\tws *
545 +--+--------------------------+--------------------------+
546
547See L</Text::Diff::Table> for more details, including how the whitespace
548escaping works.
549
550=head2 Text::Diff::Context
551
552 *** A Mon Nov 12 23:49:30 2001
553 --- B Mon Nov 12 23:49:30 2001
554 ***************
555 *** 2,14 ****
556 2
557 3
558 4
559 ! 5d
560 6
561 7
562 8
563 9
564 10
565 11
566 - 11d
567 12
568 13
569 --- 2,14 ----
570 2
571 3
572 4
573 ! 5a
574 6
575 7
576 8
577 9
578 + 9a
579 10
580 11
581 12
582 13
583
584Note: hunk_header() returns only "***************\n".
585
586=cut
587
588
589@Text::Diff::Context::ISA = qw( Text::Diff::Base );
590
591sub Text::Diff::Context::file_header {
592 _header { FILENAME_PREFIX_A=>"***", FILENAME_PREFIX_B=>"---", %{$_[-1]} };
593}
594
595
596sub Text::Diff::Context::hunk_header {
597 return "***************\n";
598}
599
600sub Text::Diff::Context::hunk {
601 shift; ## No instance data
602 pop; ## Ignore options
603 my $ops = pop;
604 ## Leave the sequences in @_[0,1]
605
606 my $a_range = _range( $ops, A, "" );
607 my $b_range = _range( $ops, B, "" );
608
609 ## Sigh. Gotta make sure that differences that aren't adds/deletions
610 ## get prefixed with "!", and that the old opcodes are removed.
611 my $after;
612 for ( my $start = 0; $start <= $#$ops ; $start = $after ) {
613 ## Scan until next difference
614 $after = $start + 1;
615 my $opcode = $ops->[$start]->[OPCODE];
616 next if $opcode eq " ";
617
618 my $bang_it;
619 while ( $after <= $#$ops && $ops->[$after]->[OPCODE] ne " " ) {
620 $bang_it ||= $ops->[$after]->[OPCODE] ne $opcode;
621 ++$after;
622 }
623
624 if ( $bang_it ) {
625 for my $i ( $start..($after-1) ) {
626 $ops->[$i]->[FLAG] = "!";
627 }
628 }
629 }
630
631 my $b_prefixes = { "+" => "+ ", " " => " ", "-" => undef, "!" => "! " };
632 my $a_prefixes = { "+" => undef, " " => " ", "-" => "- ", "!" => "! " };
633
634 return join( "",
635 "*** ", $a_range, " ****\n",
636 map( _op_to_line( \@_, $_, A, $a_prefixes ), @$ops ),
637 "--- ", $b_range, " ----\n",
638 map( _op_to_line( \@_, $_, B, $b_prefixes ), @$ops ),
639 );
640}
641=head2 Text::Diff::OldStyle
642
643 5c5
644 < 5d
645 ---
646 > 5a
647 9a10
648 > 9a
649 12d12
650 < 11d
651
652Note: no file_header().
653
654=cut
655
656@Text::Diff::OldStyle::ISA = qw( Text::Diff::Base );
657
658sub _op {
659 my $ops = shift;
660 my $op = $ops->[0]->[OPCODE];
661 $op = "c" if grep $_->[OPCODE] ne $op, @$ops;
662 $op = "a" if $op eq "+";
663 $op = "d" if $op eq "-";
664 return $op;
665}
666
667sub Text::Diff::OldStyle::hunk_header {
668 shift; ## No instance data
669 pop; ## ignore options
670 my $ops = pop;
671
672 my $op = _op $ops;
673
674 return join "", _range( $ops, A, "" ), $op, _range( $ops, B, "" ), "\n";
675}
676
677sub Text::Diff::OldStyle::hunk {
678 shift; ## No instance data
679 pop; ## ignore options
680 my $ops = pop;
681 ## Leave the sequences in @_[0,1]
682
683 my $a_prefixes = { "+" => undef, " " => undef, "-" => "< " };
684 my $b_prefixes = { "+" => "> ", " " => undef, "-" => undef };
685
686 my $op = _op $ops;
687
688 return join( "",
689 map( _op_to_line( \@_, $_, A, $a_prefixes ), @$ops ),
690 $op eq "c" ? "---\n" : (),
691 map( _op_to_line( \@_, $_, B, $b_prefixes ), @$ops ),
692 );
693}
694
695=head1 LIMITATIONS
696
697Must suck both input files entirely in to memory and store them with a normal
698amount of Perlish overhead (one array location) per record. This is implied by
699the implementation of Algorithm::Diff, which takes two arrays. If
700Algorithm::Diff ever offers an incremental mode, this can be changed (contact
701the maintainers of Algorithm::Diff and Text::Diff if you need this; it
702shouldn't be too terribly hard to tie arrays in this fashion).
703
704Does not provide most of the more refined GNU diff options: recursive directory
705tree scanning, ignoring blank lines / whitespace, etc., etc. These can all be
706added as time permits and need arises, many are rather easy; patches quite
707welcome.
708
709Uses closures internally, this may lead to leaks on C<perl> versions 5.6.1 and
710prior if used many times over a process' life time.
711
712=head1 AUTHOR
713
714Barrie Slaymaker <[email protected]>.
715
716=head1 COPYRIGHT & LICENSE
717
718Copyright 2001, Barrie Slaymaker. All Rights Reserved.
719
720You may use this under the terms of either the Artistic License or GNU Public
721License v 2.0 or greater.
722
723=cut
724
7251;
Note: See TracBrowser for help on using the repository browser.