Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

Diff.pm@ 21711

Last change on this file since 21711 was 21711, checked in by oranfry, 14 years ago
bringing across the diffcol project
File size: 20.7 KB

Line
1	package Text::Diff;
2
3	$VERSION = 0.35;
4
5	=head1 NAME
6
7	Text::Diff - Perform diffs on files and record sets
8
9	=head1 SYNOPSIS
10
11	use Text::Diff;
12
13	## Mix and match filenames, strings, file handles, producer subs,
14	## or arrays of records; returns diff in a string.
15	## WARNING: can return B<large> diffs for large files.
16	my $diff = diff "file1.txt", "file2.txt", { STYLE => "Context" };
17	my $diff = diff \$string1, \$string2, \%options;
18	my $diff = diff \FH1, \FH2;
19	my $diff = diff \&reader1, \&reader2;
20	my $diff = diff \@records1, \@records2;
21
22	## May also mix input types:
23	my $diff = diff \@records1, "file_B.txt";
24
25	=head1 DESCRIPTION
26
27	C<diff()> provides a basic set of services akin to the GNU C<diff> utility. It
28	is not anywhere near as feature complete as GNU C<diff>, but it is better
29	integrated with Perl and available on all platforms. It is often faster than
30	shelling out to a system's C<diff> executable for small files, and generally
31	slower on larger files.
32
33	Relies on L<Algorithm::Diff> for, well, the algorithm. This may not produce
34	the same exact diff as a system's local C<diff> executable, but it will be a
35	valid diff and comprehensible by C<patch>. We haven't seen any differences
36	between Algorithm::Diff's logic and GNU diff's, but we have not examined them
37	to make sure they are indeed identical.
38
39	B<Note>: If you don't want to import the C<diff> function, do one of the
40	following:
41
42	use Text::Diff ();
43
44	require Text::Diff;
45
46	That's a pretty rare occurence, so C<diff()> is exported by default.
47
48	=cut
49
50	use Exporter;
51	@ISA = qw( Exporter );
52	@EXPORT = qw( diff );
53
54	use strict;
55	use Carp;
56	use Algorithm::Diff qw( traverse_sequences );
57
58	## Hunks are made of ops. An op is the starting index for each
59	## sequence and the opcode:
60	use constant A => 0; # Array index before match/discard
61	use constant B => 1;
62	use constant OPCODE => 2; # "-", " ", "+"
63	use constant FLAG => 3; # What to display if not OPCODE "!"
64
65
66	=head1 OPTIONS
67
68	diff() takes two parameters from which to draw input and a set of
69	options to control it's output. The options are:
70
71	=over
72
73	=item FILENAME_A, MTIME_A, FILENAME_B, MTIME_B
74
75	The name of the file and the modification time "files"
76
77	These are filled in automatically for each file when diff() is passed a
78	filename, unless a defined value is passed in.
79
80	If a filename is not passed in and FILENAME_A and FILENAME_B are not provided
81	or C<undef>, the header will not be printed.
82
83	Unused on C<OldStyle> diffs.
84
85	=item OFFSET_A, OFFSET_B
86
87	The index of the first line / element. These default to 1 for all
88	parameter types except ARRAY references, for which the default is 0. This
89	is because ARRAY references are presumed to be data structures, while the
90	others are line oriented text.
91
92	=item STYLE
93
94	"Unified", "Context", "OldStyle", or an object or class reference for a class
95	providing C<file_header()>, C<hunk_header()>, C<hunk()>, C<hunk_footer()> and
96	C<file_footer()> methods. The two footer() methods are provided for
97	overloading only; none of the formats provide them.
98
99	Defaults to "Unified" (unlike standard C<diff>, but Unified is what's most
100	often used in submitting patches and is the most human readable of the three.
101
102	If the package indicated by the STYLE has no hunk() method, c<diff()> will
103	load it automatically (lazy loading). Since all such packages should inherit
104	from Text::Diff::Base, this should be marvy.
105
106	Styles may be specified as class names (C<STYLE => "Foo"), in which case they
107	will be C<new()>ed with no parameters, or as objects (C<STYLE => Foo->new>).
108
109	=item CONTEXT
110
111	How many lines before and after each diff to display. Ignored on old-style
112	diffs. Defaults to 3.
113
114	=item OUTPUT
115
116	Examples and their equivalent subroutines:
117
118	OUTPUT => \*FOOHANDLE, # like: sub { print FOOHANDLE shift() }
119	OUTPUT => \$output, # like: sub { $output .= shift }
120	OUTPUT => \@output, # like: sub { push @output, shift }
121	OUTPUT => sub { $output .= shift },
122
123	If no C<OUTPUT> is supplied, returns the diffs in a string. If
124	C<OUTPUT> is a C<CODE> ref, it will be called once with the (optional)
125	file header, and once for each hunk body with the text to emit. If
126	C<OUTPUT> is an L<IO::Handle>, output will be emitted to that handle.
127
128	=item FILENAME_PREFIX_A, FILENAME_PREFIX_B
129
130	The string to print before the filename in the header. Unused on C<OldStyle>
131	diffs. Defaults are C<"---">, C<"+++"> for Unified and C<"***">, C<"+++"> for
132	Context.
133
134	=item KEYGEN, KEYGEN_ARGS
135
136	These are passed to L<Algorithm::Diff/traverse_sequences>.
137
138	=back
139
140	B<Note>: if neither C<FILENAME_> option is defined, the header will not be
141	printed. If at one is present, the other and both MTIME_ options must be
142	present or "Use of undefined variable" warnings will be generated (except
143	on C<OldStyle> diffs, which ignores these options).
144
145	=cut
146
147	my %internal_styles = (
148	Unified => undef,
149	Context => undef,
150	OldStyle => undef,
151	Table => undef, ## "internal", but in another module
152	);
153
154	sub diff {
155	my @seqs = ( shift, shift );
156	my $options = shift \|\| {};
157
158	for my $i ( 0..1 ) {
159	my $seq = $seqs[$i];
160	my $type = ref $seq;
161
162	while ( $type eq "CODE" ) {
163	$seqs[$i] = $seq = $seq->( $options );
164	$type = ref $seq;
165	}
166
167	my $AorB = !$i ? "A" : "B";
168
169	if ( $type eq "ARRAY" ) {
170	## This is most efficient :)
171	$options->{"OFFSET_$AorB"} = 0
172	unless defined $options->{"OFFSET_$AorB"};
173	}
174	elsif ( $type eq "SCALAR" ) {
175	$seqs[$i] = [split( /^/m, $$seq )];
176	$options->{"OFFSET_$AorB"} = 1
177	unless defined $options->{"OFFSET_$AorB"};
178	}
179	elsif ( ! $type ) {
180	$options->{"OFFSET_$AorB"} = 1
181	unless defined $options->{"OFFSET_$AorB"};
182	$options->{"FILENAME_$AorB"} = $seq
183	unless defined $options->{"FILENAME_$AorB"};
184	$options->{"MTIME_$AorB"} = (stat($seq))[9]
185	unless defined $options->{"MTIME_$AorB"};
186
187	local $/ = "\n";
188	open F, "<$seq" or carp "$!: $seq";
189	$seqs[$i] = [<F>];
190	close F;
191
192	}
193	elsif ( $type eq "GLOB" \|\| UNIVERSAL::isa( $seq, "IO::Handle" ) ) {
194	$options->{"OFFSET_$AorB"} = 1
195	unless defined $options->{"OFFSET_$AorB"};
196	local $/ = "\n";
197	$seqs[$i] = [<$seq>];
198	}
199	else {
200	confess "Can't handle input of type ", ref;
201	}
202	}
203
204	## Config vars
205	my $output;
206	my $output_handler = $options->{OUTPUT};
207	my $type = ref $output_handler ;
208	if ( ! defined $output_handler ) {
209	$output = "";
210	$output_handler = sub { $output .= shift };
211	}
212	elsif ( $type eq "CODE" ) {
213	## No problems, mate.
214	}
215	elsif ( $type eq "SCALAR" ) {
216	my $out_ref = $output_handler;
217	$output_handler = sub { $$out_ref .= shift };
218	}
219	elsif ( $type eq "ARRAY" ) {
220	my $out_ref = $output_handler;
221	$output_handler = sub { push @$out_ref, shift };
222	}
223	elsif ( $type eq "GLOB" \|\| UNIVERSAL::isa $output_handler, "IO::Handle" ) {
224	my $output_handle = $output_handler;
225	$output_handler = sub { print $output_handle shift };
226	}
227	else {
228	croak "Unrecognized output type: $type";
229	}
230
231	my $style = $options->{STYLE};
232	$style = "Unified" unless defined $options->{STYLE};
233	$style = "Text::Diff::$style" if exists $internal_styles{$style};
234
235	if ( ! $style->can( "hunk" ) ) {
236	eval "require $style; 1" or die $@;
237	}
238
239	$style = $style->new
240	if ! ref $style && $style->can( "new" );
241
242	my $ctx_lines = $options->{CONTEXT};
243	$ctx_lines = 3 unless defined $ctx_lines;
244	$ctx_lines = 0 if $style->isa( "Text::Diff::OldStyle" );
245
246	my @keygen_args = $options->{KEYGEN_ARGS}
247	? @{$options->{KEYGEN_ARGS}}
248	: ();
249
250	## State vars
251	my $diffs = 0; ## Number of discards this hunk
252	my $ctx = 0; ## Number of " " (ctx_lines) ops pushed after last diff.
253	my @ops; ## ops (" ", +, -) in this hunk
254	my $hunks = 0; ## Number of hunks
255
256	my $emit_ops = sub {
257	$output_handler->( $style->file_header( @seqs, $options ) )
258	unless $hunks++;
259	$output_handler->( $style->hunk_header( @seqs, @_, $options ) );
260	$output_handler->( $style->hunk ( @seqs, @_, $options ) );
261	$output_handler->( $style->hunk_footer( @seqs, @_, $options ) );
262	};
263
264	## We keep 2*ctx_lines so that if a diff occurs
265	## at 2*ctx_lines we continue to grow the hunk instead
266	## of emitting diffs and context as we go. We
267	## need to know the total length of both of the two
268	## subsequences so the line count can be printed in the
269	## header.
270	my $dis_a = sub {push @ops, [@_[0,1],"-"]; ++$diffs ; $ctx = 0 };
271	my $dis_b = sub {push @ops, [@_[0,1],"+"]; ++$diffs ; $ctx = 0 };
272
273	traverse_sequences(
274	@seqs,
275	{
276	MATCH => sub {
277	push @ops, [@_[0,1]," "];
278
279	if ( $diffs && ++$ctx > $ctx_lines * 2 ) {
280	$emit_ops->( [ splice @ops, 0, $#ops - $ctx_lines ] );
281	$ctx = $diffs = 0;
282	}
283
284	## throw away context lines that aren't needed any more
285	shift @ops if ! $diffs && @ops > $ctx_lines;
286	},
287	DISCARD_A => $dis_a,
288	DISCARD_B => $dis_b,
289	},
290	$options->{KEYGEN}, # pass in user arguments for key gen function
291	@keygen_args,
292	);
293
294	if ( $diffs ) {
295	$#ops -= $ctx - $ctx_lines if $ctx > $ctx_lines;
296	$emit_ops->( \@ops );
297	}
298
299	$output_handler->( $style->file_footer( @seqs, $options ) ) if $hunks;
300
301	return defined $output ? $output : $hunks;
302	}
303
304
305	sub _header {
306	my ( $h ) = @_;
307	my ( $p1, $fn1, $t1, $p2, $fn2, $t2 ) = @{$h}{
308	"FILENAME_PREFIX_A",
309	"FILENAME_A",
310	"MTIME_A",
311	"FILENAME_PREFIX_B",
312	"FILENAME_B",
313	"MTIME_B"
314	};
315
316	## remember to change Text::Diff::Table if this logic is tweaked.
317	return "" unless defined $fn1 && defined $fn2;
318
319	return join( "",
320	$p1, " ", $fn1, defined $t1 ? "\t" . localtime $t1 : (), "\n",
321	$p2, " ", $fn2, defined $t2 ? "\t" . localtime $t2 : (), "\n",
322	);
323	}
324
325	## _range encapsulates the building of, well, ranges. Turns out there are
326	## a few nuances.
327	sub _range {
328	my ( $ops, $a_or_b, $format ) = @_;
329
330	my $start = $ops->[ 0]->[$a_or_b];
331	my $after = $ops->[-1]->[$a_or_b];
332
333	## The sequence indexes in the lines are from before the OPCODE is
334	## executed, so we bump the last index up unless the OP indicates
335	## it didn't change.
336	++$after
337	unless $ops->[-1]->[OPCODE] eq ( $a_or_b == A ? "+" : "-" );
338
339	## convert from 0..n index to 1..(n+1) line number. The unless modifier
340	## handles diffs with no context, where only one file is affected. In this
341	## case $start == $after indicates an empty range, and the $start must
342	## not be incremented.
343	my $empty_range = $start == $after;
344	++$start unless $empty_range;
345
346	return
347	$start == $after
348	? $format eq "unified" && $empty_range
349	? "$start,0"
350	: $start
351	: $format eq "unified"
352	? "$start,".($after-$start+1)
353	: "$start,$after";
354	}
355
356
357	sub _op_to_line {
358	my ( $seqs, $op, $a_or_b, $op_prefixes ) = @_;
359
360	my $opcode = $op->[OPCODE];
361	return () unless defined $op_prefixes->{$opcode};
362
363	my $op_sym = defined $op->[FLAG] ? $op->[FLAG] : $opcode;
364	$op_sym = $op_prefixes->{$op_sym};
365	return () unless defined $op_sym;
366
367	$a_or_b = $op->[OPCODE] ne "+" ? 0 : 1 unless defined $a_or_b;
368	return ( $op_sym, $seqs->[$a_or_b][$op->[$a_or_b]] );
369	}
370
371
372	=head1 Formatting Classes
373
374	These functions implement the output formats. They are grouped in to classes
375	so diff() can use class names to call the correct set of output routines and so
376	that you may inherit from them easily. There are no constructors or instance
377	methods for these classes, though subclasses may provide them if need be.
378
379	Each class has file_header(), hunk_header(), hunk(), and footer() methods
380	identical to those documented in the Text::Diff::Unified section. header() is
381	called before the hunk() is first called, footer() afterwards. The default
382	footer function is an empty method provided for overloading:
383
384	sub footer { return "End of patch\n" }
385
386	Some output formats are provided by external modules (which are loaded
387	automatically), such as L<Text::Diff::Table>. These are
388	are documented here to keep the documentation simple.
389
390	=over
391
392	=head2 Text::Diff::Base
393
394	Returns "" for all methods (other than C<new()>).
395
396	=cut
397
398	{
399	package Text::Diff::Base;
400	sub new {
401	my $proto = shift;
402	return bless { @_ }, ref $proto \|\| $proto;
403	}
404
405	sub file_header { return "" }
406	sub hunk_header { return "" }
407	sub hunk { return "" }
408	sub hunk_footer { return "" }
409	sub file_footer { return "" }
410	}
411
412
413	=head2 Text::Diff::Unified
414
415	--- A Mon Nov 12 23:49:30 2001
416	+++ B Mon Nov 12 23:49:30 2001
417	@@ -2,13 +2,13 @@
418	2
419	3
420	4
421	-5d
422	+5a
423	6
424	7
425	8
426	9
427	+9a
428	10
429	11
430	-11d
431	12
432	13
433
434	=over
435
436	=item file_header
437
438	$s = Text::Diff::Unified->file_header( $options );
439
440	Returns a string containing a unified header. The sole parameter is the
441	options hash passed in to diff(), containing at least:
442
443	FILENAME_A => $fn1,
444	MTIME_A => $mtime1,
445	FILENAME_B => $fn2,
446	MTIME_B => $mtime2
447
448	May also contain
449
450	FILENAME_PREFIX_A => "---",
451	FILENAME_PREFIX_B => "+++",
452
453	to override the default prefixes (default values shown).
454
455	=cut
456
457	@Text::Diff::Unified::ISA = qw( Text::Diff::Base );
458
459	sub Text::Diff::Unified::file_header {
460	shift; ## No instance data
461	my $options = pop ;
462
463	_header(
464	{ FILENAME_PREFIX_A => "---", FILENAME_PREFIX_B => "+++", %$options }
465	);
466	}
467
468	=item hunk_header
469
470	Text::Diff::Unified->hunk_header( \@ops, $options );
471
472	Returns a string containing the output of one hunk of unified diff.
473
474	=cut
475
476	sub Text::Diff::Unified::hunk_header {
477	shift; ## No instance data
478	pop; ## Ignore options
479	my $ops = pop;
480
481	return join( "",
482	"@@ -",
483	_range( $ops, A, "unified" ),
484	" +",
485	_range( $ops, B, "unified" ),
486	" @@\n",
487	);
488	}
489
490
491	=item Text::Diff::Unified::hunk
492
493	Text::Diff::Unified->hunk( \@seq_a, \@seq_b, \@ops, $options );
494
495	Returns a string containing the output of one hunk of unified diff.
496
497	=cut
498
499	sub Text::Diff::Unified::hunk {
500	shift; ## No instance data
501	pop; ## Ignore options
502	my $ops = pop;
503
504	my $prefixes = { "+" => "+", " " => " ", "-" => "-" };
505
506	return join "", map _op_to_line( \@_, $_, undef, $prefixes ), @$ops
507	}
508
509
510	=back
511
512	=head2 Text::Diff::Table
513
514	+--+----------------------------------+--+------------------------------+
515	\| \|../Test-Differences-0.2/MANIFEST \| \|../Test-Differences/MANIFEST \|
516	\| \|Thu Dec 13 15:38:49 2001 \| \|Sat Dec 15 02:09:44 2001 \|
517	+--+----------------------------------+--+------------------------------+
518	\| \| * 1\|Changes *
519	\| 1\|Differences.pm \| 2\|Differences.pm \|
520	\| 2\|MANIFEST \| 3\|MANIFEST \|
521	\| \| * 4\|MANIFEST.SKIP *
522	\| 3\|Makefile.PL \| 5\|Makefile.PL \|
523	\| \| * 6\|t/00escape.t *
524	\| 4\|t/00flatten.t \| 7\|t/00flatten.t \|
525	\| 5\|t/01text_vs_data.t \| 8\|t/01text_vs_data.t \|
526	\| 6\|t/10test.t \| 9\|t/10test.t \|
527	+--+----------------------------------+--+------------------------------+
528
529	This format also goes to some pains to highlight "invisible" characters on
530	differing elements by selectively escaping whitespace:
531
532	+--+--------------------------+--------------------------+
533	\| \|demo_ws_A.txt \|demo_ws_B.txt \|
534	\| \|Fri Dec 21 08:36:32 2001 \|Fri Dec 21 08:36:50 2001 \|
535	+--+--------------------------+--------------------------+
536	\| 1\|identical \|identical \|
537	* 2\| spaced in \| also spaced in *
538	* 3\|embedded space \|embedded tab *
539	\| 4\|identical \|identical \|
540	* 5\| spaced in \|\ttabbed in *
541	* 6\|trailing spaces\s\s\n \|trailing tabs\t\t\n *
542	\| 7\|identical \|identical \|
543	* 8\|lf line\n \|crlf line\r\n *
544	* 9\|embedded ws \|embedded\tws *
545	+--+--------------------------+--------------------------+
546
547	See L</Text::Diff::Table> for more details, including how the whitespace
548	escaping works.
549
550	=head2 Text::Diff::Context
551
552	*** A Mon Nov 12 23:49:30 2001
553	--- B Mon Nov 12 23:49:30 2001
554	***************
555	* 2,14 **
556	2
557	3
558	4
559	! 5d
560	6
561	7
562	8
563	9
564	10
565	11
566	- 11d
567	12
568	13
569	--- 2,14 ----
570	2
571	3
572	4
573	! 5a
574	6
575	7
576	8
577	9
578	+ 9a
579	10
580	11
581	12
582	13
583
584	Note: hunk_header() returns only "***************\n".
585
586	=cut
587
588
589	@Text::Diff::Context::ISA = qw( Text::Diff::Base );
590
591	sub Text::Diff::Context::file_header {
592	_header { FILENAME_PREFIX_A=>"***", FILENAME_PREFIX_B=>"---", %{$_[-1]} };
593	}
594
595
596	sub Text::Diff::Context::hunk_header {
597	return "***************\n";
598	}
599
600	sub Text::Diff::Context::hunk {
601	shift; ## No instance data
602	pop; ## Ignore options
603	my $ops = pop;
604	## Leave the sequences in @_[0,1]
605
606	my $a_range = _range( $ops, A, "" );
607	my $b_range = _range( $ops, B, "" );
608
609	## Sigh. Gotta make sure that differences that aren't adds/deletions
610	## get prefixed with "!", and that the old opcodes are removed.
611	my $after;
612	for ( my $start = 0; $start <= $#$ops ; $start = $after ) {
613	## Scan until next difference
614	$after = $start + 1;
615	my $opcode = $ops->[$start]->[OPCODE];
616	next if $opcode eq " ";
617
618	my $bang_it;
619	while ( $after <= $#$ops && $ops->[$after]->[OPCODE] ne " " ) {
620	$bang_it \|\|= $ops->[$after]->[OPCODE] ne $opcode;
621	++$after;
622	}
623
624	if ( $bang_it ) {
625	for my $i ( $start..($after-1) ) {
626	$ops->[$i]->[FLAG] = "!";
627	}
628	}
629	}
630
631	my $b_prefixes = { "+" => "+ ", " " => " ", "-" => undef, "!" => "! " };
632	my $a_prefixes = { "+" => undef, " " => " ", "-" => "- ", "!" => "! " };
633
634	return join( "",
635	"* ", $a_range, " **\n",
636	map( _op_to_line( \@_, $_, A, $a_prefixes ), @$ops ),
637	"--- ", $b_range, " ----\n",
638	map( _op_to_line( \@_, $_, B, $b_prefixes ), @$ops ),
639	);
640	}
641	=head2 Text::Diff::OldStyle
642
643	5c5
644	< 5d
645	---
646	> 5a
647	9a10
648	> 9a
649	12d12
650	< 11d
651
652	Note: no file_header().
653
654	=cut
655
656	@Text::Diff::OldStyle::ISA = qw( Text::Diff::Base );
657
658	sub _op {
659	my $ops = shift;
660	my $op = $ops->[0]->[OPCODE];
661	$op = "c" if grep $_->[OPCODE] ne $op, @$ops;
662	$op = "a" if $op eq "+";
663	$op = "d" if $op eq "-";
664	return $op;
665	}
666
667	sub Text::Diff::OldStyle::hunk_header {
668	shift; ## No instance data
669	pop; ## ignore options
670	my $ops = pop;
671
672	my $op = _op $ops;
673
674	return join "", _range( $ops, A, "" ), $op, _range( $ops, B, "" ), "\n";
675	}
676
677	sub Text::Diff::OldStyle::hunk {
678	shift; ## No instance data
679	pop; ## ignore options
680	my $ops = pop;
681	## Leave the sequences in @_[0,1]
682
683	my $a_prefixes = { "+" => undef, " " => undef, "-" => "< " };
684	my $b_prefixes = { "+" => "> ", " " => undef, "-" => undef };
685
686	my $op = _op $ops;
687
688	return join( "",
689	map( _op_to_line( \@_, $_, A, $a_prefixes ), @$ops ),
690	$op eq "c" ? "---\n" : (),
691	map( _op_to_line( \@_, $_, B, $b_prefixes ), @$ops ),
692	);
693	}
694
695	=head1 LIMITATIONS
696
697	Must suck both input files entirely in to memory and store them with a normal
698	amount of Perlish overhead (one array location) per record. This is implied by
699	the implementation of Algorithm::Diff, which takes two arrays. If
700	Algorithm::Diff ever offers an incremental mode, this can be changed (contact
701	the maintainers of Algorithm::Diff and Text::Diff if you need this; it
702	shouldn't be too terribly hard to tie arrays in this fashion).
703
704	Does not provide most of the more refined GNU diff options: recursive directory
705	tree scanning, ignoring blank lines / whitespace, etc., etc. These can all be
706	added as time permits and need arises, many are rather easy; patches quite
707	welcome.
708
709	Uses closures internally, this may lead to leaks on C<perl> versions 5.6.1 and
710	prior if used many times over a process' life time.
711
712	=head1 AUTHOR
713
714	Barrie Slaymaker <[email protected]>.
715
716	=head1 COPYRIGHT & LICENSE
717
718	Copyright 2001, Barrie Slaymaker. All Rights Reserved.
719
720	You may use this under the terms of either the Artistic License or GNU Public
721	License v 2.0 or greater.
722
723	=cut
724
725	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: other-projects/diffcol/trunk/diffcol/Text/Diff.pm@ 21711

Download in other formats: