Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

Naturally.pm@ 33236

Last change on this file since 33236 was 33236, checked in by davidb, 5 years ago
CPAN module that provides a generalized form of alpha-numerical sorting. Useful in classifiers
File size: 22.8 KB

Rev	Line
[33236]	1
	2	require 5;
	3	package Sort::Naturally; # Time-stamp: "2004-12-29 18:30:03 AST"
	4	$VERSION = '1.03';
	5	@EXPORT = ('nsort', 'ncmp');
	6	require Exporter;
	7	@ISA = ('Exporter');
	8
	9	use strict;
	10	use locale;
	11	use integer;
	12
	13	#-----------------------------------------------------------------------------
	14	# constants:
	15	BEGIN { *DEBUG = sub () {0} unless defined &DEBUG }
	16
	17	use Config ();
	18	BEGIN {
	19	# Make a constant such that if a whole-number string is that long
	20	# or shorter, we KNOW it's treatable as an integer
	21	no integer;
	22	my $x = length(256 ** $Config::Config{'intsize'} / 2) - 1;
	23	die "Crazy intsize: <$Config::Config{'intsize'}>" if $x < 4;
	24	eval 'sub MAX_INT_SIZE () {' . $x . '}';
	25	die $@ if $@;
	26	print "intsize $Config::Config{'intsize'} => MAX_INT_SIZE $x\n" if DEBUG;
	27	}
	28
	29	sub X_FIRST () {-1}
	30	sub Y_FIRST () { 1}
	31
	32	my @ORD = ('same', 'swap', 'asis');
	33
	34	#-----------------------------------------------------------------------------
	35	# For lack of a preprocessor:
	36
	37	my($code, $guts);
	38	$guts = <<'EOGUTS'; # This is the guts of both ncmp and nsort:
	39
	40	if($x eq $y) {
	41	# trap this expensive case first, and then fall thru to tiebreaker
	42	$rv = 0;
	43
	44	# Convoluted hack to get numerics to sort first, at string start:
	45	} elsif($x =~ m/^\d/s) {
	46	if($y =~ m/^\d/s) {
	47	$rv = 0; # fall thru to normal comparison for the two numbers
	48	} else {
	49	$rv = X_FIRST;
	50	DEBUG > 1 and print "Numeric-initial $x trumps letter-initial $y\n";
	51	}
	52	} elsif($y =~ m/^\d/s) {
	53	$rv = Y_FIRST;
	54	DEBUG > 1 and print "Numeric-initial $y trumps letter-initial $x\n";
	55	} else {
	56	$rv = 0;
	57	}
	58
	59	unless($rv) {
	60	# Normal case:
	61	$rv = 0;
	62	DEBUG and print "<$x> and <$y> compared...\n";
	63
	64	Consideration:
	65	while(length $x and length $y) {
	66
	67	DEBUG > 2 and print " <$x> and <$y>...\n";
	68
	69	# First, non-numeric comparison:
	70	$x2 = ($x =~ m/^(\D+)/s) ? length($1) : 0;
	71	$y2 = ($y =~ m/^(\D+)/s) ? length($1) : 0;
	72	# Now make x2 the min length of the two:
	73	$x2 = $y2 if $x2 > $y2;
	74	if($x2) {
	75	DEBUG > 1 and printf " <%s> and <%s> lexically for length $x2...\n",
	76	substr($x,0,$x2), substr($y,0,$x2);
	77	do {
	78	my $i = substr($x,0,$x2);
	79	my $j = substr($y,0,$x2);
	80	my $sv = $i cmp $j;
	81	print "SCREAM! on <$i><$j> -- $sv != $rv \n" unless $rv == $sv;
	82	last;
	83	}
	84
	85
	86	if $rv =
	87	# The ''. things here force a copy that seems to work around a
	88	# mysterious intermittent bug that 'use locale' provokes in
	89	# many versions of Perl.
	90	$cmp
	91	? $cmp->(substr($x,0,$x2) . '',
	92	substr($y,0,$x2) . '',
	93	)
	94	:
	95	scalar(( substr($x,0,$x2) . '' ) cmp
	96	( substr($y,0,$x2) . '' )
	97	)
	98	;
	99	# otherwise trim and keep going:
	100	substr($x,0,$x2) = '';
	101	substr($y,0,$x2) = '';
	102	}
	103
	104	# Now numeric:
	105	# (actually just using $x2 and $y2 as scratch)
	106
	107	if( $x =~ s/^(\d+)//s ) {
	108	$x2 = $1;
	109	if( $y =~ s/^(\d+)//s ) {
	110	# We have two numbers here.
	111	DEBUG > 1 and print " <$x2> and <$1> numerically\n";
	112	if(length($x2) < MAX_INT_SIZE and length($1) < MAX_INT_SIZE) {
	113	# small numbers: we can compare happily
	114	last if $rv = $x2 <=> $1;
	115	} else {
	116	# ARBITRARILY large integers!
	117
	118	# This saves on loss of precision that could happen
	119	# with actual stringification.
	120	# Also, I sense that very large numbers aren't too
	121	# terribly common in sort data.
	122
	123	# trim leading 0's:
	124	($y2 = $1) =~ s/^0+//s;
	125	$x2 =~ s/^0+//s;
	126	print " Treating $x2 and $y2 as bigint\n" if DEBUG;
	127
	128	no locale; # we want the dumb cmp back.
	129	last if $rv = (
	130	# works only for non-negative whole numbers:
	131	length($x2) <=> length($y2)
	132	# the longer the numeral, the larger the value
	133	or $x2 cmp $y2
	134	# between equals, compare lexically!! amazing but true.
	135	);
	136	}
	137	} else {
	138	# X is numeric but Y isn't
	139	$rv = Y_FIRST;
	140	last;
	141	}
	142	} elsif( $y =~ s/^\d+//s ) { # we don't need to capture the substring
	143	$rv = X_FIRST;
	144	last;
	145	}
	146	# else one of them is 0-length.
	147
	148	# end-while
	149	}
	150	}
	151	EOGUTS
	152
	153	sub maker {
	154	my $code = $_[0];
	155	$code =~ s/~COMPARATOR~/$guts/g \|\| die "Can't find ~COMPARATOR~";
	156	eval $code;
	157	die $@ if $@;
	158	}
	159
	160	##############################################################################
	161
	162	maker(<<'EONSORT');
	163	sub nsort {
	164	# get options:
	165	my($cmp, $lc);
	166	($cmp,$lc) = @{shift @_} if @_ and ref($_[0]) eq 'ARRAY';
	167
	168	return @_ unless @_ > 1 or wantarray; # be clever
	169
	170	my($x, $x2, $y, $y2, $rv); # scratch vars
	171
	172	# We use a Schwartzian xform to memoize the lc'ing and \W-removal
	173
	174	map $_->[0],
	175	sort {
	176	if($a->[0] eq $b->[0]) { 0 } # trap this expensive case
	177	else {
	178
	179	$x = $a->[1];
	180	$y = $b->[1];
	181
	182	~COMPARATOR~
	183
	184	# Tiebreakers...
	185	DEBUG > 1 and print " -<${$a}[0]> cmp <${$b}[0]> is $rv ($ORD[$rv])\n";
	186	$rv \|\|= (length($x) <=> length($y)) # shorter is always first
	187	\|\| ($cmp and $cmp->($x,$y) \|\| $cmp->($a->[0], $b->[0]))
	188	\|\| ($x cmp $y )
	189	\|\| ($a->[0] cmp $b->[0])
	190	;
	191
	192	DEBUG > 1 and print " <${$a}[0]> cmp <${$b}[0]> is $rv ($ORD[$rv])\n";
	193	$rv;
	194	}}
	195
	196	map {;
	197	$x = $lc ? $lc->($_) : lc($_); # x as scratch
	198	$x =~ s/\W+//s;
	199	[$_, $x];
	200	}
	201	@_
	202	}
	203	EONSORT
	204
	205	#-----------------------------------------------------------------------------
	206	maker(<<'EONCMP');
	207	sub ncmp {
	208	# The guts are basically the same as above...
	209
	210	# get options:
	211	my($cmp, $lc);
	212	($cmp,$lc) = @{shift @_} if @_ and ref($_[0]) eq 'ARRAY';
	213
	214	if(@_ == 0) {
	215	@_ = ($a, $b); # bit of a hack!
	216	DEBUG > 1 and print "Hacking in <$a><$b>\n";
	217	} elsif(@_ != 2) {
	218	require Carp;
	219	Carp::croak("Not enough options to ncmp!");
	220	}
	221	my($a,$b) = @_;
	222	my($x, $x2, $y, $y2, $rv); # scratch vars
	223
	224	DEBUG > 1 and print "ncmp args <$a><$b>\n";
	225	if($a eq $b) { # trap this expensive case
	226	0;
	227	} else {
	228	$x = ($lc ? $lc->($a) : lc($a));
	229	$x =~ s/\W+//s;
	230	$y = ($lc ? $lc->($b) : lc($b));
	231	$y =~ s/\W+//s;
	232
	233	~COMPARATOR~
	234
	235
	236	# Tiebreakers...
	237	DEBUG > 1 and print " -<$a> cmp <$b> is $rv ($ORD[$rv])\n";
	238	$rv \|\|= (length($x) <=> length($y)) # shorter is always first
	239	\|\| ($cmp and $cmp->($x,$y) \|\| $cmp->($a,$b))
	240	\|\| ($x cmp $y)
	241	\|\| ($a cmp $b)
	242	;
	243
	244	DEBUG > 1 and print " <$a> cmp <$b> is $rv\n";
	245	$rv;
	246	}
	247	}
	248	EONCMP
	249
	250	# clean up:
	251	undef $guts;
	252	undef &maker;
	253
	254	#-----------------------------------------------------------------------------
	255	1;
	256
	257	############### END OF MAIN SOURCE ###########################################
	258	__END__
	259
	260	=head1 NAME
	261
	262	Sort::Naturally -- sort lexically, but sort numeral parts numerically
	263
	264	=head1 SYNOPSIS
	265
	266	@them = nsort(qw(
	267	foo12a foo12z foo13a foo 14 9x foo12 fooa foolio Foolio Foo12a
	268	));
	269	print join(' ', @them), "\n";
	270
	271	Prints:
	272
	273	9x 14 foo fooa foolio Foolio foo12 foo12a Foo12a foo12z foo13a
	274
	275	(Or "foo12a" + "Foo12a" and "foolio" + "Foolio" and might be
	276	switched, depending on your locale.)
	277
	278	=head1 DESCRIPTION
	279
	280	This module exports two functions, C<nsort> and C<ncmp>; they are used
	281	in implementing my idea of a "natural sorting" algorithm. Under natural
	282	sorting, numeric substrings are compared numerically, and other
	283	word-characters are compared lexically.
	284
	285	This is the way I define natural sorting:
	286
	287	=over
	288
	289	=item *
	290
	291	Non-numeric word-character substrings are sorted lexically,
	292	case-insensitively: "Foo" comes between "fish" and "fowl".
	293
	294	=item *
	295
	296	Numeric substrings are sorted numerically:
	297	"100" comes after "20", not before.
	298
	299	=item *
	300
	301	\W substrings (neither words-characters nor digits) are I<ignored>.
	302
	303	=item *
	304
	305	Our use of \w, \d, \D, and \W is locale-sensitive: Sort::Naturally
	306	uses a C<use locale> statement.
	307
	308	=item *
	309
	310	When comparing two strings, where a numeric substring in one
	311	place is I<not> up against a numeric substring in another,
	312	the non-numeric always comes first. This is fudged by
	313	reading pretending that the lack of a number substring has
	314	the value -1, like so:
	315
	316	foo => "foo", -1
	317	foobar => "foo", -1, "bar"
	318	foo13 => "foo", 13,
	319	foo13xyz => "foo", 13, "xyz"
	320
	321	That's so that "foo" will come before "foo13", which will come
	322	before "foobar".
	323
	324	=item *
	325
	326	The start of a string is exceptional: leading non-\W (non-word,
	327	non-digit)
	328	components are are ignored, and numbers come I<before> letters.
	329
	330	=item *
	331
	332	I define "numeric substring" just as sequences matching m/\d+/ --
	333	scientific notation, commas, decimals, etc., are not seen. If
	334	your data has thousands separators in numbers
	335	("20,000 Leagues Under The Sea" or "20.000 lieues sous les mers"),
	336	consider stripping them before feeding them to C<nsort> or
	337	C<ncmp>.
	338
	339	=back
	340
	341	=head2 The nsort function
	342
	343	This function takes a list of strings, and returns a copy of the list,
	344	sorted.
	345
	346	This is what most people will want to use:
	347
	348	@stuff = nsort(...list...);
	349
	350	When nsort needs to compare non-numeric substrings, it
	351	uses Perl's C<lc> function in scope of a <use locale>.
	352	And when nsort needs to lowercase things, it uses Perl's
	353	C<lc> function in scope of a <use locale>. If you want nsort
	354	to use other functions instead, you can specify them in
	355	an arrayref as the first argument to nsort:
	356
	357	@stuff = nsort( [
	358	\&string_comparator, # optional
	359	\&lowercaser_function # optional
	360	],
	361	...list...
	362	);
	363
	364	If you want to specify a string comparator but no lowercaser,
	365	then the options list is C<[\&comparator, '']> or
	366	C<[\&comparator]>. If you want to specify no string comparator
	367	but a lowercaser, then the options list is
	368	C<['', \&lowercaser]>.
	369
	370	Any comparator you specify is called as
	371	C<$comparator-E<gt>($left, $right)>,
	372	and, like a normal Perl C<cmp> replacement, must return
	373	-1, 0, or 1 depending on whether the left argument is stringwise
	374	less than, equal to, or greater than the right argument.
	375
	376	Any lowercaser function you specify is called as
	377	C<$lowercased = $lowercaser-E<gt>($original)>. The routine
	378	must not modify its C<$_[0]>.
	379
	380	=head2 The ncmp function
	381
	382	Often, when sorting non-string values like this:
	383
	384	@objects_sorted = sort { $a->tag cmp $b->tag } @objects;
	385
	386	...or even in a Schwartzian transform, like this:
	387
	388	@strings =
	389	map $_->[0]
	390	sort { $a->[1] cmp $b->[1] }
	391	map { [$_, make_a_sort_key_from($_) ]
	392	@_
	393	;
	394
	395	...you wight want something that replaces not C<sort>, but C<cmp>.
	396	That's what Sort::Naturally's C<ncmp> function is for. Call it with
	397	the syntax C<ncmp($left,$right)> instead of C<$left cmp $right>,
	398	but otherwise it's a fine replacement:
	399
	400	@objects_sorted = sort { ncmp($a->tag,$b->tag) } @objects;
	401
	402	@strings =
	403	map $_->[0]
	404	sort { ncmp($a->[1], $b->[1]) }
	405	map { [$_, make_a_sort_key_from($_) ]
	406	@_
	407	;
	408
	409	Just as with C<nsort> can take different a string-comparator
	410	and/or lowercaser, you can do the same with C<ncmp>, by passing
	411	an arrayref as the first argument:
	412
	413	ncmp( [
	414	\&string_comparator, # optional
	415	\&lowercaser_function # optional
	416	],
	417	$left, $right
	418	)
	419
	420	You might get string comparators from L<Sort::ArbBiLex\|Sort::ArbBiLex>.
	421
	422	=head1 NOTES
	423
	424	=over
	425
	426	=item *
	427
	428	This module is not a substitute for
	429	L<Sort::Versions\|Sort::Versions>! If
	430	you just need proper version sorting, use I<that!>
	431
	432	=item *
	433
	434	If you need something that works I<sort of> like this module's
	435	functions, but not quite the same, consider scouting thru this
	436	module's source code, and adapting what you see. Besides
	437	the functions that actually compile in this module, after the POD,
	438	there's several alternate attempts of mine at natural sorting
	439	routines, which are not compiled as part of the module, but which you
	440	might find useful. They should all be I<working> implementations of
	441	slightly different algorithms
	442	(all of them based on Martin Pool's C<nsort>) which I eventually
	443	discarded in favor of my algorithm. If you are having to
	444	naturally-sort I<very large> data sets, and sorting is getting
	445	ridiculously slow, you might consider trying one of those
	446	discarded functions -- I have a feeling they might be faster on
	447	large data sets. Benchmark them on your data and see. (Unless
	448	you I<need> the speed, don't bother. Hint: substitute C<sort>
	449	for C<nsort> in your code, and unless your program speeds up
	450	drastically, it's not the sorting that's slowing things down.
	451	But if it I<is> C<nsort> that's slowing things down, consider
	452	just:
	453
	454	if(@set >= SOME_VERY_BIG_NUMBER) {
	455	no locale; # vroom vroom
	456	@sorted = sort(@set); # feh, good enough
	457	} elsif(@set >= SOME_BIG_NUMBER) {
	458	use locale;
	459	@sorted = sort(@set); # feh, good enough
	460	} else {
	461	# but keep it pretty for normal cases
	462	@sorted = nsort(@set);
	463	}
	464
	465	=item *
	466
	467	If you do adapt the routines in this module, email me; I'd
	468	just be interested in hearing about it.
	469
	470	=item *
	471
	472	Thanks to the EFNet #perl people for encouraging this module,
	473	especially magister and a-mused.
	474
	475	=back
	476
	477	=head1 COPYRIGHT AND DISCLAIMER
	478
	479	Copyright 2001, Sean M. Burke C<[email protected]>, all rights
	480	reserved. This program is free software; you can redistribute it
	481	and/or modify it under the same terms as Perl itself.
	482
	483	This program is distributed in the hope that it will be useful, but
	484	without any warranty; without even the implied warranty of
	485	merchantability or fitness for a particular purpose.
	486
	487	=head1 AUTHOR
	488
	489	Sean M. Burke C<[email protected]>
	490
	491	=cut
	492
	493	############ END OF DOCS ############
	494
	495	############################################################################
	496	############################################################################
	497
	498	############ BEGIN OLD STUFF ############
	499
	500	# We can't have "use integer;", or else (5 <=> 5.1) comes out "0" !
	501
	502	#-----------------------------------------------------------------------------
	503	sub nsort {
	504	my($cmp, $lc);
	505	return @_ if @_ < 2; # Just to be CLEVER.
	506
	507	my($x, $i); # scratch vars
	508
	509	# And now, the GREAT BIG Schwartzian transform:
	510
	511	map
	512	$_->[0],
	513
	514	sort {
	515	# Uses $i as the index variable, $x as the result.
	516	$x = 0;
	517	$i = 1;
	518	DEBUG and print "\nComparing ", map("{$_}", @$a),
	519	' : ', map("{$_}", @$b), , "...\n";
	520
	521	while($i < @$a and $i < @$b) {
	522	DEBUG and print " comparing $i: {$a->[$i]} cmp {$b->[$i]} => ",
	523	$a->[$i] cmp $b->[$i], "\n";
	524	last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic
	525	++$i;
	526
	527	DEBUG and print " comparing $i: {$a->[$i]} <=> {$b->[$i]} => ",
	528	$a->[$i] <=> $b->[$i], "\n";
	529	last if ($x = ($a->[$i] <=> $b->[$i])); # numeric
	530	++$i;
	531	}
	532
	533	DEBUG and print "{$a->[0]} : {$b->[0]} is ",
	534	$x \|\| (@$a <=> @$b) \|\| 0
	535	,"\n"
	536	;
	537	$x \|\| (@$a <=> @$b) \|\| ($a->[0] cmp $b->[0]);
	538	# unless we found a result for $x in the while loop,
	539	# use length as a tiebreaker, otherwise use cmp
	540	# on the original string as a fallback tiebreaker.
	541	}
	542
	543	map {
	544	my @bit = ($x = defined($_) ? $_ : '');
	545
	546	if($x =~ m/^[+-]?(?=\d\|\.\d)\d(?:\.\d)?(?:[Ee](?:[+-]?\d+))?\z/s) {
	547	# It's entirely purely numeric, so treat it specially:
	548	push @bit, '', $x;
	549	} else {
	550	# Consume the string.
	551	while(length $x) {
	552	push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : '';
	553	push @bit, ($x =~ s/^(\d+)//s) ? $1 : 0;
	554	}
	555	}
	556	DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n";
	557
	558	# End result: [original bit , (text, number), (text, number), ...]
	559	# Minimally: [0-length original bit,]
	560	# Examples:
	561	# ['10' => '' , 10, ]
	562	# ['fo900' => 'fo' , 900, ]
	563	# ['foo10' => 'foo', 10, ]
	564	# ['foo9.pl' => 'foo', 9, , '.pl', 0 ]
	565	# ['foo32.pl' => 'foo', 32, , '.pl', 0 ]
	566	# ['foo325.pl' => 'foo', 325, , '.pl', 0 ]
	567	# Yes, always an ODD number of elements.
	568
	569	\@bit;
	570	}
	571	@_;
	572	}
	573
	574	#-----------------------------------------------------------------------------
	575	# Same as before, except without the pure-number trap.
	576
	577	sub nsorts {
	578	return @_ if @_ < 2; # Just to be CLEVER.
	579
	580	my($x, $i); # scratch vars
	581
	582	# And now, the GREAT BIG Schwartzian transform:
	583
	584	map
	585	$_->[0],
	586
	587	sort {
	588	# Uses $i as the index variable, $x as the result.
	589	$x = 0;
	590	$i = 1;
	591	DEBUG and print "\nComparing ", map("{$_}", @$a),
	592	' : ', map("{$_}", @$b), , "...\n";
	593
	594	while($i < @$a and $i < @$b) {
	595	DEBUG and print " comparing $i: {$a->[$i]} cmp {$b->[$i]} => ",
	596	$a->[$i] cmp $b->[$i], "\n";
	597	last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic
	598	++$i;
	599
	600	DEBUG and print " comparing $i: {$a->[$i]} <=> {$b->[$i]} => ",
	601	$a->[$i] <=> $b->[$i], "\n";
	602	last if ($x = ($a->[$i] <=> $b->[$i])); # numeric
	603	++$i;
	604	}
	605
	606	DEBUG and print "{$a->[0]} : {$b->[0]} is ",
	607	$x \|\| (@$a <=> @$b) \|\| 0
	608	,"\n"
	609	;
	610	$x \|\| (@$a <=> @$b) \|\| ($a->[0] cmp $b->[0]);
	611	# unless we found a result for $x in the while loop,
	612	# use length as a tiebreaker, otherwise use cmp
	613	# on the original string as a fallback tiebreaker.
	614	}
	615
	616	map {
	617	my @bit = ($x = defined($_) ? $_ : '');
	618
	619	while(length $x) {
	620	push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : '';
	621	push @bit, ($x =~ s/^(\d+)//s) ? $1 : 0;
	622	}
	623	DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n";
	624
	625	# End result: [original bit , (text, number), (text, number), ...]
	626	# Minimally: [0-length original bit,]
	627	# Examples:
	628	# ['10' => '' , 10, ]
	629	# ['fo900' => 'fo' , 900, ]
	630	# ['foo10' => 'foo', 10, ]
	631	# ['foo9.pl' => 'foo', 9, , '.pl', 0 ]
	632	# ['foo32.pl' => 'foo', 32, , '.pl', 0 ]
	633	# ['foo325.pl' => 'foo', 325, , '.pl', 0 ]
	634	# Yes, always an ODD number of elements.
	635
	636	\@bit;
	637	}
	638	@_;
	639	}
	640
	641	#-----------------------------------------------------------------------------
	642	# Same as before, except for the sort-key-making
	643
	644	sub nsort0 {
	645	return @_ if @_ < 2; # Just to be CLEVER.
	646
	647	my($x, $i); # scratch vars
	648
	649	# And now, the GREAT BIG Schwartzian transform:
	650
	651	map
	652	$_->[0],
	653
	654	sort {
	655	# Uses $i as the index variable, $x as the result.
	656	$x = 0;
	657	$i = 1;
	658	DEBUG and print "\nComparing ", map("{$_}", @$a),
	659	' : ', map("{$_}", @$b), , "...\n";
	660
	661	while($i < @$a and $i < @$b) {
	662	DEBUG and print " comparing $i: {$a->[$i]} cmp {$b->[$i]} => ",
	663	$a->[$i] cmp $b->[$i], "\n";
	664	last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic
	665	++$i;
	666
	667	DEBUG and print " comparing $i: {$a->[$i]} <=> {$b->[$i]} => ",
	668	$a->[$i] <=> $b->[$i], "\n";
	669	last if ($x = ($a->[$i] <=> $b->[$i])); # numeric
	670	++$i;
	671	}
	672
	673	DEBUG and print "{$a->[0]} : {$b->[0]} is ",
	674	$x \|\| (@$a <=> @$b) \|\| 0
	675	,"\n"
	676	;
	677	$x \|\| (@$a <=> @$b) \|\| ($a->[0] cmp $b->[0]);
	678	# unless we found a result for $x in the while loop,
	679	# use length as a tiebreaker, otherwise use cmp
	680	# on the original string as a fallback tiebreaker.
	681	}
	682
	683	map {
	684	my @bit = ($x = defined($_) ? $_ : '');
	685
	686	if($x =~ m/^[+-]?(?=\d\|\.\d)\d(?:\.\d)?(?:[Ee](?:[+-]?\d+))?\z/s) {
	687	# It's entirely purely numeric, so treat it specially:
	688	push @bit, '', $x;
	689	} else {
	690	# Consume the string.
	691	while(length $x) {
	692	push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : '';
	693	# Secret sauce:
	694	if($x =~ s/^(\d+)//s) {
	695	if(substr($1,0,1) eq '0' and $1 != 0) {
	696	push @bit, $1 / (10 ** length($1));
	697	} else {
	698	push @bit, $1;
	699	}
	700	} else {
	701	push @bit, 0;
	702	}
	703	}
	704	}
	705	DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n";
	706
	707	\@bit;
	708	}
	709	@_;
	710	}
	711
	712	#-----------------------------------------------------------------------------
	713	# Like nsort0, but WITHOUT pure number handling, and WITH special treatment
	714	# of pulling off extensions and version numbers.
	715
	716	sub nsortf {
	717	return @_ if @_ < 2; # Just to be CLEVER.
	718
	719	my($x, $i); # scratch vars
	720
	721	# And now, the GREAT BIG Schwartzian transform:
	722
	723	map
	724	$_->[0],
	725
	726	sort {
	727	# Uses $i as the index variable, $x as the result.
	728	$x = 0;
	729	$i = 3;
	730	DEBUG and print "\nComparing ", map("{$_}", @$a),
	731	' : ', map("{$_}", @$b), , "...\n";
	732
	733	while($i < @$a and $i < @$b) {
	734	DEBUG and print " comparing $i: {$a->[$i]} cmp {$b->[$i]} => ",
	735	$a->[$i] cmp $b->[$i], "\n";
	736	last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic
	737	++$i;
	738
	739	DEBUG and print " comparing $i: {$a->[$i]} <=> {$b->[$i]} => ",
	740	$a->[$i] <=> $b->[$i], "\n";
	741	last if ($x = ($a->[$i] <=> $b->[$i])); # numeric
	742	++$i;
	743	}
	744
	745	DEBUG and print "{$a->[0]} : {$b->[0]} is ",
	746	$x \|\| (@$a <=> @$b) \|\| 0
	747	,"\n"
	748	;
	749	$x \|\| (@$a <=> @$b ) \|\| ($a->[1] cmp $b->[1])
	750	\|\| ($a->[2] <=> $b->[2]) \|\| ($a->[0] cmp $b->[0]);
	751	# unless we found a result for $x in the while loop,
	752	# use length as a tiebreaker, otherwise use the
	753	# lc'd extension, otherwise the verison, otherwise use
	754	# the original string as a fallback tiebreaker.
	755	}
	756
	757	map {
	758	my @bit = ( ($x = defined($_) ? $_ : ''), '',0 );
	759
	760	{
	761	# Consume the string.
	762
	763	# First, pull off any VAX-style version
	764	$bit[2] = $1 if $x =~ s/;(\d+)$//;
	765
	766	# Then pull off any apparent extension
	767	if( $x !~ m/^\.+$/s and # don't mangle ".", "..", or "..."
	768	$x =~ s/(\.[^\.\;]*)$//sg
	769	# We could try to avoid catching all-digit extensions,
	770	# but I think that's getting /too/ clever.
	771	) {
	772	$i = $1;
	773	if($x =~ m<[^\\\://]$>s) {
	774	# We didn't take the whole basename.
	775	$bit[1] = lc $i;
	776	DEBUG and print "Consuming extension \"$1\"\n";
	777	} else {
	778	# We DID take the whole basename. Fix it.
	779	$x = $1; # Repair it.
	780	}
	781	}
	782
	783	push @bit, '', -1 if $x =~ m/^\./s;
	784	# A hack to make .-initial filenames sort first, regardless of locale.
	785	# And -1 is always a sort-firster, since in the code below, there's
	786	# no allowance for filenames containing negative numbers: -1.dat
	787	# will be read as string '-' followed by number 1.
	788
	789	while(length $x) {
	790	push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : '';
	791	# Secret sauce:
	792	if($x =~ s/^(\d+)//s) {
	793	if(substr($1,0,1) eq '0' and $1 != 0) {
	794	push @bit, $1 / (10 ** length($1));
	795	} else {
	796	push @bit, $1;
	797	}
	798	} else {
	799	push @bit, 0;
	800	}
	801	}
	802	}
	803
	804	DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n";
	805
	806	\@bit;
	807	}
	808	@_;
	809	}
	810
	811	# yowza yowza yowza.
	812

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/perllib/cpan/Sort/Naturally.pm@ 33236

Download in other formats: