Context Navigation

Naturally.pm

Last change on this file was 33236, checked in by davidb, 5 years ago
CPAN module that provides a generalized form of alpha-numerical sorting. Useful in classifiers
File size: 22.8 KB

Line
1
2	require 5;
3	package Sort::Naturally; # Time-stamp: "2004-12-29 18:30:03 AST"
4	$VERSION = '1.03';
5	@EXPORT = ('nsort', 'ncmp');
6	require Exporter;
7	@ISA = ('Exporter');
8
9	use strict;
10	use locale;
11	use integer;
12
13	#-----------------------------------------------------------------------------
14	# constants:
15	BEGIN { *DEBUG = sub () {0} unless defined &DEBUG }
16
17	use Config ();
18	BEGIN {
19	# Make a constant such that if a whole-number string is that long
20	# or shorter, we KNOW it's treatable as an integer
21	no integer;
22	my $x = length(256 ** $Config::Config{'intsize'} / 2) - 1;
23	die "Crazy intsize: <$Config::Config{'intsize'}>" if $x < 4;
24	eval 'sub MAX_INT_SIZE () {' . $x . '}';
25	die $@ if $@;
26	print "intsize $Config::Config{'intsize'} => MAX_INT_SIZE $x\n" if DEBUG;
27	}
28
29	sub X_FIRST () {-1}
30	sub Y_FIRST () { 1}
31
32	my @ORD = ('same', 'swap', 'asis');
33
34	#-----------------------------------------------------------------------------
35	# For lack of a preprocessor:
36
37	my($code, $guts);
38	$guts = <<'EOGUTS'; # This is the guts of both ncmp and nsort:
39
40	if($x eq $y) {
41	# trap this expensive case first, and then fall thru to tiebreaker
42	$rv = 0;
43
44	# Convoluted hack to get numerics to sort first, at string start:
45	} elsif($x =~ m/^\d/s) {
46	if($y =~ m/^\d/s) {
47	$rv = 0; # fall thru to normal comparison for the two numbers
48	} else {
49	$rv = X_FIRST;
50	DEBUG > 1 and print "Numeric-initial $x trumps letter-initial $y\n";
51	}
52	} elsif($y =~ m/^\d/s) {
53	$rv = Y_FIRST;
54	DEBUG > 1 and print "Numeric-initial $y trumps letter-initial $x\n";
55	} else {
56	$rv = 0;
57	}
58
59	unless($rv) {
60	# Normal case:
61	$rv = 0;
62	DEBUG and print "<$x> and <$y> compared...\n";
63
64	Consideration:
65	while(length $x and length $y) {
66
67	DEBUG > 2 and print " <$x> and <$y>...\n";
68
69	# First, non-numeric comparison:
70	$x2 = ($x =~ m/^(\D+)/s) ? length($1) : 0;
71	$y2 = ($y =~ m/^(\D+)/s) ? length($1) : 0;
72	# Now make x2 the min length of the two:
73	$x2 = $y2 if $x2 > $y2;
74	if($x2) {
75	DEBUG > 1 and printf " <%s> and <%s> lexically for length $x2...\n",
76	substr($x,0,$x2), substr($y,0,$x2);
77	do {
78	my $i = substr($x,0,$x2);
79	my $j = substr($y,0,$x2);
80	my $sv = $i cmp $j;
81	print "SCREAM! on <$i><$j> -- $sv != $rv \n" unless $rv == $sv;
82	last;
83	}
84
85
86	if $rv =
87	# The ''. things here force a copy that seems to work around a
88	# mysterious intermittent bug that 'use locale' provokes in
89	# many versions of Perl.
90	$cmp
91	? $cmp->(substr($x,0,$x2) . '',
92	substr($y,0,$x2) . '',
93	)
94	:
95	scalar(( substr($x,0,$x2) . '' ) cmp
96	( substr($y,0,$x2) . '' )
97	)
98	;
99	# otherwise trim and keep going:
100	substr($x,0,$x2) = '';
101	substr($y,0,$x2) = '';
102	}
103
104	# Now numeric:
105	# (actually just using $x2 and $y2 as scratch)
106
107	if( $x =~ s/^(\d+)//s ) {
108	$x2 = $1;
109	if( $y =~ s/^(\d+)//s ) {
110	# We have two numbers here.
111	DEBUG > 1 and print " <$x2> and <$1> numerically\n";
112	if(length($x2) < MAX_INT_SIZE and length($1) < MAX_INT_SIZE) {
113	# small numbers: we can compare happily
114	last if $rv = $x2 <=> $1;
115	} else {
116	# ARBITRARILY large integers!
117
118	# This saves on loss of precision that could happen
119	# with actual stringification.
120	# Also, I sense that very large numbers aren't too
121	# terribly common in sort data.
122
123	# trim leading 0's:
124	($y2 = $1) =~ s/^0+//s;
125	$x2 =~ s/^0+//s;
126	print " Treating $x2 and $y2 as bigint\n" if DEBUG;
127
128	no locale; # we want the dumb cmp back.
129	last if $rv = (
130	# works only for non-negative whole numbers:
131	length($x2) <=> length($y2)
132	# the longer the numeral, the larger the value
133	or $x2 cmp $y2
134	# between equals, compare lexically!! amazing but true.
135	);
136	}
137	} else {
138	# X is numeric but Y isn't
139	$rv = Y_FIRST;
140	last;
141	}
142	} elsif( $y =~ s/^\d+//s ) { # we don't need to capture the substring
143	$rv = X_FIRST;
144	last;
145	}
146	# else one of them is 0-length.
147
148	# end-while
149	}
150	}
151	EOGUTS
152
153	sub maker {
154	my $code = $_[0];
155	$code =~ s/~COMPARATOR~/$guts/g \|\| die "Can't find ~COMPARATOR~";
156	eval $code;
157	die $@ if $@;
158	}
159
160	##############################################################################
161
162	maker(<<'EONSORT');
163	sub nsort {
164	# get options:
165	my($cmp, $lc);
166	($cmp,$lc) = @{shift @_} if @_ and ref($_[0]) eq 'ARRAY';
167
168	return @_ unless @_ > 1 or wantarray; # be clever
169
170	my($x, $x2, $y, $y2, $rv); # scratch vars
171
172	# We use a Schwartzian xform to memoize the lc'ing and \W-removal
173
174	map $_->[0],
175	sort {
176	if($a->[0] eq $b->[0]) { 0 } # trap this expensive case
177	else {
178
179	$x = $a->[1];
180	$y = $b->[1];
181
182	~COMPARATOR~
183
184	# Tiebreakers...
185	DEBUG > 1 and print " -<${$a}[0]> cmp <${$b}[0]> is $rv ($ORD[$rv])\n";
186	$rv \|\|= (length($x) <=> length($y)) # shorter is always first
187	\|\| ($cmp and $cmp->($x,$y) \|\| $cmp->($a->[0], $b->[0]))
188	\|\| ($x cmp $y )
189	\|\| ($a->[0] cmp $b->[0])
190	;
191
192	DEBUG > 1 and print " <${$a}[0]> cmp <${$b}[0]> is $rv ($ORD[$rv])\n";
193	$rv;
194	}}
195
196	map {;
197	$x = $lc ? $lc->($_) : lc($_); # x as scratch
198	$x =~ s/\W+//s;
199	[$_, $x];
200	}
201	@_
202	}
203	EONSORT
204
205	#-----------------------------------------------------------------------------
206	maker(<<'EONCMP');
207	sub ncmp {
208	# The guts are basically the same as above...
209
210	# get options:
211	my($cmp, $lc);
212	($cmp,$lc) = @{shift @_} if @_ and ref($_[0]) eq 'ARRAY';
213
214	if(@_ == 0) {
215	@_ = ($a, $b); # bit of a hack!
216	DEBUG > 1 and print "Hacking in <$a><$b>\n";
217	} elsif(@_ != 2) {
218	require Carp;
219	Carp::croak("Not enough options to ncmp!");
220	}
221	my($a,$b) = @_;
222	my($x, $x2, $y, $y2, $rv); # scratch vars
223
224	DEBUG > 1 and print "ncmp args <$a><$b>\n";
225	if($a eq $b) { # trap this expensive case
226	0;
227	} else {
228	$x = ($lc ? $lc->($a) : lc($a));
229	$x =~ s/\W+//s;
230	$y = ($lc ? $lc->($b) : lc($b));
231	$y =~ s/\W+//s;
232
233	~COMPARATOR~
234
235
236	# Tiebreakers...
237	DEBUG > 1 and print " -<$a> cmp <$b> is $rv ($ORD[$rv])\n";
238	$rv \|\|= (length($x) <=> length($y)) # shorter is always first
239	\|\| ($cmp and $cmp->($x,$y) \|\| $cmp->($a,$b))
240	\|\| ($x cmp $y)
241	\|\| ($a cmp $b)
242	;
243
244	DEBUG > 1 and print " <$a> cmp <$b> is $rv\n";
245	$rv;
246	}
247	}
248	EONCMP
249
250	# clean up:
251	undef $guts;
252	undef &maker;
253
254	#-----------------------------------------------------------------------------
255	1;
256
257	############### END OF MAIN SOURCE ###########################################
258	__END__
259
260	=head1 NAME
261
262	Sort::Naturally -- sort lexically, but sort numeral parts numerically
263
264	=head1 SYNOPSIS
265
266	@them = nsort(qw(
267	foo12a foo12z foo13a foo 14 9x foo12 fooa foolio Foolio Foo12a
268	));
269	print join(' ', @them), "\n";
270
271	Prints:
272
273	9x 14 foo fooa foolio Foolio foo12 foo12a Foo12a foo12z foo13a
274
275	(Or "foo12a" + "Foo12a" and "foolio" + "Foolio" and might be
276	switched, depending on your locale.)
277
278	=head1 DESCRIPTION
279
280	This module exports two functions, C<nsort> and C<ncmp>; they are used
281	in implementing my idea of a "natural sorting" algorithm. Under natural
282	sorting, numeric substrings are compared numerically, and other
283	word-characters are compared lexically.
284
285	This is the way I define natural sorting:
286
287	=over
288
289	=item *
290
291	Non-numeric word-character substrings are sorted lexically,
292	case-insensitively: "Foo" comes between "fish" and "fowl".
293
294	=item *
295
296	Numeric substrings are sorted numerically:
297	"100" comes after "20", not before.
298
299	=item *
300
301	\W substrings (neither words-characters nor digits) are I<ignored>.
302
303	=item *
304
305	Our use of \w, \d, \D, and \W is locale-sensitive: Sort::Naturally
306	uses a C<use locale> statement.
307
308	=item *
309
310	When comparing two strings, where a numeric substring in one
311	place is I<not> up against a numeric substring in another,
312	the non-numeric always comes first. This is fudged by
313	reading pretending that the lack of a number substring has
314	the value -1, like so:
315
316	foo => "foo", -1
317	foobar => "foo", -1, "bar"
318	foo13 => "foo", 13,
319	foo13xyz => "foo", 13, "xyz"
320
321	That's so that "foo" will come before "foo13", which will come
322	before "foobar".
323
324	=item *
325
326	The start of a string is exceptional: leading non-\W (non-word,
327	non-digit)
328	components are are ignored, and numbers come I<before> letters.
329
330	=item *
331
332	I define "numeric substring" just as sequences matching m/\d+/ --
333	scientific notation, commas, decimals, etc., are not seen. If
334	your data has thousands separators in numbers
335	("20,000 Leagues Under The Sea" or "20.000 lieues sous les mers"),
336	consider stripping them before feeding them to C<nsort> or
337	C<ncmp>.
338
339	=back
340
341	=head2 The nsort function
342
343	This function takes a list of strings, and returns a copy of the list,
344	sorted.
345
346	This is what most people will want to use:
347
348	@stuff = nsort(...list...);
349
350	When nsort needs to compare non-numeric substrings, it
351	uses Perl's C<lc> function in scope of a <use locale>.
352	And when nsort needs to lowercase things, it uses Perl's
353	C<lc> function in scope of a <use locale>. If you want nsort
354	to use other functions instead, you can specify them in
355	an arrayref as the first argument to nsort:
356
357	@stuff = nsort( [
358	\&string_comparator, # optional
359	\&lowercaser_function # optional
360	],
361	...list...
362	);
363
364	If you want to specify a string comparator but no lowercaser,
365	then the options list is C<[\&comparator, '']> or
366	C<[\&comparator]>. If you want to specify no string comparator
367	but a lowercaser, then the options list is
368	C<['', \&lowercaser]>.
369
370	Any comparator you specify is called as
371	C<$comparator-E<gt>($left, $right)>,
372	and, like a normal Perl C<cmp> replacement, must return
373	-1, 0, or 1 depending on whether the left argument is stringwise
374	less than, equal to, or greater than the right argument.
375
376	Any lowercaser function you specify is called as
377	C<$lowercased = $lowercaser-E<gt>($original)>. The routine
378	must not modify its C<$_[0]>.
379
380	=head2 The ncmp function
381
382	Often, when sorting non-string values like this:
383
384	@objects_sorted = sort { $a->tag cmp $b->tag } @objects;
385
386	...or even in a Schwartzian transform, like this:
387
388	@strings =
389	map $_->[0]
390	sort { $a->[1] cmp $b->[1] }
391	map { [$_, make_a_sort_key_from($_) ]
392	@_
393	;
394
395	...you wight want something that replaces not C<sort>, but C<cmp>.
396	That's what Sort::Naturally's C<ncmp> function is for. Call it with
397	the syntax C<ncmp($left,$right)> instead of C<$left cmp $right>,
398	but otherwise it's a fine replacement:
399
400	@objects_sorted = sort { ncmp($a->tag,$b->tag) } @objects;
401
402	@strings =
403	map $_->[0]
404	sort { ncmp($a->[1], $b->[1]) }
405	map { [$_, make_a_sort_key_from($_) ]
406	@_
407	;
408
409	Just as with C<nsort> can take different a string-comparator
410	and/or lowercaser, you can do the same with C<ncmp>, by passing
411	an arrayref as the first argument:
412
413	ncmp( [
414	\&string_comparator, # optional
415	\&lowercaser_function # optional
416	],
417	$left, $right
418	)
419
420	You might get string comparators from L<Sort::ArbBiLex\|Sort::ArbBiLex>.
421
422	=head1 NOTES
423
424	=over
425
426	=item *
427
428	This module is not a substitute for
429	L<Sort::Versions\|Sort::Versions>! If
430	you just need proper version sorting, use I<that!>
431
432	=item *
433
434	If you need something that works I<sort of> like this module's
435	functions, but not quite the same, consider scouting thru this
436	module's source code, and adapting what you see. Besides
437	the functions that actually compile in this module, after the POD,
438	there's several alternate attempts of mine at natural sorting
439	routines, which are not compiled as part of the module, but which you
440	might find useful. They should all be I<working> implementations of
441	slightly different algorithms
442	(all of them based on Martin Pool's C<nsort>) which I eventually
443	discarded in favor of my algorithm. If you are having to
444	naturally-sort I<very large> data sets, and sorting is getting
445	ridiculously slow, you might consider trying one of those
446	discarded functions -- I have a feeling they might be faster on
447	large data sets. Benchmark them on your data and see. (Unless
448	you I<need> the speed, don't bother. Hint: substitute C<sort>
449	for C<nsort> in your code, and unless your program speeds up
450	drastically, it's not the sorting that's slowing things down.
451	But if it I<is> C<nsort> that's slowing things down, consider
452	just:
453
454	if(@set >= SOME_VERY_BIG_NUMBER) {
455	no locale; # vroom vroom
456	@sorted = sort(@set); # feh, good enough
457	} elsif(@set >= SOME_BIG_NUMBER) {
458	use locale;
459	@sorted = sort(@set); # feh, good enough
460	} else {
461	# but keep it pretty for normal cases
462	@sorted = nsort(@set);
463	}
464
465	=item *
466
467	If you do adapt the routines in this module, email me; I'd
468	just be interested in hearing about it.
469
470	=item *
471
472	Thanks to the EFNet #perl people for encouraging this module,
473	especially magister and a-mused.
474
475	=back
476
477	=head1 COPYRIGHT AND DISCLAIMER
478
479	Copyright 2001, Sean M. Burke C<[email protected]>, all rights
480	reserved. This program is free software; you can redistribute it
481	and/or modify it under the same terms as Perl itself.
482
483	This program is distributed in the hope that it will be useful, but
484	without any warranty; without even the implied warranty of
485	merchantability or fitness for a particular purpose.
486
487	=head1 AUTHOR
488
489	Sean M. Burke C<[email protected]>
490
491	=cut
492
493	############ END OF DOCS ############
494
495	############################################################################
496	############################################################################
497
498	############ BEGIN OLD STUFF ############
499
500	# We can't have "use integer;", or else (5 <=> 5.1) comes out "0" !
501
502	#-----------------------------------------------------------------------------
503	sub nsort {
504	my($cmp, $lc);
505	return @_ if @_ < 2; # Just to be CLEVER.
506
507	my($x, $i); # scratch vars
508
509	# And now, the GREAT BIG Schwartzian transform:
510
511	map
512	$_->[0],
513
514	sort {
515	# Uses $i as the index variable, $x as the result.
516	$x = 0;
517	$i = 1;
518	DEBUG and print "\nComparing ", map("{$_}", @$a),
519	' : ', map("{$_}", @$b), , "...\n";
520
521	while($i < @$a and $i < @$b) {
522	DEBUG and print " comparing $i: {$a->[$i]} cmp {$b->[$i]} => ",
523	$a->[$i] cmp $b->[$i], "\n";
524	last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic
525	++$i;
526
527	DEBUG and print " comparing $i: {$a->[$i]} <=> {$b->[$i]} => ",
528	$a->[$i] <=> $b->[$i], "\n";
529	last if ($x = ($a->[$i] <=> $b->[$i])); # numeric
530	++$i;
531	}
532
533	DEBUG and print "{$a->[0]} : {$b->[0]} is ",
534	$x \|\| (@$a <=> @$b) \|\| 0
535	,"\n"
536	;
537	$x \|\| (@$a <=> @$b) \|\| ($a->[0] cmp $b->[0]);
538	# unless we found a result for $x in the while loop,
539	# use length as a tiebreaker, otherwise use cmp
540	# on the original string as a fallback tiebreaker.
541	}
542
543	map {
544	my @bit = ($x = defined($_) ? $_ : '');
545
546	if($x =~ m/^[+-]?(?=\d\|\.\d)\d(?:\.\d)?(?:[Ee](?:[+-]?\d+))?\z/s) {
547	# It's entirely purely numeric, so treat it specially:
548	push @bit, '', $x;
549	} else {
550	# Consume the string.
551	while(length $x) {
552	push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : '';
553	push @bit, ($x =~ s/^(\d+)//s) ? $1 : 0;
554	}
555	}
556	DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n";
557
558	# End result: [original bit , (text, number), (text, number), ...]
559	# Minimally: [0-length original bit,]
560	# Examples:
561	# ['10' => '' , 10, ]
562	# ['fo900' => 'fo' , 900, ]
563	# ['foo10' => 'foo', 10, ]
564	# ['foo9.pl' => 'foo', 9, , '.pl', 0 ]
565	# ['foo32.pl' => 'foo', 32, , '.pl', 0 ]
566	# ['foo325.pl' => 'foo', 325, , '.pl', 0 ]
567	# Yes, always an ODD number of elements.
568
569	\@bit;
570	}
571	@_;
572	}
573
574	#-----------------------------------------------------------------------------
575	# Same as before, except without the pure-number trap.
576
577	sub nsorts {
578	return @_ if @_ < 2; # Just to be CLEVER.
579
580	my($x, $i); # scratch vars
581
582	# And now, the GREAT BIG Schwartzian transform:
583
584	map
585	$_->[0],
586
587	sort {
588	# Uses $i as the index variable, $x as the result.
589	$x = 0;
590	$i = 1;
591	DEBUG and print "\nComparing ", map("{$_}", @$a),
592	' : ', map("{$_}", @$b), , "...\n";
593
594	while($i < @$a and $i < @$b) {
595	DEBUG and print " comparing $i: {$a->[$i]} cmp {$b->[$i]} => ",
596	$a->[$i] cmp $b->[$i], "\n";
597	last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic
598	++$i;
599
600	DEBUG and print " comparing $i: {$a->[$i]} <=> {$b->[$i]} => ",
601	$a->[$i] <=> $b->[$i], "\n";
602	last if ($x = ($a->[$i] <=> $b->[$i])); # numeric
603	++$i;
604	}
605
606	DEBUG and print "{$a->[0]} : {$b->[0]} is ",
607	$x \|\| (@$a <=> @$b) \|\| 0
608	,"\n"
609	;
610	$x \|\| (@$a <=> @$b) \|\| ($a->[0] cmp $b->[0]);
611	# unless we found a result for $x in the while loop,
612	# use length as a tiebreaker, otherwise use cmp
613	# on the original string as a fallback tiebreaker.
614	}
615
616	map {
617	my @bit = ($x = defined($_) ? $_ : '');
618
619	while(length $x) {
620	push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : '';
621	push @bit, ($x =~ s/^(\d+)//s) ? $1 : 0;
622	}
623	DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n";
624
625	# End result: [original bit , (text, number), (text, number), ...]
626	# Minimally: [0-length original bit,]
627	# Examples:
628	# ['10' => '' , 10, ]
629	# ['fo900' => 'fo' , 900, ]
630	# ['foo10' => 'foo', 10, ]
631	# ['foo9.pl' => 'foo', 9, , '.pl', 0 ]
632	# ['foo32.pl' => 'foo', 32, , '.pl', 0 ]
633	# ['foo325.pl' => 'foo', 325, , '.pl', 0 ]
634	# Yes, always an ODD number of elements.
635
636	\@bit;
637	}
638	@_;
639	}
640
641	#-----------------------------------------------------------------------------
642	# Same as before, except for the sort-key-making
643
644	sub nsort0 {
645	return @_ if @_ < 2; # Just to be CLEVER.
646
647	my($x, $i); # scratch vars
648
649	# And now, the GREAT BIG Schwartzian transform:
650
651	map
652	$_->[0],
653
654	sort {
655	# Uses $i as the index variable, $x as the result.
656	$x = 0;
657	$i = 1;
658	DEBUG and print "\nComparing ", map("{$_}", @$a),
659	' : ', map("{$_}", @$b), , "...\n";
660
661	while($i < @$a and $i < @$b) {
662	DEBUG and print " comparing $i: {$a->[$i]} cmp {$b->[$i]} => ",
663	$a->[$i] cmp $b->[$i], "\n";
664	last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic
665	++$i;
666
667	DEBUG and print " comparing $i: {$a->[$i]} <=> {$b->[$i]} => ",
668	$a->[$i] <=> $b->[$i], "\n";
669	last if ($x = ($a->[$i] <=> $b->[$i])); # numeric
670	++$i;
671	}
672
673	DEBUG and print "{$a->[0]} : {$b->[0]} is ",
674	$x \|\| (@$a <=> @$b) \|\| 0
675	,"\n"
676	;
677	$x \|\| (@$a <=> @$b) \|\| ($a->[0] cmp $b->[0]);
678	# unless we found a result for $x in the while loop,
679	# use length as a tiebreaker, otherwise use cmp
680	# on the original string as a fallback tiebreaker.
681	}
682
683	map {
684	my @bit = ($x = defined($_) ? $_ : '');
685
686	if($x =~ m/^[+-]?(?=\d\|\.\d)\d(?:\.\d)?(?:[Ee](?:[+-]?\d+))?\z/s) {
687	# It's entirely purely numeric, so treat it specially:
688	push @bit, '', $x;
689	} else {
690	# Consume the string.
691	while(length $x) {
692	push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : '';
693	# Secret sauce:
694	if($x =~ s/^(\d+)//s) {
695	if(substr($1,0,1) eq '0' and $1 != 0) {
696	push @bit, $1 / (10 ** length($1));
697	} else {
698	push @bit, $1;
699	}
700	} else {
701	push @bit, 0;
702	}
703	}
704	}
705	DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n";
706
707	\@bit;
708	}
709	@_;
710	}
711
712	#-----------------------------------------------------------------------------
713	# Like nsort0, but WITHOUT pure number handling, and WITH special treatment
714	# of pulling off extensions and version numbers.
715
716	sub nsortf {
717	return @_ if @_ < 2; # Just to be CLEVER.
718
719	my($x, $i); # scratch vars
720
721	# And now, the GREAT BIG Schwartzian transform:
722
723	map
724	$_->[0],
725
726	sort {
727	# Uses $i as the index variable, $x as the result.
728	$x = 0;
729	$i = 3;
730	DEBUG and print "\nComparing ", map("{$_}", @$a),
731	' : ', map("{$_}", @$b), , "...\n";
732
733	while($i < @$a and $i < @$b) {
734	DEBUG and print " comparing $i: {$a->[$i]} cmp {$b->[$i]} => ",
735	$a->[$i] cmp $b->[$i], "\n";
736	last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic
737	++$i;
738
739	DEBUG and print " comparing $i: {$a->[$i]} <=> {$b->[$i]} => ",
740	$a->[$i] <=> $b->[$i], "\n";
741	last if ($x = ($a->[$i] <=> $b->[$i])); # numeric
742	++$i;
743	}
744
745	DEBUG and print "{$a->[0]} : {$b->[0]} is ",
746	$x \|\| (@$a <=> @$b) \|\| 0
747	,"\n"
748	;
749	$x \|\| (@$a <=> @$b ) \|\| ($a->[1] cmp $b->[1])
750	\|\| ($a->[2] <=> $b->[2]) \|\| ($a->[0] cmp $b->[0]);
751	# unless we found a result for $x in the while loop,
752	# use length as a tiebreaker, otherwise use the
753	# lc'd extension, otherwise the verison, otherwise use
754	# the original string as a fallback tiebreaker.
755	}
756
757	map {
758	my @bit = ( ($x = defined($_) ? $_ : ''), '',0 );
759
760	{
761	# Consume the string.
762
763	# First, pull off any VAX-style version
764	$bit[2] = $1 if $x =~ s/;(\d+)$//;
765
766	# Then pull off any apparent extension
767	if( $x !~ m/^\.+$/s and # don't mangle ".", "..", or "..."
768	$x =~ s/(\.[^\.\;]*)$//sg
769	# We could try to avoid catching all-digit extensions,
770	# but I think that's getting /too/ clever.
771	) {
772	$i = $1;
773	if($x =~ m<[^\\\://]$>s) {
774	# We didn't take the whole basename.
775	$bit[1] = lc $i;
776	DEBUG and print "Consuming extension \"$1\"\n";
777	} else {
778	# We DID take the whole basename. Fix it.
779	$x = $1; # Repair it.
780	}
781	}
782
783	push @bit, '', -1 if $x =~ m/^\./s;
784	# A hack to make .-initial filenames sort first, regardless of locale.
785	# And -1 is always a sort-firster, since in the code below, there's
786	# no allowance for filenames containing negative numbers: -1.dat
787	# will be read as string '-' followed by number 1.
788
789	while(length $x) {
790	push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : '';
791	# Secret sauce:
792	if($x =~ s/^(\d+)//s) {
793	if(substr($1,0,1) eq '0' and $1 != 0) {
794	push @bit, $1 / (10 ** length($1));
795	} else {
796	push @bit, $1;
797	}
798	} else {
799	push @bit, 0;
800	}
801	}
802	}
803
804	DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n";
805
806	\@bit;
807	}
808	@_;
809	}
810
811	# yowza yowza yowza.
812

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/perllib/cpan/Sort/Naturally.pm

Download in other formats: