root/main/trunk/greenstone2/perllib/cpan/Sort/Naturally.pm @ 33236

Revision 33236, 22.8 KB (checked in by davidb, 14 months ago)

CPAN module that provides a generalized form of alpha-numerical sorting. Useful in classifiers

Line 
1
2require 5;
3package Sort::Naturally;  # Time-stamp: "2004-12-29 18:30:03 AST"
4$VERSION = '1.03';
5@EXPORT = ('nsort', 'ncmp');
6require Exporter;
7@ISA = ('Exporter');
8
9use strict;
10use locale;
11use integer;
12
13#-----------------------------------------------------------------------------
14# constants:
15BEGIN { *DEBUG = sub () {0} unless defined &DEBUG }
16
17use Config ();
18BEGIN {
19  # Make a constant such that if a whole-number string is that long
20  #  or shorter, we KNOW it's treatable as an integer
21  no integer;
22  my $x = length(256 ** $Config::Config{'intsize'} / 2) - 1;
23  die "Crazy intsize: <$Config::Config{'intsize'}>" if $x < 4;
24  eval 'sub MAX_INT_SIZE () {' . $x . '}';
25  die $@ if $@;
26  print "intsize $Config::Config{'intsize'} => MAX_INT_SIZE $x\n" if DEBUG;
27}
28
29sub X_FIRST () {-1}
30sub Y_FIRST () { 1}
31
32my @ORD = ('same', 'swap', 'asis');
33
34#-----------------------------------------------------------------------------
35# For lack of a preprocessor:
36
37my($code, $guts);
38$guts = <<'EOGUTS';  # This is the guts of both ncmp and nsort:
39
40    if($x eq $y) {
41      # trap this expensive case first, and then fall thru to tiebreaker
42      $rv = 0;
43
44    # Convoluted hack to get numerics to sort first, at string start:
45    } elsif($x =~ m/^\d/s) {
46      if($y =~ m/^\d/s) {
47        $rv = 0;    # fall thru to normal comparison for the two numbers
48      } else {
49        $rv = X_FIRST;
50        DEBUG > 1 and print "Numeric-initial $x trumps letter-initial $y\n";
51      }
52    } elsif($y =~ m/^\d/s) {
53      $rv = Y_FIRST;
54      DEBUG > 1 and print "Numeric-initial $y trumps letter-initial $x\n";
55    } else {
56      $rv = 0;
57    }
58
59    unless($rv) {
60      # Normal case:
61      $rv = 0;
62      DEBUG and print "<$x> and <$y> compared...\n";
63
64     Consideration:
65      while(length $x and length $y) {
66
67        DEBUG > 2 and print " <$x> and <$y>...\n";
68
69        # First, non-numeric comparison:
70        $x2 = ($x =~ m/^(\D+)/s) ? length($1) : 0;
71        $y2 = ($y =~ m/^(\D+)/s) ? length($1) : 0;
72        # Now make x2 the min length of the two:
73        $x2 = $y2 if $x2 > $y2;
74        if($x2) {
75          DEBUG > 1 and printf " <%s> and <%s> lexically for length $x2...\n",
76            substr($x,0,$x2), substr($y,0,$x2);
77          do {
78           my $i = substr($x,0,$x2);
79           my $j = substr($y,0,$x2);
80           my $sv = $i cmp $j;
81           print "SCREAM! on <$i><$j> -- $sv != $rv \n" unless $rv == $sv;
82           last;
83          }
84
85
86           if $rv =
87           # The ''. things here force a copy that seems to work around a
88           #  mysterious intermittent bug that 'use locale' provokes in
89           #  many versions of Perl.
90                   $cmp
91                   ? $cmp->(substr($x,0,$x2) . '',
92                            substr($y,0,$x2) . '',
93                           )
94                   :
95                   scalar(( substr($x,0,$x2) . '' ) cmp
96                          ( substr($y,0,$x2) . '' )
97                          )
98          ;
99          # otherwise trim and keep going:
100          substr($x,0,$x2) = '';
101          substr($y,0,$x2) = '';
102        }
103
104        # Now numeric:
105        #  (actually just using $x2 and $y2 as scratch)
106
107        if( $x =~ s/^(\d+)//s ) {
108          $x2 = $1;
109          if( $y =~ s/^(\d+)//s ) {
110            # We have two numbers here.
111            DEBUG > 1 and print " <$x2> and <$1> numerically\n";
112            if(length($x2) < MAX_INT_SIZE and length($1) < MAX_INT_SIZE) {
113              # small numbers: we can compare happily
114              last if $rv = $x2 <=> $1;
115            } else {
116              # ARBITRARILY large integers!
117
118              # This saves on loss of precision that could happen
119              #  with actual stringification.
120              # Also, I sense that very large numbers aren't too
121              #  terribly common in sort data.
122
123              # trim leading 0's:
124              ($y2 = $1) =~ s/^0+//s;
125              $x2 =~ s/^0+//s;
126              print "   Treating $x2 and $y2 as bigint\n" if DEBUG;
127
128              no locale; # we want the dumb cmp back.
129              last if $rv = (
130                 # works only for non-negative whole numbers:
131                 length($x2) <=> length($y2)
132                   # the longer the numeral, the larger the value
133                 or $x2 cmp $y2
134                   # between equals, compare lexically!!  amazing but true.
135              );
136            }
137          } else {
138            # X is numeric but Y isn't
139            $rv = Y_FIRST;
140            last;
141          }
142        } elsif( $y =~ s/^\d+//s ) {  # we don't need to capture the substring
143          $rv = X_FIRST;
144          last;
145        }
146         # else one of them is 0-length.
147
148       # end-while
149      }
150    }
151EOGUTS
152
153sub maker {
154  my $code = $_[0];
155  $code =~ s/~COMPARATOR~/$guts/g || die "Can't find ~COMPARATOR~";
156  eval $code;
157  die $@ if $@;
158}
159
160##############################################################################
161
162maker(<<'EONSORT');
163sub nsort {
164  # get options:
165  my($cmp, $lc);
166  ($cmp,$lc) = @{shift @_} if @_ and ref($_[0]) eq 'ARRAY';
167
168  return @_ unless @_ > 1 or wantarray; # be clever
169
170  my($x, $x2, $y, $y2, $rv);  # scratch vars
171
172  # We use a Schwartzian xform to memoize the lc'ing and \W-removal
173
174  map $_->[0],
175  sort {
176    if($a->[0] eq $b->[0]) { 0 }   # trap this expensive case
177    else {
178
179    $x = $a->[1];
180    $y = $b->[1];
181
182~COMPARATOR~
183
184    # Tiebreakers...
185    DEBUG > 1 and print " -<${$a}[0]> cmp <${$b}[0]> is $rv ($ORD[$rv])\n";
186    $rv ||= (length($x) <=> length($y))  # shorter is always first
187        ||  ($cmp and $cmp->($x,$y) || $cmp->($a->[0], $b->[0]))
188        ||  ($x      cmp $y     )
189        ||  ($a->[0] cmp $b->[0])
190    ;
191
192    DEBUG > 1 and print "  <${$a}[0]> cmp <${$b}[0]> is $rv ($ORD[$rv])\n";
193    $rv;
194  }}
195
196  map {;
197    $x = $lc ? $lc->($_) : lc($_); # x as scratch
198    $x =~ s/\W+//s;
199    [$_, $x];
200  }
201  @_
202}
203EONSORT
204
205#-----------------------------------------------------------------------------
206maker(<<'EONCMP');
207sub ncmp {
208  # The guts are basically the same as above...
209
210  # get options:
211  my($cmp, $lc);
212  ($cmp,$lc) = @{shift @_} if @_ and ref($_[0]) eq 'ARRAY';
213
214  if(@_ == 0) {
215    @_ = ($a, $b); # bit of a hack!
216    DEBUG > 1 and print "Hacking in <$a><$b>\n";
217  } elsif(@_ != 2) {
218    require Carp;
219    Carp::croak("Not enough options to ncmp!");
220  }
221  my($a,$b) = @_;
222  my($x, $x2, $y, $y2, $rv);  # scratch vars
223
224  DEBUG > 1 and print "ncmp args <$a><$b>\n";
225  if($a eq $b) { # trap this expensive case
226    0;
227  } else {
228    $x = ($lc ? $lc->($a) : lc($a));
229    $x =~ s/\W+//s;
230    $y = ($lc ? $lc->($b) : lc($b));
231    $y =~ s/\W+//s;
232
233~COMPARATOR~
234
235
236    # Tiebreakers...
237    DEBUG > 1 and print " -<$a> cmp <$b> is $rv ($ORD[$rv])\n";
238    $rv ||= (length($x) <=> length($y))  # shorter is always first
239        ||  ($cmp and $cmp->($x,$y) || $cmp->($a,$b))
240        ||  ($x cmp $y)
241        ||  ($a cmp $b)
242    ;
243
244    DEBUG > 1 and print "  <$a> cmp <$b> is $rv\n";
245    $rv;
246  }
247}
248EONCMP
249
250# clean up:
251undef $guts;
252undef &maker;
253
254#-----------------------------------------------------------------------------
2551;
256
257############### END OF MAIN SOURCE ###########################################
258__END__
259
260=head1 NAME
261
262Sort::Naturally -- sort lexically, but sort numeral parts numerically
263
264=head1 SYNOPSIS
265
266  @them = nsort(qw(
267   foo12a foo12z foo13a foo 14 9x foo12 fooa foolio Foolio Foo12a
268  ));
269  print join(' ', @them), "\n";
270
271Prints:
272
273  9x 14 foo fooa foolio Foolio foo12 foo12a Foo12a foo12z foo13a
274
275(Or "foo12a" + "Foo12a" and "foolio" + "Foolio" and might be
276switched, depending on your locale.)
277
278=head1 DESCRIPTION
279
280This module exports two functions, C<nsort> and C<ncmp>; they are used
281in implementing my idea of a "natural sorting" algorithm.  Under natural
282sorting, numeric substrings are compared numerically, and other
283word-characters are compared lexically.
284
285This is the way I define natural sorting:
286
287=over
288
289=item *
290
291Non-numeric word-character substrings are sorted lexically,
292case-insensitively: "Foo" comes between "fish" and "fowl".
293
294=item *
295
296Numeric substrings are sorted numerically:
297"100" comes after "20", not before.
298
299=item *
300
301\W substrings (neither words-characters nor digits) are I<ignored>.
302
303=item *
304
305Our use of \w, \d, \D, and \W is locale-sensitive:  Sort::Naturally
306uses a C<use locale> statement.
307
308=item *
309
310When comparing two strings, where a numeric substring in one
311place is I<not> up against a numeric substring in another,
312the non-numeric always comes first.  This is fudged by
313reading pretending that the lack of a number substring has
314the value -1, like so:
315
316  foo       =>  "foo",  -1
317  foobar    =>  "foo",  -1,  "bar"
318  foo13     =>  "foo",  13,
319  foo13xyz  =>  "foo",  13,  "xyz"
320
321That's so that "foo" will come before "foo13", which will come
322before "foobar".
323
324=item *
325
326The start of a string is exceptional: leading non-\W (non-word,
327non-digit)
328components are are ignored, and numbers come I<before> letters.
329
330=item *
331
332I define "numeric substring" just as sequences matching m/\d+/ --
333scientific notation, commas, decimals, etc., are not seen.  If
334your data has thousands separators in numbers
335("20,000 Leagues Under The Sea" or "20.000 lieues sous les mers"),
336consider stripping them before feeding them to C<nsort> or
337C<ncmp>.
338
339=back
340
341=head2 The nsort function
342
343This function takes a list of strings, and returns a copy of the list,
344sorted.
345
346This is what most people will want to use:
347
348  @stuff = nsort(...list...);
349
350When nsort needs to compare non-numeric substrings, it
351uses Perl's C<lc> function in scope of a <use locale>.
352And when nsort needs to lowercase things, it uses Perl's
353C<lc> function in scope of a <use locale>.  If you want nsort
354to use other functions instead, you can specify them in
355an arrayref as the first argument to nsort:
356
357  @stuff = nsort( [
358                    \&string_comparator,   # optional
359                    \&lowercaser_function  # optional
360                  ],
361                  ...list...
362                );
363
364If you want to specify a string comparator but no lowercaser,
365then the options list is C<[\&comparator, '']> or
366C<[\&comparator]>.  If you want to specify no string comparator
367but a lowercaser, then the options list is
368C<['', \&lowercaser]>.
369
370Any comparator you specify is called as
371C<$comparator-E<gt>($left, $right)>,
372and, like a normal Perl C<cmp> replacement, must return
373-1, 0, or 1 depending on whether the left argument is stringwise
374less than, equal to, or greater than the right argument.
375
376Any lowercaser function you specify is called as
377C<$lowercased = $lowercaser-E<gt>($original)>.  The routine
378must not modify its C<$_[0]>.
379
380=head2 The ncmp function
381
382Often, when sorting non-string values like this:
383
384   @objects_sorted = sort { $a->tag cmp $b->tag } @objects;
385
386...or even in a Schwartzian transform, like this:
387
388   @strings =
389     map $_->[0]
390     sort { $a->[1] cmp $b->[1] }
391     map { [$_, make_a_sort_key_from($_) ]
392     @_
393   ;
394
395...you wight want something that replaces not C<sort>, but C<cmp>.
396That's what Sort::Naturally's C<ncmp> function is for.  Call it with
397the syntax C<ncmp($left,$right)> instead of C<$left cmp $right>,
398but otherwise it's a fine replacement:
399
400   @objects_sorted = sort { ncmp($a->tag,$b->tag) } @objects;
401
402   @strings =
403     map $_->[0]
404     sort { ncmp($a->[1], $b->[1]) }
405     map { [$_, make_a_sort_key_from($_) ]
406     @_
407   ;
408
409Just as with C<nsort> can take different a string-comparator
410and/or lowercaser, you can do the same with C<ncmp>, by passing
411an arrayref as the first argument:
412
413  ncmp( [
414          \&string_comparator,   # optional
415          \&lowercaser_function  # optional
416        ],
417        $left, $right
418      )
419
420You might get string comparators from L<Sort::ArbBiLex|Sort::ArbBiLex>.
421
422=head1 NOTES
423
424=over
425
426=item *
427
428This module is not a substitute for
429L<Sort::Versions|Sort::Versions>!  If
430you just need proper version sorting, use I<that!>
431
432=item *
433
434If you need something that works I<sort of> like this module's
435functions, but not quite the same, consider scouting thru this
436module's source code, and adapting what you see.  Besides
437the functions that actually compile in this module, after the POD,
438there's several alternate attempts of mine at natural sorting
439routines, which are not compiled as part of the module, but which you
440might find useful.  They should all be I<working> implementations of
441slightly different algorithms
442(all of them based on Martin Pool's C<nsort>) which I eventually
443discarded in favor of my algorithm.  If you are having to
444naturally-sort I<very large> data sets, and sorting is getting
445ridiculously slow, you might consider trying one of those
446discarded functions -- I have a feeling they might be faster on
447large data sets.  Benchmark them on your data and see.  (Unless
448you I<need> the speed, don't bother.  Hint: substitute C<sort>
449for C<nsort> in your code, and unless your program speeds up
450drastically, it's not the sorting that's slowing things down.
451But if it I<is> C<nsort> that's slowing things down, consider
452just:
453
454      if(@set >= SOME_VERY_BIG_NUMBER) {
455        no locale; # vroom vroom
456        @sorted = sort(@set);  # feh, good enough
457      } elsif(@set >= SOME_BIG_NUMBER) {
458        use locale;
459        @sorted = sort(@set);  # feh, good enough
460      } else {
461        # but keep it pretty for normal cases
462        @sorted = nsort(@set);
463      }
464
465=item *
466
467If you do adapt the routines in this module, email me; I'd
468just be interested in hearing about it.
469
470=item *
471
472Thanks to the EFNet #perl people for encouraging this module,
473especially magister and a-mused.
474
475=back
476
477=head1 COPYRIGHT AND DISCLAIMER
478
479Copyright 2001, Sean M. Burke C<sburke@cpan.org>, all rights
480reserved.  This program is free software; you can redistribute it
481and/or modify it under the same terms as Perl itself.
482
483This program is distributed in the hope that it will be useful, but
484without any warranty; without even the implied warranty of
485merchantability or fitness for a particular purpose.
486
487=head1 AUTHOR
488
489Sean M. Burke C<sburke@cpan.org>
490
491=cut
492
493############   END OF DOCS   ############
494
495############################################################################
496############################################################################
497
498############ BEGIN OLD STUFF ############
499
500# We can't have "use integer;", or else (5 <=> 5.1) comes out "0" !
501
502#-----------------------------------------------------------------------------
503sub nsort {
504  my($cmp, $lc);
505  return @_ if @_ < 2;   # Just to be CLEVER.
506
507  my($x, $i);  # scratch vars
508
509  # And now, the GREAT BIG Schwartzian transform:
510
511  map
512    $_->[0],
513
514  sort {
515    # Uses $i as the index variable, $x as the result.
516    $x = 0;
517    $i = 1;
518    DEBUG and print "\nComparing ", map("{$_}", @$a),
519                 ' : ', map("{$_}", @$b), , "...\n";
520
521    while($i < @$a and $i < @$b) {
522      DEBUG and print "  comparing $i: {$a->[$i]} cmp {$b->[$i]} => ",
523        $a->[$i] cmp $b->[$i], "\n";
524      last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic
525      ++$i;
526
527      DEBUG and print "  comparing $i: {$a->[$i]} <=> {$b->[$i]} => ",
528        $a->[$i] <=> $b->[$i], "\n";
529      last if ($x = ($a->[$i] <=> $b->[$i])); # numeric
530      ++$i;
531    }
532
533    DEBUG and print "{$a->[0]} : {$b->[0]} is ",
534      $x || (@$a <=> @$b) || 0
535      ,"\n"
536    ;
537    $x || (@$a <=> @$b) || ($a->[0] cmp $b->[0]);
538      # unless we found a result for $x in the while loop,
539      #  use length as a tiebreaker, otherwise use cmp
540      #  on the original string as a fallback tiebreaker.
541  }
542
543  map {
544    my @bit = ($x = defined($_) ? $_ : '');
545
546    if($x =~ m/^[+-]?(?=\d|\.\d)\d*(?:\.\d*)?(?:[Ee](?:[+-]?\d+))?\z/s) {
547      # It's entirely purely numeric, so treat it specially:
548      push @bit, '', $x;
549    } else {
550      # Consume the string.
551      while(length $x) {
552        push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : '';
553        push @bit, ($x =~ s/^(\d+)//s) ?    $1  :  0;
554      }
555    }
556    DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n";
557
558    # End result: [original bit         , (text, number), (text, number), ...]
559    # Minimally:  [0-length original bit,]
560    # Examples:
561    #    ['10'         => ''   ,  10,              ]
562    #    ['fo900'      => 'fo' , 900,              ]
563    #    ['foo10'      => 'foo',  10,              ]
564    #    ['foo9.pl'    => 'foo',   9,   , '.pl', 0 ]
565    #    ['foo32.pl'   => 'foo',  32,   , '.pl', 0 ]
566    #    ['foo325.pl'  => 'foo', 325,   , '.pl', 0 ]
567    #  Yes, always an ODD number of elements.
568
569    \@bit;
570  }
571  @_;
572}
573
574#-----------------------------------------------------------------------------
575# Same as before, except without the pure-number trap.
576
577sub nsorts {
578  return @_ if @_ < 2;   # Just to be CLEVER.
579
580  my($x, $i);  # scratch vars
581
582  # And now, the GREAT BIG Schwartzian transform:
583
584  map
585    $_->[0],
586
587  sort {
588    # Uses $i as the index variable, $x as the result.
589    $x = 0;
590    $i = 1;
591    DEBUG and print "\nComparing ", map("{$_}", @$a),
592                 ' : ', map("{$_}", @$b), , "...\n";
593
594    while($i < @$a and $i < @$b) {
595      DEBUG and print "  comparing $i: {$a->[$i]} cmp {$b->[$i]} => ",
596        $a->[$i] cmp $b->[$i], "\n";
597      last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic
598      ++$i;
599
600      DEBUG and print "  comparing $i: {$a->[$i]} <=> {$b->[$i]} => ",
601        $a->[$i] <=> $b->[$i], "\n";
602      last if ($x = ($a->[$i] <=> $b->[$i])); # numeric
603      ++$i;
604    }
605
606    DEBUG and print "{$a->[0]} : {$b->[0]} is ",
607      $x || (@$a <=> @$b) || 0
608      ,"\n"
609    ;
610    $x || (@$a <=> @$b) || ($a->[0] cmp $b->[0]);
611      # unless we found a result for $x in the while loop,
612      #  use length as a tiebreaker, otherwise use cmp
613      #  on the original string as a fallback tiebreaker.
614  }
615
616  map {
617    my @bit = ($x = defined($_) ? $_ : '');
618
619    while(length $x) {
620      push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : '';
621      push @bit, ($x =~ s/^(\d+)//s) ?    $1  :  0;
622    }
623    DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n";
624
625    # End result: [original bit         , (text, number), (text, number), ...]
626    # Minimally:  [0-length original bit,]
627    # Examples:
628    #    ['10'         => ''   ,  10,              ]
629    #    ['fo900'      => 'fo' , 900,              ]
630    #    ['foo10'      => 'foo',  10,              ]
631    #    ['foo9.pl'    => 'foo',   9,   , '.pl', 0 ]
632    #    ['foo32.pl'   => 'foo',  32,   , '.pl', 0 ]
633    #    ['foo325.pl'  => 'foo', 325,   , '.pl', 0 ]
634    #  Yes, always an ODD number of elements.
635
636    \@bit;
637  }
638  @_;
639}
640
641#-----------------------------------------------------------------------------
642# Same as before, except for the sort-key-making
643
644sub nsort0 {
645  return @_ if @_ < 2;   # Just to be CLEVER.
646
647  my($x, $i);  # scratch vars
648
649  # And now, the GREAT BIG Schwartzian transform:
650
651  map
652    $_->[0],
653
654  sort {
655    # Uses $i as the index variable, $x as the result.
656    $x = 0;
657    $i = 1;
658    DEBUG and print "\nComparing ", map("{$_}", @$a),
659                 ' : ', map("{$_}", @$b), , "...\n";
660
661    while($i < @$a and $i < @$b) {
662      DEBUG and print "  comparing $i: {$a->[$i]} cmp {$b->[$i]} => ",
663        $a->[$i] cmp $b->[$i], "\n";
664      last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic
665      ++$i;
666
667      DEBUG and print "  comparing $i: {$a->[$i]} <=> {$b->[$i]} => ",
668        $a->[$i] <=> $b->[$i], "\n";
669      last if ($x = ($a->[$i] <=> $b->[$i])); # numeric
670      ++$i;
671    }
672
673    DEBUG and print "{$a->[0]} : {$b->[0]} is ",
674      $x || (@$a <=> @$b) || 0
675      ,"\n"
676    ;
677    $x || (@$a <=> @$b) || ($a->[0] cmp $b->[0]);
678      # unless we found a result for $x in the while loop,
679      #  use length as a tiebreaker, otherwise use cmp
680      #  on the original string as a fallback tiebreaker.
681  }
682
683  map {
684    my @bit = ($x = defined($_) ? $_ : '');
685
686    if($x =~ m/^[+-]?(?=\d|\.\d)\d*(?:\.\d*)?(?:[Ee](?:[+-]?\d+))?\z/s) {
687      # It's entirely purely numeric, so treat it specially:
688      push @bit, '', $x;
689    } else {
690      # Consume the string.
691      while(length $x) {
692        push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : '';
693        # Secret sauce:
694        if($x =~ s/^(\d+)//s) {
695          if(substr($1,0,1) eq '0' and $1 != 0) {
696            push @bit, $1 / (10 ** length($1));
697          } else {
698            push @bit, $1;
699          }
700        } else {
701          push @bit, 0;
702        }
703      }
704    }
705    DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n";
706
707    \@bit;
708  }
709  @_;
710}
711
712#-----------------------------------------------------------------------------
713# Like nsort0, but WITHOUT pure number handling, and WITH special treatment
714# of pulling off extensions and version numbers.
715
716sub nsortf {
717  return @_ if @_ < 2;   # Just to be CLEVER.
718
719  my($x, $i);  # scratch vars
720
721  # And now, the GREAT BIG Schwartzian transform:
722
723  map
724    $_->[0],
725
726  sort {
727    # Uses $i as the index variable, $x as the result.
728    $x = 0;
729    $i = 3;
730    DEBUG and print "\nComparing ", map("{$_}", @$a),
731                 ' : ', map("{$_}", @$b), , "...\n";
732
733    while($i < @$a and $i < @$b) {
734      DEBUG and print "  comparing $i: {$a->[$i]} cmp {$b->[$i]} => ",
735        $a->[$i] cmp $b->[$i], "\n";
736      last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic
737      ++$i;
738
739      DEBUG and print "  comparing $i: {$a->[$i]} <=> {$b->[$i]} => ",
740        $a->[$i] <=> $b->[$i], "\n";
741      last if ($x = ($a->[$i] <=> $b->[$i])); # numeric
742      ++$i;
743    }
744
745    DEBUG and print "{$a->[0]} : {$b->[0]} is ",
746      $x || (@$a <=> @$b) || 0
747      ,"\n"
748    ;
749    $x || (@$a     <=> @$b    ) || ($a->[1] cmp $b->[1])
750       || ($a->[2] <=> $b->[2]) || ($a->[0] cmp $b->[0]);
751      # unless we found a result for $x in the while loop,
752      #  use length as a tiebreaker, otherwise use the
753      #  lc'd extension, otherwise the verison, otherwise use
754      #  the original string as a fallback tiebreaker.
755  }
756
757  map {
758    my @bit = ( ($x = defined($_) ? $_ : ''), '',0 );
759
760    {
761      # Consume the string.
762
763      # First, pull off any VAX-style version
764      $bit[2] = $1 if $x =~ s/;(\d+)$//;
765
766      # Then pull off any apparent extension
767      if( $x !~ m/^\.+$/s and     # don't mangle ".", "..", or "..."
768          $x =~ s/(\.[^\.\;]*)$//sg
769          # We could try to avoid catching all-digit extensions,
770          #  but I think that's getting /too/ clever.
771      ) {
772        $i = $1;
773        if($x =~ m<[^\\\://]$>s) {
774          # We didn't take the whole basename.
775          $bit[1] = lc $i;
776          DEBUG and print "Consuming extension \"$1\"\n";
777        } else {
778          # We DID take the whole basename.  Fix it.
779          $x = $1;  # Repair it.
780        }
781      }
782
783      push @bit, '', -1   if $x =~ m/^\./s;
784       # A hack to make .-initial filenames sort first, regardless of locale.
785       # And -1 is always a sort-firster, since in the code below, there's
786       # no allowance for filenames containing negative numbers: -1.dat
787       # will be read as string '-' followed by number 1.
788
789      while(length $x) {
790        push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : '';
791        # Secret sauce:
792        if($x =~ s/^(\d+)//s) {
793          if(substr($1,0,1) eq '0' and $1 != 0) {
794            push @bit, $1 / (10 ** length($1));
795          } else {
796            push @bit, $1;
797          }
798        } else {
799          push @bit, 0;
800        }
801      }
802    }
803
804    DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n";
805
806    \@bit;
807  }
808  @_;
809}
810
811# yowza yowza yowza.
812
Note: See TracBrowser for help on using the browser.