[33236] | 1 |
|
---|
| 2 | require 5;
|
---|
| 3 | package Sort::Naturally; # Time-stamp: "2004-12-29 18:30:03 AST"
|
---|
| 4 | $VERSION = '1.03';
|
---|
| 5 | @EXPORT = ('nsort', 'ncmp');
|
---|
| 6 | require Exporter;
|
---|
| 7 | @ISA = ('Exporter');
|
---|
| 8 |
|
---|
| 9 | use strict;
|
---|
| 10 | use locale;
|
---|
| 11 | use integer;
|
---|
| 12 |
|
---|
| 13 | #-----------------------------------------------------------------------------
|
---|
| 14 | # constants:
|
---|
| 15 | BEGIN { *DEBUG = sub () {0} unless defined &DEBUG }
|
---|
| 16 |
|
---|
| 17 | use Config ();
|
---|
| 18 | BEGIN {
|
---|
| 19 | # Make a constant such that if a whole-number string is that long
|
---|
| 20 | # or shorter, we KNOW it's treatable as an integer
|
---|
| 21 | no integer;
|
---|
| 22 | my $x = length(256 ** $Config::Config{'intsize'} / 2) - 1;
|
---|
| 23 | die "Crazy intsize: <$Config::Config{'intsize'}>" if $x < 4;
|
---|
| 24 | eval 'sub MAX_INT_SIZE () {' . $x . '}';
|
---|
| 25 | die $@ if $@;
|
---|
| 26 | print "intsize $Config::Config{'intsize'} => MAX_INT_SIZE $x\n" if DEBUG;
|
---|
| 27 | }
|
---|
| 28 |
|
---|
| 29 | sub X_FIRST () {-1}
|
---|
| 30 | sub Y_FIRST () { 1}
|
---|
| 31 |
|
---|
| 32 | my @ORD = ('same', 'swap', 'asis');
|
---|
| 33 |
|
---|
| 34 | #-----------------------------------------------------------------------------
|
---|
| 35 | # For lack of a preprocessor:
|
---|
| 36 |
|
---|
| 37 | my($code, $guts);
|
---|
| 38 | $guts = <<'EOGUTS'; # This is the guts of both ncmp and nsort:
|
---|
| 39 |
|
---|
| 40 | if($x eq $y) {
|
---|
| 41 | # trap this expensive case first, and then fall thru to tiebreaker
|
---|
| 42 | $rv = 0;
|
---|
| 43 |
|
---|
| 44 | # Convoluted hack to get numerics to sort first, at string start:
|
---|
| 45 | } elsif($x =~ m/^\d/s) {
|
---|
| 46 | if($y =~ m/^\d/s) {
|
---|
| 47 | $rv = 0; # fall thru to normal comparison for the two numbers
|
---|
| 48 | } else {
|
---|
| 49 | $rv = X_FIRST;
|
---|
| 50 | DEBUG > 1 and print "Numeric-initial $x trumps letter-initial $y\n";
|
---|
| 51 | }
|
---|
| 52 | } elsif($y =~ m/^\d/s) {
|
---|
| 53 | $rv = Y_FIRST;
|
---|
| 54 | DEBUG > 1 and print "Numeric-initial $y trumps letter-initial $x\n";
|
---|
| 55 | } else {
|
---|
| 56 | $rv = 0;
|
---|
| 57 | }
|
---|
| 58 |
|
---|
| 59 | unless($rv) {
|
---|
| 60 | # Normal case:
|
---|
| 61 | $rv = 0;
|
---|
| 62 | DEBUG and print "<$x> and <$y> compared...\n";
|
---|
| 63 |
|
---|
| 64 | Consideration:
|
---|
| 65 | while(length $x and length $y) {
|
---|
| 66 |
|
---|
| 67 | DEBUG > 2 and print " <$x> and <$y>...\n";
|
---|
| 68 |
|
---|
| 69 | # First, non-numeric comparison:
|
---|
| 70 | $x2 = ($x =~ m/^(\D+)/s) ? length($1) : 0;
|
---|
| 71 | $y2 = ($y =~ m/^(\D+)/s) ? length($1) : 0;
|
---|
| 72 | # Now make x2 the min length of the two:
|
---|
| 73 | $x2 = $y2 if $x2 > $y2;
|
---|
| 74 | if($x2) {
|
---|
| 75 | DEBUG > 1 and printf " <%s> and <%s> lexically for length $x2...\n",
|
---|
| 76 | substr($x,0,$x2), substr($y,0,$x2);
|
---|
| 77 | do {
|
---|
| 78 | my $i = substr($x,0,$x2);
|
---|
| 79 | my $j = substr($y,0,$x2);
|
---|
| 80 | my $sv = $i cmp $j;
|
---|
| 81 | print "SCREAM! on <$i><$j> -- $sv != $rv \n" unless $rv == $sv;
|
---|
| 82 | last;
|
---|
| 83 | }
|
---|
| 84 |
|
---|
| 85 |
|
---|
| 86 | if $rv =
|
---|
| 87 | # The ''. things here force a copy that seems to work around a
|
---|
| 88 | # mysterious intermittent bug that 'use locale' provokes in
|
---|
| 89 | # many versions of Perl.
|
---|
| 90 | $cmp
|
---|
| 91 | ? $cmp->(substr($x,0,$x2) . '',
|
---|
| 92 | substr($y,0,$x2) . '',
|
---|
| 93 | )
|
---|
| 94 | :
|
---|
| 95 | scalar(( substr($x,0,$x2) . '' ) cmp
|
---|
| 96 | ( substr($y,0,$x2) . '' )
|
---|
| 97 | )
|
---|
| 98 | ;
|
---|
| 99 | # otherwise trim and keep going:
|
---|
| 100 | substr($x,0,$x2) = '';
|
---|
| 101 | substr($y,0,$x2) = '';
|
---|
| 102 | }
|
---|
| 103 |
|
---|
| 104 | # Now numeric:
|
---|
| 105 | # (actually just using $x2 and $y2 as scratch)
|
---|
| 106 |
|
---|
| 107 | if( $x =~ s/^(\d+)//s ) {
|
---|
| 108 | $x2 = $1;
|
---|
| 109 | if( $y =~ s/^(\d+)//s ) {
|
---|
| 110 | # We have two numbers here.
|
---|
| 111 | DEBUG > 1 and print " <$x2> and <$1> numerically\n";
|
---|
| 112 | if(length($x2) < MAX_INT_SIZE and length($1) < MAX_INT_SIZE) {
|
---|
| 113 | # small numbers: we can compare happily
|
---|
| 114 | last if $rv = $x2 <=> $1;
|
---|
| 115 | } else {
|
---|
| 116 | # ARBITRARILY large integers!
|
---|
| 117 |
|
---|
| 118 | # This saves on loss of precision that could happen
|
---|
| 119 | # with actual stringification.
|
---|
| 120 | # Also, I sense that very large numbers aren't too
|
---|
| 121 | # terribly common in sort data.
|
---|
| 122 |
|
---|
| 123 | # trim leading 0's:
|
---|
| 124 | ($y2 = $1) =~ s/^0+//s;
|
---|
| 125 | $x2 =~ s/^0+//s;
|
---|
| 126 | print " Treating $x2 and $y2 as bigint\n" if DEBUG;
|
---|
| 127 |
|
---|
| 128 | no locale; # we want the dumb cmp back.
|
---|
| 129 | last if $rv = (
|
---|
| 130 | # works only for non-negative whole numbers:
|
---|
| 131 | length($x2) <=> length($y2)
|
---|
| 132 | # the longer the numeral, the larger the value
|
---|
| 133 | or $x2 cmp $y2
|
---|
| 134 | # between equals, compare lexically!! amazing but true.
|
---|
| 135 | );
|
---|
| 136 | }
|
---|
| 137 | } else {
|
---|
| 138 | # X is numeric but Y isn't
|
---|
| 139 | $rv = Y_FIRST;
|
---|
| 140 | last;
|
---|
| 141 | }
|
---|
| 142 | } elsif( $y =~ s/^\d+//s ) { # we don't need to capture the substring
|
---|
| 143 | $rv = X_FIRST;
|
---|
| 144 | last;
|
---|
| 145 | }
|
---|
| 146 | # else one of them is 0-length.
|
---|
| 147 |
|
---|
| 148 | # end-while
|
---|
| 149 | }
|
---|
| 150 | }
|
---|
| 151 | EOGUTS
|
---|
| 152 |
|
---|
| 153 | sub maker {
|
---|
| 154 | my $code = $_[0];
|
---|
| 155 | $code =~ s/~COMPARATOR~/$guts/g || die "Can't find ~COMPARATOR~";
|
---|
| 156 | eval $code;
|
---|
| 157 | die $@ if $@;
|
---|
| 158 | }
|
---|
| 159 |
|
---|
| 160 | ##############################################################################
|
---|
| 161 |
|
---|
| 162 | maker(<<'EONSORT');
|
---|
| 163 | sub nsort {
|
---|
| 164 | # get options:
|
---|
| 165 | my($cmp, $lc);
|
---|
| 166 | ($cmp,$lc) = @{shift @_} if @_ and ref($_[0]) eq 'ARRAY';
|
---|
| 167 |
|
---|
| 168 | return @_ unless @_ > 1 or wantarray; # be clever
|
---|
| 169 |
|
---|
| 170 | my($x, $x2, $y, $y2, $rv); # scratch vars
|
---|
| 171 |
|
---|
| 172 | # We use a Schwartzian xform to memoize the lc'ing and \W-removal
|
---|
| 173 |
|
---|
| 174 | map $_->[0],
|
---|
| 175 | sort {
|
---|
| 176 | if($a->[0] eq $b->[0]) { 0 } # trap this expensive case
|
---|
| 177 | else {
|
---|
| 178 |
|
---|
| 179 | $x = $a->[1];
|
---|
| 180 | $y = $b->[1];
|
---|
| 181 |
|
---|
| 182 | ~COMPARATOR~
|
---|
| 183 |
|
---|
| 184 | # Tiebreakers...
|
---|
| 185 | DEBUG > 1 and print " -<${$a}[0]> cmp <${$b}[0]> is $rv ($ORD[$rv])\n";
|
---|
| 186 | $rv ||= (length($x) <=> length($y)) # shorter is always first
|
---|
| 187 | || ($cmp and $cmp->($x,$y) || $cmp->($a->[0], $b->[0]))
|
---|
| 188 | || ($x cmp $y )
|
---|
| 189 | || ($a->[0] cmp $b->[0])
|
---|
| 190 | ;
|
---|
| 191 |
|
---|
| 192 | DEBUG > 1 and print " <${$a}[0]> cmp <${$b}[0]> is $rv ($ORD[$rv])\n";
|
---|
| 193 | $rv;
|
---|
| 194 | }}
|
---|
| 195 |
|
---|
| 196 | map {;
|
---|
| 197 | $x = $lc ? $lc->($_) : lc($_); # x as scratch
|
---|
| 198 | $x =~ s/\W+//s;
|
---|
| 199 | [$_, $x];
|
---|
| 200 | }
|
---|
| 201 | @_
|
---|
| 202 | }
|
---|
| 203 | EONSORT
|
---|
| 204 |
|
---|
| 205 | #-----------------------------------------------------------------------------
|
---|
| 206 | maker(<<'EONCMP');
|
---|
| 207 | sub ncmp {
|
---|
| 208 | # The guts are basically the same as above...
|
---|
| 209 |
|
---|
| 210 | # get options:
|
---|
| 211 | my($cmp, $lc);
|
---|
| 212 | ($cmp,$lc) = @{shift @_} if @_ and ref($_[0]) eq 'ARRAY';
|
---|
| 213 |
|
---|
| 214 | if(@_ == 0) {
|
---|
| 215 | @_ = ($a, $b); # bit of a hack!
|
---|
| 216 | DEBUG > 1 and print "Hacking in <$a><$b>\n";
|
---|
| 217 | } elsif(@_ != 2) {
|
---|
| 218 | require Carp;
|
---|
| 219 | Carp::croak("Not enough options to ncmp!");
|
---|
| 220 | }
|
---|
| 221 | my($a,$b) = @_;
|
---|
| 222 | my($x, $x2, $y, $y2, $rv); # scratch vars
|
---|
| 223 |
|
---|
| 224 | DEBUG > 1 and print "ncmp args <$a><$b>\n";
|
---|
| 225 | if($a eq $b) { # trap this expensive case
|
---|
| 226 | 0;
|
---|
| 227 | } else {
|
---|
| 228 | $x = ($lc ? $lc->($a) : lc($a));
|
---|
| 229 | $x =~ s/\W+//s;
|
---|
| 230 | $y = ($lc ? $lc->($b) : lc($b));
|
---|
| 231 | $y =~ s/\W+//s;
|
---|
| 232 |
|
---|
| 233 | ~COMPARATOR~
|
---|
| 234 |
|
---|
| 235 |
|
---|
| 236 | # Tiebreakers...
|
---|
| 237 | DEBUG > 1 and print " -<$a> cmp <$b> is $rv ($ORD[$rv])\n";
|
---|
| 238 | $rv ||= (length($x) <=> length($y)) # shorter is always first
|
---|
| 239 | || ($cmp and $cmp->($x,$y) || $cmp->($a,$b))
|
---|
| 240 | || ($x cmp $y)
|
---|
| 241 | || ($a cmp $b)
|
---|
| 242 | ;
|
---|
| 243 |
|
---|
| 244 | DEBUG > 1 and print " <$a> cmp <$b> is $rv\n";
|
---|
| 245 | $rv;
|
---|
| 246 | }
|
---|
| 247 | }
|
---|
| 248 | EONCMP
|
---|
| 249 |
|
---|
| 250 | # clean up:
|
---|
| 251 | undef $guts;
|
---|
| 252 | undef &maker;
|
---|
| 253 |
|
---|
| 254 | #-----------------------------------------------------------------------------
|
---|
| 255 | 1;
|
---|
| 256 |
|
---|
| 257 | ############### END OF MAIN SOURCE ###########################################
|
---|
| 258 | __END__
|
---|
| 259 |
|
---|
| 260 | =head1 NAME
|
---|
| 261 |
|
---|
| 262 | Sort::Naturally -- sort lexically, but sort numeral parts numerically
|
---|
| 263 |
|
---|
| 264 | =head1 SYNOPSIS
|
---|
| 265 |
|
---|
| 266 | @them = nsort(qw(
|
---|
| 267 | foo12a foo12z foo13a foo 14 9x foo12 fooa foolio Foolio Foo12a
|
---|
| 268 | ));
|
---|
| 269 | print join(' ', @them), "\n";
|
---|
| 270 |
|
---|
| 271 | Prints:
|
---|
| 272 |
|
---|
| 273 | 9x 14 foo fooa foolio Foolio foo12 foo12a Foo12a foo12z foo13a
|
---|
| 274 |
|
---|
| 275 | (Or "foo12a" + "Foo12a" and "foolio" + "Foolio" and might be
|
---|
| 276 | switched, depending on your locale.)
|
---|
| 277 |
|
---|
| 278 | =head1 DESCRIPTION
|
---|
| 279 |
|
---|
| 280 | This module exports two functions, C<nsort> and C<ncmp>; they are used
|
---|
| 281 | in implementing my idea of a "natural sorting" algorithm. Under natural
|
---|
| 282 | sorting, numeric substrings are compared numerically, and other
|
---|
| 283 | word-characters are compared lexically.
|
---|
| 284 |
|
---|
| 285 | This is the way I define natural sorting:
|
---|
| 286 |
|
---|
| 287 | =over
|
---|
| 288 |
|
---|
| 289 | =item *
|
---|
| 290 |
|
---|
| 291 | Non-numeric word-character substrings are sorted lexically,
|
---|
| 292 | case-insensitively: "Foo" comes between "fish" and "fowl".
|
---|
| 293 |
|
---|
| 294 | =item *
|
---|
| 295 |
|
---|
| 296 | Numeric substrings are sorted numerically:
|
---|
| 297 | "100" comes after "20", not before.
|
---|
| 298 |
|
---|
| 299 | =item *
|
---|
| 300 |
|
---|
| 301 | \W substrings (neither words-characters nor digits) are I<ignored>.
|
---|
| 302 |
|
---|
| 303 | =item *
|
---|
| 304 |
|
---|
| 305 | Our use of \w, \d, \D, and \W is locale-sensitive: Sort::Naturally
|
---|
| 306 | uses a C<use locale> statement.
|
---|
| 307 |
|
---|
| 308 | =item *
|
---|
| 309 |
|
---|
| 310 | When comparing two strings, where a numeric substring in one
|
---|
| 311 | place is I<not> up against a numeric substring in another,
|
---|
| 312 | the non-numeric always comes first. This is fudged by
|
---|
| 313 | reading pretending that the lack of a number substring has
|
---|
| 314 | the value -1, like so:
|
---|
| 315 |
|
---|
| 316 | foo => "foo", -1
|
---|
| 317 | foobar => "foo", -1, "bar"
|
---|
| 318 | foo13 => "foo", 13,
|
---|
| 319 | foo13xyz => "foo", 13, "xyz"
|
---|
| 320 |
|
---|
| 321 | That's so that "foo" will come before "foo13", which will come
|
---|
| 322 | before "foobar".
|
---|
| 323 |
|
---|
| 324 | =item *
|
---|
| 325 |
|
---|
| 326 | The start of a string is exceptional: leading non-\W (non-word,
|
---|
| 327 | non-digit)
|
---|
| 328 | components are are ignored, and numbers come I<before> letters.
|
---|
| 329 |
|
---|
| 330 | =item *
|
---|
| 331 |
|
---|
| 332 | I define "numeric substring" just as sequences matching m/\d+/ --
|
---|
| 333 | scientific notation, commas, decimals, etc., are not seen. If
|
---|
| 334 | your data has thousands separators in numbers
|
---|
| 335 | ("20,000 Leagues Under The Sea" or "20.000 lieues sous les mers"),
|
---|
| 336 | consider stripping them before feeding them to C<nsort> or
|
---|
| 337 | C<ncmp>.
|
---|
| 338 |
|
---|
| 339 | =back
|
---|
| 340 |
|
---|
| 341 | =head2 The nsort function
|
---|
| 342 |
|
---|
| 343 | This function takes a list of strings, and returns a copy of the list,
|
---|
| 344 | sorted.
|
---|
| 345 |
|
---|
| 346 | This is what most people will want to use:
|
---|
| 347 |
|
---|
| 348 | @stuff = nsort(...list...);
|
---|
| 349 |
|
---|
| 350 | When nsort needs to compare non-numeric substrings, it
|
---|
| 351 | uses Perl's C<lc> function in scope of a <use locale>.
|
---|
| 352 | And when nsort needs to lowercase things, it uses Perl's
|
---|
| 353 | C<lc> function in scope of a <use locale>. If you want nsort
|
---|
| 354 | to use other functions instead, you can specify them in
|
---|
| 355 | an arrayref as the first argument to nsort:
|
---|
| 356 |
|
---|
| 357 | @stuff = nsort( [
|
---|
| 358 | \&string_comparator, # optional
|
---|
| 359 | \&lowercaser_function # optional
|
---|
| 360 | ],
|
---|
| 361 | ...list...
|
---|
| 362 | );
|
---|
| 363 |
|
---|
| 364 | If you want to specify a string comparator but no lowercaser,
|
---|
| 365 | then the options list is C<[\&comparator, '']> or
|
---|
| 366 | C<[\&comparator]>. If you want to specify no string comparator
|
---|
| 367 | but a lowercaser, then the options list is
|
---|
| 368 | C<['', \&lowercaser]>.
|
---|
| 369 |
|
---|
| 370 | Any comparator you specify is called as
|
---|
| 371 | C<$comparator-E<gt>($left, $right)>,
|
---|
| 372 | and, like a normal Perl C<cmp> replacement, must return
|
---|
| 373 | -1, 0, or 1 depending on whether the left argument is stringwise
|
---|
| 374 | less than, equal to, or greater than the right argument.
|
---|
| 375 |
|
---|
| 376 | Any lowercaser function you specify is called as
|
---|
| 377 | C<$lowercased = $lowercaser-E<gt>($original)>. The routine
|
---|
| 378 | must not modify its C<$_[0]>.
|
---|
| 379 |
|
---|
| 380 | =head2 The ncmp function
|
---|
| 381 |
|
---|
| 382 | Often, when sorting non-string values like this:
|
---|
| 383 |
|
---|
| 384 | @objects_sorted = sort { $a->tag cmp $b->tag } @objects;
|
---|
| 385 |
|
---|
| 386 | ...or even in a Schwartzian transform, like this:
|
---|
| 387 |
|
---|
| 388 | @strings =
|
---|
| 389 | map $_->[0]
|
---|
| 390 | sort { $a->[1] cmp $b->[1] }
|
---|
| 391 | map { [$_, make_a_sort_key_from($_) ]
|
---|
| 392 | @_
|
---|
| 393 | ;
|
---|
| 394 |
|
---|
| 395 | ...you wight want something that replaces not C<sort>, but C<cmp>.
|
---|
| 396 | That's what Sort::Naturally's C<ncmp> function is for. Call it with
|
---|
| 397 | the syntax C<ncmp($left,$right)> instead of C<$left cmp $right>,
|
---|
| 398 | but otherwise it's a fine replacement:
|
---|
| 399 |
|
---|
| 400 | @objects_sorted = sort { ncmp($a->tag,$b->tag) } @objects;
|
---|
| 401 |
|
---|
| 402 | @strings =
|
---|
| 403 | map $_->[0]
|
---|
| 404 | sort { ncmp($a->[1], $b->[1]) }
|
---|
| 405 | map { [$_, make_a_sort_key_from($_) ]
|
---|
| 406 | @_
|
---|
| 407 | ;
|
---|
| 408 |
|
---|
| 409 | Just as with C<nsort> can take different a string-comparator
|
---|
| 410 | and/or lowercaser, you can do the same with C<ncmp>, by passing
|
---|
| 411 | an arrayref as the first argument:
|
---|
| 412 |
|
---|
| 413 | ncmp( [
|
---|
| 414 | \&string_comparator, # optional
|
---|
| 415 | \&lowercaser_function # optional
|
---|
| 416 | ],
|
---|
| 417 | $left, $right
|
---|
| 418 | )
|
---|
| 419 |
|
---|
| 420 | You might get string comparators from L<Sort::ArbBiLex|Sort::ArbBiLex>.
|
---|
| 421 |
|
---|
| 422 | =head1 NOTES
|
---|
| 423 |
|
---|
| 424 | =over
|
---|
| 425 |
|
---|
| 426 | =item *
|
---|
| 427 |
|
---|
| 428 | This module is not a substitute for
|
---|
| 429 | L<Sort::Versions|Sort::Versions>! If
|
---|
| 430 | you just need proper version sorting, use I<that!>
|
---|
| 431 |
|
---|
| 432 | =item *
|
---|
| 433 |
|
---|
| 434 | If you need something that works I<sort of> like this module's
|
---|
| 435 | functions, but not quite the same, consider scouting thru this
|
---|
| 436 | module's source code, and adapting what you see. Besides
|
---|
| 437 | the functions that actually compile in this module, after the POD,
|
---|
| 438 | there's several alternate attempts of mine at natural sorting
|
---|
| 439 | routines, which are not compiled as part of the module, but which you
|
---|
| 440 | might find useful. They should all be I<working> implementations of
|
---|
| 441 | slightly different algorithms
|
---|
| 442 | (all of them based on Martin Pool's C<nsort>) which I eventually
|
---|
| 443 | discarded in favor of my algorithm. If you are having to
|
---|
| 444 | naturally-sort I<very large> data sets, and sorting is getting
|
---|
| 445 | ridiculously slow, you might consider trying one of those
|
---|
| 446 | discarded functions -- I have a feeling they might be faster on
|
---|
| 447 | large data sets. Benchmark them on your data and see. (Unless
|
---|
| 448 | you I<need> the speed, don't bother. Hint: substitute C<sort>
|
---|
| 449 | for C<nsort> in your code, and unless your program speeds up
|
---|
| 450 | drastically, it's not the sorting that's slowing things down.
|
---|
| 451 | But if it I<is> C<nsort> that's slowing things down, consider
|
---|
| 452 | just:
|
---|
| 453 |
|
---|
| 454 | if(@set >= SOME_VERY_BIG_NUMBER) {
|
---|
| 455 | no locale; # vroom vroom
|
---|
| 456 | @sorted = sort(@set); # feh, good enough
|
---|
| 457 | } elsif(@set >= SOME_BIG_NUMBER) {
|
---|
| 458 | use locale;
|
---|
| 459 | @sorted = sort(@set); # feh, good enough
|
---|
| 460 | } else {
|
---|
| 461 | # but keep it pretty for normal cases
|
---|
| 462 | @sorted = nsort(@set);
|
---|
| 463 | }
|
---|
| 464 |
|
---|
| 465 | =item *
|
---|
| 466 |
|
---|
| 467 | If you do adapt the routines in this module, email me; I'd
|
---|
| 468 | just be interested in hearing about it.
|
---|
| 469 |
|
---|
| 470 | =item *
|
---|
| 471 |
|
---|
| 472 | Thanks to the EFNet #perl people for encouraging this module,
|
---|
| 473 | especially magister and a-mused.
|
---|
| 474 |
|
---|
| 475 | =back
|
---|
| 476 |
|
---|
| 477 | =head1 COPYRIGHT AND DISCLAIMER
|
---|
| 478 |
|
---|
| 479 | Copyright 2001, Sean M. Burke C<[email protected]>, all rights
|
---|
| 480 | reserved. This program is free software; you can redistribute it
|
---|
| 481 | and/or modify it under the same terms as Perl itself.
|
---|
| 482 |
|
---|
| 483 | This program is distributed in the hope that it will be useful, but
|
---|
| 484 | without any warranty; without even the implied warranty of
|
---|
| 485 | merchantability or fitness for a particular purpose.
|
---|
| 486 |
|
---|
| 487 | =head1 AUTHOR
|
---|
| 488 |
|
---|
| 489 | Sean M. Burke C<[email protected]>
|
---|
| 490 |
|
---|
| 491 | =cut
|
---|
| 492 |
|
---|
| 493 | ############ END OF DOCS ############
|
---|
| 494 |
|
---|
| 495 | ############################################################################
|
---|
| 496 | ############################################################################
|
---|
| 497 |
|
---|
| 498 | ############ BEGIN OLD STUFF ############
|
---|
| 499 |
|
---|
| 500 | # We can't have "use integer;", or else (5 <=> 5.1) comes out "0" !
|
---|
| 501 |
|
---|
| 502 | #-----------------------------------------------------------------------------
|
---|
| 503 | sub nsort {
|
---|
| 504 | my($cmp, $lc);
|
---|
| 505 | return @_ if @_ < 2; # Just to be CLEVER.
|
---|
| 506 |
|
---|
| 507 | my($x, $i); # scratch vars
|
---|
| 508 |
|
---|
| 509 | # And now, the GREAT BIG Schwartzian transform:
|
---|
| 510 |
|
---|
| 511 | map
|
---|
| 512 | $_->[0],
|
---|
| 513 |
|
---|
| 514 | sort {
|
---|
| 515 | # Uses $i as the index variable, $x as the result.
|
---|
| 516 | $x = 0;
|
---|
| 517 | $i = 1;
|
---|
| 518 | DEBUG and print "\nComparing ", map("{$_}", @$a),
|
---|
| 519 | ' : ', map("{$_}", @$b), , "...\n";
|
---|
| 520 |
|
---|
| 521 | while($i < @$a and $i < @$b) {
|
---|
| 522 | DEBUG and print " comparing $i: {$a->[$i]} cmp {$b->[$i]} => ",
|
---|
| 523 | $a->[$i] cmp $b->[$i], "\n";
|
---|
| 524 | last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic
|
---|
| 525 | ++$i;
|
---|
| 526 |
|
---|
| 527 | DEBUG and print " comparing $i: {$a->[$i]} <=> {$b->[$i]} => ",
|
---|
| 528 | $a->[$i] <=> $b->[$i], "\n";
|
---|
| 529 | last if ($x = ($a->[$i] <=> $b->[$i])); # numeric
|
---|
| 530 | ++$i;
|
---|
| 531 | }
|
---|
| 532 |
|
---|
| 533 | DEBUG and print "{$a->[0]} : {$b->[0]} is ",
|
---|
| 534 | $x || (@$a <=> @$b) || 0
|
---|
| 535 | ,"\n"
|
---|
| 536 | ;
|
---|
| 537 | $x || (@$a <=> @$b) || ($a->[0] cmp $b->[0]);
|
---|
| 538 | # unless we found a result for $x in the while loop,
|
---|
| 539 | # use length as a tiebreaker, otherwise use cmp
|
---|
| 540 | # on the original string as a fallback tiebreaker.
|
---|
| 541 | }
|
---|
| 542 |
|
---|
| 543 | map {
|
---|
| 544 | my @bit = ($x = defined($_) ? $_ : '');
|
---|
| 545 |
|
---|
| 546 | if($x =~ m/^[+-]?(?=\d|\.\d)\d*(?:\.\d*)?(?:[Ee](?:[+-]?\d+))?\z/s) {
|
---|
| 547 | # It's entirely purely numeric, so treat it specially:
|
---|
| 548 | push @bit, '', $x;
|
---|
| 549 | } else {
|
---|
| 550 | # Consume the string.
|
---|
| 551 | while(length $x) {
|
---|
| 552 | push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : '';
|
---|
| 553 | push @bit, ($x =~ s/^(\d+)//s) ? $1 : 0;
|
---|
| 554 | }
|
---|
| 555 | }
|
---|
| 556 | DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n";
|
---|
| 557 |
|
---|
| 558 | # End result: [original bit , (text, number), (text, number), ...]
|
---|
| 559 | # Minimally: [0-length original bit,]
|
---|
| 560 | # Examples:
|
---|
| 561 | # ['10' => '' , 10, ]
|
---|
| 562 | # ['fo900' => 'fo' , 900, ]
|
---|
| 563 | # ['foo10' => 'foo', 10, ]
|
---|
| 564 | # ['foo9.pl' => 'foo', 9, , '.pl', 0 ]
|
---|
| 565 | # ['foo32.pl' => 'foo', 32, , '.pl', 0 ]
|
---|
| 566 | # ['foo325.pl' => 'foo', 325, , '.pl', 0 ]
|
---|
| 567 | # Yes, always an ODD number of elements.
|
---|
| 568 |
|
---|
| 569 | \@bit;
|
---|
| 570 | }
|
---|
| 571 | @_;
|
---|
| 572 | }
|
---|
| 573 |
|
---|
| 574 | #-----------------------------------------------------------------------------
|
---|
| 575 | # Same as before, except without the pure-number trap.
|
---|
| 576 |
|
---|
| 577 | sub nsorts {
|
---|
| 578 | return @_ if @_ < 2; # Just to be CLEVER.
|
---|
| 579 |
|
---|
| 580 | my($x, $i); # scratch vars
|
---|
| 581 |
|
---|
| 582 | # And now, the GREAT BIG Schwartzian transform:
|
---|
| 583 |
|
---|
| 584 | map
|
---|
| 585 | $_->[0],
|
---|
| 586 |
|
---|
| 587 | sort {
|
---|
| 588 | # Uses $i as the index variable, $x as the result.
|
---|
| 589 | $x = 0;
|
---|
| 590 | $i = 1;
|
---|
| 591 | DEBUG and print "\nComparing ", map("{$_}", @$a),
|
---|
| 592 | ' : ', map("{$_}", @$b), , "...\n";
|
---|
| 593 |
|
---|
| 594 | while($i < @$a and $i < @$b) {
|
---|
| 595 | DEBUG and print " comparing $i: {$a->[$i]} cmp {$b->[$i]} => ",
|
---|
| 596 | $a->[$i] cmp $b->[$i], "\n";
|
---|
| 597 | last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic
|
---|
| 598 | ++$i;
|
---|
| 599 |
|
---|
| 600 | DEBUG and print " comparing $i: {$a->[$i]} <=> {$b->[$i]} => ",
|
---|
| 601 | $a->[$i] <=> $b->[$i], "\n";
|
---|
| 602 | last if ($x = ($a->[$i] <=> $b->[$i])); # numeric
|
---|
| 603 | ++$i;
|
---|
| 604 | }
|
---|
| 605 |
|
---|
| 606 | DEBUG and print "{$a->[0]} : {$b->[0]} is ",
|
---|
| 607 | $x || (@$a <=> @$b) || 0
|
---|
| 608 | ,"\n"
|
---|
| 609 | ;
|
---|
| 610 | $x || (@$a <=> @$b) || ($a->[0] cmp $b->[0]);
|
---|
| 611 | # unless we found a result for $x in the while loop,
|
---|
| 612 | # use length as a tiebreaker, otherwise use cmp
|
---|
| 613 | # on the original string as a fallback tiebreaker.
|
---|
| 614 | }
|
---|
| 615 |
|
---|
| 616 | map {
|
---|
| 617 | my @bit = ($x = defined($_) ? $_ : '');
|
---|
| 618 |
|
---|
| 619 | while(length $x) {
|
---|
| 620 | push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : '';
|
---|
| 621 | push @bit, ($x =~ s/^(\d+)//s) ? $1 : 0;
|
---|
| 622 | }
|
---|
| 623 | DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n";
|
---|
| 624 |
|
---|
| 625 | # End result: [original bit , (text, number), (text, number), ...]
|
---|
| 626 | # Minimally: [0-length original bit,]
|
---|
| 627 | # Examples:
|
---|
| 628 | # ['10' => '' , 10, ]
|
---|
| 629 | # ['fo900' => 'fo' , 900, ]
|
---|
| 630 | # ['foo10' => 'foo', 10, ]
|
---|
| 631 | # ['foo9.pl' => 'foo', 9, , '.pl', 0 ]
|
---|
| 632 | # ['foo32.pl' => 'foo', 32, , '.pl', 0 ]
|
---|
| 633 | # ['foo325.pl' => 'foo', 325, , '.pl', 0 ]
|
---|
| 634 | # Yes, always an ODD number of elements.
|
---|
| 635 |
|
---|
| 636 | \@bit;
|
---|
| 637 | }
|
---|
| 638 | @_;
|
---|
| 639 | }
|
---|
| 640 |
|
---|
| 641 | #-----------------------------------------------------------------------------
|
---|
| 642 | # Same as before, except for the sort-key-making
|
---|
| 643 |
|
---|
| 644 | sub nsort0 {
|
---|
| 645 | return @_ if @_ < 2; # Just to be CLEVER.
|
---|
| 646 |
|
---|
| 647 | my($x, $i); # scratch vars
|
---|
| 648 |
|
---|
| 649 | # And now, the GREAT BIG Schwartzian transform:
|
---|
| 650 |
|
---|
| 651 | map
|
---|
| 652 | $_->[0],
|
---|
| 653 |
|
---|
| 654 | sort {
|
---|
| 655 | # Uses $i as the index variable, $x as the result.
|
---|
| 656 | $x = 0;
|
---|
| 657 | $i = 1;
|
---|
| 658 | DEBUG and print "\nComparing ", map("{$_}", @$a),
|
---|
| 659 | ' : ', map("{$_}", @$b), , "...\n";
|
---|
| 660 |
|
---|
| 661 | while($i < @$a and $i < @$b) {
|
---|
| 662 | DEBUG and print " comparing $i: {$a->[$i]} cmp {$b->[$i]} => ",
|
---|
| 663 | $a->[$i] cmp $b->[$i], "\n";
|
---|
| 664 | last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic
|
---|
| 665 | ++$i;
|
---|
| 666 |
|
---|
| 667 | DEBUG and print " comparing $i: {$a->[$i]} <=> {$b->[$i]} => ",
|
---|
| 668 | $a->[$i] <=> $b->[$i], "\n";
|
---|
| 669 | last if ($x = ($a->[$i] <=> $b->[$i])); # numeric
|
---|
| 670 | ++$i;
|
---|
| 671 | }
|
---|
| 672 |
|
---|
| 673 | DEBUG and print "{$a->[0]} : {$b->[0]} is ",
|
---|
| 674 | $x || (@$a <=> @$b) || 0
|
---|
| 675 | ,"\n"
|
---|
| 676 | ;
|
---|
| 677 | $x || (@$a <=> @$b) || ($a->[0] cmp $b->[0]);
|
---|
| 678 | # unless we found a result for $x in the while loop,
|
---|
| 679 | # use length as a tiebreaker, otherwise use cmp
|
---|
| 680 | # on the original string as a fallback tiebreaker.
|
---|
| 681 | }
|
---|
| 682 |
|
---|
| 683 | map {
|
---|
| 684 | my @bit = ($x = defined($_) ? $_ : '');
|
---|
| 685 |
|
---|
| 686 | if($x =~ m/^[+-]?(?=\d|\.\d)\d*(?:\.\d*)?(?:[Ee](?:[+-]?\d+))?\z/s) {
|
---|
| 687 | # It's entirely purely numeric, so treat it specially:
|
---|
| 688 | push @bit, '', $x;
|
---|
| 689 | } else {
|
---|
| 690 | # Consume the string.
|
---|
| 691 | while(length $x) {
|
---|
| 692 | push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : '';
|
---|
| 693 | # Secret sauce:
|
---|
| 694 | if($x =~ s/^(\d+)//s) {
|
---|
| 695 | if(substr($1,0,1) eq '0' and $1 != 0) {
|
---|
| 696 | push @bit, $1 / (10 ** length($1));
|
---|
| 697 | } else {
|
---|
| 698 | push @bit, $1;
|
---|
| 699 | }
|
---|
| 700 | } else {
|
---|
| 701 | push @bit, 0;
|
---|
| 702 | }
|
---|
| 703 | }
|
---|
| 704 | }
|
---|
| 705 | DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n";
|
---|
| 706 |
|
---|
| 707 | \@bit;
|
---|
| 708 | }
|
---|
| 709 | @_;
|
---|
| 710 | }
|
---|
| 711 |
|
---|
| 712 | #-----------------------------------------------------------------------------
|
---|
| 713 | # Like nsort0, but WITHOUT pure number handling, and WITH special treatment
|
---|
| 714 | # of pulling off extensions and version numbers.
|
---|
| 715 |
|
---|
| 716 | sub nsortf {
|
---|
| 717 | return @_ if @_ < 2; # Just to be CLEVER.
|
---|
| 718 |
|
---|
| 719 | my($x, $i); # scratch vars
|
---|
| 720 |
|
---|
| 721 | # And now, the GREAT BIG Schwartzian transform:
|
---|
| 722 |
|
---|
| 723 | map
|
---|
| 724 | $_->[0],
|
---|
| 725 |
|
---|
| 726 | sort {
|
---|
| 727 | # Uses $i as the index variable, $x as the result.
|
---|
| 728 | $x = 0;
|
---|
| 729 | $i = 3;
|
---|
| 730 | DEBUG and print "\nComparing ", map("{$_}", @$a),
|
---|
| 731 | ' : ', map("{$_}", @$b), , "...\n";
|
---|
| 732 |
|
---|
| 733 | while($i < @$a and $i < @$b) {
|
---|
| 734 | DEBUG and print " comparing $i: {$a->[$i]} cmp {$b->[$i]} => ",
|
---|
| 735 | $a->[$i] cmp $b->[$i], "\n";
|
---|
| 736 | last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic
|
---|
| 737 | ++$i;
|
---|
| 738 |
|
---|
| 739 | DEBUG and print " comparing $i: {$a->[$i]} <=> {$b->[$i]} => ",
|
---|
| 740 | $a->[$i] <=> $b->[$i], "\n";
|
---|
| 741 | last if ($x = ($a->[$i] <=> $b->[$i])); # numeric
|
---|
| 742 | ++$i;
|
---|
| 743 | }
|
---|
| 744 |
|
---|
| 745 | DEBUG and print "{$a->[0]} : {$b->[0]} is ",
|
---|
| 746 | $x || (@$a <=> @$b) || 0
|
---|
| 747 | ,"\n"
|
---|
| 748 | ;
|
---|
| 749 | $x || (@$a <=> @$b ) || ($a->[1] cmp $b->[1])
|
---|
| 750 | || ($a->[2] <=> $b->[2]) || ($a->[0] cmp $b->[0]);
|
---|
| 751 | # unless we found a result for $x in the while loop,
|
---|
| 752 | # use length as a tiebreaker, otherwise use the
|
---|
| 753 | # lc'd extension, otherwise the verison, otherwise use
|
---|
| 754 | # the original string as a fallback tiebreaker.
|
---|
| 755 | }
|
---|
| 756 |
|
---|
| 757 | map {
|
---|
| 758 | my @bit = ( ($x = defined($_) ? $_ : ''), '',0 );
|
---|
| 759 |
|
---|
| 760 | {
|
---|
| 761 | # Consume the string.
|
---|
| 762 |
|
---|
| 763 | # First, pull off any VAX-style version
|
---|
| 764 | $bit[2] = $1 if $x =~ s/;(\d+)$//;
|
---|
| 765 |
|
---|
| 766 | # Then pull off any apparent extension
|
---|
| 767 | if( $x !~ m/^\.+$/s and # don't mangle ".", "..", or "..."
|
---|
| 768 | $x =~ s/(\.[^\.\;]*)$//sg
|
---|
| 769 | # We could try to avoid catching all-digit extensions,
|
---|
| 770 | # but I think that's getting /too/ clever.
|
---|
| 771 | ) {
|
---|
| 772 | $i = $1;
|
---|
| 773 | if($x =~ m<[^\\\://]$>s) {
|
---|
| 774 | # We didn't take the whole basename.
|
---|
| 775 | $bit[1] = lc $i;
|
---|
| 776 | DEBUG and print "Consuming extension \"$1\"\n";
|
---|
| 777 | } else {
|
---|
| 778 | # We DID take the whole basename. Fix it.
|
---|
| 779 | $x = $1; # Repair it.
|
---|
| 780 | }
|
---|
| 781 | }
|
---|
| 782 |
|
---|
| 783 | push @bit, '', -1 if $x =~ m/^\./s;
|
---|
| 784 | # A hack to make .-initial filenames sort first, regardless of locale.
|
---|
| 785 | # And -1 is always a sort-firster, since in the code below, there's
|
---|
| 786 | # no allowance for filenames containing negative numbers: -1.dat
|
---|
| 787 | # will be read as string '-' followed by number 1.
|
---|
| 788 |
|
---|
| 789 | while(length $x) {
|
---|
| 790 | push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : '';
|
---|
| 791 | # Secret sauce:
|
---|
| 792 | if($x =~ s/^(\d+)//s) {
|
---|
| 793 | if(substr($1,0,1) eq '0' and $1 != 0) {
|
---|
| 794 | push @bit, $1 / (10 ** length($1));
|
---|
| 795 | } else {
|
---|
| 796 | push @bit, $1;
|
---|
| 797 | }
|
---|
| 798 | } else {
|
---|
| 799 | push @bit, 0;
|
---|
| 800 | }
|
---|
| 801 | }
|
---|
| 802 | }
|
---|
| 803 |
|
---|
| 804 | DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n";
|
---|
| 805 |
|
---|
| 806 | \@bit;
|
---|
| 807 | }
|
---|
| 808 | @_;
|
---|
| 809 | }
|
---|
| 810 |
|
---|
| 811 | # yowza yowza yowza.
|
---|
| 812 |
|
---|