Changeset 1287


Ignore:
Timestamp:
2000-07-14T12:24:20+12:00 (24 years ago)
Author:
sjboddie
Message:

Implemented a -sortmeta option for import.pl to sort archives.inf file
(generated at end of import process) alphabetically by the given
metadata element. This may be useful for some collections as boolean
queries currently return matches in build (fairly random) order. Changing
the order of archives.inf changes the order that documents are built.
This option has a couple of important limitations:

  1. Can't be used in conjunction with the groupsize option as it would then only change the build order of groups of documents which doesn't seem very useful.
  2. Is of limited use when building indexes at a section level as the build order is only sorted by document, not by section.
Location:
trunk/gsdl
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/bin/script/import.pl

    r1269 r1287  
    6060    print STDERR "   -maxdocs number        Maximum number of documents to import\n";
    6161    print STDERR "   -groupsize number      Number of GML documents to group into one file\n";
     62    print STDERR "   -sortmeta metadata     Sort documents alphabetically by metadata for\n";
     63    print STDERR "                          building. This will be disabled if groupsize > 1\n";
    6264    print STDERR "   -debug                 Print imported text to STDOUT\n\n";
    6365}
     
    6971    my ($verbosity, $importdir, $archivedir, $keepold,
    7072    $removeold, $gzip, $groupsize, $debug, $maxdocs, $collection,
    71     $configfilename, $collectcfg, $pluginfo,
     73    $configfilename, $collectcfg, $pluginfo, $sortmeta,
    7274    $archive_info_filename, $archive_info, $processor);
    7375    if (!parsargv::parse(\@ARGV,
     
    7981             'gzip', \$gzip,
    8082             'groupsize/\d+/1', \$groupsize,
     83             'sortmeta/.*/', \$sortmeta,
    8184             'debug', \$debug,
    8285             'maxdocs/^\-?\d+/-1', \$maxdocs)) {
     
    9295    &print_usage();
    9396    die "\n";
     97    }
     98
     99    # check sortmeta
     100    $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
     101    if (defined $sortmeta && $groupsize > 1) {
     102    print STDERR "WARNING: import.pl cannot sort documents when groupsize > 1\n";
     103    print STDERR "         sortmeta option will be ignored\n\n";
     104    $sortmeta = undef;
    94105    }
    95106
     
    160171    $processor = new docsave ($collection, $archive_info, $verbosity, $gzip, $groupsize);
    161172    $processor->setarchivedir ($archivedir);
     173    $processor->set_sortmeta ($sortmeta) if defined $sortmeta;
    162174    } else {
    163175    $processor = new docprint ();
     
    173185    # write out the archive information file
    174186    if (!$debug) {
    175     $processor->close_file_output();
     187    $processor->close_file_output() if $groupsize > 1;
    176188    $archive_info->save_info($archive_info_filename);
    177189    }
  • trunk/gsdl/perllib/arcinfo.pm

    r537 r1287  
    9393    my $i = 0;
    9494    while ($i < scalar (@{$self->{'order'}})) {
    95         if ($self->{'order'}->[$i] eq $OID) {
     95        if ($self->{'order'}->[$i]->[0] eq $OID) {
    9696        splice (@{$self->{'order'}}, $i, 1);
    9797        last;
     
    105105sub add_info {
    106106    my $self = shift (@_);
    107     my ($OID, $doc_file) = @_;
     107    my ($OID, $doc_file, $sortmeta) = @_;
     108    $sortmeta = "" unless defined $sortmeta;
    108109
    109110    $self->delete_info ($OID);
    110111    $self->{'info'}->{$OID} = [$doc_file];
    111     push (@{$self->{'order'}}, $OID);
     112    push (@{$self->{'order'}}, [$OID, $sortmeta]);
    112113}
    113114
     
    119120    my @list = ();
    120121
    121     foreach $OID (@{$self->{'order'}}) {
    122     push (@list, [$OID, $self->{'info'}->{$OID}->[0]]);
     122    foreach $OID (sort {$a->[1] cmp $b->[1]} @{$self->{'order'}}) {
     123    push (@list, [$OID->[0], $self->{'info'}->{$OID->[0]}->[0]]);
    123124    }
    124125
     
    133134    my @list = ();
    134135
    135     foreach $OID (@{$self->{'order'}}) {
    136     push (@list, [$self->{'info'}->{$OID}->[0], $OID]);
     136    foreach $OID (sort {$a->[1] cmp $b->[1]} @{$self->{'order'}}) {
     137    push (@list, [$self->{'info'}->{$OID->[0]}->[0], $OID->[0]]);
    137138    }
    138139
  • trunk/gsdl/perllib/docsave.pm

    r898 r1287  
    5656    $self->{'archive_dir'} = "$ENV{'GSDLHOME'}/collect/$self->{'collection'}/archives";
    5757
     58    $self->{'sortmeta'} = undef;
     59
    5860    return bless $self, $class;
    5961}
     
    6668}
    6769
     70sub set_sortmeta {
     71    my $self = shift (@_);
     72    my ($sortmeta) = @_;
     73
     74    $self->{'sortmeta'} = $sortmeta;
     75}
     76
    6877sub process {
    6978    my $self = shift (@_);
    7079    my ($doc_obj) = @_;
    7180
    72     my $archive_dir = $self->{'archive_dir'};
     81    if ($self->{'groupsize'} > 1) {
     82    $self->group_process ($doc_obj);
     83
     84    } else {
     85    # groupsize is 1 (i.e. one document per GML file) so sortmeta
     86    # may be used
     87
     88    my $OID = $doc_obj->get_OID();
     89    $OID = "NULL" unless defined $OID;
     90   
     91    # get document's directory
     92    my $doc_dir = $self->get_doc_dir ($OID);
     93   
     94    # copy all the associated files, add this information as metadata
     95    # to the document
     96    $self->process_assoc_files ($doc_obj, $doc_dir);
     97
     98    my $doc_file
     99        = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.gml");
     100    my $short_doc_file = &util::filename_cat ($doc_dir, "doc.gml");
     101       
     102    if (!open (OUTDOC, ">$doc_file")) {
     103        print STDERR "docsave::process could not write to file $doc_file\n";
     104        return;
     105    }
     106
     107    # save this document
     108    $doc_obj->output_section('docsave::OUTDOC', $doc_obj->get_top_section());
     109    close OUTDOC;
     110
     111    if ($self->{'gzip'}) {
     112        my $doc_file = $self->{'gs_filename'};
     113        `gzip $doc_file`;
     114        $doc_file .= ".gz";
     115        $short_doc_file .= ".gz";
     116        if (!-e $doc_file) {
     117        print STDERR "error while gzipping: $doc_file doesn't exist\n";
     118        return 0;
     119        }
     120    }
     121
     122    # do the sortmeta thing
     123    my ($metadata);
     124    if (defined ($self->{'sortmeta'})) {
     125        $metadata = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'sortmeta'});
     126    }
     127
     128    # store reference in the archive_info
     129    $self->{'archive_info'}->add_info($OID, $short_doc_file, $metadata);
     130    }
     131}
     132
     133sub group_process {
     134    my $self = shift (@_);
     135    my ($doc_obj) = @_;
     136
    73137    my $OID = $doc_obj->get_OID();
    74138    $OID = "NULL" unless defined $OID;
     
    79143
    80144    # opening a new file, or document has assoicated files => directory needed
    81     if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0))
    82     {
    83     # get the document's directory.
    84     my $doc_info = $self->{'archive_info'}->get_info($OID);
    85     my $doc_dir = "";
    86     if (defined $doc_info && scalar(@$doc_info) >= 1) {
    87         # this OID already has an assigned directory, use the
    88         # same one.
    89         $doc_dir = $doc_info->[0];
    90         $doc_dir =~ s/\/?doc\.gml(\.gz)?$//;
    91     } else {
    92         # have to get a new document directory
    93         my $doc_dir_rest = $OID;
    94         my $doc_dir_num = 0;
    95         do {
    96         $doc_dir .= "/" if $doc_dir_num > 0;
    97         if ($doc_dir_rest =~ s/^(.{1,8})//) {
    98             $doc_dir .= $1;
    99             $doc_dir_num++;
    100         }
    101         } while ($doc_dir_rest ne "" &&
    102              ((-d &util::filename_cat ($archive_dir, "$doc_dir.dir")) ||
    103               ($self->{'archive_info'}->size() >= 1024 && $doc_dir_num < 2)));
    104         $doc_dir .= ".dir";
    105        
    106     }
    107    
    108     &util::mk_all_dir ("$archive_dir/$doc_dir");
    109    
     145    if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
     146
     147    # get document's directory
     148    my $doc_dir = $self->get_doc_dir ($OID);
     149
    110150    # copy all the associated files, add this information as metadata
    111151    # to the document
    112     my @assoc_files = ();
    113     foreach $assoc_file (@{$doc_obj->get_assoc_files()}) {
    114         my ($dir, $afile) = $assoc_file->[1] =~ /^(.*?)([^\/\\]+)$/;
    115         $dir = "" unless defined $dir;
    116         if (-e $assoc_file->[0]) {
    117         my $filepath = &util::filename_cat($archive_dir, $doc_dir, $afile);
    118         &util::hard_link ($assoc_file->[0], $filepath);
    119         $doc_obj->add_metadata ($doc_obj->get_top_section(),
    120                     "gsdlassocfile",
    121                     "$afile:$assoc_file->[2]:$dir");
    122         } else {
    123         print STDERR "docsave::process couldn't copy the associated file " .
    124             "$assoc_file->[0] to $afile\n";
    125         }
    126     }
    127 
    128     if ($open_new_file)
    129     {
     152    $self->process_assoc_files ($doc_obj, $doc_dir);
     153
     154
     155    if ($open_new_file) {
    130156        # only if opening new file
    131157        my $doc_file
    132         = &util::filename_cat ($archive_dir, $doc_dir, "doc.gml");
     158        = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.gml");
    133159        my $short_doc_file = &util::filename_cat ($doc_dir, "doc.gml");
    134160       
     
    139165       
    140166        if (!open (OUTDOC, ">$doc_file")) {
    141         print STDERR "docsave::process could not write to file $doc_file\n";
     167        print STDERR "docsave::group_process could not write to file $doc_file\n";
    142168        return;
    143169        }
     
    153179    $self->{'gs_count'}++;
    154180}
     181
     182
     183sub get_doc_dir {
     184    my $self = shift (@_);
     185    my ($OID) = @_;
     186
     187    my $doc_info = $self->{'archive_info'}->get_info($OID);
     188    my $doc_dir = "";
     189    if (defined $doc_info && scalar(@$doc_info) >= 1) {
     190    # this OID already has an assigned directory, use the
     191    # same one.
     192    $doc_dir = $doc_info->[0];
     193    $doc_dir =~ s/\/?doc\.gml(\.gz)?$//;
     194    } else {
     195    # have to get a new document directory
     196    my $doc_dir_rest = $OID;
     197    my $doc_dir_num = 0;
     198    do {
     199        $doc_dir .= "/" if $doc_dir_num > 0;
     200        if ($doc_dir_rest =~ s/^(.{1,8})//) {
     201        $doc_dir .= $1;
     202        $doc_dir_num++;
     203        }
     204    } while ($doc_dir_rest ne "" &&
     205         ((-d &util::filename_cat ($self->{'archive_dir'}, "$doc_dir.dir")) ||
     206          ($self->{'archive_info'}->size() >= 1024 && $doc_dir_num < 2)));
     207    $doc_dir .= ".dir";
     208   
     209    }
     210   
     211    &util::mk_all_dir (&util::filename_cat ($self->{'archive_dir'}, $doc_dir));
     212
     213    return $doc_dir;
     214}
     215
     216
     217sub process_assoc_files {
     218    my $self = shift (@_);
     219    my ($doc_obj, $doc_dir) = @_;
     220
     221    my @assoc_files = ();
     222    foreach $assoc_file (@{$doc_obj->get_assoc_files()}) {
     223    my ($dir, $afile) = $assoc_file->[1] =~ /^(.*?)([^\/\\]+)$/;
     224    $dir = "" unless defined $dir;
     225    if (-e $assoc_file->[0]) {
     226        my $filepath = &util::filename_cat($self->{'archive_dir'}, $doc_dir, $afile);
     227        &util::hard_link ($assoc_file->[0], $filepath);
     228        $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
     229                     "gsdlassocfile",
     230                     "$afile:$assoc_file->[2]:$dir");
     231    } else {
     232        print STDERR "docsave::process couldn't copy the associated file " .
     233        "$assoc_file->[0] to $afile\n";
     234    }
     235    }
     236}
     237   
    155238
    156239sub close_file_output
Note: See TracChangeset for help on using the changeset viewer.