Ignore:
Timestamp:
2000-07-14T12:24:20+12:00 (24 years ago)
Author:
sjboddie
Message:

Implemented a -sortmeta option for import.pl to sort archives.inf file
(generated at end of import process) alphabetically by the given
metadata element. This may be useful for some collections as boolean
queries currently return matches in build (fairly random) order. Changing
the order of archives.inf changes the order that documents are built.
This option has a couple of important limitations:

  1. Can't be used in conjunction with the groupsize option as it would then only change the build order of groups of documents which doesn't seem very useful.
  2. Is of limited use when building indexes at a section level as the build order is only sorted by document, not by section.
File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/docsave.pm

    r898 r1287  
    5656    $self->{'archive_dir'} = "$ENV{'GSDLHOME'}/collect/$self->{'collection'}/archives";
    5757
     58    $self->{'sortmeta'} = undef;
     59
    5860    return bless $self, $class;
    5961}
     
    6668}
    6769
     70sub set_sortmeta {
     71    my $self = shift (@_);
     72    my ($sortmeta) = @_;
     73
     74    $self->{'sortmeta'} = $sortmeta;
     75}
     76
    6877sub process {
    6978    my $self = shift (@_);
    7079    my ($doc_obj) = @_;
    7180
    72     my $archive_dir = $self->{'archive_dir'};
     81    if ($self->{'groupsize'} > 1) {
     82    $self->group_process ($doc_obj);
     83
     84    } else {
     85    # groupsize is 1 (i.e. one document per GML file) so sortmeta
     86    # may be used
     87
     88    my $OID = $doc_obj->get_OID();
     89    $OID = "NULL" unless defined $OID;
     90   
     91    # get document's directory
     92    my $doc_dir = $self->get_doc_dir ($OID);
     93   
     94    # copy all the associated files, add this information as metadata
     95    # to the document
     96    $self->process_assoc_files ($doc_obj, $doc_dir);
     97
     98    my $doc_file
     99        = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.gml");
     100    my $short_doc_file = &util::filename_cat ($doc_dir, "doc.gml");
     101       
     102    if (!open (OUTDOC, ">$doc_file")) {
     103        print STDERR "docsave::process could not write to file $doc_file\n";
     104        return;
     105    }
     106
     107    # save this document
     108    $doc_obj->output_section('docsave::OUTDOC', $doc_obj->get_top_section());
     109    close OUTDOC;
     110
     111    if ($self->{'gzip'}) {
     112        my $doc_file = $self->{'gs_filename'};
     113        `gzip $doc_file`;
     114        $doc_file .= ".gz";
     115        $short_doc_file .= ".gz";
     116        if (!-e $doc_file) {
     117        print STDERR "error while gzipping: $doc_file doesn't exist\n";
     118        return 0;
     119        }
     120    }
     121
     122    # do the sortmeta thing
     123    my ($metadata);
     124    if (defined ($self->{'sortmeta'})) {
     125        $metadata = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'sortmeta'});
     126    }
     127
     128    # store reference in the archive_info
     129    $self->{'archive_info'}->add_info($OID, $short_doc_file, $metadata);
     130    }
     131}
     132
     133sub group_process {
     134    my $self = shift (@_);
     135    my ($doc_obj) = @_;
     136
    73137    my $OID = $doc_obj->get_OID();
    74138    $OID = "NULL" unless defined $OID;
     
    79143
    80144    # opening a new file, or document has assoicated files => directory needed
    81     if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0))
    82     {
    83     # get the document's directory.
    84     my $doc_info = $self->{'archive_info'}->get_info($OID);
    85     my $doc_dir = "";
    86     if (defined $doc_info && scalar(@$doc_info) >= 1) {
    87         # this OID already has an assigned directory, use the
    88         # same one.
    89         $doc_dir = $doc_info->[0];
    90         $doc_dir =~ s/\/?doc\.gml(\.gz)?$//;
    91     } else {
    92         # have to get a new document directory
    93         my $doc_dir_rest = $OID;
    94         my $doc_dir_num = 0;
    95         do {
    96         $doc_dir .= "/" if $doc_dir_num > 0;
    97         if ($doc_dir_rest =~ s/^(.{1,8})//) {
    98             $doc_dir .= $1;
    99             $doc_dir_num++;
    100         }
    101         } while ($doc_dir_rest ne "" &&
    102              ((-d &util::filename_cat ($archive_dir, "$doc_dir.dir")) ||
    103               ($self->{'archive_info'}->size() >= 1024 && $doc_dir_num < 2)));
    104         $doc_dir .= ".dir";
    105        
    106     }
    107    
    108     &util::mk_all_dir ("$archive_dir/$doc_dir");
    109    
     145    if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
     146
     147    # get document's directory
     148    my $doc_dir = $self->get_doc_dir ($OID);
     149
    110150    # copy all the associated files, add this information as metadata
    111151    # to the document
    112     my @assoc_files = ();
    113     foreach $assoc_file (@{$doc_obj->get_assoc_files()}) {
    114         my ($dir, $afile) = $assoc_file->[1] =~ /^(.*?)([^\/\\]+)$/;
    115         $dir = "" unless defined $dir;
    116         if (-e $assoc_file->[0]) {
    117         my $filepath = &util::filename_cat($archive_dir, $doc_dir, $afile);
    118         &util::hard_link ($assoc_file->[0], $filepath);
    119         $doc_obj->add_metadata ($doc_obj->get_top_section(),
    120                     "gsdlassocfile",
    121                     "$afile:$assoc_file->[2]:$dir");
    122         } else {
    123         print STDERR "docsave::process couldn't copy the associated file " .
    124             "$assoc_file->[0] to $afile\n";
    125         }
    126     }
    127 
    128     if ($open_new_file)
    129     {
     152    $self->process_assoc_files ($doc_obj, $doc_dir);
     153
     154
     155    if ($open_new_file) {
    130156        # only if opening new file
    131157        my $doc_file
    132         = &util::filename_cat ($archive_dir, $doc_dir, "doc.gml");
     158        = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.gml");
    133159        my $short_doc_file = &util::filename_cat ($doc_dir, "doc.gml");
    134160       
     
    139165       
    140166        if (!open (OUTDOC, ">$doc_file")) {
    141         print STDERR "docsave::process could not write to file $doc_file\n";
     167        print STDERR "docsave::group_process could not write to file $doc_file\n";
    142168        return;
    143169        }
     
    153179    $self->{'gs_count'}++;
    154180}
     181
     182
     183sub get_doc_dir {
     184    my $self = shift (@_);
     185    my ($OID) = @_;
     186
     187    my $doc_info = $self->{'archive_info'}->get_info($OID);
     188    my $doc_dir = "";
     189    if (defined $doc_info && scalar(@$doc_info) >= 1) {
     190    # this OID already has an assigned directory, use the
     191    # same one.
     192    $doc_dir = $doc_info->[0];
     193    $doc_dir =~ s/\/?doc\.gml(\.gz)?$//;
     194    } else {
     195    # have to get a new document directory
     196    my $doc_dir_rest = $OID;
     197    my $doc_dir_num = 0;
     198    do {
     199        $doc_dir .= "/" if $doc_dir_num > 0;
     200        if ($doc_dir_rest =~ s/^(.{1,8})//) {
     201        $doc_dir .= $1;
     202        $doc_dir_num++;
     203        }
     204    } while ($doc_dir_rest ne "" &&
     205         ((-d &util::filename_cat ($self->{'archive_dir'}, "$doc_dir.dir")) ||
     206          ($self->{'archive_info'}->size() >= 1024 && $doc_dir_num < 2)));
     207    $doc_dir .= ".dir";
     208   
     209    }
     210   
     211    &util::mk_all_dir (&util::filename_cat ($self->{'archive_dir'}, $doc_dir));
     212
     213    return $doc_dir;
     214}
     215
     216
     217sub process_assoc_files {
     218    my $self = shift (@_);
     219    my ($doc_obj, $doc_dir) = @_;
     220
     221    my @assoc_files = ();
     222    foreach $assoc_file (@{$doc_obj->get_assoc_files()}) {
     223    my ($dir, $afile) = $assoc_file->[1] =~ /^(.*?)([^\/\\]+)$/;
     224    $dir = "" unless defined $dir;
     225    if (-e $assoc_file->[0]) {
     226        my $filepath = &util::filename_cat($self->{'archive_dir'}, $doc_dir, $afile);
     227        &util::hard_link ($assoc_file->[0], $filepath);
     228        $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
     229                     "gsdlassocfile",
     230                     "$afile:$assoc_file->[2]:$dir");
     231    } else {
     232        print STDERR "docsave::process couldn't copy the associated file " .
     233        "$assoc_file->[0] to $afile\n";
     234    }
     235    }
     236}
     237   
    155238
    156239sub close_file_output
Note: See TracChangeset for help on using the changeset viewer.