Changeset 7686


Ignore:
Timestamp:
2004-07-01T14:48:55+12:00 (20 years ago)
Author:
mdewsnip
Message:

First cut at upgrading the CDS/ISIS plugin to obtain and index documents specified in the database (for the UNESCO contract).

Location:
trunk/gsdl/perllib/plugins
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/ISISPlug.pm

    r7049 r7686  
    77# University of Waikato, New Zealand.
    88#
    9 # Copyright 1999-2003 New Zealand Digital Library Project
     9# Copyright 1999-2004 New Zealand Digital Library Project
    1010#
    1111# This program is free software; you can redistribute it and/or modify
     
    3939
    4040my $arguments =
    41     [ { 'name' => "entry_separator",
    42     'desc' => "{ISISPlug.entry_separator}",
    43     'type' => "string",
    44     'reqd' => "no",
    45     'deft' => "<br>" },
    46       { 'name' => "process_exp",
     41    [ { 'name' => "process_exp",
    4742    'desc' => "{BasPlug.process_exp}",
    4843    'type' => "regexp",
     
    5247    'desc' => "{BasPlug.block_exp}",
    5348    'type' => "regexp",
     49    'reqd' => "no",
    5450    'deft' => &get_default_block_exp() },
     51      { 'name' => "split_exp",
     52    'desc' => "{SplitPlug.split_exp}",
     53    'type' => "regexp",
     54    'reqd' => "no",
     55    'deft' => &get_default_split_exp() },
     56
     57      # The interesting options
     58      { 'name' => "entry_separator",
     59    'desc' => "{ISISPlug.entry_separator}",
     60    'type' => "string",
     61    'reqd' => "no",
     62    'deft' => "<br>" },
    5563      { 'name' => "subfield_separator",
    5664    'desc' => "{ISISPlug.subfield_separator}",
     
    5866    'reqd' => "no",
    5967    'deft' => ", " },
    60       { 'name' => "split_exp",
    61     'desc' => "{SplitPlug.split_exp}",
    62     'type' => "regexp",
    63     'deft' => &get_default_split_exp(),
    64     'reqd' => "no" }
     68      { 'name' => "document_field",
     69    'desc' => "{ISISPlug.document_field}",
     70    'type' => "string",
     71    'reqd' => "no",
     72    'deft' => "" },
     73      { 'name' => "document_prefix",
     74    'desc' => "{ISISPlug.document_prefix}",
     75    'type' => "string",
     76    'reqd' => "no",
     77    'deft' => "" },
     78      { 'name' => "document_suffix",
     79    'desc' => "{ISISPlug.document_suffix}",
     80    'type' => "string",
     81    'reqd' => "no",
     82    'deft' => "" }
    6583      ];
    6684
     
    97115             q^subfield_separator/.*/, ^, \$self->{'subfield_separator'},
    98116             q^entry_separator/.*/<br>^, \$self->{'entry_separator'},
     117             q^document_field/.*/^, \$self->{'document_field'},
     118             q^document_prefix/.*/^, \$self->{'document_prefix'},
     119             q^document_suffix/.*/^, \$self->{'document_suffix'},
    99120             "allow_extra_options")) {
    100121    print STDERR "\nIncorrect options passed to ISISPlug, check your collect.cfg configuration file\n";
     
    108129
    109130    return bless $self, $class;
     131}
     132
     133
     134sub read
     135{
     136    my $self = shift(@_);
     137    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
     138
     139    my $result = &SplitPlug::read($self, @_);
     140    if ($file =~ /$self->{'process_exp'}/ && $self->{'document_field'}) {
     141    &end_metadata_xml_file($self->{'documents_metadata_xml_file'});
     142    }
     143
     144    return $result;
    110145}
    111146
     
    145180    $$textref =~ s/\ntag=(\d+) /\ntag=$fdtmapping{$1}{'title'} /g;
    146181
    147     # Add a newline at the start so it is split properly
    148     $$textref = "\n" . $$textref;
     182    # Remove the line at the start so it is split and processed properly
     183    $$textref =~ s/^----------\n//;
     184
     185    # Obtain the documents specified in the CDS/ISIS database, if requested
     186    if ($self->{'document_field'}) {
     187    # Create a directory to store the document files
     188    $self->{'documents_directory'} = $databasename . ".all";
     189    if (-e $self->{'documents_directory'}) {
     190        &util::rm_r($self->{'documents_directory'});
     191    }
     192    &util::mk_dir($self->{'documents_directory'});
     193
     194    # ...and a metadata.xml file for the document metadata (extracted from the database)
     195    $self->{'documents_metadata_xml_file'} = &util::filename_cat($self->{'documents_directory'}, "metadata.xml");
     196    if (-e $self->{'documents_metadata_xml_file'}) {
     197        &util::rm($self->{'documents_metadata_xml_file'});
     198    }
     199    &begin_metadata_xml_file($self->{'documents_metadata_xml_file'});
     200    }
    149201}
    150202
     
    167219    foreach $line (split(/\n/, $$textref)) {
    168220    $line =~ /^tag=(.+) data=(.+)$/;
    169     local $rawtagname = $1;
    170     local $rawtagdata = $2;
     221    my $rawtagname = $1;
     222    my $rawtagdata = $2;
    171223    # print "Raw tag: $rawtagname, Raw data: $rawtagdata\n";
    172224
    173225    # Metadata field names: title case, then remove spaces
    174     local $tagname = "";
     226    my $tagname = "";
    175227    foreach $word (split(/\s+/, $rawtagname)) {
    176228        substr($word, 0, 1) =~ tr/a-z/A-Z/;
     
    182234
    183235    # Handle each piece of metadata ('%' separated)
    184     local $completetagvalue = "";
     236    my $completetagvalue = "";
    185237    foreach $rawtagvalue (split(/%/, $rawtagdata)) {
    186238        $completetagvalue .= $entry_separator unless ($completetagvalue eq "");
    187239
    188240        # Metadata field values: take care with subfields
    189         local $completeentryvalue = "";
     241        my $completeentryvalue = "";
    190242        while ($rawtagvalue ne "") {
    191243        # If there is a subfield specifier, parse it off
    192         local $subfieldname = "";
     244        my $subfieldname = "";
    193245        if ($rawtagvalue =~ s/^\^([a-z])//) {
    194246            $subfieldname = "." . $1;
     
    197249        # Parse the metadata value off
    198250        $rawtagvalue =~ s/^([^\^]*)//;
    199         local $metadatafieldname = $tagname . $subfieldname;
    200         local $metadatafieldvalue = $1;
     251        my $metadatafieldname = $tagname . $subfieldname;
     252        my $metadatafieldvalue = $1;
    201253        # print "Metadata: $metadatafieldname -> $metadatafieldvalue\n";
    202254
    203255        # Handle Keywords specially
    204256        if ($metadatafieldname eq "Keywords") {
    205             local $keywordmetadatavalue = $metadatafieldvalue;
    206             local $keywordlist = "";
     257            my $keywordmetadatavalue = $metadatafieldvalue;
     258            my $keywordlist = "";
    207259            while ($keywordmetadatavalue =~ s/\<([^\>]+)\>//) {
    208             local $keyword = $1;
     260            my $keyword = $1;
    209261            $doc_obj->add_utf8_metadata($cursection, $metadatafieldname, $keyword);
    210262            $keywordlist .= ", " unless ($keywordlist eq "");
     
    236288    $doc_obj->add_utf8_text ($cursection, $$textref);
    237289
    238     # Document was processed successfully
     290    # Obtain the documents specified in the CDS/ISIS database, if requested
     291    if ($self->{'document_field'}) {
     292    my $document_field = $self->{'document_field'};
     293    my $document_prefix = $self->{'document_prefix'} || "";
     294    my $document_suffix = $self->{'document_suffix'} || "";
     295
     296    my $documents_directory = $self->{'documents_directory'};
     297    my $document_obtained = 0;
     298
     299    # Look at all the metadata assigned to this record
     300    my $record_metadata = $doc_obj->get_all_metadata($cursection);
     301    foreach my $pair (@$record_metadata) {
     302        my ($field, $value) = (@$pair);
     303
     304        # Does this metadata element specify a document to obtain?
     305        if ($field eq $document_field) {
     306        my $document_file_full = $document_prefix . $value . $document_suffix;
     307        my ($document_file) = ($document_file_full =~ /([^\/]+)$/);
     308        if (&obtain_document($self, $document_file_full, $document_file,
     309                     $documents_directory)) {
     310            $document_obtained = 1;
     311            &write_metadata_xml_file($self->{'documents_metadata_xml_file'},
     312                         $document_file, $record_metadata);
     313        }
     314        }
     315    }
     316
     317    # If there was a document obtained for this record we don't want the record as well
     318    if ($document_obtained) {
     319        return 0;
     320    }
     321    }
     322
     323    # Record was processed successfully (and there was no document obtained)
    239324    return 1;
    240325}
     
    243328sub parse_field_definition_table
    244329{
    245     local $fdtfilename = shift(@_);
    246 
    247     local %fdtmapping = ();
     330    my $fdtfilename = shift(@_);
     331
     332    my %fdtmapping = ();
    248333
    249334    open(FDT_FILE, "<$fdtfilename") || die "Error: Could not open file $fdtfilename.\n";
    250335
    251     local $amongstdefinitions = 0;
     336    my $amongstdefinitions = 0;
    252337    foreach $fdtfileline (<FDT_FILE>) {
    253338    $fdtfileline =~ s/(\s*)$//;  # Remove any nasty spaces at the end of the lines
    254339
    255340    if ($amongstdefinitions) {
    256         local $fieldtitle     = substr($fdtfileline,  0, 30);
    257         local $fieldsubfields = substr($fdtfileline, 30, 20);
    258         local $fieldspecs     = substr($fdtfileline, 50);
     341        my $fieldtitle     = substr($fdtfileline,  0, 30);
     342        my $fieldsubfields = substr($fdtfileline, 30, 20);
     343        my $fieldspecs     = substr($fdtfileline, 50);
    259344
    260345        # Remove extra spaces
     
    263348
    264349        # Map from tag number to metadata field title and subfields
    265         local $fieldtag = (split(/ /, $fieldspecs))[0];
     350        my $fieldtag = (split(/ /, $fieldspecs))[0];
    266351        $fdtmapping{$fieldtag} = { 'title' => $fieldtitle,
    267352                       'subfields' => $fieldsubfields };
     
    278363
    279364
     365sub obtain_document
     366{
     367    my $self = shift(@_);
     368    my $document_file_full = shift(@_);
     369    my $document_file_name = shift(@_);
     370    my $documents_directory = shift(@_);
     371    my $local_document_file = &util::filename_cat($documents_directory, $document_file_name);
     372
     373    my $outhandle = $self->{'outhandle'};
     374    print $outhandle "Obtaining document file $document_file_full...\n"
     375    if ($self->{'verbosity'} > 1);
     376
     377    # Document specified is on the web
     378    if ($document_file_full =~ /^http:/ || $document_file_full =~ /^ftp:/) {
     379    my $wget_options = "--quiet";
     380    $wget_options = "--verbose" if ($self->{'verbosity'} > 2);
     381    $wget_options .= " --timestamping";  # Only re-download files if they're newer
     382    `wget $wget_options $document_file_full --output-document $local_document_file`;
     383    }
     384    # Document specified is on the disk
     385    else {
     386    &util::cp($document_file_full, $documents_directory);
     387    }
     388
     389    # Check the document was obtained successfully
     390    if (!-e $local_document_file) {
     391    print STDERR "WARNING: Could not obtain document file $document_file_full\n";
     392    return 0;
     393    }
     394
     395    return 1;
     396}
     397
     398
     399sub begin_metadata_xml_file
     400{
     401    my $metadata_xml_file = shift(@_);
     402
     403    open(METADATA_XML_FILE, ">$metadata_xml_file");
     404    print METADATA_XML_FILE
     405    "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" .
     406    "<!DOCTYPE DirectoryMetadata SYSTEM \"http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd\">\n" .
     407    "<DirectoryMetadata>\n";
     408    close(METADATA_XML_FILE);
     409}
     410
     411
     412sub write_metadata_xml_file
     413{
     414    my $metadata_xml_file = shift(@_);
     415    my $file_name = shift(@_);
     416    my $record_metadata = shift(@_);
     417
     418    # Make $file_name XML-safe
     419    $file_name =~ s/</&lt;/g;
     420    $file_name =~ s/>/&gt;/g;
     421
     422    open(METADATA_XML_FILE, ">>$metadata_xml_file");
     423
     424    print METADATA_XML_FILE
     425    "\n" .
     426        "  <FileSet>\n" .
     427    "    <FileName>$file_name</FileName>\n" .
     428    "    <Description>\n";
     429
     430    foreach my $pair (@$record_metadata) {
     431    my ($field, $value) = (@$pair);
     432
     433    # We're only interested in metadata from the database
     434    next if ($field eq "gsdlsourcefilename");
     435    next if ($field eq "gsdldoctype");
     436    next if ($field eq "Language");
     437    next if ($field eq "Encoding");
     438    next if ($field eq "Identifier");
     439    next if ($field eq "Source");
     440    next if ($field eq "SourceSegment");
     441    next if ($field eq "Plugin");
     442
     443    # Make $value XML-safe
     444    $value =~ s/</&lt;/g;
     445    $value =~ s/>/&gt;/g;
     446   
     447    print METADATA_XML_FILE "      <Metadata name=\"$field\">$value</Metadata>\n";
     448    }
     449
     450    print METADATA_XML_FILE
     451    "    </Description>\n" .
     452        "  </FileSet>\n";
     453
     454    close(METADATA_XML_FILE);
     455}
     456
     457
     458sub end_metadata_xml_file
     459{
     460    my $metadata_xml_file = shift(@_);
     461
     462    open(METADATA_XML_FILE, ">>$metadata_xml_file");
     463    print METADATA_XML_FILE "\n</DirectoryMetadata>\n";
     464    close(METADATA_XML_FILE);
     465}
     466
     467
    2804681;
  • trunk/gsdl/perllib/plugins/RecPlug.pm

    r7362 r7686  
    115115    'desc' => "{RecPlug.use_metadata_files}",
    116116    'type' => "flag",
     117    'reqd' => "no" },
     118      { 'name' => "recheck_directories",
     119    'desc' => "{RecPlug.recheck_directories}",
     120    'type' => "flag",
    117121    'reqd' => "no" } ];
    118122
     
    137141    if (!parsargv::parse(\@_,
    138142             q^use_metadata_files^, \$self->{'use_metadata_files'},
     143             q^recheck_directories^, \$self->{'recheck_directories'},
    139144             "allow_extra_options")) {
    140145    print STDERR "\nRecPlug uses an incorrect option.\n";
     
    232237    @dir = readdir (DIR);
    233238    closedir (DIR);
    234    
     239
     240    # Re-order the files in the list so any directories ending with .all are moved to the end
     241    for ($i = scalar(@dir) - 1; $i >= 0; $i--) {
     242    if (-d $dir[$i] && $dir[$i] =~ /\.all$/) {
     243        push(@dir, splice(@dir, $i, 1));
     244    }
     245    }
     246
    235247    # read XML metadata files (if supplied)
    236248    my $additionalmetadata = 0;      # is there extra metadata available?
     
    252264    # import each of the files in the directory
    253265    my $out_metadata;
    254     foreach $subfile (@dir) {
    255        
     266    my $num_files = scalar(@dir);
     267    for (my $i = 0; $i <= scalar(@dir); $i++) {
     268    # When every file in the directory has been done, pause for a moment (figuratively!)
     269    # If the -recheck_directories argument hasn't been provided, stop now (default)
     270    # Otherwise, re-read the contents of the directory to check for new files
     271    #   Any new files are added to the @dir list and are processed as normal
     272    #   This is necessary when documents to be indexed are specified in bibliographic DBs
     273    #   These files are copied/downloaded and stored in a new folder at import time
     274    if ($i == $num_files) {
     275        last unless $self->{'recheck_directories'};
     276
     277        # Re-read the files in the directory to see if there are any new files
     278        last if (!opendir (DIR, $dirname));
     279        my @dirnow = readdir (DIR);
     280        closedir (DIR);
     281
     282        # We're only interested if there are more files than there were before
     283        last if (scalar(@dirnow) <= scalar(@dir));
     284
     285        # Any new files are added to the end of @dir to get processed by the loop
     286        foreach my $subfilenow (@dirnow) {
     287        for ($j = 0; $j < $num_files; $j++) {
     288            last if ($subfilenow eq $dir[$j]);
     289        }
     290        if ($j == $num_files) {
     291            # New file
     292            push(@dir, $subfilenow);
     293        }
     294        }
     295
     296        # When the new files have been processed, check again
     297        $num_files = scalar(@dir);
     298    }
     299
     300    my $subfile = $dir[$i];
    256301    last if ($maxdocs != -1 && $count >= $maxdocs);
    257302    next if ($subfile =~ /^\.\.?$/);
     
    304349                 $out_metadata, $processor, $maxdocs, $gli);
    305350    }
    306     return $count;
    307    
     351
     352    return $count;   
    308353}
    309354
Note: See TracChangeset for help on using the changeset viewer.