Context Navigation

← Previous Changeset
Next Changeset →

Changeset 7686

Timestamp:

2004-07-01T14:48:55+12:00 (20 years ago)

Author:

mdewsnip

Message:

First cut at upgrading the CDS/ISIS plugin to obtain and index documents specified in the database (for the UNESCO contract).

Location:

trunk/gsdl/perllib/plugins

Files:

: 2 edited

ISISPlug.pm (modified) (14 diffs)
RecPlug.pm (modified) (5 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/plugins/ISISPlug.pm

-              r7049
+              r7686
 # University of Waikato, New Zealand.
+#
 # Copyright 1999-2003 New Zealand Digital Library Project
+# Copyright 1999-2004 New Zealand Digital Library Project
+#
 # This program is free software; you can redistribute it and/or modify
 …
 my $arguments =
+    [ { 'name' => "entry_separator",
+    'desc' => "{ISISPlug.entry_separator}",
+    'type' => "string",
+    'reqd' => "no",
+    'deft' => "<br>" },
+      { 'name' => "process_exp",
+    [ { 'name' => "process_exp",
     'desc' => "{BasPlug.process_exp}",
     'type' => "regexp",
 …
     'desc' => "{BasPlug.block_exp}",
     'type' => "regexp",
+    'reqd' => "no",
     'deft' => &get_default_block_exp() },
+      { 'name' => "split_exp",
+    'desc' => "{SplitPlug.split_exp}",
+    'type' => "regexp",
+    'reqd' => "no",
+    'deft' => &get_default_split_exp() },
+      # The interesting options
+      { 'name' => "entry_separator",
+    'desc' => "{ISISPlug.entry_separator}",
+    'type' => "string",
+    'reqd' => "no",
+    'deft' => "<br>" },
       { 'name' => "subfield_separator",
     'desc' => "{ISISPlug.subfield_separator}",
 …
     'reqd' => "no",
     'deft' => ", " },
+      { 'name' => "split_exp",
+    'desc' => "{SplitPlug.split_exp}",
+    'type' => "regexp",
+    'deft' => &get_default_split_exp(),
+    'reqd' => "no" }
+      { 'name' => "document_field",
+    'desc' => "{ISISPlug.document_field}",
+    'type' => "string",
+    'reqd' => "no",
+    'deft' => "" },
+      { 'name' => "document_prefix",
+    'desc' => "{ISISPlug.document_prefix}",
+    'type' => "string",
+    'reqd' => "no",
+    'deft' => "" },
+      { 'name' => "document_suffix",
+    'desc' => "{ISISPlug.document_suffix}",
+    'type' => "string",
+    'reqd' => "no",
+    'deft' => "" }
       ];
 …
              q^subfield_separator/.*/, ^, \$self->{'subfield_separator'},
              q^entry_separator/.*/<br>^, \$self->{'entry_separator'},
+             q^document_field/.*/^, \$self->{'document_field'},
+             q^document_prefix/.*/^, \$self->{'document_prefix'},
+             q^document_suffix/.*/^, \$self->{'document_suffix'},
              "allow_extra_options")) {
     print STDERR "\nIncorrect options passed to ISISPlug, check your collect.cfg configuration file\n";
 …
     return bless $self, $class;
+}
+sub read
+{
+    my $self = shift(@_);
+    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
+    my $result = &SplitPlug::read($self, @_);
+    if ($file =~ /$self->{'process_exp'}/ && $self->{'document_field'}) {
+    &end_metadata_xml_file($self->{'documents_metadata_xml_file'});
+    }
+    return $result;
+}
 …
     $$textref =~ s/\ntag=(\d+) /\ntag=$fdtmapping{$1}{'title'} /g;
+    # Add a newline at the start so it is split properly
+    $$textref = "\n" . $$textref;
+    # Remove the line at the start so it is split and processed properly
+    $$textref =~ s/^----------\n//;
+    # Obtain the documents specified in the CDS/ISIS database, if requested
+    if ($self->{'document_field'}) {
+    # Create a directory to store the document files
+    $self->{'documents_directory'} = $databasename . ".all";
+    if (-e $self->{'documents_directory'}) {
+        &util::rm_r($self->{'documents_directory'});
+    }
+    &util::mk_dir($self->{'documents_directory'});
+    # ...and a metadata.xml file for the document metadata (extracted from the database)
+    $self->{'documents_metadata_xml_file'} = &util::filename_cat($self->{'documents_directory'}, "metadata.xml");
+    if (-e $self->{'documents_metadata_xml_file'}) {
+        &util::rm($self->{'documents_metadata_xml_file'});
+    }
+    &begin_metadata_xml_file($self->{'documents_metadata_xml_file'});
+    }
+}
 …
     foreach $line (split(/\n/, $$textref)) {
     $line =~ /^tag=(.+) data=(.+)$/;
     local $rawtagname = $1;
     local $rawtagdata = $2;
+    my $rawtagname = $1;
+    my $rawtagdata = $2;
     # print "Raw tag: $rawtagname, Raw data: $rawtagdata\n";
     # Metadata field names: title case, then remove spaces
     local $tagname = "";
+    my $tagname = "";
     foreach $word (split(/\s+/, $rawtagname)) {
         substr($word, 0, 1) =~ tr/a-z/A-Z/;
 …
     # Handle each piece of metadata ('%' separated)
     local $completetagvalue = "";
+    my $completetagvalue = "";
     foreach $rawtagvalue (split(/%/, $rawtagdata)) {
         $completetagvalue .= $entry_separator unless ($completetagvalue eq "");
         # Metadata field values: take care with subfields
         local $completeentryvalue = "";
+        my $completeentryvalue = "";
         while ($rawtagvalue ne "") {
         # If there is a subfield specifier, parse it off
         local $subfieldname = "";
+        my $subfieldname = "";
         if ($rawtagvalue =~ s/^\^([a-z])//) {
             $subfieldname = "." . $1;
 …
         # Parse the metadata value off
         $rawtagvalue =~ s/^([^\^]*)//;
         local $metadatafieldname = $tagname . $subfieldname;
         local $metadatafieldvalue = $1;
+        my $metadatafieldname = $tagname . $subfieldname;
+        my $metadatafieldvalue = $1;
         # print "Metadata: $metadatafieldname -> $metadatafieldvalue\n";
         # Handle Keywords specially
         if ($metadatafieldname eq "Keywords") {
             local $keywordmetadatavalue = $metadatafieldvalue;
             local $keywordlist = "";
+            my $keywordmetadatavalue = $metadatafieldvalue;
+            my $keywordlist = "";
             while ($keywordmetadatavalue =~ s/\<([^\>]+)\>//) {
             local $keyword = $1;
+            my $keyword = $1;
             $doc_obj->add_utf8_metadata($cursection, $metadatafieldname, $keyword);
             $keywordlist .= ", " unless ($keywordlist eq "");
 …
     $doc_obj->add_utf8_text ($cursection, $$textref);
+    # Document was processed successfully
+    # Obtain the documents specified in the CDS/ISIS database, if requested
+    if ($self->{'document_field'}) {
+    my $document_field = $self->{'document_field'};
+    my $document_prefix = $self->{'document_prefix'} || "";
+    my $document_suffix = $self->{'document_suffix'} || "";
+    my $documents_directory = $self->{'documents_directory'};
+    my $document_obtained = 0;
+    # Look at all the metadata assigned to this record
+    my $record_metadata = $doc_obj->get_all_metadata($cursection);
+    foreach my $pair (@$record_metadata) {
+        my ($field, $value) = (@$pair);
+        # Does this metadata element specify a document to obtain?
+        if ($field eq $document_field) {
+        my $document_file_full = $document_prefix . $value . $document_suffix;
+        my ($document_file) = ($document_file_full =~ /([^\/]+)$/);
+        if (&obtain_document($self, $document_file_full, $document_file,
+                     $documents_directory)) {
+            $document_obtained = 1;
+            &write_metadata_xml_file($self->{'documents_metadata_xml_file'},
+                         $document_file, $record_metadata);
+        }
+        }
+    }
+    # If there was a document obtained for this record we don't want the record as well
+    if ($document_obtained) {
+        return 0;
+    }
+    }
+    # Record was processed successfully (and there was no document obtained)
     return 1;
+}
 …
 sub parse_field_definition_table
+{
     local $fdtfilename = shift(@_);
     local %fdtmapping = ();
+    my $fdtfilename = shift(@_);
+    my %fdtmapping = ();
     open(FDT_FILE, "<$fdtfilename") || die "Error: Could not open file $fdtfilename.\n";
     local $amongstdefinitions = 0;
+    my $amongstdefinitions = 0;
     foreach $fdtfileline (<FDT_FILE>) {
     $fdtfileline =~ s/(\s*)$//;  # Remove any nasty spaces at the end of the lines
     if ($amongstdefinitions) {
         local $fieldtitle     = substr($fdtfileline,  0, 30);
         local $fieldsubfields = substr($fdtfileline, 30, 20);
         local $fieldspecs     = substr($fdtfileline, 50);
+        my $fieldtitle     = substr($fdtfileline,  0, 30);
+        my $fieldsubfields = substr($fdtfileline, 30, 20);
+        my $fieldspecs     = substr($fdtfileline, 50);
         # Remove extra spaces
 …
         # Map from tag number to metadata field title and subfields
         local $fieldtag = (split(/ /, $fieldspecs))[0];
+        my $fieldtag = (split(/ /, $fieldspecs))[0];
         $fdtmapping{$fieldtag} = { 'title' => $fieldtitle,
                        'subfields' => $fieldsubfields };
 …
+sub obtain_document
+{
+    my $self = shift(@_);
+    my $document_file_full = shift(@_);
+    my $document_file_name = shift(@_);
+    my $documents_directory = shift(@_);
+    my $local_document_file = &util::filename_cat($documents_directory, $document_file_name);
+    my $outhandle = $self->{'outhandle'};
+    print $outhandle "Obtaining document file $document_file_full...\n"
+    if ($self->{'verbosity'} > 1);
+    # Document specified is on the web
+    if ($document_file_full =~ /^http:/ || $document_file_full =~ /^ftp:/) {
+    my $wget_options = "--quiet";
+    $wget_options = "--verbose" if ($self->{'verbosity'} > 2);
+    $wget_options .= " --timestamping";  # Only re-download files if they're newer
+    `wget $wget_options $document_file_full --output-document $local_document_file`;
+    }
+    # Document specified is on the disk
+    else {
+    &util::cp($document_file_full, $documents_directory);
+    }
+    # Check the document was obtained successfully
+    if (!-e $local_document_file) {
+    print STDERR "WARNING: Could not obtain document file $document_file_full\n";
+    return 0;
+    }
+    return 1;
+}
+sub begin_metadata_xml_file
+{
+    my $metadata_xml_file = shift(@_);
+    open(METADATA_XML_FILE, ">$metadata_xml_file");
+    print METADATA_XML_FILE
+    "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" .
+    "<!DOCTYPE DirectoryMetadata SYSTEM \"http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd\">\n" .
+    "<DirectoryMetadata>\n";
+    close(METADATA_XML_FILE);
+}
+sub write_metadata_xml_file
+{
+    my $metadata_xml_file = shift(@_);
+    my $file_name = shift(@_);
+    my $record_metadata = shift(@_);
+    # Make $file_name XML-safe
+    $file_name =~ s/</&lt;/g;
+    $file_name =~ s/>/&gt;/g;
+    open(METADATA_XML_FILE, ">>$metadata_xml_file");
+    print METADATA_XML_FILE
+    "\n" .
+        "  <FileSet>\n" .
+    "    <FileName>$file_name</FileName>\n" .
+    "    <Description>\n";
+    foreach my $pair (@$record_metadata) {
+    my ($field, $value) = (@$pair);
+    # We're only interested in metadata from the database
+    next if ($field eq "gsdlsourcefilename");
+    next if ($field eq "gsdldoctype");
+    next if ($field eq "Language");
+    next if ($field eq "Encoding");
+    next if ($field eq "Identifier");
+    next if ($field eq "Source");
+    next if ($field eq "SourceSegment");
+    next if ($field eq "Plugin");
+    # Make $value XML-safe
+    $value =~ s/</&lt;/g;
+    $value =~ s/>/&gt;/g;
+    print METADATA_XML_FILE "      <Metadata name=\"$field\">$value</Metadata>\n";
+    }
+    print METADATA_XML_FILE
+    "    </Description>\n" .
+        "  </FileSet>\n";
+    close(METADATA_XML_FILE);
+}
+sub end_metadata_xml_file
+{
+    my $metadata_xml_file = shift(@_);
+    open(METADATA_XML_FILE, ">>$metadata_xml_file");
+    print METADATA_XML_FILE "\n</DirectoryMetadata>\n";
+    close(METADATA_XML_FILE);
+}
 ;

trunk/gsdl/perllib/plugins/RecPlug.pm

-              r7362
+              r7686
     'desc' => "{RecPlug.use_metadata_files}",
     'type' => "flag",
+    'reqd' => "no" },
+      { 'name' => "recheck_directories",
+    'desc' => "{RecPlug.recheck_directories}",
+    'type' => "flag",
     'reqd' => "no" } ];
 …
     if (!parsargv::parse(\@_,
              q^use_metadata_files^, \$self->{'use_metadata_files'},
+             q^recheck_directories^, \$self->{'recheck_directories'},
              "allow_extra_options")) {
     print STDERR "\nRecPlug uses an incorrect option.\n";
 …
     @dir = readdir (DIR);
     closedir (DIR);
+    # Re-order the files in the list so any directories ending with .all are moved to the end
+    for ($i = scalar(@dir) - 1; $i >= 0; $i--) {
+    if (-d $dir[$i] && $dir[$i] =~ /\.all$/) {
+        push(@dir, splice(@dir, $i, 1));
+    }
+    }
     # read XML metadata files (if supplied)
     my $additionalmetadata = 0;      # is there extra metadata available?
 …
     # import each of the files in the directory
     my $out_metadata;
+    foreach $subfile (@dir) {
+    my $num_files = scalar(@dir);
+    for (my $i = 0; $i <= scalar(@dir); $i++) {
+    # When every file in the directory has been done, pause for a moment (figuratively!)
+    # If the -recheck_directories argument hasn't been provided, stop now (default)
+    # Otherwise, re-read the contents of the directory to check for new files
+    #   Any new files are added to the @dir list and are processed as normal
+    #   This is necessary when documents to be indexed are specified in bibliographic DBs
+    #   These files are copied/downloaded and stored in a new folder at import time
+    if ($i == $num_files) {
+        last unless $self->{'recheck_directories'};
+        # Re-read the files in the directory to see if there are any new files
+        last if (!opendir (DIR, $dirname));
+        my @dirnow = readdir (DIR);
+        closedir (DIR);
+        # We're only interested if there are more files than there were before
+        last if (scalar(@dirnow) <= scalar(@dir));
+        # Any new files are added to the end of @dir to get processed by the loop
+        foreach my $subfilenow (@dirnow) {
+        for ($j = 0; $j < $num_files; $j++) {
+            last if ($subfilenow eq $dir[$j]);
+        }
+        if ($j == $num_files) {
+            # New file
+            push(@dir, $subfilenow);
+        }
+        }
+        # When the new files have been processed, check again
+        $num_files = scalar(@dir);
+    }
+    my $subfile = $dir[$i];
     last if ($maxdocs != -1 && $count >= $maxdocs);
     next if ($subfile =~ /^\.\.?$/);
 …
                  $out_metadata, $processor, $maxdocs, $gli);
+    }
+    return $count;
+    return $count;
+}

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 7686

Legend:

trunk/gsdl/perllib/plugins/ISISPlug.pm

trunk/gsdl/perllib/plugins/RecPlug.pm

Download in other formats: