Context Navigation

← Previous Change
Next Change →

Changeset 12610 for trunk/gsdl

Timestamp:

2006-08-30T15:05:30+12:00 (18 years ago)

Author:

mdewsnip

Message:

Essentially a brand-new plugin (the old CSVPlug has been renamed to MetadataCSVPlug). This plugin uses SplitPlug to split CSV files into lines, and creates a new document for each line, with the metadata specified. The first line of the CSV file must contain the metadata element names.

File:

: 1 edited

trunk/gsdl/perllib/plugins/CSVPlug.pm (modified) (6 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/plugins/CSVPlug.pm

-              r12169
+              r12610
 ###########################################################################
+#
 # CSVPlug.pm -- A plugin for metadata in comma-separated value format
+# CSVPlug.pm -- A plugin for files in comma-separated value format
+#
 # A component of the Greenstone digital library software
 …
 use BasPlug;
+use SplitPlug;
 use strict;
+no strict 'refs'; # allow filehandles to be variables and viceversa
+# CSVPlug is a sub-class of SplitPlug.
 sub BEGIN {
     @CSVPlug::ISA = ('BasPlug');
+    @CSVPlug::ISA = ('SplitPlug');
+}
 my $arguments =
     [ { 'name' => "block_exp",
     'desc' => "{BasPlug.block_exp}",
+my $arguments =
+    [ { 'name' => "process_exp",
+    'desc' => "{BasPlug.process_exp}",
     'type' => "regexp",
     'reqd' => "no",
+    'deft' => &get_default_block_exp() } ];
+    'deft' => &get_default_process_exp() },
+      { 'name' => "split_exp",
+    'desc' => "{SplitPlug.split_exp}",
+    'type' => "regexp",
+    'reqd' => "no",
+    'deft' => &get_default_split_exp(),
+        'hiddengli' => "yes" }
+      ];
 …
         'abstract' => "no",
         'inherits' => "yes",
+        'explodes' => "yes",
         'args'     => $arguments };
+# This plugin processes files with the suffix ".csv"
+sub get_default_process_exp {
+    return q^(?i)(\.csv)$^;
+}
+# This plugin splits the input text by line
+sub get_default_split_exp {
+    return q^\r?\n^;
+}
 …
     push(@$pluginlist, $class);
     if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
     if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
+    if (defined $arguments) { push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});}
+    if (defined $options) { push(@{$hashArgOptLists->{"OptList"}}, $options)};
     my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);
+    my $self = new SplitPlug($pluginlist, $inputargs, $hashArgOptLists);
     return bless $self, $class;
 …
+# Not used, just here to prevent a warning
+sub get_default_process_exp
+{
+    return q^(?i)\.csv$^;
+}
+# We don't want any other plugins to see .csv files
+sub get_default_block_exp
+{
+    return q^(?i)\.csv$^;
+}
+sub metadata_read
+sub read_file
+{
     my $self = shift (@_);
+    my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
+    my ($filename, $encoding, $language, $textref) = @_;
+    my $outhandle = $self->{'outhandle'};
     # Read metadata from CSV files
     my $filename = &util::filename_cat($base_dir, $file);
     if ($filename !~ /\.csv$/ || !-f $filename) {
     return undef;
+    }
     print STDERR "\n<Processing n='$file' p='CSVPlug'>\n" if ($gli);
     print STDERR "CSVPlug: processing $file\n" if ($self->{'verbosity'}) > 1;
+    # Read the CSV file content
+    open(FILE, $filename);
+    my $reader = new multiread();
+    $reader->set_handle('CSVPlug::FILE');
+    $reader->set_encoding($encoding);
+    $reader->read_file($textref);
+    close(FILE);
+    # Read the CSV file to get the metadata
+    my $csv_file_content;
+    open(CSV_FILE, "$filename");
+    my $csv_file_reader = new multiread();
+    $csv_file_reader->set_handle('CSVPlug::CSV_FILE');
+    $csv_file_reader->read_file(\$csv_file_content);
+    close(CSV_FILE);
+    # Remove any blank lines so the data is split and processed properly
+    $$textref =~ s/\n(\s*)\n/\n/g;
+    # Split the file into lines and read the first line (contains the metadata names)
+    $csv_file_content =~ s/\n//g;
+    my @csv_file_lines = split(/\r/, $csv_file_content);
+    my $csv_file_field_line = shift(@csv_file_lines);
+    # The first line contains the metadata element names
+    $$textref =~ s/^(.*?)\r?\n//;
+    my $csv_file_field_line = $1;
     my @csv_file_fields = split(/\,/, $csv_file_field_line);
     for (my $i = 0; $i < scalar(@csv_file_fields); $i++) {
 …
     $csv_file_fields[$i] =~ s/ //g;
+    }
+    $self->{'csv_file_fields'} = \@csv_file_fields;
+}
-    # Read each line of the file and assign the metadata appropriately
-    foreach my $csv_line (@csv_file_lines) {
-    # Ignore lines containing only whitespace
-    next if ($csv_line =~ /^\s*$/);
+    # Build a hash of metadata name to metadata value for this line
+    my %csv_line_metadata;
+    my $i = 0;
+    $csv_line .= ",";  # To make the regular expressions simpler
+    while ($csv_line ne "") {
+        # Metadata values containing commas are quoted
+        if ($csv_line =~ s/^\"(.*?)\"\,//) {
+        # Only bother with non-empty values
+        if ($1 ne "" && defined($csv_file_fields[$i])) {
+            $csv_line_metadata{$csv_file_fields[$i]} = $1;
+        }
+sub process
+{
+    my $self = shift (@_);
+    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
+    my $outhandle = $self->{'outhandle'};
+    my $section = $doc_obj->get_top_section();
+    my $csv_line = $$textref;
+    my @csv_file_fields = @{$self->{'csv_file_fields'}};
+    # Report that we're processing the file
+    print STDERR "\n<Processing n='$file' p='CSVPlug'>\n" if ($gli);
+    print $outhandle "CSVPlug: processing $file\n" if ($self->{'verbosity'}) > 1;
+    # Add the raw line as the document text
+    $doc_obj->add_utf8_text($section, $csv_line);
+    # Build a hash of metadata name to metadata value for this line
+    my $i = 0;
+    $csv_line .= ",";  # To make the regular expressions simpler
+    while ($csv_line ne "") {
+    # Metadata values containing commas are quoted
+    if ($csv_line =~ s/^\"(.*?)\"\,//) {
+        # Only bother with non-empty values
+        if ($1 ne "" && defined($csv_file_fields[$i])) {
+        $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $1);
+        }
         # Normal comma-separated case
         elsif ($csv_line =~ s/^(.*?)\,//) {
         # Only bother with non-empty values
         if ($1 ne "" && defined($csv_file_fields[$i])) {
             $csv_line_metadata{$csv_file_fields[$i]} = $1;
+        }
+    }
+    # Normal comma-separated case
+    elsif ($csv_line =~ s/^(.*?)\,//) {
+        # Only bother with non-empty values
+        if ($1 ne "" && defined($csv_file_fields[$i])) {
+        $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $1);
+        }
+        # The line must be formatted incorrectly
+        else {
+        print STDERR "Error: Badly formatted CSV line: $csv_line.\n";
+        last;
+        }
+        $i++;
+    }
+    # The line must be formatted incorrectly
+    else {
+        print STDERR "Error: Badly formatted CSV line: $csv_line.\n";
+        last;
+    }
+    # We can't associate any metadata without knowing the file to associate it with
+    my $csv_line_filename = $csv_line_metadata{"Filename"};
+    if (!defined($csv_line_filename)) {
+        print STDERR "Error: No Filename metadata in CSV line: $csv_line\n";
+        next;
+    }
+    delete $csv_line_metadata{"Filename"};
+    $i++;
+    }
+    # Associate the metadata now
+    $extrametadata->{$csv_line_filename} = \%csv_line_metadata;
+    push(@$extrametakeys, $csv_line_filename);
+    }
+    # Record was processed successfully
+    return 1;
+}

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 12610 for trunk/gsdl

Legend:

trunk/gsdl/perllib/plugins/CSVPlug.pm

Download in other formats: