Changeset 12610 for trunk/gsdl


Ignore:
Timestamp:
2006-08-30T15:05:30+12:00 (18 years ago)
Author:
mdewsnip
Message:

Essentially a brand-new plugin (the old CSVPlug has been renamed to MetadataCSVPlug). This plugin uses SplitPlug to split CSV files into lines, and creates a new document for each line, with the metadata specified. The first line of the CSV file must contain the metadata element names.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/CSVPlug.pm

    r12169 r12610  
    11###########################################################################
    22#
    3 # CSVPlug.pm -- A plugin for metadata in comma-separated value format
     3# CSVPlug.pm -- A plugin for files in comma-separated value format
    44#
    55# A component of the Greenstone digital library software
     
    2828
    2929
    30 use BasPlug;
     30use SplitPlug;
    3131use strict;
     32no strict 'refs'; # allow filehandles to be variables and viceversa
    3233
    3334
     35# CSVPlug is a sub-class of SplitPlug.
    3436sub BEGIN {
    35     @CSVPlug::ISA = ('BasPlug');
     37    @CSVPlug::ISA = ('SplitPlug');
    3638}
    3739
    3840
    39 my $arguments =
    40     [ { 'name' => "block_exp",
    41     'desc' => "{BasPlug.block_exp}",
     41my $arguments = 
     42    [ { 'name' => "process_exp",
     43    'desc' => "{BasPlug.process_exp}",
    4244    'type' => "regexp",
    4345    'reqd' => "no",
    44     'deft' => &get_default_block_exp() } ];
     46    'deft' => &get_default_process_exp() },
     47      { 'name' => "split_exp",
     48    'desc' => "{SplitPlug.split_exp}",
     49    'type' => "regexp",
     50    'reqd' => "no",
     51    'deft' => &get_default_split_exp(),
     52        'hiddengli' => "yes" }
     53      ];
    4554
    4655
     
    4958        'abstract' => "no",
    5059        'inherits' => "yes",
     60        'explodes' => "yes",
    5161        'args'     => $arguments };
     62
     63
     64# This plugin processes files with the suffix ".csv"
     65sub get_default_process_exp {
     66    return q^(?i)(\.csv)$^;
     67}
     68
     69   
     70# This plugin splits the input text by line
     71sub get_default_split_exp {
     72    return q^\r?\n^;
     73}
    5274
    5375
     
    5880    push(@$pluginlist, $class);
    5981
    60     if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
    61     if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
     82    if (defined $arguments) { push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});}
     83    if (defined $options) { push(@{$hashArgOptLists->{"OptList"}}, $options)};
    6284
    63     my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);
     85    my $self = new SplitPlug($pluginlist, $inputargs, $hashArgOptLists);
    6486
    6587    return bless $self, $class;
     
    6789
    6890
    69 # Not used, just here to prevent a warning
    70 sub get_default_process_exp
    71 {
    72     return q^(?i)\.csv$^;
    73 }
    74 
    75 
    76 # We don't want any other plugins to see .csv files
    77 sub get_default_block_exp
    78 {
    79     return q^(?i)\.csv$^;
    80 }
    81 
    82 
    83 sub metadata_read
     91sub read_file
    8492{
    8593    my $self = shift (@_);
    86     my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
     94    my ($filename, $encoding, $language, $textref) = @_;
     95    my $outhandle = $self->{'outhandle'};
    8796
    88     # Read metadata from CSV files
    89     my $filename = &util::filename_cat($base_dir, $file);
    90     if ($filename !~ /\.csv$/ || !-f $filename) {
    91     return undef;
    92     }
    93     print STDERR "\n<Processing n='$file' p='CSVPlug'>\n" if ($gli);
    94     print STDERR "CSVPlug: processing $file\n" if ($self->{'verbosity'}) > 1;
     97    # Read the CSV file content
     98    open(FILE, $filename);
     99    my $reader = new multiread();
     100    $reader->set_handle('CSVPlug::FILE');
     101    $reader->set_encoding($encoding);
     102    $reader->read_file($textref);
     103    close(FILE);
    95104
    96     # Read the CSV file to get the metadata
    97     my $csv_file_content;
    98     open(CSV_FILE, "$filename");
    99     my $csv_file_reader = new multiread();
    100     $csv_file_reader->set_handle('CSVPlug::CSV_FILE');
    101     $csv_file_reader->read_file(\$csv_file_content);
    102     close(CSV_FILE);
     105    # Remove any blank lines so the data is split and processed properly
     106    $$textref =~ s/\n(\s*)\n/\n/g;
    103107
    104     # Split the file into lines and read the first line (contains the metadata names)
    105     $csv_file_content =~ s/\n//g;
    106     my @csv_file_lines = split(/\r/, $csv_file_content);
    107     my $csv_file_field_line = shift(@csv_file_lines);
     108    # The first line contains the metadata element names
     109    $$textref =~ s/^(.*?)\r?\n//;
     110    my $csv_file_field_line = $1;
    108111    my @csv_file_fields = split(/\,/, $csv_file_field_line);
    109112    for (my $i = 0; $i < scalar(@csv_file_fields); $i++) {
     
    111114    $csv_file_fields[$i] =~ s/ //g;
    112115    }
     116    $self->{'csv_file_fields'} = \@csv_file_fields;
     117}
    113118
    114     # Read each line of the file and assign the metadata appropriately
    115     foreach my $csv_line (@csv_file_lines) {
    116     # Ignore lines containing only whitespace
    117     next if ($csv_line =~ /^\s*$/);
    118119
    119     # Build a hash of metadata name to metadata value for this line
    120     my %csv_line_metadata;
    121     my $i = 0;
    122     $csv_line .= ",";  # To make the regular expressions simpler
    123     while ($csv_line ne "") {
    124         # Metadata values containing commas are quoted
    125         if ($csv_line =~ s/^\"(.*?)\"\,//) {
    126         # Only bother with non-empty values
    127         if ($1 ne "" && defined($csv_file_fields[$i])) {
    128             $csv_line_metadata{$csv_file_fields[$i]} = $1;
    129         }
     120sub process
     121{
     122    my $self = shift (@_);
     123    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
     124    my $outhandle = $self->{'outhandle'};
     125
     126    my $section = $doc_obj->get_top_section();
     127    my $csv_line = $$textref;
     128    my @csv_file_fields = @{$self->{'csv_file_fields'}};
     129
     130    # Report that we're processing the file
     131    print STDERR "\n<Processing n='$file' p='CSVPlug'>\n" if ($gli);
     132    print $outhandle "CSVPlug: processing $file\n" if ($self->{'verbosity'}) > 1;
     133
     134    # Add the raw line as the document text
     135    $doc_obj->add_utf8_text($section, $csv_line);
     136
     137    # Build a hash of metadata name to metadata value for this line
     138    my $i = 0;
     139    $csv_line .= ",";  # To make the regular expressions simpler
     140    while ($csv_line ne "") {
     141    # Metadata values containing commas are quoted
     142    if ($csv_line =~ s/^\"(.*?)\"\,//) {
     143        # Only bother with non-empty values
     144        if ($1 ne "" && defined($csv_file_fields[$i])) {
     145        $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $1);
    130146        }
    131         # Normal comma-separated case
    132         elsif ($csv_line =~ s/^(.*?)\,//) {
    133         # Only bother with non-empty values
    134         if ($1 ne "" && defined($csv_file_fields[$i])) {
    135             $csv_line_metadata{$csv_file_fields[$i]} = $1;
    136         }
     147    }
     148    # Normal comma-separated case
     149    elsif ($csv_line =~ s/^(.*?)\,//) {
     150        # Only bother with non-empty values
     151        if ($1 ne "" && defined($csv_file_fields[$i])) {
     152        $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $1);
    137153        }
    138         # The line must be formatted incorrectly
    139         else {
    140         print STDERR "Error: Badly formatted CSV line: $csv_line.\n";
    141         last;
    142         }
    143 
    144         $i++;
     154    }
     155    # The line must be formatted incorrectly
     156    else {
     157        print STDERR "Error: Badly formatted CSV line: $csv_line.\n";
     158        last;
    145159    }
    146160
    147     # We can't associate any metadata without knowing the file to associate it with
    148     my $csv_line_filename = $csv_line_metadata{"Filename"};
    149     if (!defined($csv_line_filename)) {
    150         print STDERR "Error: No Filename metadata in CSV line: $csv_line\n";
    151         next;
    152     }
    153     delete $csv_line_metadata{"Filename"};
     161    $i++;
     162    }
    154163
    155     # Associate the metadata now
    156     $extrametadata->{$csv_line_filename} = \%csv_line_metadata;
    157     push(@$extrametakeys, $csv_line_filename);
    158     }
     164    # Record was processed successfully
     165    return 1;
    159166}
    160167
Note: See TracChangeset for help on using the changeset viewer.