Changeset 12610 for trunk/gsdl
- Timestamp:
- 2006-08-30T15:05:30+12:00 (18 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/CSVPlug.pm
r12169 r12610 1 1 ########################################################################### 2 2 # 3 # CSVPlug.pm -- A plugin for metadatain comma-separated value format3 # CSVPlug.pm -- A plugin for files in comma-separated value format 4 4 # 5 5 # A component of the Greenstone digital library software … … 28 28 29 29 30 use BasPlug;30 use SplitPlug; 31 31 use strict; 32 no strict 'refs'; # allow filehandles to be variables and viceversa 32 33 33 34 35 # CSVPlug is a sub-class of SplitPlug. 34 36 sub BEGIN { 35 @CSVPlug::ISA = (' BasPlug');37 @CSVPlug::ISA = ('SplitPlug'); 36 38 } 37 39 38 40 39 my $arguments = 40 [ { 'name' => " block_exp",41 'desc' => "{BasPlug. block_exp}",41 my $arguments = 42 [ { 'name' => "process_exp", 43 'desc' => "{BasPlug.process_exp}", 42 44 'type' => "regexp", 43 45 'reqd' => "no", 44 'deft' => &get_default_block_exp() } ]; 46 'deft' => &get_default_process_exp() }, 47 { 'name' => "split_exp", 48 'desc' => "{SplitPlug.split_exp}", 49 'type' => "regexp", 50 'reqd' => "no", 51 'deft' => &get_default_split_exp(), 52 'hiddengli' => "yes" } 53 ]; 45 54 46 55 … … 49 58 'abstract' => "no", 50 59 'inherits' => "yes", 60 'explodes' => "yes", 51 61 'args' => $arguments }; 62 63 64 # This plugin processes files with the suffix ".csv" 65 sub get_default_process_exp { 66 return q^(?i)(\.csv)$^; 67 } 68 69 70 # This plugin splits the input text by line 71 sub get_default_split_exp { 72 return q^\r?\n^; 73 } 52 74 53 75 … … 58 80 push(@$pluginlist, $class); 59 81 60 if (defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}61 if (defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};82 if (defined $arguments) { push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});} 83 if (defined $options) { push(@{$hashArgOptLists->{"OptList"}}, $options)}; 62 84 63 my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);85 my $self = new SplitPlug($pluginlist, $inputargs, $hashArgOptLists); 64 86 65 87 return bless $self, $class; … … 67 89 68 90 69 # Not used, just here to prevent a warning 70 sub get_default_process_exp 71 { 72 return q^(?i)\.csv$^; 73 } 74 75 76 # We don't want any other plugins to see .csv files 77 sub get_default_block_exp 78 { 79 return q^(?i)\.csv$^; 80 } 81 82 83 sub metadata_read 91 sub read_file 84 92 { 85 93 my $self = shift (@_); 86 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_; 94 my ($filename, $encoding, $language, $textref) = @_; 95 my $outhandle = $self->{'outhandle'}; 87 96 88 # Read metadata from CSV files89 my $filename = &util::filename_cat($base_dir, $file);90 if ($filename !~ /\.csv$/ || !-f $filename) {91 return undef;92 }93 print STDERR "\n<Processing n='$file' p='CSVPlug'>\n" if ($gli);94 print STDERR "CSVPlug: processing $file\n" if ($self->{'verbosity'}) > 1;97 # Read the CSV file content 98 open(FILE, $filename); 99 my $reader = new multiread(); 100 $reader->set_handle('CSVPlug::FILE'); 101 $reader->set_encoding($encoding); 102 $reader->read_file($textref); 103 close(FILE); 95 104 96 # Read the CSV file to get the metadata 97 my $csv_file_content; 98 open(CSV_FILE, "$filename"); 99 my $csv_file_reader = new multiread(); 100 $csv_file_reader->set_handle('CSVPlug::CSV_FILE'); 101 $csv_file_reader->read_file(\$csv_file_content); 102 close(CSV_FILE); 105 # Remove any blank lines so the data is split and processed properly 106 $$textref =~ s/\n(\s*)\n/\n/g; 103 107 104 # Split the file into lines and read the first line (contains the metadata names) 105 $csv_file_content =~ s/\n//g; 106 my @csv_file_lines = split(/\r/, $csv_file_content); 107 my $csv_file_field_line = shift(@csv_file_lines); 108 # The first line contains the metadata element names 109 $$textref =~ s/^(.*?)\r?\n//; 110 my $csv_file_field_line = $1; 108 111 my @csv_file_fields = split(/\,/, $csv_file_field_line); 109 112 for (my $i = 0; $i < scalar(@csv_file_fields); $i++) { … … 111 114 $csv_file_fields[$i] =~ s/ //g; 112 115 } 116 $self->{'csv_file_fields'} = \@csv_file_fields; 117 } 113 118 114 # Read each line of the file and assign the metadata appropriately115 foreach my $csv_line (@csv_file_lines) {116 # Ignore lines containing only whitespace117 next if ($csv_line =~ /^\s*$/);118 119 119 # Build a hash of metadata name to metadata value for this line 120 my %csv_line_metadata; 121 my $i = 0; 122 $csv_line .= ","; # To make the regular expressions simpler 123 while ($csv_line ne "") { 124 # Metadata values containing commas are quoted 125 if ($csv_line =~ s/^\"(.*?)\"\,//) { 126 # Only bother with non-empty values 127 if ($1 ne "" && defined($csv_file_fields[$i])) { 128 $csv_line_metadata{$csv_file_fields[$i]} = $1; 129 } 120 sub process 121 { 122 my $self = shift (@_); 123 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_; 124 my $outhandle = $self->{'outhandle'}; 125 126 my $section = $doc_obj->get_top_section(); 127 my $csv_line = $$textref; 128 my @csv_file_fields = @{$self->{'csv_file_fields'}}; 129 130 # Report that we're processing the file 131 print STDERR "\n<Processing n='$file' p='CSVPlug'>\n" if ($gli); 132 print $outhandle "CSVPlug: processing $file\n" if ($self->{'verbosity'}) > 1; 133 134 # Add the raw line as the document text 135 $doc_obj->add_utf8_text($section, $csv_line); 136 137 # Build a hash of metadata name to metadata value for this line 138 my $i = 0; 139 $csv_line .= ","; # To make the regular expressions simpler 140 while ($csv_line ne "") { 141 # Metadata values containing commas are quoted 142 if ($csv_line =~ s/^\"(.*?)\"\,//) { 143 # Only bother with non-empty values 144 if ($1 ne "" && defined($csv_file_fields[$i])) { 145 $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $1); 130 146 } 131 # Normal comma-separated case132 elsif ($csv_line =~ s/^(.*?)\,//) {133 # Only bother with non-empty values134 if ($1 ne "" && defined($csv_file_fields[$i])) {135 $csv_line_metadata{$csv_file_fields[$i]} = $1;136 }147 } 148 # Normal comma-separated case 149 elsif ($csv_line =~ s/^(.*?)\,//) { 150 # Only bother with non-empty values 151 if ($1 ne "" && defined($csv_file_fields[$i])) { 152 $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $1); 137 153 } 138 # The line must be formatted incorrectly 139 else { 140 print STDERR "Error: Badly formatted CSV line: $csv_line.\n"; 141 last; 142 } 143 144 $i++; 154 } 155 # The line must be formatted incorrectly 156 else { 157 print STDERR "Error: Badly formatted CSV line: $csv_line.\n"; 158 last; 145 159 } 146 160 147 # We can't associate any metadata without knowing the file to associate it with 148 my $csv_line_filename = $csv_line_metadata{"Filename"}; 149 if (!defined($csv_line_filename)) { 150 print STDERR "Error: No Filename metadata in CSV line: $csv_line\n"; 151 next; 152 } 153 delete $csv_line_metadata{"Filename"}; 161 $i++; 162 } 154 163 155 # Associate the metadata now 156 $extrametadata->{$csv_line_filename} = \%csv_line_metadata; 157 push(@$extrametakeys, $csv_line_filename); 158 } 164 # Record was processed successfully 165 return 1; 159 166 } 160 167
Note:
See TracChangeset
for help on using the changeset viewer.