Changeset 24290
- Timestamp:
- 2011-07-19T14:02:17+12:00 (12 years ago)
- Location:
- main/trunk/greenstone2/perllib/plugins
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/BasePlugin.pm
r24225 r24290 95 95 { 'name' => "hash", 96 96 'desc' => "{import.OIDtype.hash}" }, 97 { 'name' => "hash_on_ga_xml", 98 'desc' => "{import.OIDtype.hash_on_ga_xml}" }, 97 99 { 'name' => "assigned", 98 100 'desc' => "{import.OIDtype.assigned}" }, -
main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm
r24225 r24290 289 289 print STDERR "calling cmd $cmd\n"; 290 290 $output_type = `$cmd`; 291 291 292 292 # remove symbolic link to original file 293 293 &util::rm($tmp_filename); … … 368 368 } 369 369 } 370 370 371 371 my $secondary_plugins = $self->{'secondary_plugins'}; 372 372 my $num_secondary_plugins = scalar(keys %$secondary_plugins); … … 488 488 my $tmp_dir = $self->{'tmp_dir'}; 489 489 if (defined $tmp_dir && -d $tmp_dir) { 490 ##print STDERR "**** Supressing clean up of tmp dir\n";491 &util::rm_r($tmp_dir);490 print STDERR "**** Supressing clean up of tmp dir\n"; 491 ##&util::rm_r($tmp_dir); 492 492 $self->{'tmp_dir'} = undef; 493 493 } -
main/trunk/greenstone2/perllib/plugins/EmbeddedMetadataPlugin.pm
r23810 r24290 39 39 sub BEGIN 40 40 { 41 42 43 } 44 45 my $encoding_plus_auto_list = 46 [ {'name' => "auto",47 'desc' => "{ReadTextFile.input_encoding.auto}" }];41 @EmbeddedMetadataPlugin::ISA = ('BasePlugin'); 42 binmode(STDERR, ":utf8"); 43 } 44 45 my $encoding_plus_auto_list = [{ 46 'name' => "auto", 47 'desc' => "{ReadTextFile.input_encoding.auto}" }]; 48 48 push(@{$encoding_plus_auto_list},@{$BasePlugin::encoding_list}); 49 49 50 my $arguments = 51 [ {'name' => "metadata_field_separator",50 my $arguments = [{ 51 'name' => "metadata_field_separator", 52 52 'desc' => "{HTMLPlugin.metadata_field_separator}", 53 53 'type' => "string", 54 'deft' => "" }, 55 { 'name' => "input_encoding", 54 'deft' => "" 55 },{ 56 'name' => "input_encoding", 56 57 'desc' => "{ReadTextFile.input_encoding}", 57 58 'type' => "enum", 58 59 'list' => $encoding_plus_auto_list, 59 60 'reqd' => "no", 60 'deft' => "auto" } 61 ]; 62 63 64 my $options = { 'name' => "EmbeddedMetadataPlugin", 65 'desc' => "{EmbeddedMetadataPlugin.desc}", 66 'abstract' => "no", 67 'inherits' => "yes", 68 'args' => $arguments }; 61 'deft' => "auto" 62 },{ 63 'name' => "join_before_split", 64 'desc' => "{EmbeddedMetadataPlugin.join_before_split}", 65 'type' => "flag" 66 },{ 67 'name' => "join_character", 68 'desc' => "{EmbeddedMetadataPlugin.join_character}", 69 'type' => "string", 70 'deft' => " " 71 },{ 72 'name' => "trim_whitespace", 73 'desc' => "{EmbeddedMetadataPlugin.trim_whitespace}", 74 'type' => "enum", 75 'list' => [{'name' => "true", 'desc' => "{common.true}"}, {'name' => "false", 'desc' => "{common.false}"}], 76 'deft' => "true" 77 }]; 78 79 my $options = { 80 'name' => "EmbeddedMetadataPlugin", 81 'desc' => "{EmbeddedMetadataPlugin.desc}", 82 'abstract' => "no", 83 'inherits' => "yes", 84 'args' => $arguments }; 69 85 70 86 sub new() … … 79 95 my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists); 80 96 81 82 97 # Create a new Image::ExifTool object 83 98 my $exifTool = new Image::ExifTool; 84 99 $exifTool->Options(Duplicates => 0); 85 100 $exifTool->Options(PrintConv => 0); 86 101 $exifTool->Options(Unknown => 1); 87 102 $exifTool->Options('Verbose'); 88 103 $self->{'exiftool'} = $exifTool; 89 90 104 91 105 return bless $self, $class; … … 122 136 my $separator = $self->{'metadata_field_separator'}; 123 137 if ($separator eq "") { 124 undef $separator;138 undef $separator; 125 139 } 126 140 127 141 my @group_list = Image::ExifTool::GetAllGroups(0); 128 foreach my $group (@group_list) 129 { 130 ## print STDERR "**** group = $group\n"; 131 132 # Extract meta information from an image 133 $self->{'exiftool'}->Options(Group0 => [$group]); 134 $self->{'exiftool'}->ExtractInfo($filename); 135 136 # Get list of tags in the order they were found in the file 137 my @tag_list = $self->{'exiftool'}->GetFoundTags('File'); 138 foreach my $tag (@tag_list) 139 { 140 ### print STDERR "**** tag = $tag\n"; 142 foreach my $group (@group_list) { 143 ## print STDERR "**** group = $group\n"; 144 145 # Extract meta information from an image 146 $self->{'exiftool'}->Options(Group0 => [$group]); 147 $self->{'exiftool'}->ExtractInfo($filename); 148 149 # Get list of tags in the order they were found in the file 150 my @tag_list = $self->{'exiftool'}->GetFoundTags('File'); 151 foreach my $tag (@tag_list) { 141 152 142 153 # Strip any numbering suffix 143 $tag =~ s/^([^\s]+)\s.*$/$1/i; 144 my $value = $self->{'exiftool'}->GetValue($tag); 145 if (defined $value && $value =~ /[a-z0-9]+/i) { 146 147 my $field = "ex.$group.$tag"; 154 $tag =~ s/^([^\s]+)\s.*$/$1/i; 155 my $value = $self->{'exiftool'}->GetValue($tag); 156 if (defined $value && $value =~ /[a-z0-9]+/i) { 157 my $field = "ex.$group.$tag"; 148 158 149 my $encoding = $self->{'input_encoding'}; 150 if($encoding eq "auto") 151 { 152 $encoding = "utf8" 159 my $encoding = $self->{'input_encoding'}; 160 if($encoding eq "auto") 161 { 162 $encoding = "utf8" 163 } 164 165 if (!defined $exif_metadata{$field}) 166 { 167 $exif_metadata{$field} = []; 168 } 169 170 $field = Encode::decode($encoding,$field); 171 my $metadata_done = 0; 172 if (ref $value eq 'SCALAR') { 173 if ($$value =~ /^Binary data/) { 174 $value = "($$value)"; 175 } 176 else { 177 my $len = length($$value); 178 $value = "(Binary data $len bytes)"; 179 } 180 } 181 elsif (ref $value eq 'ARRAY') { 182 $metadata_done = 1; 183 184 my $allvals = ""; 185 foreach my $v (@$value) { 186 $v = Encode::decode($encoding,$v); 187 188 if(!$self->{'join_before_split'}){ 189 if (defined $separator) { 190 my @vs = split($separator, $v); 191 foreach my $val (@vs) { 192 if ($val =~ /\S/) { 193 push (@{$exif_metadata{$field}}, $self->gsSafe($val)); 194 ++$metadata_count; 195 } 196 } 197 } 198 else 199 { 200 push (@{$exif_metadata{$field}}, $self->gsSafe($v)); 201 ++$metadata_count; 202 } 203 } 204 else{ 205 if($allvals ne ""){ 206 $allvals = $allvals . $self->{'join_character'}; 207 } 208 $allvals = $allvals . $v; 209 } 210 } 211 212 if($self->{'join_before_split'}){ 213 if (defined $separator) { 214 my @vs = split($separator, $allvals); 215 foreach my $val (@vs) { 216 if ($val =~ /\S/) { 217 push (@{$exif_metadata{$field}}, $self->gsSafe($val)); 218 ++$metadata_count; 219 } 220 } 221 } 222 else 223 { 224 push (@{$exif_metadata{$field}}, $self->gsSafe($allvals)); 225 ++$metadata_count; 226 } 227 } 228 } 229 else { 230 $value = Encode::decode($encoding,$value); 231 if (defined $separator) { 232 my @vs = split($separator, $value); 233 $metadata_done = 1; 234 foreach my $v (@vs) { 235 if ($v =~ /\S/) { 236 push (@{$exif_metadata{$field}}, $self->gsSafe($v)); 237 ++$metadata_count; 238 } 239 } 240 } 241 } 242 if (!$metadata_done) { 243 push (@{$exif_metadata{$field}}, $self->gsSafe($value)); 244 ++$metadata_count; 245 } 153 246 } 154 155 156 157 if (!defined $exif_metadata{$field})158 {159 $exif_metadata{$field} = [];160 247 } 161 162 $field = Encode::decode($encoding,$field); 163 my $metadata_done = 0; 164 if (ref $value eq 'SCALAR') { 165 166 if ($$value =~ /^Binary data/) { 167 168 $value = "($$value)"; 169 } 170 else { 171 172 my $len = length($$value); 173 $value = "(Binary data $len bytes)"; 174 } 175 } 176 elsif (ref $value eq 'ARRAY') { 177 $metadata_done = 1; 178 foreach my $v (@$value) { 179 $v = Encode::decode($encoding,$v); 180 push (@{$exif_metadata{$field}}, $self->gsSafe($v)); 181 ++$metadata_count; 182 } 183 } 184 else { 185 $value = Encode::decode($encoding,$value); 186 if (defined $separator) { 187 my @vs = split($separator, $value); 188 $metadata_done = 1; 189 foreach my $v (@vs) { 190 if ($v =~ /\S/) { 191 push (@{$exif_metadata{$field}}, $self->gsSafe($v)); 192 ++$metadata_count; 193 } 194 } 195 } 196 } 197 if (!$metadata_done) { 198 push (@{$exif_metadata{$field}}, $self->gsSafe($value)); 199 ++$metadata_count; 200 } 201 } 202 } 203 } 204 248 } 205 249 206 250 if ($metadata_count > 0) { 207 print $outhandle " Extracted $metadata_count pieces of metadata from $filename EXIF block\n";251 print $outhandle " Extracted $metadata_count pieces of metadata from $filename EXIF block\n"; 208 252 } 209 253 … … 221 265 sub metadata_read() 222 266 { 223 224 225 226 267 my $self = shift (@_); 268 my ($pluginfo, $base_dir, $file, $block_hash, 269 $extrametakeys, $extrametadata, $extrametafile, 270 $processor, $gli, $aux) = @_; 227 271 228 272 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); 229 273 230 # we don't want to process directories 231 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) { 232 return undef; 233 } 234 print STDERR "\n<Processing n='$file' p='EmbeddedMetadataPlugin'>\n" if ($gli); 235 print STDERR "EmbeddedMetadataPlugin: processing $file\n" if ($self->{'verbosity'}) > 1; 236 237 238 $self->extractEmbeddedMetadata($filename_no_path,$filename_full_path, 239 $extrametadata,$extrametakeys); 240 241 242 return undef; 274 # we don't want to process directories 275 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) { 276 return undef; 277 } 278 print STDERR "\n<Processing n='$file' p='EmbeddedMetadataPlugin'>\n" if ($gli); 279 print STDERR "EmbeddedMetadataPlugin: processing $file\n" if ($self->{'verbosity'}) > 1; 280 281 $self->extractEmbeddedMetadata($filename_no_path,$filename_full_path, 282 $extrametadata,$extrametakeys); 283 284 return undef; 243 285 } 244 286 … … 250 292 } 251 293 252 sub gsSafe() 253 { 254 my $self = shift(@_); 255 my ($text) = @_; 256 # Replace dangerous characters 257 $text =~ s/\(/(/g; 258 $text =~ s/\)/)/g; 259 $text =~ s/,/,/g; 260 $text =~ s/\</</g; 261 $text =~ s/\</>/g; 262 $text =~ s/\[/[/g; 263 $text =~ s/\]/]/g; 264 # Done 265 return $text; 266 } 294 sub gsSafe() { 295 my $self = shift(@_); 296 my ($text) = @_; 297 298 # Replace potentially problematic characters 299 $text =~ s/\(/(/g; 300 $text =~ s/\)/)/g; 301 $text =~ s/,/,/g; 302 $text =~ s/\</</g; 303 $text =~ s/\>/>/g; 304 $text =~ s/\[/[/g; 305 $text =~ s/\]/]/g; 306 $text =~ s/\{/{/g; 307 $text =~ s/\}/}/g; 308 # Done 309 310 if ($self->{'trim_whitespace'} eq "true"){ 311 $text =~ s/^\s+//; 312 $text =~ s/\s+$//; 313 } 314 315 return $text; 316 } 267 317 268 318 1; -
main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm
r24199 r24290 173 173 if ($secondary_plugin_name eq "HTMLPlugin") { 174 174 # pdftohtml always produces utf8 - What about pdfbox??? 175 push(@$specific_options, "-input_encoding", "utf8");175 # push(@$specific_options, "-input_encoding", "utf8"); 176 176 push(@$specific_options, "-extract_language") if $self->{'extract_language'}; 177 177 push(@$specific_options, "-processing_tmp_files"); … … 238 238 } 239 239 240 240 # By setting hashing to be on ga xml this ensures that two 241 # PDF files that are identical except for the metadata 242 # to hash to different values. Without this, when each PDF 243 # file is converted to HTML there is a chance that they 244 # will both be *identical* if the conversion utility does 245 # not embed the metadata in the generated HTML. This is 246 # certainly the case when PDFBOX is being used. 247 248 # This change makes this convert to based plugin more 249 # consistent with the original vision that the same document 250 # with different metadata should 251 # be seen as different. 252 253 sub get_oid_hash_type { 254 my $self = shift (@_); 255 return "hash_on_ga_xml"; 256 } 257 258 241 259 sub tmp_area_convert_file { 242 260
Note:
See TracChangeset
for help on using the changeset viewer.