Changeset 22597
- Timestamp:
- 2010-08-10T14:31:53+12:00 (14 years ago)
- Location:
- main/trunk/greenstone2/perllib/plugins
- Files:
-
- 8 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/CONTENTdmPlugin.pm
r20790 r22597 44 44 45 45 my $convert_to_list = 46 [ { 'name' => "auto", 47 'desc' => "{ConvertBinaryFile.convert_to.auto}" }, 48 { 'name' => "html", 49 'desc' => "{ConvertBinaryFile.convert_to.html}" }, 50 { 'name' => "text", 51 'desc' => "{ConvertBinaryFile.convert_to.text}" }, 46 [ 47 # { 'name' => "auto", 48 # 'desc' => "{ConvertBinaryFile.convert_to.auto}" }, 49 # { 'name' => "html", 50 # 'desc' => "{ConvertBinaryFile.convert_to.html}" }, 51 # { 'name' => "text", 52 # 'desc' => "{ConvertBinaryFile.convert_to.text}" }, 52 53 { 'name' => "pagedimg", 53 54 'desc' => "{ConvertBinaryFile.convert_to.pagedimg}"}, … … 124 125 $self->{'metadata_value'} = undef; 125 126 126 $self->{'convert_to'} = "PagedImage"; 127 # do we only allow one option?? 128 $self->{'convert_to'} = "pagedimg"; 129 $self->{'convert_to_plugin'} = "PagedImagePlugin"; 130 $self->{'convert_to_ext'} = "jpg"; 131 132 my $secondary_plugin_name = $self->{'convert_to_plugin'}; 127 133 my $secondary_plugin_options = $self->{'secondary_plugin_options'}; 128 134 129 if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){ 130 $secondary_plugin_options->{'PagedImagePlugin'} = []; 131 } 132 my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'}; 133 push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 134 push(@$pagedimg_options, "-create_thumbnail", "true", "-create_screenview", "true"); 135 push(@$pagedimg_options, "-file_rename_method", "none"); 136 push(@$pagedimg_options, "-processing_tmp_files"); 135 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) { 136 $secondary_plugin_options->{$secondary_plugin_name} = []; 137 } 138 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name}; 139 140 push(@$specific_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 141 push(@$specific_options, "-create_thumbnail", "true", "-create_screenview", "true"); 142 push(@$specific_options, "-file_rename_method", "none"); 143 push(@$specific_options, "-processing_tmp_files"); 144 145 # my $secondary_plugin_options = $self->{'secondary_plugin_options'}; 146 147 # if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){ 148 # $secondary_plugin_options->{'PagedImagePlugin'} = []; 149 # } 150 # my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'}; 151 # push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 152 # push(@$pagedimg_options, "-create_thumbnail", "true", "-create_screenview", "true"); 153 # push(@$pagedimg_options, "-file_rename_method", "none"); 154 # push(@$pagedimg_options, "-processing_tmp_files"); 137 155 $self = bless $self, $class; 138 139 # ***** no longer needed!140 # # This needs to be done after blss, to $self passed to XML::Parser141 # # can correctly resolve the right call-back methods during XML parsing142 143 144 156 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists); 145 157 return $self; -
main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm
r22504 r22597 95 95 my ($class,$input_args,$hashArgOptLists) = @_; 96 96 97 my @convert_to_list = split(",",$self->{'convert_to '});97 my @convert_to_list = split(",",$self->{'convert_to_plugin'}); 98 98 my $secondary_plugins = {}; 99 99 # find the plugin … … 101 101 foreach my $convert_to (@convert_to_list) { 102 102 # load in "convert_to" plugin package 103 my $plugin_class = $convert_to ."Plugin";103 my $plugin_class = $convert_to; 104 104 my $plugin_package = $plugin_class.".pm"; 105 105 … … 143 143 144 144 my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists); 145 146 if ($self->{'info_only'}) { 147 # don't worry about any options etc 148 return bless $self, $class; 149 } 150 151 my $convert_to_type = $self->{'convert_to'}; 152 if (!defined $convert_to_type || $convert_to_type eq "") { 153 $convert_to_type = "auto"; 154 } 155 my $windows_scripting = $self->{'windows_scripting'}; 156 $windows_scripting = 0 unless defined $windows_scripting; 157 if ($classPluginName eq "PDFPlugin") { 158 if ($convert_to_type eq "text" && 159 $ENV{'GSDLOS'} =~ /^windows$/i) { 160 print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n"; 161 $convert_to_type = "html"; 162 } 163 } elsif ($classPluginName eq "WordPlugin") { 164 if (($windows_scripting && $ENV{'GSDLOS'} =~ /^windows$/i && $convert_to_type =~ /^(html|auto)$/) || defined $self->{'openoffice_scripting'}) { 165 # we use structured HTML, not normal html 166 $convert_to_type = "structuredhtml"; 167 } 168 } elsif ($classPluginName eq "PowerPointPlugin") { 169 if ($windows_scripting && $ENV{'GSDLOS'} =~ /^windows$/i && $convert_to_type eq "auto") { 170 # we use paged img 171 $convert_to_type = "pagedimg_jpg"; 172 } 173 } elsif ($classPluginName eq "PostScriptPlugin") { 174 if ($convert_to_type eq "auto") { 175 # we use text 176 $convert_to_type = "text"; 177 } 178 } 179 180 if ($convert_to_type eq "auto") { 181 # choose html for now - should choose a format based on doc type 182 $convert_to_type = "html"; 183 } 184 185 if ($convert_to_type eq "html") { 186 $self->{'convert_to'} = "HTML"; 145 146 return bless $self, $class; 147 } 148 149 # should be called by subclasses after checking and setting 150 # $self->{'convert_to'} 151 sub set_standard_convert_settings { 152 my $self =shift (@_); 153 154 my $convert_to = $self->{'convert_to'}; 155 if ($convert_to eq "auto") { 156 $convert_to = "html"; 157 $self->{'convert_to'} = "html"; 158 } 159 160 if ($convert_to eq "html") { 161 $self->{'convert_to_plugin'} = "HTMLPlugin"; 187 162 $self->{'convert_to_ext'} = "html"; 188 } elsif ($convert_to _typeeq "text") {189 $self->{'convert_to '} = "Text";163 } elsif ($convert_to eq "text") { 164 $self->{'convert_to_plugin'} = "TextPlugin"; 190 165 $self->{'convert_to_ext'} = "txt"; 191 } elsif ($convert_to _typeeq "structuredhtml") {192 $self->{'convert_to '} = "StructuredHTML";166 } elsif ($convert_to eq "structuredhtml") { 167 $self->{'convert_to_plugin'} = "StructuredHTMLPlugin"; 193 168 $self->{'convert_to_ext'} = "html"; 194 } elsif ($convert_to _type=~ /^pagedimg/) {195 $self->{'convert_to '} = "PagedImage";196 my ($convert_to_ext) = $convert_to _type=~ /pagedimg\_(jpg|gif|png)/i;169 } elsif ($convert_to =~ /^pagedimg/) { 170 $self->{'convert_to_plugin'} = "PagedImagePlugin"; 171 my ($convert_to_ext) = $convert_to =~ /pagedimg\_(jpg|gif|png)/i; 197 172 $convert_to_ext = 'jpg' unless defined $convert_to_ext; 198 173 $self->{'convert_to_ext'} = $convert_to_ext; 199 174 } 200 201 return bless $self, $class; 202 } 203 204 175 176 } 205 177 sub init { 206 178 my $self = shift (@_); … … 316 288 # Execute the conversion command and get the type of the result, 317 289 # making sure the converter gives us the appropriate output type 318 my $output_type= "";319 if ($convert_to =~ m/PagedImage/i) {320 $output_type = lc($convert_to)."_".lc($convert_to_ext);321 } else {322 $output_type = lc($convert_to);323 }290 my $output_type=$self->{'convert_to'}; 291 # if ($convert_to =~ m/PagedImage/i) { 292 # $output_type = lc($convert_to)."_".lc($convert_to_ext); 293 # } else { 294 # $output_type = lc($convert_to); 295 # } 324 296 325 297 my $cmd = "perl -S gsConvert.pl -verbose $verbosity "; … … 331 303 } 332 304 $cmd .= "-errlog \"$errlog\" -output $output_type \"$tmp_filename\""; 333 305 print STDERR "calling cmd $cmd\n"; 334 306 $output_type = `$cmd`; 335 307 -
main/trunk/greenstone2/perllib/plugins/ExcelPlugin.pm
r22515 r22597 33 33 no strict 'subs'; 34 34 use gsprintf 'gsprintf'; 35 36 #sub BEGIN {37 # @ExcelPlugin::ISA = ('ConvertBinaryFile');38 #}39 35 40 36 # @ISA dynamically configured to be either OpenOfficeConverter or ConvertBinaryFile … … 94 90 push(@$pluginlist, $class); 95 91 96 #my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);97 92 if ($openoffice_ext_installed) { 98 93 print STDERR "ExcelPlugin: OpenOffice Extension to Greenstone detected\n"; … … 126 121 } 127 122 128 my $outhandle = $self->{'outhandle'};129 130 123 $self->{'filename_extension'} = "xls"; 131 124 $self->{'file_type'} = "Excel"; 132 125 126 my $outhandle = $self->{'outhandle'}; 127 128 # check convert_to 129 if ($self->{'convert_to'} eq "auto") { 130 $self->{'convert_to'} = "html"; 131 } 132 133 133 $self->{'convert_options'} = "-openoffice_scripting" if $self->{'openoffice_scripting'}; 134 134 135 # other options for HTML if using open office??? 135 # set convert_to_plugin and convert_to_ext 136 $self->ConvertBinaryFile::set_standard_convert_settings(); 137 print STDERR "final convert-to $self->{'convert_to'}\n"; 138 139 my $secondary_plugin_name = $self->{'convert_to_plugin'}; 136 140 my $secondary_plugin_options = $self->{'secondary_plugin_options'}; 137 if (!defined $secondary_plugin_options->{'HTMLPlugin'}) { 138 $secondary_plugin_options->{'HTMLPlugin'} = []; 141 142 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) { 143 $secondary_plugin_options->{$secondary_plugin_name} = []; 139 144 } 140 if (!defined $secondary_plugin_options->{'TextPlugin'}) { 141 $secondary_plugin_options->{'TextPlugin'} = []; 145 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name}; 146 147 push(@$specific_options,"-extract_language") if $self->{'extract_language'}; 148 push(@$specific_options, "-file_rename_method", "none"); 149 150 if ($secondary_plugin_name eq "HTMLPlugin") { 151 push(@$specific_options, "-processing_tmp_files"); 142 152 } 143 my $html_options = $secondary_plugin_options->{'HTMLPlugin'};144 my $text_options = $secondary_plugin_options->{'TextPlugin'};145 146 # xslhtml doesn't output utf8, let Greenstone work out the encoding147 #push(@$html_options, "-input_encoding", "utf8");148 push(@$html_options,"-extract_language") if $self->{'extract_language'};149 push(@$html_options, "-file_rename_method", "none");150 push(@$html_options, "-processing_tmp_files");151 152 #push(@$text_options, "-input_encoding", "utf8");153 push(@$text_options,"-extract_language") if $self->{'extract_language'};154 push(@$text_options, "-file_rename_method", "none");155 153 156 154 $self = bless $self, $class; -
main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm
r21800 r22597 119 119 push(@{$hashArgOptLists->{"OptList"}},$options); 120 120 121 my @arg_array = @$inputargs;122 121 my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists); 123 122 … … 138 137 $self->{'convert_options'} .= " -pdf_allow_images_only" if $self->{"allowimagesonly"}; 139 138 139 # check convert_to 140 if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) { 141 print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n"; 142 $self->{'convert_to'} = "html"; 143 } 144 elsif ($self->{'convert_to'} eq "auto") { 145 # choose html ?? is this the best option 146 $self->{'convert_to'} = "html"; 147 } 148 # set convert_to_plugin and convert_to_ext 149 $self->ConvertBinaryFile::set_standard_convert_settings(); 150 151 my $secondary_plugin_name = $self->{'convert_to_plugin'}; 140 152 my $secondary_plugin_options = $self->{'secondary_plugin_options'}; 141 153 142 if (!defined $secondary_plugin_options->{'HTMLPlugin'}) { 143 $secondary_plugin_options->{'HTMLPlugin'} = []; 144 } 145 if (!defined $secondary_plugin_options->{'TextPlugin'}) { 146 $secondary_plugin_options->{'TextPlugin'} = []; 147 } 148 if (defined $self->{'convert_to'} && $self->{'convert_to'} =~ m/(pagedimage|pagedimg).*/i) { 149 if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){ 150 $secondary_plugin_options->{'PagedImagePlugin'} = []; 151 my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'}; 152 push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 153 push(@$pagedimg_options, "-screenviewsize", "1000"); 154 push(@$pagedimg_options, "-enable_cache"); 155 } 156 } 157 my $html_options = $secondary_plugin_options->{'HTMLPlugin'}; 158 my $text_options = $secondary_plugin_options->{'TextPlugin'}; 159 my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'}; 160 161 # if ($self->{'input_encoding'} eq "auto") { 162 # $self->{'input_encoding'} = "utf8"; 163 # } 164 165 # if pdftohtml is always producing utf8, then htmlplug always needs this option 166 push(@$html_options,"-input_encoding", "utf8"); 167 push(@$html_options,"-extract_language") if $self->{'extract_language'}; 168 169 push(@$html_options, "-processing_tmp_files"); 170 push(@$pagedimg_options, "-processing_tmp_files"); 171 172 # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj) 173 # to extract these metadata fields from the HEAD META fields 174 my $required_metadata; 175 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) { 176 push(@$html_options,"-metadata_fields",$self->{'metadata_fields'}); 177 } else { 178 push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>"); 179 } 180 if (defined $self->{'metadata_field_separator'} && $self->{'metadata_field_separator'} =~ /\S/) { 181 push(@$html_options,"-metadata_field_separator",$self->{'metadata_field_separator'}); 182 } 183 184 if ($self->{'use_sections'} || $self->{'description_tags'}) { 185 $self->{'description_tags'} = 1; 186 push(@$html_options,"-description_tags"); 187 } 154 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) { 155 $secondary_plugin_options->{$secondary_plugin_name} = []; 156 } 157 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name}; 188 158 189 159 # following title_sub removes "Page 1" added by pdftohtml, and a leading 190 160 # "1", which is often the page number at the top of the page. Bad Luck 191 161 # if your document title actually starts with "1 " - is there a better way? 192 push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 193 push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 194 162 push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 195 163 my $associate_tail_re = $self->{'associate_tail_re'}; 196 164 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) { 197 push(@$html_options, "-associate_tail_re", $associate_tail_re); 198 push(@$text_options, "-associate_tail_re", $associate_tail_re); 199 push(@$pagedimg_options, "-associate_tail_re", $associate_tail_re) if defined $pagedimg_options; 200 } 201 202 push(@$html_options, "-file_rename_method", "none"); 203 push(@$text_options, "-file_rename_method", "none"); 204 push(@$pagedimg_options, "-file_rename_method", "none") if defined $pagedimg_options; 165 push(@$specific_options, "-associate_tail_re", $associate_tail_re); 166 } 167 push(@$specific_options, "-file_rename_method", "none"); 168 169 if ($secondary_plugin_name eq "HTMLPlugin") { 170 # pdftohtml always produces utf8 171 push(@$specific_options, "-input_encoding", "utf8"); 172 push(@$specific_options, "-extract_language") if $self->{'extract_language'}; 173 push(@$specific_options, "-processing_tmp_files"); 174 # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj) 175 # to extract these metadata fields from the HEAD META fields 176 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) { 177 push(@$specific_options,"-metadata_fields",$self->{'metadata_fields'}); 178 } else { 179 push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>"); 180 } 181 if (defined $self->{'metadata_field_separator'} && $self->{'metadata_field_separator'} =~ /\S/) { 182 push(@$specific_options,"-metadata_field_separator",$self->{'metadata_field_separator'}); 183 } 184 if ($self->{'use_sections'} || $self->{'description_tags'}) { 185 $self->{'description_tags'} = 1; 186 push(@$specific_options, "-description_tags"); 187 } 188 } 189 elsif ($secondary_plugin_name eq "PagedImagePlugin") { 190 push(@$specific_options, "-screenviewsize", "1000"); 191 push(@$specific_options, "-enable_cache"); 192 push(@$specific_options, "-processing_tmp_files"); 193 } 205 194 206 195 $self = bless $self, $class; -
main/trunk/greenstone2/perllib/plugins/PostScriptPlugin.pm
r20790 r22597 88 88 push(@$pluginlist, $class); 89 89 90 #push(@$inputargs,"-convert_to");91 #push(@$inputargs,"text");92 90 push(@$inputargs,"-title_sub"); 93 91 push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?'); … … 106 104 $self->{'file_type'} = "PS"; 107 105 106 if ($self->{'convert_to'} eq "auto") { 107 $self->{'convert_to'} = "text"; 108 } 109 110 # set convert_to_plugin and convert_to_ext 111 $self->ConvertBinaryFile::set_standard_convert_settings(); 112 my $secondary_plugin_name = $self->{'convert_to_plugin'}; 108 113 my $secondary_plugin_options = $self->{'secondary_plugin_options'}; 109 114 110 if (!defined $secondary_plugin_options->{'TextPlugin'}) { 111 $secondary_plugin_options->{'TextPlugin'} = []; 112 } 113 114 if (!defined $secondary_plugin_options->{'HTMLPlugin'}) { 115 $secondary_plugin_options->{'HTMLPlugin'} = []; 116 } 117 118 my $text_options = $secondary_plugin_options->{'TextPlugin'}; 119 my $html_options = $secondary_plugin_options->{'HTMLPlugin'}; 120 121 if (defined $self->{'convert_to'} && $self->{'convert_to'} =~ m/(pagedimage|pagedimg).*/i) { 122 if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){ 123 $secondary_plugin_options->{'PagedImagePlugin'} = []; 124 my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'}; 125 push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 126 push(@$pagedimg_options, "-file_rename_method", "none"); 127 push(@$pagedimg_options, "-processing_tmp_files"); 128 } 129 } 115 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) { 116 $secondary_plugin_options->{$secondary_plugin_name} = []; 117 } 118 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name}; 119 130 120 # following title_sub removes "Page 1" added by ps2ascii, and a leading 131 121 # "1", which is often the page number at the top of the page. Bad Luck 132 122 # if your document title actually starts with "1 " - is there a better way? 133 #$self->{'input_encoding'} = "utf8"; 134 #$self->{'extract_language'} = 1; 135 push(@$text_options, "-input_encoding", "utf8"); 136 push(@$text_options,"-extract_language") if $self->{'extract_language'}; 137 push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 138 139 push(@$text_options, "-file_rename_method", "none"); 140 push(@$html_options, "-file_rename_method", "none"); 141 142 # tell the secondary plugins that they are processing tmp files 143 push(@$html_options, "-processing_tmp_files"); 123 push(@$specific_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 124 push(@$specific_options, "-file_rename_method", "none"); 125 126 if ($secondary_plugin_name eq "TextPlugin") { 127 push(@$specific_options, "-input_encoding", "utf8"); 128 push(@$specific_options,"-extract_language") if $self->{'extract_language'}; 129 } elsif ($secondary_plugin_name eq "PagedImagePlugin") { 130 push(@$specific_options, "-processing_tmp_files"); 131 } 144 132 145 133 $self = bless $self, $class; -
main/trunk/greenstone2/perllib/plugins/PowerPointPlugin.pm
r22515 r22597 34 34 no strict 'subs'; 35 35 use gsprintf 'gsprintf'; 36 37 #sub BEGIN {38 # @PowerPointPlugin::ISA = ('ConvertBinaryFile');39 #}40 36 41 37 # @ISA dynamically configured to be either OpenOfficeConverter or ConvertBinaryFile … … 157 153 } 158 154 159 my $outhandle = $self->{'outhandle'};160 161 155 $self->{'filename_extension'} = "ppt"; 162 156 $self->{'file_type'} = "PPT"; 157 158 if ($self->{'convert_to'} eq "auto") { 159 if ($self->{'windows_scripting'}) { 160 $self->{'convert_to'} = "pagedimg_jpg"; 161 } 162 else { 163 $self->{'convert_to'} = "html"; 164 } 165 } 166 167 my $outhandle = $self->{'outhandle'}; 163 168 164 169 # can't have windows_scripting and openoffice_scripting at the same time … … 172 177 $self->{'convert_options'} = "-windows_scripting" if $self->{'windows_scripting'}; 173 178 $self->{'convert_options'} = "-openoffice_scripting" if $self->{'openoffice_scripting'}; 179 180 # set convert_to_plugin and convert_to_ext 181 $self->ConvertBinaryFile::set_standard_convert_settings(); 182 183 my $secondary_plugin_name = $self->{'convert_to_plugin'}; 174 184 my $secondary_plugin_options = $self->{'secondary_plugin_options'}; 175 185 176 if ($self->{'windows_scripting'} && ($self->{'convert_to'} =~ m/(pagedimage|pagedimg).*/i)) { 177 $secondary_plugin_options->{'PagedImagePlugin'} = []; 178 } else { 179 $secondary_plugin_options->{'HTMLPlugin'} = []; 180 $secondary_plugin_options->{'TextPlugin'} = []; 181 } 182 my $html_options = $secondary_plugin_options->{'HTMLPlugin'}; 183 my $text_options = $secondary_plugin_options->{'TextPlugin'}; 184 my $pageimg_options = $secondary_plugin_options->{'PagedImagePlugin'}; 185 186 if (defined $html_options){ 187 # ppthtml doesn't output utf-8 necessarily - let Greenstone determine the encoding 188 #push(@$html_options,"-input_encoding", "utf8"); 189 push(@$html_options,"-extract_language") if $self->{'extract_language'}; 190 push(@$html_options,"-file_rename_method", "none"); 191 192 push(@$html_options, "-processing_tmp_files"); 193 194 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj) 195 # to extract these metadata fields from the HEAD META fields 196 push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>"); 197 } 198 if (defined $text_options){ 199 #push(@$text_options,"-input_encoding", "utf8"); 200 push(@$text_options,"-extract_language") if $self->{'extract_language'}; 201 push(@$text_options,"-file_rename_method", "none"); 202 } 203 if (defined $pageimg_options){ 186 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) { 187 $secondary_plugin_options->{$secondary_plugin_name} = []; 188 } 189 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name}; 190 191 push(@$specific_options, "-file_rename_method", "none"); 192 push(@$specific_options, "-extract_language") if $self->{'extract_language'}; 193 194 if ($secondary_plugin_name eq "HTMLPlugin") { 195 push(@$specific_options, "-processing_tmp_files"); 196 push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>"); 197 } 198 elsif ($secondary_plugin_name eq "PagedImagePlugin") { 199 push(@$specific_options, "-processing_tmp_files"); 204 200 #is this true?? 205 push(@$pageimg_options,"-input_encoding", "utf8"); 206 push(@$pageimg_options,"-extract_language") if $self->{'extract_language'}; 207 push(@$pageimg_options,"-file_rename_method", "none"); 208 push(@$pageimg_options, "-processing_tmp_files"); 201 push(@$specific_options,"-input_encoding", "utf8"); 209 202 } 210 203 -
main/trunk/greenstone2/perllib/plugins/RTFPlugin.pm
r20790 r22597 35 35 } 36 36 37 # currently only converts to HTML 38 my $convert_to_list = 39 [ { 'name' => "html", 40 'desc' => "{ConvertBinaryFile.convert_to.html}" } ]; 41 37 42 my $arguments = 38 [ { 'name' => "process_exp", 43 [ { 'name' => "convert_to", 44 'desc' => "{ConvertBinaryFile.convert_to}", 45 'type' => "enum", 46 'reqd' => "yes", 47 'list' => $convert_to_list, 48 'deft' => "html" }, 49 { 'name' => "process_exp", 39 50 'desc' => "{BasePlugin.process_exp}", 40 51 'type' => "regexp", … … 71 82 $self->{'file_type'} = "RTF"; 72 83 84 # set convert_to_plugin and convert_to_ext 85 $self->ConvertBinaryFile::set_standard_convert_settings(); 86 my $secondary_plugin_name = $self->{'convert_to_plugin'}; 73 87 my $secondary_plugin_options = $self->{'secondary_plugin_options'}; 74 if (!defined $secondary_plugin_options->{'TextPlugin'}) { 75 $secondary_plugin_options->{'TextPlugin'} = []; 88 89 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) { 90 $secondary_plugin_options->{$secondary_plugin_name} = []; 76 91 } 77 if (!defined $secondary_plugin_options->{'HTMLPlugin'}) { 78 $secondary_plugin_options->{'HTMLPlugin'} = []; 92 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name}; 93 94 push(@$specific_options, "-file_rename_method", "none"); 95 push(@$specific_options, "-extract_language") if $self->{'extract_language'}; 96 if ($secondary_plugin_name eq "TextPlugin") { 97 push(@$specific_options, "-input_encoding", "utf8"); 79 98 } 80 my $text_options = $secondary_plugin_options->{'TextPlugin'}; 81 my $html_options = $secondary_plugin_options->{'HTMLPlugin'}; 82 83 #$self->{'input_encoding'} = "utf8"; 84 #$self->{'extract_language'} = 1; 85 push(@$text_options, "-input_encoding", "utf8"); 86 push(@$text_options,"-extract_language") if $self->{'extract_language'}; 87 push(@$html_options, "-description_tags") if $self->{'description_tags'}; 88 push(@$html_options,"-extract_language") if $self->{'extract_language'}; 89 90 push(@$html_options, "-file_rename_method", "none"); 91 push(@$text_options, "-file_rename_method", "none"); 92 93 # tell the secondary plugins that they are processing tmp files 94 push(@$html_options, "-processing_tmp_files"); 99 elsif ($secondary_plugin_name eq "HTMLPlugin") { 100 push(@$specific_options, "-description_tags") if $self->{'description_tags'}; 101 push(@$specific_options, "-processing_tmp_files"); 102 } 95 103 96 104 $self = bless $self, $class; -
main/trunk/greenstone2/perllib/plugins/WordPlugin.pm
r22514 r22597 41 41 eval("require OpenOfficeConverter"); 42 42 if ($@) { 43 # Useful debugging statement if there is a syntax error in OpenOfficeConverter 43 # Useful debugging statement if there is a syntax error in OpenOfficeConverter: 44 44 #print STDERR "$@\n"; 45 45 @WordPlugin::ISA = ('ConvertBinaryFile'); … … 174 174 } 175 175 176 my $outhandle = $self->{'outhandle'};177 176 $self->{'filename_extension'} = "doc"; 178 177 $self->{'file_type'} = "Word"; 178 179 my $outhandle = $self->{'outhandle'}; 179 180 180 181 if ($self->{'windows_scripting'}) { … … 193 194 } 194 195 195 # we always save as utf-8 196 # if ($self->{'input_encoding'} eq "auto") { 197 # $self->{'input_encoding'} = "utf8"; 198 # } 199 196 # check convert_to 197 if ($self->{'convert_to'} eq "auto") { 198 $self->{'convert_to'} = "html"; 199 } 200 # windows or open office scripting, outputs structuredHTML 201 if (defined $self->{'office_scripting'}) { 202 $self->{'convert_to'} = "structuredhtml"; 203 } 204 205 # set convert_to_plugin and convert_to_ext 206 $self->ConvertBinaryFile::set_standard_convert_settings(); 207 208 my $secondary_plugin_name = $self->{'convert_to_plugin'}; 200 209 my $secondary_plugin_options = $self->{'secondary_plugin_options'}; 201 if (defined $self->{'office_scripting'}) { 202 if (!defined $secondary_plugin_options->{'StructuredHTMLPlugin'}){ 203 $secondary_plugin_options->{'StructuredHTMLPlugin'} = []; 204 my $structhtml_options = $secondary_plugin_options->{'StructuredHTMLPlugin'}; 205 206 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj) 207 # to extract these metadata fields from the HEAD META fields 208 push (@$structhtml_options, "-metadata_fields","Title,GENERATOR,date,author<Creator>"); 209 push (@$structhtml_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 210 push (@$structhtml_options, "-description_tags") if $self->{'office_scripting'}; 211 push (@$structhtml_options, "-extract_language") if $self->{'extract_language'}; 212 push (@$structhtml_options, "-delete_toc") if $self->{'delete_toc'}; 213 push (@$structhtml_options, "-toc_header", $self->{'toc_header'}) if $self->{'toc_header'}; 214 push (@$structhtml_options, "-title_header", $self->{'title_header'}) if $self->{'title_header'}; 215 push (@$structhtml_options, "-level1_header", $self->{'level1_header'}) if $self->{'level1_header'}; 216 push (@$structhtml_options, "-level2_header", $self->{'level2_header'})if $self->{'level2_header'}; 217 push (@$structhtml_options, "-level3_header", $self->{'level3_header'}) if $self->{'level3_header'}; 218 push (@$structhtml_options, "-metadata_fields", $self->{'metadata_fields'}) if $self->{'metadata_fields'}; 219 push (@$structhtml_options, "-metadata_field_separator", $self->{'metadata_field_separator'}) if $self->{'metadata_field_separator'}; 220 } 221 } 222 if (!defined $secondary_plugin_options->{'HTMLPlugin'}) { 223 $secondary_plugin_options->{'HTMLPlugin'} = []; 224 } 225 if (!defined $secondary_plugin_options->{'TextPlugin'}) { 226 $secondary_plugin_options->{'TextPlugin'} = []; 227 } 228 229 my $html_options = $secondary_plugin_options->{'HTMLPlugin'}; 230 my $text_options = $secondary_plugin_options->{'TextPlugin'}; 231 my $structhtml_options = $secondary_plugin_options->{'StructuredHTMLPlugin'}; 232 # tell the secondary plugins that they are processing tmp files 233 push(@$html_options, "-processing_tmp_files"); 234 push(@$structhtml_options, "-processing_tmp_files"); 235 236 # wvWare will always produce html files encoded as utf-8, so make sure the secondary HTMLPlugin knows this 237 push(@$html_options,"-input_encoding", "utf8"); 238 push(@$html_options,"-extract_language") if $self->{'extract_language'}; 239 push(@$html_options, "-description_tags") if $self->{'description_tags'}; 240 241 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj) 242 # to extract these metadata fields from the HEAD META fields 243 push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>"); 244 push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 210 211 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) { 212 $secondary_plugin_options->{$secondary_plugin_name} = []; 213 } 214 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name}; 215 216 # following title_sub removes "Page 1" and a leading 217 # "1", which is often the page number at the top of the page. Bad Luck 218 # if your document title actually starts with "1 " - is there a better way? 219 push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 245 220 246 221 my $associate_tail_re = $self->{'associate_tail_re'}; 247 222 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) { 248 push(@$html_options, "-associate_tail_re", $associate_tail_re); 249 push(@$text_options, "-associate_tail_re", $associate_tail_re); 250 push(@$structhtml_options, "-associate_tail_re", $associate_tail_re) if defined $structhtml_options; 251 } 252 253 push(@$html_options, "-file_rename_method", "none"); 254 push(@$text_options, "-file_rename_method", "none"); 255 push(@$structhtml_options, "-file_rename_method", "none") if defined $structhtml_options; 223 push(@$specific_options, "-associate_tail_re", $associate_tail_re); 224 } 225 push(@$specific_options, "-file_rename_method", "none"); 226 227 if ($secondary_plugin_name eq "StructuredHTMLPlugin") { 228 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj) 229 # to extract these metadata fields from the HEAD META fields 230 push (@$specific_options, "-metadata_fields","Title,GENERATOR,date,author<Creator>"); 231 push (@$specific_options, "-description_tags") if $self->{'office_scripting'}; 232 push (@$specific_options, "-extract_language") if $self->{'extract_language'}; 233 push (@$specific_options, "-delete_toc") if $self->{'delete_toc'}; 234 push (@$specific_options, "-toc_header", $self->{'toc_header'}) if $self->{'toc_header'}; 235 push (@$specific_options, "-title_header", $self->{'title_header'}) if $self->{'title_header'}; 236 push (@$specific_options, "-level1_header", $self->{'level1_header'}) if $self->{'level1_header'}; 237 push (@$specific_options, "-level2_header", $self->{'level2_header'})if $self->{'level2_header'}; 238 push (@$specific_options, "-level3_header", $self->{'level3_header'}) if $self->{'level3_header'}; 239 push (@$specific_options, "-metadata_fields", $self->{'metadata_fields'}) if $self->{'metadata_fields'}; 240 push (@$specific_options, "-metadata_field_separator", $self->{'metadata_field_separator'}) if $self->{'metadata_field_separator'}; 241 push(@$specific_options, "-processing_tmp_files"); 242 243 } 244 245 elsif ($secondary_plugin_name eq "HTMLPlugin") { 246 push(@$specific_options, "-processing_tmp_files"); 247 push(@$specific_options,"-input_encoding", "utf8"); 248 push(@$specific_options,"-extract_language") if $self->{'extract_language'}; 249 push(@$specific_options, "-description_tags") if $self->{'description_tags'}; 250 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj) 251 # to extract these metadata fields from the HEAD META fields 252 push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>"); 253 } 256 254 257 255 $self = bless $self, $class;
Note:
See TracChangeset
for help on using the changeset viewer.