Changeset 22597 for main/trunk/greenstone2/perllib/plugins/WordPlugin.pm
- Timestamp:
- 2010-08-10T14:31:53+12:00 (14 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/WordPlugin.pm
r22514 r22597 41 41 eval("require OpenOfficeConverter"); 42 42 if ($@) { 43 # Useful debugging statement if there is a syntax error in OpenOfficeConverter 43 # Useful debugging statement if there is a syntax error in OpenOfficeConverter: 44 44 #print STDERR "$@\n"; 45 45 @WordPlugin::ISA = ('ConvertBinaryFile'); … … 174 174 } 175 175 176 my $outhandle = $self->{'outhandle'};177 176 $self->{'filename_extension'} = "doc"; 178 177 $self->{'file_type'} = "Word"; 178 179 my $outhandle = $self->{'outhandle'}; 179 180 180 181 if ($self->{'windows_scripting'}) { … … 193 194 } 194 195 195 # we always save as utf-8 196 # if ($self->{'input_encoding'} eq "auto") { 197 # $self->{'input_encoding'} = "utf8"; 198 # } 199 196 # check convert_to 197 if ($self->{'convert_to'} eq "auto") { 198 $self->{'convert_to'} = "html"; 199 } 200 # windows or open office scripting, outputs structuredHTML 201 if (defined $self->{'office_scripting'}) { 202 $self->{'convert_to'} = "structuredhtml"; 203 } 204 205 # set convert_to_plugin and convert_to_ext 206 $self->ConvertBinaryFile::set_standard_convert_settings(); 207 208 my $secondary_plugin_name = $self->{'convert_to_plugin'}; 200 209 my $secondary_plugin_options = $self->{'secondary_plugin_options'}; 201 if (defined $self->{'office_scripting'}) { 202 if (!defined $secondary_plugin_options->{'StructuredHTMLPlugin'}){ 203 $secondary_plugin_options->{'StructuredHTMLPlugin'} = []; 204 my $structhtml_options = $secondary_plugin_options->{'StructuredHTMLPlugin'}; 205 206 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj) 207 # to extract these metadata fields from the HEAD META fields 208 push (@$structhtml_options, "-metadata_fields","Title,GENERATOR,date,author<Creator>"); 209 push (@$structhtml_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 210 push (@$structhtml_options, "-description_tags") if $self->{'office_scripting'}; 211 push (@$structhtml_options, "-extract_language") if $self->{'extract_language'}; 212 push (@$structhtml_options, "-delete_toc") if $self->{'delete_toc'}; 213 push (@$structhtml_options, "-toc_header", $self->{'toc_header'}) if $self->{'toc_header'}; 214 push (@$structhtml_options, "-title_header", $self->{'title_header'}) if $self->{'title_header'}; 215 push (@$structhtml_options, "-level1_header", $self->{'level1_header'}) if $self->{'level1_header'}; 216 push (@$structhtml_options, "-level2_header", $self->{'level2_header'})if $self->{'level2_header'}; 217 push (@$structhtml_options, "-level3_header", $self->{'level3_header'}) if $self->{'level3_header'}; 218 push (@$structhtml_options, "-metadata_fields", $self->{'metadata_fields'}) if $self->{'metadata_fields'}; 219 push (@$structhtml_options, "-metadata_field_separator", $self->{'metadata_field_separator'}) if $self->{'metadata_field_separator'}; 220 } 221 } 222 if (!defined $secondary_plugin_options->{'HTMLPlugin'}) { 223 $secondary_plugin_options->{'HTMLPlugin'} = []; 224 } 225 if (!defined $secondary_plugin_options->{'TextPlugin'}) { 226 $secondary_plugin_options->{'TextPlugin'} = []; 227 } 228 229 my $html_options = $secondary_plugin_options->{'HTMLPlugin'}; 230 my $text_options = $secondary_plugin_options->{'TextPlugin'}; 231 my $structhtml_options = $secondary_plugin_options->{'StructuredHTMLPlugin'}; 232 # tell the secondary plugins that they are processing tmp files 233 push(@$html_options, "-processing_tmp_files"); 234 push(@$structhtml_options, "-processing_tmp_files"); 235 236 # wvWare will always produce html files encoded as utf-8, so make sure the secondary HTMLPlugin knows this 237 push(@$html_options,"-input_encoding", "utf8"); 238 push(@$html_options,"-extract_language") if $self->{'extract_language'}; 239 push(@$html_options, "-description_tags") if $self->{'description_tags'}; 240 241 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj) 242 # to extract these metadata fields from the HEAD META fields 243 push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>"); 244 push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 210 211 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) { 212 $secondary_plugin_options->{$secondary_plugin_name} = []; 213 } 214 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name}; 215 216 # following title_sub removes "Page 1" and a leading 217 # "1", which is often the page number at the top of the page. Bad Luck 218 # if your document title actually starts with "1 " - is there a better way? 219 push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 245 220 246 221 my $associate_tail_re = $self->{'associate_tail_re'}; 247 222 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) { 248 push(@$html_options, "-associate_tail_re", $associate_tail_re); 249 push(@$text_options, "-associate_tail_re", $associate_tail_re); 250 push(@$structhtml_options, "-associate_tail_re", $associate_tail_re) if defined $structhtml_options; 251 } 252 253 push(@$html_options, "-file_rename_method", "none"); 254 push(@$text_options, "-file_rename_method", "none"); 255 push(@$structhtml_options, "-file_rename_method", "none") if defined $structhtml_options; 223 push(@$specific_options, "-associate_tail_re", $associate_tail_re); 224 } 225 push(@$specific_options, "-file_rename_method", "none"); 226 227 if ($secondary_plugin_name eq "StructuredHTMLPlugin") { 228 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj) 229 # to extract these metadata fields from the HEAD META fields 230 push (@$specific_options, "-metadata_fields","Title,GENERATOR,date,author<Creator>"); 231 push (@$specific_options, "-description_tags") if $self->{'office_scripting'}; 232 push (@$specific_options, "-extract_language") if $self->{'extract_language'}; 233 push (@$specific_options, "-delete_toc") if $self->{'delete_toc'}; 234 push (@$specific_options, "-toc_header", $self->{'toc_header'}) if $self->{'toc_header'}; 235 push (@$specific_options, "-title_header", $self->{'title_header'}) if $self->{'title_header'}; 236 push (@$specific_options, "-level1_header", $self->{'level1_header'}) if $self->{'level1_header'}; 237 push (@$specific_options, "-level2_header", $self->{'level2_header'})if $self->{'level2_header'}; 238 push (@$specific_options, "-level3_header", $self->{'level3_header'}) if $self->{'level3_header'}; 239 push (@$specific_options, "-metadata_fields", $self->{'metadata_fields'}) if $self->{'metadata_fields'}; 240 push (@$specific_options, "-metadata_field_separator", $self->{'metadata_field_separator'}) if $self->{'metadata_field_separator'}; 241 push(@$specific_options, "-processing_tmp_files"); 242 243 } 244 245 elsif ($secondary_plugin_name eq "HTMLPlugin") { 246 push(@$specific_options, "-processing_tmp_files"); 247 push(@$specific_options,"-input_encoding", "utf8"); 248 push(@$specific_options,"-extract_language") if $self->{'extract_language'}; 249 push(@$specific_options, "-description_tags") if $self->{'description_tags'}; 250 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj) 251 # to extract these metadata fields from the HEAD META fields 252 push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>"); 253 } 256 254 257 255 $self = bless $self, $class;
Note:
See TracChangeset
for help on using the changeset viewer.