source: main/trunk/greenstone2/perllib/plugins/PagedImagePlugin.pm@ 22705

Last change on this file since 22705 was 22565, checked in by kjdon, 14 years ago

removed block exp. now it scans the item file to work out which files to block

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 25.1 KB
Line 
1###########################################################################
2#
3# PagedImagePlugin.pm -- plugin for sets of images and OCR text that
4# make up a document
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# PagedImagePlugin
28# processes sequences of images, with optional OCR text
29#
30# This plugin takes *.item files, which contain metadata and lists of image
31# files, and produces a document containing sections, one for each page.
32# The files should be named something.item, then you can have more than one
33# book in a directory. You will need to create these files, one for each
34# document/book.
35#
36#There are two formats for the item files: a plain text format, and an xml
37#format. You can use either format, and can have both formats in the same
38#collection if you like. If you use the plain format, you must not start the
39#file off with <PagedDocument>
40
41#### PLAIN FORMAT
42# The format of the xxx.item file is as follows:
43# The first lines contain any metadata for the whole document
44# <metadata-name>metadata-value
45# eg.
46# <Title>Snail farming
47# <Date>19230102
48# Then comes a list of pages, one page per line, each line has the format
49#
50# pagenum:imagefile:textfile:r
51#
52# page num and imagefile are required. pagenum is used for the Title
53# of the section, and in the display is shown as page <pagenum>.
54# imagefile is the image for the page. textfile is an optional text
55# file containing the OCR (or any) text for the page - this gets added
56# as the text for the section. r is optional, and signals that the image
57# should be rotated 180deg. Eg use this if the image has been made upside down.
58# So an example item file looks like:
59# <Title>Snail farming
60# <Date>19960403
61# 1:p1.gif:p1.txt:
62# 2:p2.gif::
63# 3:p3.gif:p3.txt:
64# 3b:p3b.gif:p3b.txt:r
65# The second page has no text, the fourth page is a back page, and
66# should be rotated.
67#
68
69#### XML FORMAT
70# The xml format looks like the following
71#<PagedDocument>
72#<Metadata name="Title">The Title of the entire document</Metadata>
73#<Page pagenum="1" imgfile="xxx.jpg" txtfile="yyy.txt">
74#<Metadata name="Title">The Title of this page</Metadata>
75#</Page>
76#... more pages
77#</PagedDocument>
78#PagedDocument contains a list of Pages, Metadata and PageGroups. Any metadata
79#that is not inside another tag will belong to the document.
80#Each Page has a pagenum (not used at the moment), an imgfile and/or a txtfile.
81#These are both optional - if neither is used, the section will have no content.
82#Pages can also have metadata associated with them.
83#PageGroups can be introduced at any point - they can contain Metadata and Pages and other PageGroups. They are used to introduce hierarchical structure into the document.
84#For example
85#<PagedDocument>
86#<PageGroup>
87#<Page>
88#<Page>
89#</PageGroup>
90#<Page>
91#</PagedDocument>
92#would generate a structure like
93#X
94#--X
95# --X
96# --X
97#--X
98#PageGroup tags can also have imgfile/textfile metadata if you like - this way they get some content themselves.
99
100#Currently the XML structure doesn't work very well with the paged document type, unless you use numerical Titles for each section.
101#There is still a bit of work to do on this format:
102#* enable other text file types, eg html, pdf etc
103#* make the document paging work properly
104#* add pagenum as Title unless a Title is present?
105
106# All the supplemetary image amd text files should be in the same folder as
107# the .item file.
108#
109# To display the images instead of the document text, you can use [srcicon]
110# in the DocumentText format statement.
111# For example,
112#
113# format DocumentText "<center><table width=_pagewidth_><tr><td>[srcicon]</td></tr></table></center>"
114#
115# To have it create thumbnail size images, use the '-create_thumbnail' option.
116# To have it create medium size images for display, use the '-create_screenview'
117# option. As usual, running
118# 'perl -S pluginfo.pl PagedImagePlugin' will list all the options.
119
120# If you want the resulting documents to be presented with a table of
121# contents, use '-documenttype hierarchy', otherwise they will have
122# next and previous arrows, and a goto page X box.
123
124# If you have used -create_screenview, you can also use [screenicon] in the format
125# statement to display the smaller image. Here is an example that switches
126# between the two:
127#
128# format DocumentText "<center><table width=_pagewidth_><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small'>Switch to small version.</a>,<a href='_httpdocument_&d=_cgiargd_&p=full'>Switch to fullsize version</a>}</td></tr><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small' title='Switch to small version'>[srcicon]</a>,<a href='_httpdocument_&d=_cgiargd_&p=full' title='Switch to fullsize version'>[screenicon]</a>}</td></tr></table></center>"
129#
130# Additional metadata can be added into the .item files, alternatively you can
131# use normal metadata.xml files, with the name of the xxx.item file as the
132# FileName (only for document level metadata).
133
134package PagedImagePlugin;
135
136use ReadXMLFile;
137use ReadTextFile;
138use ImageConverter;
139
140use strict;
141no strict 'refs'; # allow filehandles to be variables and viceversa
142
143sub BEGIN {
144 @PagedImagePlugin::ISA = ('ReadXMLFile', 'ReadTextFile', 'ImageConverter');
145}
146
147my $type_list =
148 [ { 'name' => "paged",
149 'desc' => "{PagedImagePlugin.documenttype.paged}" },
150 { 'name' => "hierarchy",
151 'desc' => "{PagedImagePlugin.documenttype.hierarchy}" } ];
152
153my $arguments =
154 [ { 'name' => "process_exp",
155 'desc' => "{BasePlugin.process_exp}",
156 'type' => "string",
157 'deft' => &get_default_process_exp(),
158 'reqd' => "no" },
159 { 'name' => "title_sub",
160 'desc' => "{HTMLPlugin.title_sub}",
161 'type' => "string",
162 'deft' => "" },
163 { 'name' => "headerpage",
164 'desc' => "{PagedImagePlugin.headerpage}",
165 'type' => "flag",
166 'reqd' => "no" },
167 { 'name' => "documenttype",
168 'desc' => "{PagedImagePlugin.documenttype}",
169 'type' => "enum",
170 'list' => $type_list,
171 'deft' => "paged",
172 'reqd' => "no" },
173 {'name' => "processing_tmp_files",
174 'desc' => "{BasePlugin.processing_tmp_files}",
175 'type' => "flag",
176 'hiddengli' => "yes"}
177];
178
179
180my $options = { 'name' => "PagedImagePlugin",
181 'desc' => "{PagedImagePlugin.desc}",
182 'abstract' => "no",
183 'inherits' => "yes",
184 'args' => $arguments };
185
186sub new {
187 my ($class) = shift (@_);
188 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
189 push(@$pluginlist, $class);
190
191 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
192 push(@{$hashArgOptLists->{"OptList"}},$options);
193
194 my $imc_self = new ImageConverter($pluginlist, $inputargs, $hashArgOptLists);
195 my $rtf_self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists, 1);
196 my $rxf_self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
197
198 my $self = BasePlugin::merge_inheritance($imc_self,$rtf_self,$rxf_self);
199
200 # Update $self used by XML::Parser so it finds callback functions
201 # such as start_document here and not in ReadXMLFile (which is what
202 # $self was when new XML::Parser was done)
203 #
204 # If the $self returned by this constructor is the same as the one
205 # used in ReadXMLFile (e.g. in the GreenstoneXMLPlugin) then this step isn't necessary
206 #
207 # Consider embedding this type of assignment into merge_inheritance
208 # to help catch all cases?
209
210 $rxf_self->{'parser'}->{'PluginObj'} = $self;
211
212 return bless $self, $class;
213}
214
215
216sub init {
217 my $self = shift (@_);
218 my ($verbosity, $outhandle, $failhandle) = @_;
219
220 $self->SUPER::init(@_);
221 $self->ImageConverter::init();
222}
223
224sub begin {
225 my $self = shift (@_);
226 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
227
228 $self->SUPER::begin(@_);
229 $self->ImageConverter::begin(@_);
230}
231
232sub get_default_process_exp {
233 my $self = shift (@_);
234
235 return q^\.item$^;
236}
237
238sub get_doctype {
239 my $self = shift(@_);
240
241 return "PagedDocument";
242}
243
244
245# want to use BasePlugin's version of this, not ReadXMLFile's
246sub can_process_this_file {
247 my $self = shift(@_);
248 return $self->BasePlugin::can_process_this_file(@_);
249}
250
251# instead of a block exp, now we scan the file and record all text and img files mentioned there for blocking.
252sub store_block_files
253{
254 my $self = shift (@_);
255 my ($filename_full_path, $block_hash) = @_;
256
257 my $xml_version = $self->is_xml_item_file($filename_full_path);
258
259 # do we need to do this? if we do it here, then don't need to do it later
260 $self->tidy_item_file($filename_full_path);
261
262 my ($dir, $file) = $filename_full_path =~ /^(.*?)([^\/\\]*)$/;
263 if ($xml_version) {
264
265 # do something
266 $self->scan_xml_for_files_to_block($filename_full_path, $dir, $block_hash);
267 } else {
268
269 $self->scan_item_for_files_to_block($filename_full_path, $dir, $block_hash);
270 }
271
272}
273
274# we want to use BasePlugin's read, not ReadXMLFile's
275sub read
276{
277 my $self = shift (@_);
278
279 $self->BasePlugin::read(@_);
280}
281
282
283
284sub read_into_doc_obj {
285 my $self = shift (@_);
286 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
287 my $outhandle = $self->{'outhandle'};
288 my $verbosity = $self->{'verbosity'};
289
290 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
291
292 print $outhandle "PagedImagePlugin processing \"$filename_full_path\"\n"
293 if $verbosity > 1;
294 print STDERR "<Processing n='$file' p='PagedImagePlugin'>\n" if ($gli);
295
296 $self->{'MaxImageWidth'} = 0;
297 $self->{'MaxImageHeight'} = 0;
298
299 # here we need to decide if we have an old text .item file, or a new xml
300 # .item file
301 my $xml_version = $self->is_xml_item_file($filename_full_path);
302
303 # have done this already in store_block_files
304 #$self->tidy_item_file($filename_full_path);
305
306 my $doc_obj;
307 if ($xml_version) {
308 # careful checking needed here!! are we using local xml handlers or super ones
309 $self->ReadXMLFile::read($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
310 $doc_obj = $self->{'doc_obj'};
311 } else {
312 my ($dir);
313 ($dir, $file) = $filename_full_path =~ /^(.*?)([^\/\\]*)$/;
314
315 #process the .item file
316 $doc_obj = $self->process_item($filename_full_path, $dir, $file, $processor);
317
318 }
319
320 my $section = $doc_obj->get_top_section();
321
322 $doc_obj->add_utf8_metadata($section, "Plugin", "$self->{'plugin_type'}");
323 $doc_obj->add_metadata($section, "FileFormat", "PagedImage");
324
325 # include any metadata passed in from previous plugins
326 # note that this metadata is associated with the top level section
327 $self->add_associated_files($doc_obj, $filename_full_path);
328 $self->extra_metadata ($doc_obj, $section, $metadata);
329 $self->auto_extract_metadata ($doc_obj);
330
331 # if we haven't found any Title so far, assign one
332 $self->title_fallback($doc_obj,$section,$filename_no_path);
333
334 $self->add_OID($doc_obj);
335 return (1,$doc_obj);
336}
337
338# for now, the test is if the first non-empty line is <PagedDocument>, then its xml
339sub is_xml_item_file {
340 my $self = shift(@_);
341 my ($filename) = @_;
342
343 my $xml_version = 0;
344 open (ITEMFILE, $filename) || die "couldn't open $filename\n";
345
346 my $line = "";
347 my $num = 0;
348
349 $line = <ITEMFILE>;
350 while (defined ($line) && ($line !~ /\w/)) {
351 $line = <ITEMFILE>;
352 }
353
354 if (defined $line) {
355 chomp $line;
356 if ($line =~ /<PagedDocument/) {
357 $xml_version = 1;
358 }
359 }
360
361 close ITEMFILE;
362 return $xml_version;
363}
364
365sub tidy_item_file {
366 my $self = shift(@_);
367 my ($filename) = @_;
368
369 open (ITEMFILE, $filename) || die "couldn't open $filename\n";
370 my $backup_filename = "backup.item";
371 open (BACKUP,">$backup_filename")|| die "couldn't write to $backup_filename\n";
372 my $line = "";
373 $line = <ITEMFILE>;
374 $line =~ s/^\xEF\xBB\xBF//; # strip BOM
375 $line =~ s/\x0B+//ig;
376 $line =~ s/&/&amp;/g;
377 print BACKUP ($line);
378 #Tidy up the item file some metadata title contains \vt-vertical tab
379 while ($line = <ITEMFILE>) {
380 $line =~ s/\x0B+//ig;
381 $line =~ s/&/&amp;/g;
382 print BACKUP ($line);
383 }
384 close ITEMFILE;
385 close BACKUP;
386 &File::Copy::copy ($backup_filename, $filename);
387 &util::rm($backup_filename);
388
389}
390
391sub rotate_image {
392 my $self = shift (@_);
393 my ($filename_full_path) = @_;
394
395 my ($this_filetype) = $filename_full_path =~ /\.([^\.]*)$/;
396 my $result = $self->convert($filename_full_path, $this_filetype, "-rotate 180", "ROTATE");
397 my ($new_filename) = ($result =~ /=>(.*\.$this_filetype)/);
398 if (-e "$new_filename") {
399 return $new_filename;
400 }
401 # somethings gone wrong
402 return $filename_full_path;
403
404}
405
406sub process_image {
407 my $self = shift(@_);
408 my ($filename_full_path, $filename_no_path, $doc_obj, $section, $rotate) = @_;
409 # check the filenames
410 return 0 if ($filename_no_path eq "" || !-f $filename_full_path);
411
412 # remember that this image file was one of our source files, but only
413 # if we are not processing a tmp file
414 if (!$self->{'processing_tmp_files'} ) {
415 $doc_obj->associate_source_file($filename_full_path);
416 }
417 # do rotation
418 if ((defined $rotate) && ($rotate eq "r")) {
419 # we get a new temporary file which is rotated
420 $filename_full_path = $self->rotate_image($filename_full_path);
421 }
422
423 # do generate images
424 my $result = 0;
425 if ($self->{'image_conversion_available'} == 1) {
426 # do we need to convert $filename_no_path to utf8? We are already reading in from a file, what encoding is it in???
427 $result = $self->generate_images($filename_full_path, $filename_no_path, $doc_obj, $section);
428 }
429 #overwrite one set in ImageConverter
430 $doc_obj->set_metadata_element ($section, "FileFormat", "PagedImage");
431 return $result;
432}
433
434
435sub xml_start_tag {
436 my $self = shift(@_);
437 my ($expat, $element) = @_;
438 $self->{'element'} = $element;
439
440 my $doc_obj = $self->{'doc_obj'};
441 if ($element eq "PagedDocument") {
442 $self->{'current_section'} = $doc_obj->get_top_section();
443 } elsif ($element eq "PageGroup" || $element eq "Page") {
444 # create a new section as a child
445 $self->{'current_section'} = $doc_obj->insert_section($doc_obj->get_end_child($self->{'current_section'}));
446 $self->{'num_pages'}++;
447 # assign pagenum as what??
448 my $pagenum = $_{'pagenum'}; #TODO!!
449 if (defined $pagenum) {
450 $doc_obj->set_utf8_metadata_element($self->{'current_section'}, 'PageNum', $pagenum);
451 }
452 my ($imgfile) = $_{'imgfile'};
453 if (defined $imgfile) {
454 # *****
455 # What about support for rotate image (e.g. old ':r' notation)?
456 $self->process_image($self->{'xml_file_dir'}.$imgfile, $imgfile, $doc_obj, $self->{'current_section'});
457 }
458 my ($txtfile) = $_{'txtfile'};
459 if (defined($txtfile)&& $txtfile ne "") {
460 $self->process_text ($self->{'xml_file_dir'}.$txtfile, $txtfile, $doc_obj, $self->{'current_section'});
461 } else {
462 $self->add_dummy_text($doc_obj, $self->{'current_section'});
463 }
464 } elsif ($element eq "Metadata") {
465 $self->{'metadata_name'} = $_{'name'};
466 }
467}
468
469sub xml_end_tag {
470 my $self = shift(@_);
471 my ($expat, $element) = @_;
472
473 my $doc_obj = $self->{'doc_obj'};
474 if ($element eq "Page" || $element eq "PageGroup") {
475 # if Title hasn't been assigned, set PageNum as Title
476 if (!defined $doc_obj->get_metadata_element ($self->{'current_section'}, "Title") && defined $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" )) {
477 $doc_obj->add_utf8_metadata ($self->{'current_section'}, "Title", $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" ));
478 }
479 # move the current section back to the parent
480 $self->{'current_section'} = $doc_obj->get_parent_section($self->{'current_section'});
481 } elsif ($element eq "Metadata") {
482 my $meta_name = $self->{'metadata_name'};
483 if ($meta_name =~ /\./) {
484 $meta_name = "ex.$meta_name";
485 }
486 $doc_obj->add_utf8_metadata ($self->{'current_section'}, $meta_name, $self->{'metadata_value'});
487 $self->{'metadata_name'} = "";
488 $self->{'metadata_value'} = "";
489
490 }
491 # otherwise we ignore the end tag
492}
493
494
495sub xml_text {
496 my $self = shift(@_);
497 my ($expat) = @_;
498
499 if ($self->{'element'} eq "Metadata" && $self->{'metadata_name'}) {
500 $self->{'metadata_value'} .= $_;
501 }
502}
503
504sub xml_doctype {
505}
506
507sub open_document {
508 my $self = shift(@_);
509
510 # create a new document
511 $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc", $self->{'file_rename_method'});
512 # TODO is file filenmae_no_path??
513 $self->set_initial_doc_fields($self->{'doc_obj'}, $self->{'file'}, $self->{'processor'});
514
515 my ($dir, $file) = $self->{'filename'} =~ /^(.*?)([^\/\\]*)$/;
516 $self->{'xml_file_dir'} = $dir;
517 $self->{'num_pages'} = 0;
518
519}
520
521sub close_document {
522 my $self = shift(@_);
523 my $doc_obj = $self->{'doc_obj'};
524
525 # add numpages metadata
526 my $topsection = $doc_obj->get_top_section();
527
528 $doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', $self->{'num_pages'});
529
530 $doc_obj->set_utf8_metadata_element($topsection,"MaxImageWidth",$self->{'MaxImageWidth'});
531 $doc_obj->set_utf8_metadata_element($topsection,"MaxImageHeight",$self->{'MaxImageHeight'});
532 $self->{'MaxImageWidth'} = undef;
533 $self->{'MaxImageHeight'} = undef;
534
535}
536
537
538sub set_initial_doc_fields {
539 my $self = shift(@_);
540 my ($doc_obj, $filename_no_path, $processor) = @_;
541
542 my $topsection = $doc_obj->get_top_section();
543
544 if ($self->{'documenttype'} eq 'paged') {
545 # set the gsdlthistype metadata to Paged - this ensures this document will
546 # be treated as a Paged doc, even if Titles are not numeric
547 $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Paged");
548 } else {
549 $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Hierarchy");
550 }
551
552 $self->set_Source_metadata($doc_obj, $filename_no_path);
553
554 # if we want a header page, we need to add some text into the top section, otherwise this section will become invisible
555 if ($self->{'headerpage'}) {
556 $self->add_dummy_text($doc_obj, $topsection);
557 }
558
559
560}
561
562sub scan_xml_for_files_to_block
563{
564 my $self = shift (@_);
565 my ($filename_full_path, $dir, $block_hash) = @_;
566
567 open (ITEMFILE, $filename_full_path) || die "couldn't open $filename_full_path to work out which files to block\n";
568 my $line = "";
569 while (defined ($line = <ITEMFILE>)) {
570 next unless $line =~ /\w/;
571
572 if ($line =~ /imgfile=\"([^\"]+)\"/) {
573 $block_hash->{'file_blocks'}->{$dir.$1} = 1;
574 }
575 if ($line =~ /txtfile=\"([^\"]+)\"/) {
576 $block_hash->{'file_blocks'}->{$dir.$1} = 1;
577 }
578 }
579 close ITEMFILE;
580
581}
582
583sub scan_item_for_files_to_block
584{
585 my $self = shift (@_);
586 my ($filename_full_path, $dir, $block_hash) = @_;
587
588 open (ITEMFILE, $filename_full_path) || die "couldn't open $filename_full_path to work out which files to block\n";
589 my $line = "";
590 while (defined ($line = <ITEMFILE>)) {
591 next unless $line =~ /\w/;
592 chomp $line;
593 next if $line =~ /^#/; # ignore comment lines
594 next if ($line =~ /^<([^>]*)>\s*(.*?)\s*$/); # ignore metadata lines
595 # line should be like page:imagefilename:textfilename:r
596 $line =~ s/^\s+//; #remove space at the front
597 $line =~ s/\s+$//; #remove space at the end
598 my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
599
600 # find the image file if there is one
601 if (defined $imgname && $imgname ne "") {
602 $block_hash->{'file_blocks'}->{$dir.$imgname}=1;
603 }
604 # find the text file if there is one
605 if (defined $txtname && $txtname ne "") {
606 $block_hash->{'file_blocks'}->{$dir.$txtname} = 1;
607 }
608 }
609 close ITEMFILE;
610
611}
612
613sub process_item {
614 my $self = shift (@_);
615 my ($filename_full_path, $dir, $filename_no_path, $processor) = @_;
616
617 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
618 $self->set_initial_doc_fields($doc_obj, $filename_no_path, $processor);
619 my $topsection = $doc_obj->get_top_section();
620 open (ITEMFILE, $filename_full_path) || die "couldn't open $filename_full_path\n";
621 my $line = "";
622 my $num = 0;
623 while (defined ($line = <ITEMFILE>)) {
624 next unless $line =~ /\w/;
625 chomp $line;
626 next if $line =~ /^#/; # ignore comment lines
627 if ($line =~ /^<([^>]*)>\s*(.*?)\s*$/) {
628 my $meta_name = $1;
629 my $meta_value = $2;
630 if ($meta_name =~ /\./) {
631 $meta_name = "ex.$meta_name";
632 }
633 $doc_obj->set_utf8_metadata_element ($topsection, $meta_name, $meta_value);
634 #$meta->{$1} = $2;
635 } else {
636 $num++;
637 # line should be like page:imagefilename:textfilename:r - the r is optional -> means rotate the image 180 deg
638 $line =~ s/^\s+//; #remove space at the front
639 $line =~ s/\s+$//; #remove space at the end
640 my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
641
642 # create a new section for each image file
643 my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
644 # the page number becomes the Title
645 $doc_obj->set_utf8_metadata_element($cursection, 'Title', $pagenum);
646
647 # process the image for this page if there is one
648 if (defined $imgname && $imgname ne "") {
649 my $result1 = $self->process_image($dir.$imgname, $imgname, $doc_obj, $cursection, $rotate);
650 if (!defined $result1)
651 {
652 print "PagedImagePlugin: couldn't process image \"$dir$imgname\" for item \"$filename_full_path\"\n";
653 }
654 }
655 # process the text file if one is there
656 if (defined $txtname && $txtname ne "") {
657 my $result2 = $self->process_text ($dir.$txtname, $txtname, $doc_obj, $cursection);
658
659 if (!defined $result2) {
660 print "PagedImagePlugin: couldn't process text file \"$dir.$txtname\" for item \"$filename_full_path\"\n";
661 $self->add_dummy_text($doc_obj, $cursection);
662 }
663 } else {
664 # otherwise add in some dummy text
665 $self->add_dummy_text($doc_obj, $cursection);
666 }
667 }
668 }
669
670 close ITEMFILE;
671
672 # add numpages metadata
673 $doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', "$num");
674
675 $doc_obj->set_utf8_metadata_element($topsection,"MaxImageWidth",$self->{'MaxImageWidth'});
676 $doc_obj->set_utf8_metadata_element($topsection,"MaxImageHeight",$self->{'MaxImageHeight'});
677 $self->{'MaxImageWidth'} = undef;
678 $self->{'MaxImageHeight'} = undef;
679
680
681 return $doc_obj;
682}
683
684sub process_text {
685 my $self = shift (@_);
686 my ($filename_full_path, $file, $doc_obj, $cursection) = @_;
687
688 # check that the text file exists!!
689 if (!-f $filename_full_path) {
690 print "PagedImagePlugin: ERROR: File $filename_full_path does not exist, skipping\n";
691 return 0;
692 }
693
694 # remember that this text file was one of our source files, but only
695 # if we are not processing a tmp file
696 if (!$self->{'processing_tmp_files'} ) {
697 $doc_obj->associate_source_file($filename_full_path);
698 }
699 # Do encoding stuff
700 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
701
702 my $text="";
703 &ReadTextFile::read_file($self, $filename_full_path, $encoding, $language, \$text);
704 if (!length ($text)) {
705 # It's a bit unusual but not out of the question to have no text, so just give a warning
706 print "PagedImagePlugin: WARNING: $filename_full_path contains no text\n";
707 }
708
709 # we need to escape the escape character, or else mg will convert into
710 # eg literal newlines, instead of leaving the text as '\n'
711 $text =~ s/\\/\\\\/g; # macro language
712 $text =~ s/_/\\_/g; # macro language
713
714
715 if ($text =~ m/<html.*?>\s*<head.*?>.*<\/head>\s*<body.*?>(.*)<\/body>\s*<\/html>\s*$/is) {
716 # looks like HTML input
717 # no need to escape < and > or put in <pre> tags
718
719 $text = $1;
720
721 # add text to document object
722 $doc_obj->add_utf8_text($cursection, "$text");
723 }
724 else {
725 $text =~ s/</&lt;/g;
726 $text =~ s/>/&gt;/g;
727
728 # insert preformat tags and add text to document object
729 $doc_obj->add_utf8_text($cursection, "<pre>\n$text\n</pre>");
730 }
731
732
733 return 1;
734}
735
736
737sub clean_up_after_doc_obj_processing {
738 my $self = shift(@_);
739
740 $self->ImageConverter::clean_up_temporary_files();
741}
742
7431;
Note: See TracBrowser for help on using the repository browser.