Context Navigation

source: main/trunk/greenstone2/perllib/plugins/PagedImagePlugin.pm@ 22705

Last change on this file since 22705 was 22565, checked in by kjdon, 14 years ago
removed block exp. now it scans the item file to work out which files to block
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 25.1 KB

Line
1	###########################################################################
2	#
3	# PagedImagePlugin.pm -- plugin for sets of images and OCR text that
4	# make up a document
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	# PagedImagePlugin
28	# processes sequences of images, with optional OCR text
29	#
30	# This plugin takes *.item files, which contain metadata and lists of image
31	# files, and produces a document containing sections, one for each page.
32	# The files should be named something.item, then you can have more than one
33	# book in a directory. You will need to create these files, one for each
34	# document/book.
35	#
36	#There are two formats for the item files: a plain text format, and an xml
37	#format. You can use either format, and can have both formats in the same
38	#collection if you like. If you use the plain format, you must not start the
39	#file off with <PagedDocument>
40
41	#### PLAIN FORMAT
42	# The format of the xxx.item file is as follows:
43	# The first lines contain any metadata for the whole document
44	# <metadata-name>metadata-value
45	# eg.
46	# <Title>Snail farming
47	# <Date>19230102
48	# Then comes a list of pages, one page per line, each line has the format
49	#
50	# pagenum:imagefile:textfile:r
51	#
52	# page num and imagefile are required. pagenum is used for the Title
53	# of the section, and in the display is shown as page <pagenum>.
54	# imagefile is the image for the page. textfile is an optional text
55	# file containing the OCR (or any) text for the page - this gets added
56	# as the text for the section. r is optional, and signals that the image
57	# should be rotated 180deg. Eg use this if the image has been made upside down.
58	# So an example item file looks like:
59	# <Title>Snail farming
60	# <Date>19960403
61	# 1:p1.gif:p1.txt:
62	# 2:p2.gif::
63	# 3:p3.gif:p3.txt:
64	# 3b:p3b.gif:p3b.txt:r
65	# The second page has no text, the fourth page is a back page, and
66	# should be rotated.
67	#
68
69	#### XML FORMAT
70	# The xml format looks like the following
71	#<PagedDocument>
72	#<Metadata name="Title">The Title of the entire document</Metadata>
73	#<Page pagenum="1" imgfile="xxx.jpg" txtfile="yyy.txt">
74	#<Metadata name="Title">The Title of this page</Metadata>
75	#</Page>
76	#... more pages
77	#</PagedDocument>
78	#PagedDocument contains a list of Pages, Metadata and PageGroups. Any metadata
79	#that is not inside another tag will belong to the document.
80	#Each Page has a pagenum (not used at the moment), an imgfile and/or a txtfile.
81	#These are both optional - if neither is used, the section will have no content.
82	#Pages can also have metadata associated with them.
83	#PageGroups can be introduced at any point - they can contain Metadata and Pages and other PageGroups. They are used to introduce hierarchical structure into the document.
84	#For example
85	#<PagedDocument>
86	#<PageGroup>
87	#<Page>
88	#<Page>
89	#</PageGroup>
90	#<Page>
91	#</PagedDocument>
92	#would generate a structure like
93	#X
94	#--X
95	# --X
96	# --X
97	#--X
98	#PageGroup tags can also have imgfile/textfile metadata if you like - this way they get some content themselves.
99
100	#Currently the XML structure doesn't work very well with the paged document type, unless you use numerical Titles for each section.
101	#There is still a bit of work to do on this format:
102	#* enable other text file types, eg html, pdf etc
103	#* make the document paging work properly
104	#* add pagenum as Title unless a Title is present?
105
106	# All the supplemetary image amd text files should be in the same folder as
107	# the .item file.
108	#
109	# To display the images instead of the document text, you can use [srcicon]
110	# in the DocumentText format statement.
111	# For example,
112	#
113	# format DocumentText "<center><table width=_pagewidth_><tr><td>[srcicon]</td></tr></table></center>"
114	#
115	# To have it create thumbnail size images, use the '-create_thumbnail' option.
116	# To have it create medium size images for display, use the '-create_screenview'
117	# option. As usual, running
118	# 'perl -S pluginfo.pl PagedImagePlugin' will list all the options.
119
120	# If you want the resulting documents to be presented with a table of
121	# contents, use '-documenttype hierarchy', otherwise they will have
122	# next and previous arrows, and a goto page X box.
123
124	# If you have used -create_screenview, you can also use [screenicon] in the format
125	# statement to display the smaller image. Here is an example that switches
126	# between the two:
127	#
128	# format DocumentText "<center><table width=_pagewidth_><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small'>Switch to small version.</a>,<a href='_httpdocument_&d=_cgiargd_&p=full'>Switch to fullsize version</a>}</td></tr><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small' title='Switch to small version'>[srcicon]</a>,<a href='_httpdocument_&d=_cgiargd_&p=full' title='Switch to fullsize version'>[screenicon]</a>}</td></tr></table></center>"
129	#
130	# Additional metadata can be added into the .item files, alternatively you can
131	# use normal metadata.xml files, with the name of the xxx.item file as the
132	# FileName (only for document level metadata).
133
134	package PagedImagePlugin;
135
136	use ReadXMLFile;
137	use ReadTextFile;
138	use ImageConverter;
139
140	use strict;
141	no strict 'refs'; # allow filehandles to be variables and viceversa
142
143	sub BEGIN {
144	@PagedImagePlugin::ISA = ('ReadXMLFile', 'ReadTextFile', 'ImageConverter');
145	}
146
147	my $type_list =
148	[ { 'name' => "paged",
149	'desc' => "{PagedImagePlugin.documenttype.paged}" },
150	{ 'name' => "hierarchy",
151	'desc' => "{PagedImagePlugin.documenttype.hierarchy}" } ];
152
153	my $arguments =
154	[ { 'name' => "process_exp",
155	'desc' => "{BasePlugin.process_exp}",
156	'type' => "string",
157	'deft' => &get_default_process_exp(),
158	'reqd' => "no" },
159	{ 'name' => "title_sub",
160	'desc' => "{HTMLPlugin.title_sub}",
161	'type' => "string",
162	'deft' => "" },
163	{ 'name' => "headerpage",
164	'desc' => "{PagedImagePlugin.headerpage}",
165	'type' => "flag",
166	'reqd' => "no" },
167	{ 'name' => "documenttype",
168	'desc' => "{PagedImagePlugin.documenttype}",
169	'type' => "enum",
170	'list' => $type_list,
171	'deft' => "paged",
172	'reqd' => "no" },
173	{'name' => "processing_tmp_files",
174	'desc' => "{BasePlugin.processing_tmp_files}",
175	'type' => "flag",
176	'hiddengli' => "yes"}
177	];
178
179
180	my $options = { 'name' => "PagedImagePlugin",
181	'desc' => "{PagedImagePlugin.desc}",
182	'abstract' => "no",
183	'inherits' => "yes",
184	'args' => $arguments };
185
186	sub new {
187	my ($class) = shift (@_);
188	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
189	push(@$pluginlist, $class);
190
191	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
192	push(@{$hashArgOptLists->{"OptList"}},$options);
193
194	my $imc_self = new ImageConverter($pluginlist, $inputargs, $hashArgOptLists);
195	my $rtf_self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists, 1);
196	my $rxf_self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
197
198	my $self = BasePlugin::merge_inheritance($imc_self,$rtf_self,$rxf_self);
199
200	# Update $self used by XML::Parser so it finds callback functions
201	# such as start_document here and not in ReadXMLFile (which is what
202	# $self was when new XML::Parser was done)
203	#
204	# If the $self returned by this constructor is the same as the one
205	# used in ReadXMLFile (e.g. in the GreenstoneXMLPlugin) then this step isn't necessary
206	#
207	# Consider embedding this type of assignment into merge_inheritance
208	# to help catch all cases?
209
210	$rxf_self->{'parser'}->{'PluginObj'} = $self;
211
212	return bless $self, $class;
213	}
214
215
216	sub init {
217	my $self = shift (@_);
218	my ($verbosity, $outhandle, $failhandle) = @_;
219
220	$self->SUPER::init(@_);
221	$self->ImageConverter::init();
222	}
223
224	sub begin {
225	my $self = shift (@_);
226	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
227
228	$self->SUPER::begin(@_);
229	$self->ImageConverter::begin(@_);
230	}
231
232	sub get_default_process_exp {
233	my $self = shift (@_);
234
235	return q^\.item$^;
236	}
237
238	sub get_doctype {
239	my $self = shift(@_);
240
241	return "PagedDocument";
242	}
243
244
245	# want to use BasePlugin's version of this, not ReadXMLFile's
246	sub can_process_this_file {
247	my $self = shift(@_);
248	return $self->BasePlugin::can_process_this_file(@_);
249	}
250
251	# instead of a block exp, now we scan the file and record all text and img files mentioned there for blocking.
252	sub store_block_files
253	{
254	my $self = shift (@_);
255	my ($filename_full_path, $block_hash) = @_;
256
257	my $xml_version = $self->is_xml_item_file($filename_full_path);
258
259	# do we need to do this? if we do it here, then don't need to do it later
260	$self->tidy_item_file($filename_full_path);
261
262	my ($dir, $file) = $filename_full_path =~ /^(.?)([^\/\\])$/;
263	if ($xml_version) {
264
265	# do something
266	$self->scan_xml_for_files_to_block($filename_full_path, $dir, $block_hash);
267	} else {
268
269	$self->scan_item_for_files_to_block($filename_full_path, $dir, $block_hash);
270	}
271
272	}
273
274	# we want to use BasePlugin's read, not ReadXMLFile's
275	sub read
276	{
277	my $self = shift (@_);
278
279	$self->BasePlugin::read(@_);
280	}
281
282
283
284	sub read_into_doc_obj {
285	my $self = shift (@_);
286	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
287	my $outhandle = $self->{'outhandle'};
288	my $verbosity = $self->{'verbosity'};
289
290	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
291
292	print $outhandle "PagedImagePlugin processing \"$filename_full_path\"\n"
293	if $verbosity > 1;
294	print STDERR "<Processing n='$file' p='PagedImagePlugin'>\n" if ($gli);
295
296	$self->{'MaxImageWidth'} = 0;
297	$self->{'MaxImageHeight'} = 0;
298
299	# here we need to decide if we have an old text .item file, or a new xml
300	# .item file
301	my $xml_version = $self->is_xml_item_file($filename_full_path);
302
303	# have done this already in store_block_files
304	#$self->tidy_item_file($filename_full_path);
305
306	my $doc_obj;
307	if ($xml_version) {
308	# careful checking needed here!! are we using local xml handlers or super ones
309	$self->ReadXMLFile::read($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
310	$doc_obj = $self->{'doc_obj'};
311	} else {
312	my ($dir);
313	($dir, $file) = $filename_full_path =~ /^(.?)([^\/\\])$/;
314
315	#process the .item file
316	$doc_obj = $self->process_item($filename_full_path, $dir, $file, $processor);
317
318	}
319
320	my $section = $doc_obj->get_top_section();
321
322	$doc_obj->add_utf8_metadata($section, "Plugin", "$self->{'plugin_type'}");
323	$doc_obj->add_metadata($section, "FileFormat", "PagedImage");
324
325	# include any metadata passed in from previous plugins
326	# note that this metadata is associated with the top level section
327	$self->add_associated_files($doc_obj, $filename_full_path);
328	$self->extra_metadata ($doc_obj, $section, $metadata);
329	$self->auto_extract_metadata ($doc_obj);
330
331	# if we haven't found any Title so far, assign one
332	$self->title_fallback($doc_obj,$section,$filename_no_path);
333
334	$self->add_OID($doc_obj);
335	return (1,$doc_obj);
336	}
337
338	# for now, the test is if the first non-empty line is <PagedDocument>, then its xml
339	sub is_xml_item_file {
340	my $self = shift(@_);
341	my ($filename) = @_;
342
343	my $xml_version = 0;
344	open (ITEMFILE, $filename) \|\| die "couldn't open $filename\n";
345
346	my $line = "";
347	my $num = 0;
348
349	$line = <ITEMFILE>;
350	while (defined ($line) && ($line !~ /\w/)) {
351	$line = <ITEMFILE>;
352	}
353
354	if (defined $line) {
355	chomp $line;
356	if ($line =~ /<PagedDocument/) {
357	$xml_version = 1;
358	}
359	}
360
361	close ITEMFILE;
362	return $xml_version;
363	}
364
365	sub tidy_item_file {
366	my $self = shift(@_);
367	my ($filename) = @_;
368
369	open (ITEMFILE, $filename) \|\| die "couldn't open $filename\n";
370	my $backup_filename = "backup.item";
371	open (BACKUP,">$backup_filename")\|\| die "couldn't write to $backup_filename\n";
372	my $line = "";
373	$line = <ITEMFILE>;
374	$line =~ s/^\xEF\xBB\xBF//; # strip BOM
375	$line =~ s/\x0B+//ig;
376	$line =~ s/&/&/g;
377	print BACKUP ($line);
378	#Tidy up the item file some metadata title contains \vt-vertical tab
379	while ($line = <ITEMFILE>) {
380	$line =~ s/\x0B+//ig;
381	$line =~ s/&/&/g;
382	print BACKUP ($line);
383	}
384	close ITEMFILE;
385	close BACKUP;
386	&File::Copy::copy ($backup_filename, $filename);
387	&util::rm($backup_filename);
388
389	}
390
391	sub rotate_image {
392	my $self = shift (@_);
393	my ($filename_full_path) = @_;
394
395	my ($this_filetype) = $filename_full_path =~ /\.([^\.]*)$/;
396	my $result = $self->convert($filename_full_path, $this_filetype, "-rotate 180", "ROTATE");
397	my ($new_filename) = ($result =~ /=>(.*\.$this_filetype)/);
398	if (-e "$new_filename") {
399	return $new_filename;
400	}
401	# somethings gone wrong
402	return $filename_full_path;
403
404	}
405
406	sub process_image {
407	my $self = shift(@_);
408	my ($filename_full_path, $filename_no_path, $doc_obj, $section, $rotate) = @_;
409	# check the filenames
410	return 0 if ($filename_no_path eq "" \|\| !-f $filename_full_path);
411
412	# remember that this image file was one of our source files, but only
413	# if we are not processing a tmp file
414	if (!$self->{'processing_tmp_files'} ) {
415	$doc_obj->associate_source_file($filename_full_path);
416	}
417	# do rotation
418	if ((defined $rotate) && ($rotate eq "r")) {
419	# we get a new temporary file which is rotated
420	$filename_full_path = $self->rotate_image($filename_full_path);
421	}
422
423	# do generate images
424	my $result = 0;
425	if ($self->{'image_conversion_available'} == 1) {
426	# do we need to convert $filename_no_path to utf8? We are already reading in from a file, what encoding is it in???
427	$result = $self->generate_images($filename_full_path, $filename_no_path, $doc_obj, $section);
428	}
429	#overwrite one set in ImageConverter
430	$doc_obj->set_metadata_element ($section, "FileFormat", "PagedImage");
431	return $result;
432	}
433
434
435	sub xml_start_tag {
436	my $self = shift(@_);
437	my ($expat, $element) = @_;
438	$self->{'element'} = $element;
439
440	my $doc_obj = $self->{'doc_obj'};
441	if ($element eq "PagedDocument") {
442	$self->{'current_section'} = $doc_obj->get_top_section();
443	} elsif ($element eq "PageGroup" \|\| $element eq "Page") {
444	# create a new section as a child
445	$self->{'current_section'} = $doc_obj->insert_section($doc_obj->get_end_child($self->{'current_section'}));
446	$self->{'num_pages'}++;
447	# assign pagenum as what??
448	my $pagenum = $_{'pagenum'}; #TODO!!
449	if (defined $pagenum) {
450	$doc_obj->set_utf8_metadata_element($self->{'current_section'}, 'PageNum', $pagenum);
451	}
452	my ($imgfile) = $_{'imgfile'};
453	if (defined $imgfile) {
454	# *****
455	# What about support for rotate image (e.g. old ':r' notation)?
456	$self->process_image($self->{'xml_file_dir'}.$imgfile, $imgfile, $doc_obj, $self->{'current_section'});
457	}
458	my ($txtfile) = $_{'txtfile'};
459	if (defined($txtfile)&& $txtfile ne "") {
460	$self->process_text ($self->{'xml_file_dir'}.$txtfile, $txtfile, $doc_obj, $self->{'current_section'});
461	} else {
462	$self->add_dummy_text($doc_obj, $self->{'current_section'});
463	}
464	} elsif ($element eq "Metadata") {
465	$self->{'metadata_name'} = $_{'name'};
466	}
467	}
468
469	sub xml_end_tag {
470	my $self = shift(@_);
471	my ($expat, $element) = @_;
472
473	my $doc_obj = $self->{'doc_obj'};
474	if ($element eq "Page" \|\| $element eq "PageGroup") {
475	# if Title hasn't been assigned, set PageNum as Title
476	if (!defined $doc_obj->get_metadata_element ($self->{'current_section'}, "Title") && defined $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" )) {
477	$doc_obj->add_utf8_metadata ($self->{'current_section'}, "Title", $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" ));
478	}
479	# move the current section back to the parent
480	$self->{'current_section'} = $doc_obj->get_parent_section($self->{'current_section'});
481	} elsif ($element eq "Metadata") {
482	my $meta_name = $self->{'metadata_name'};
483	if ($meta_name =~ /\./) {
484	$meta_name = "ex.$meta_name";
485	}
486	$doc_obj->add_utf8_metadata ($self->{'current_section'}, $meta_name, $self->{'metadata_value'});
487	$self->{'metadata_name'} = "";
488	$self->{'metadata_value'} = "";
489
490	}
491	# otherwise we ignore the end tag
492	}
493
494
495	sub xml_text {
496	my $self = shift(@_);
497	my ($expat) = @_;
498
499	if ($self->{'element'} eq "Metadata" && $self->{'metadata_name'}) {
500	$self->{'metadata_value'} .= $_;
501	}
502	}
503
504	sub xml_doctype {
505	}
506
507	sub open_document {
508	my $self = shift(@_);
509
510	# create a new document
511	$self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc", $self->{'file_rename_method'});
512	# TODO is file filenmae_no_path??
513	$self->set_initial_doc_fields($self->{'doc_obj'}, $self->{'file'}, $self->{'processor'});
514
515	my ($dir, $file) = $self->{'filename'} =~ /^(.?)([^\/\\])$/;
516	$self->{'xml_file_dir'} = $dir;
517	$self->{'num_pages'} = 0;
518
519	}
520
521	sub close_document {
522	my $self = shift(@_);
523	my $doc_obj = $self->{'doc_obj'};
524
525	# add numpages metadata
526	my $topsection = $doc_obj->get_top_section();
527
528	$doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', $self->{'num_pages'});
529
530	$doc_obj->set_utf8_metadata_element($topsection,"MaxImageWidth",$self->{'MaxImageWidth'});
531	$doc_obj->set_utf8_metadata_element($topsection,"MaxImageHeight",$self->{'MaxImageHeight'});
532	$self->{'MaxImageWidth'} = undef;
533	$self->{'MaxImageHeight'} = undef;
534
535	}
536
537
538	sub set_initial_doc_fields {
539	my $self = shift(@_);
540	my ($doc_obj, $filename_no_path, $processor) = @_;
541
542	my $topsection = $doc_obj->get_top_section();
543
544	if ($self->{'documenttype'} eq 'paged') {
545	# set the gsdlthistype metadata to Paged - this ensures this document will
546	# be treated as a Paged doc, even if Titles are not numeric
547	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Paged");
548	} else {
549	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Hierarchy");
550	}
551
552	$self->set_Source_metadata($doc_obj, $filename_no_path);
553
554	# if we want a header page, we need to add some text into the top section, otherwise this section will become invisible
555	if ($self->{'headerpage'}) {
556	$self->add_dummy_text($doc_obj, $topsection);
557	}
558
559
560	}
561
562	sub scan_xml_for_files_to_block
563	{
564	my $self = shift (@_);
565	my ($filename_full_path, $dir, $block_hash) = @_;
566
567	open (ITEMFILE, $filename_full_path) \|\| die "couldn't open $filename_full_path to work out which files to block\n";
568	my $line = "";
569	while (defined ($line = <ITEMFILE>)) {
570	next unless $line =~ /\w/;
571
572	if ($line =~ /imgfile=\"([^\"]+)\"/) {
573	$block_hash->{'file_blocks'}->{$dir.$1} = 1;
574	}
575	if ($line =~ /txtfile=\"([^\"]+)\"/) {
576	$block_hash->{'file_blocks'}->{$dir.$1} = 1;
577	}
578	}
579	close ITEMFILE;
580
581	}
582
583	sub scan_item_for_files_to_block
584	{
585	my $self = shift (@_);
586	my ($filename_full_path, $dir, $block_hash) = @_;
587
588	open (ITEMFILE, $filename_full_path) \|\| die "couldn't open $filename_full_path to work out which files to block\n";
589	my $line = "";
590	while (defined ($line = <ITEMFILE>)) {
591	next unless $line =~ /\w/;
592	chomp $line;
593	next if $line =~ /^#/; # ignore comment lines
594	next if ($line =~ /^<([^>])>\s(.?)\s$/); # ignore metadata lines
595	# line should be like page:imagefilename:textfilename:r
596	$line =~ s/^\s+//; #remove space at the front
597	$line =~ s/\s+$//; #remove space at the end
598	my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
599
600	# find the image file if there is one
601	if (defined $imgname && $imgname ne "") {
602	$block_hash->{'file_blocks'}->{$dir.$imgname}=1;
603	}
604	# find the text file if there is one
605	if (defined $txtname && $txtname ne "") {
606	$block_hash->{'file_blocks'}->{$dir.$txtname} = 1;
607	}
608	}
609	close ITEMFILE;
610
611	}
612
613	sub process_item {
614	my $self = shift (@_);
615	my ($filename_full_path, $dir, $filename_no_path, $processor) = @_;
616
617	my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
618	$self->set_initial_doc_fields($doc_obj, $filename_no_path, $processor);
619	my $topsection = $doc_obj->get_top_section();
620	open (ITEMFILE, $filename_full_path) \|\| die "couldn't open $filename_full_path\n";
621	my $line = "";
622	my $num = 0;
623	while (defined ($line = <ITEMFILE>)) {
624	next unless $line =~ /\w/;
625	chomp $line;
626	next if $line =~ /^#/; # ignore comment lines
627	if ($line =~ /^<([^>])>\s(.?)\s$/) {
628	my $meta_name = $1;
629	my $meta_value = $2;
630	if ($meta_name =~ /\./) {
631	$meta_name = "ex.$meta_name";
632	}
633	$doc_obj->set_utf8_metadata_element ($topsection, $meta_name, $meta_value);
634	#$meta->{$1} = $2;
635	} else {
636	$num++;
637	# line should be like page:imagefilename:textfilename:r - the r is optional -> means rotate the image 180 deg
638	$line =~ s/^\s+//; #remove space at the front
639	$line =~ s/\s+$//; #remove space at the end
640	my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
641
642	# create a new section for each image file
643	my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
644	# the page number becomes the Title
645	$doc_obj->set_utf8_metadata_element($cursection, 'Title', $pagenum);
646
647	# process the image for this page if there is one
648	if (defined $imgname && $imgname ne "") {
649	my $result1 = $self->process_image($dir.$imgname, $imgname, $doc_obj, $cursection, $rotate);
650	if (!defined $result1)
651	{
652	print "PagedImagePlugin: couldn't process image \"$dir$imgname\" for item \"$filename_full_path\"\n";
653	}
654	}
655	# process the text file if one is there
656	if (defined $txtname && $txtname ne "") {
657	my $result2 = $self->process_text ($dir.$txtname, $txtname, $doc_obj, $cursection);
658
659	if (!defined $result2) {
660	print "PagedImagePlugin: couldn't process text file \"$dir.$txtname\" for item \"$filename_full_path\"\n";
661	$self->add_dummy_text($doc_obj, $cursection);
662	}
663	} else {
664	# otherwise add in some dummy text
665	$self->add_dummy_text($doc_obj, $cursection);
666	}
667	}
668	}
669
670	close ITEMFILE;
671
672	# add numpages metadata
673	$doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', "$num");
674
675	$doc_obj->set_utf8_metadata_element($topsection,"MaxImageWidth",$self->{'MaxImageWidth'});
676	$doc_obj->set_utf8_metadata_element($topsection,"MaxImageHeight",$self->{'MaxImageHeight'});
677	$self->{'MaxImageWidth'} = undef;
678	$self->{'MaxImageHeight'} = undef;
679
680
681	return $doc_obj;
682	}
683
684	sub process_text {
685	my $self = shift (@_);
686	my ($filename_full_path, $file, $doc_obj, $cursection) = @_;
687
688	# check that the text file exists!!
689	if (!-f $filename_full_path) {
690	print "PagedImagePlugin: ERROR: File $filename_full_path does not exist, skipping\n";
691	return 0;
692	}
693
694	# remember that this text file was one of our source files, but only
695	# if we are not processing a tmp file
696	if (!$self->{'processing_tmp_files'} ) {
697	$doc_obj->associate_source_file($filename_full_path);
698	}
699	# Do encoding stuff
700	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
701
702	my $text="";
703	&ReadTextFile::read_file($self, $filename_full_path, $encoding, $language, \$text);
704	if (!length ($text)) {
705	# It's a bit unusual but not out of the question to have no text, so just give a warning
706	print "PagedImagePlugin: WARNING: $filename_full_path contains no text\n";
707	}
708
709	# we need to escape the escape character, or else mg will convert into
710	# eg literal newlines, instead of leaving the text as '\n'
711	$text =~ s/\\/\\\\/g; # macro language
712	$text =~ s/_/\\_/g; # macro language
713
714
715	if ($text =~ m/<html.?>\s<head.?>.<\/head>\s<body.?>(.)<\/body>\s<\/html>\s*$/is) {
716	# looks like HTML input
717	# no need to escape < and > or put in <pre> tags
718
719	$text = $1;
720
721	# add text to document object
722	$doc_obj->add_utf8_text($cursection, "$text");
723	}
724	else {
725	$text =~ s/</</g;
726	$text =~ s/>/>/g;
727
728	# insert preformat tags and add text to document object
729	$doc_obj->add_utf8_text($cursection, "<pre>\n$text\n</pre>");
730	}
731
732
733	return 1;
734	}
735
736
737	sub clean_up_after_doc_obj_processing {
738	my $self = shift(@_);
739
740	$self->ImageConverter::clean_up_temporary_files();
741	}
742
743	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: