Context Navigation

source: gsdl/trunk/perllib/plugins/PagedImagePlugin.pm@ 20778

Last change on this file since 20778 was 20778, checked in by kjdon, 15 years ago
plugins now need to add any auxiliary source files as source assoc files, so we know when to reimport for incremental import. Have started this, but not finished and not tested :-)
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 22.8 KB

Line
1	###########################################################################
2	#
3	# PagedImagePlugin.pm -- plugin for sets of images and OCR text that
4	# make up a document
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	# PagedImagePlugin
28	# processes sequences of images, with optional OCR text
29	#
30	# This plugin takes *.item files, which contain metadata and lists of image
31	# files, and produces a document containing sections, one for each page.
32	# The files should be named something.item, then you can have more than one
33	# book in a directory. You will need to create these files, one for each
34	# document/book.
35	#
36	#There are two formats for the item files: a plain text format, and an xml
37	#format. You can use either format, and can have both formats in the same
38	#collection if you like. If you use the plain format, you must not start the
39	#file off with <PagedDocument>
40
41	#### PLAIN FORMAT
42	# The format of the xxx.item file is as follows:
43	# The first lines contain any metadata for the whole document
44	# <metadata-name>metadata-value
45	# eg.
46	# <Title>Snail farming
47	# <Date>19230102
48	# Then comes a list of pages, one page per line, each line has the format
49	#
50	# pagenum:imagefile:textfile:r
51	#
52	# page num and imagefile are required. pagenum is used for the Title
53	# of the section, and in the display is shown as page <pagenum>.
54	# imagefile is the image for the page. textfile is an optional text
55	# file containing the OCR (or any) text for the page - this gets added
56	# as the text for the section. r is optional, and signals that the image
57	# should be rotated 180deg. Eg use this if the image has been made upside down.
58	# So an example item file looks like:
59	# <Title>Snail farming
60	# <Date>19960403
61	# 1:p1.gif:p1.txt:
62	# 2:p2.gif::
63	# 3:p3.gif:p3.txt:
64	# 3b:p3b.gif:p3b.txt:r
65	# The second page has no text, the fourth page is a back page, and
66	# should be rotated.
67	#
68
69	#### XML FORMAT
70	# The xml format looks like the following
71	#<PagedDocument>
72	#<Metadata name="Title">The Title of the entire document</Metadata>
73	#<Page pagenum="1" imgfile="xxx.jpg" txtfile="yyy.txt">
74	#<Metadata name="Title">The Title of this page</Metadata>
75	#</Page>
76	#... more pages
77	#</PagedDocument>
78	#PagedDocument contains a list of Pages, Metadata and PageGroups. Any metadata
79	#that is not inside another tag will belong to the document.
80	#Each Page has a pagenum (not used at the moment), an imgfile and/or a txtfile.
81	#These are both optional - if neither is used, the section will have no content.
82	#Pages can also have metadata associated with them.
83	#PageGroups can be introduced at any point - they can contain Metadata and Pages and other PageGroups. They are used to introduce hierarchical structure into the document.
84	#For example
85	#<PagedDocument>
86	#<PageGroup>
87	#<Page>
88	#<Page>
89	#</PageGroup>
90	#<Page>
91	#</PagedDocument>
92	#would generate a structure like
93	#X
94	#--X
95	# --X
96	# --X
97	#--X
98	#PageGroup tags can also have imgfile/textfile metadata if you like - this way they get some content themselves.
99
100	#Currently the XML structure doesn't work very well with the paged document type, unless you use numerical Titles for each section.
101	#There is still a bit of work to do on this format:
102	#* enable other text file types, eg html, pdf etc
103	#* make the document paging work properly
104	#* add pagenum as Title unless a Title is present?
105
106	# All the supplemetary image amd text files should be in the same folder as
107	# the .item file.
108	#
109	# To display the images instead of the document text, you can use [srcicon]
110	# in the DocumentText format statement.
111	# For example,
112	#
113	# format DocumentText "<center><table width=_pagewidth_><tr><td>[srcicon]</td></tr></table></center>"
114	#
115	# To have it create thumbnail size images, use the '-create_thumbnail' option.
116	# To have it create medium size images for display, use the '-create_screenview'
117	# option. As usual, running
118	# 'perl -S pluginfo.pl PagedImagePlugin' will list all the options.
119
120	# If you want the resulting documents to be presented with a table of
121	# contents, use '-documenttype hierarchy', otherwise they will have
122	# next and previous arrows, and a goto page X box.
123
124	# If you have used -create_screenview, you can also use [screenicon] in the format
125	# statement to display the smaller image. Here is an example that switches
126	# between the two:
127	#
128	# format DocumentText "<center><table width=_pagewidth_><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small'>Switch to small version.</a>,<a href='_httpdocument_&d=_cgiargd_&p=full'>Switch to fullsize version</a>}</td></tr><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small' title='Switch to small version'>[srcicon]</a>,<a href='_httpdocument_&d=_cgiargd_&p=full' title='Switch to fullsize version'>[screenicon]</a>}</td></tr></table></center>"
129	#
130	# Additional metadata can be added into the .item files, alternatively you can
131	# use normal metadata.xml files, with the name of the xxx.item file as the
132	# FileName (only for document level metadata).
133
134	package PagedImagePlugin;
135
136	use ReadXMLFile;
137	use ReadTextFile;
138	use ImageConverter;
139
140	use strict;
141	no strict 'refs'; # allow filehandles to be variables and viceversa
142
143	sub BEGIN {
144	@PagedImagePlugin::ISA = ('ReadXMLFile', 'ReadTextFile', 'ImageConverter');
145	}
146
147	my $type_list =
148	[ { 'name' => "paged",
149	'desc' => "{PagedImagePlugin.documenttype.paged}" },
150	{ 'name' => "hierarchy",
151	'desc' => "{PagedImagePlugin.documenttype.hierarchy}" } ];
152
153	my $arguments =
154	[ { 'name' => "process_exp",
155	'desc' => "{BasePlugin.process_exp}",
156	'type' => "string",
157	'deft' => &get_default_process_exp(),
158	'reqd' => "no" },
159	{ 'name' => "block_exp",
160	'desc' => "{BasePlugin.block_exp}",
161	'type' => "string",
162	'deft' => &get_default_block_exp(),
163	'reqd' => "no" },
164	{ 'name' => "title_sub",
165	'desc' => "{HTMLPlugin.title_sub}",
166	'type' => "string",
167	'deft' => "" },
168	{ 'name' => "headerpage",
169	'desc' => "{PagedImagePlugin.headerpage}",
170	'type' => "flag",
171	'reqd' => "no" },
172	{ 'name' => "documenttype",
173	'desc' => "{PagedImagePlugin.documenttype}",
174	'type' => "enum",
175	'list' => $type_list,
176	'deft' => "paged",
177	'reqd' => "no" } ];
178
179
180	my $options = { 'name' => "PagedImagePlugin",
181	'desc' => "{PagedImagePlugin.desc}",
182	'abstract' => "no",
183	'inherits' => "yes",
184	'args' => $arguments };
185
186	sub new {
187	my ($class) = shift (@_);
188	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
189	push(@$pluginlist, $class);
190
191	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
192	push(@{$hashArgOptLists->{"OptList"}},$options);
193
194	my $imc_self = new ImageConverter($pluginlist, $inputargs, $hashArgOptLists);
195	my $rtf_self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists, 1);
196	my $rxf_self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
197
198	my $self = BasePlugin::merge_inheritance($imc_self,$rtf_self,$rxf_self);
199
200	# Update $self used by XML::Parser so it finds callback functions
201	# such as start_document here and not in ReadXMLFile (which is what
202	# $self was when new XML::Parser was done)
203	#
204	# If the $self returned by this constructor is the same as the one
205	# used in ReadXMLFile (e.g. in the GreenstoneXMLPlugin) then this step isn't necessary
206	#
207	# Consider embedding this type of assignment into merge_inheritance
208	# to help catch all cases?
209
210	$rxf_self->{'parser'}->{'PluginObj'} = $self;
211
212	return bless $self, $class;
213	}
214
215
216	sub init {
217	my $self = shift (@_);
218	my ($verbosity, $outhandle, $failhandle) = @_;
219
220	$self->SUPER::init(@_);
221	$self->ImageConverter::init();
222	}
223
224	sub begin {
225	my $self = shift (@_);
226	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
227
228	$self->SUPER::begin(@_);
229	$self->ImageConverter::begin(@_);
230	}
231
232	sub get_default_process_exp {
233	my $self = shift (@_);
234
235	return q^\.item$^;
236	}
237
238	sub get_doctype {
239	my $self = shift(@_);
240
241	return "PagedDocument";
242	}
243
244
245	# want to block everything except the .item ones
246	# but instead we will block images and txt files
247	sub get_default_block_exp {
248	my $self = shift (@_);
249
250	return q^(?i)(\.jpe?g\|\.gif\|\.png\|\.tif?f\|\.te?xt\|\.html?\|~)$^
251	### return q^(?i)(\.jpe?g\|\.gif\|\.png\|\.tif?f\|\.te?xt\|\.html?\|\.css\|\.opd\|\.pdf\|~)$^
252	}
253
254
255	# want to use BasePlugin's version of this, not ReadXMLFile's
256	sub can_process_this_file {
257	my $self = shift(@_);
258
259	return $self->BasePlugin::can_process_this_file(@_);
260	}
261
262	# we want to use BasePlugin's read, not ReadXMLFile's
263	sub read
264	{
265	my $self = shift (@_);
266
267	$self->BasePlugin::read(@_);
268	}
269
270
271
272	sub read_into_doc_obj {
273	my $self = shift (@_);
274	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
275	my $outhandle = $self->{'outhandle'};
276	my $verbosity = $self->{'verbosity'};
277
278	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
279
280	print $outhandle "PagedImagePlugin processing \"$filename_full_path\"\n"
281	if $verbosity > 1;
282	print STDERR "<Processing n='$file' p='PagedImagePlugin'>\n" if ($gli);
283
284	$self->{'MaxImageWidth'} = 0;
285	$self->{'MaxImageHeight'} = 0;
286
287	# here we need to decide if we have an old text .item file, or a new xml
288	# .item file
289	my $xml_version = $self->is_xml_item_file($filename_full_path);
290
291	$self->tidy_item_file($filename_full_path);
292
293	my $doc_obj;
294	if ($xml_version) {
295	# careful checking needed here!! are we using local xml handlers or super ones
296	$self->ReadXMLFile::read($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
297	$doc_obj = $self->{'doc_obj'};
298	} else {
299	my ($dir);
300	($dir, $file) = $filename_full_path =~ /^(.?)([^\/\\])$/;
301
302	#process the .item file
303	$doc_obj = $self->process_item($filename_full_path, $dir, $file, $processor);
304
305	}
306
307	my $section = $doc_obj->get_top_section();
308
309	$doc_obj->add_utf8_metadata($section, "Plugin", "$self->{'plugin_type'}");
310	$doc_obj->add_metadata($section, "FileFormat", "PagedImage");
311
312	# include any metadata passed in from previous plugins
313	# note that this metadata is associated with the top level section
314	$self->add_associated_files($doc_obj, $filename_full_path);
315	$self->extra_metadata ($doc_obj, $section, $metadata);
316	$self->auto_extract_metadata ($doc_obj);
317
318	# if we haven't found any Title so far, assign one
319	$self->title_fallback($doc_obj,$section,$filename_no_path);
320
321	$self->add_OID($doc_obj);
322	return (1,$doc_obj);
323	}
324
325	# for now, the test is if the first non-empty line is <PagedDocument>, then its xml
326	sub is_xml_item_file {
327	my $self = shift(@_);
328	my ($filename) = @_;
329
330	my $xml_version = 0;
331	open (ITEMFILE, $filename) \|\| die "couldn't open $filename\n";
332
333	my $line = "";
334	my $num = 0;
335
336	$line = <ITEMFILE>;
337	while (defined ($line) && ($line !~ /\w/)) {
338	$line = <ITEMFILE>;
339	}
340
341	if (defined $line) {
342	chomp $line;
343	if ($line =~ /<PagedDocument/) {
344	$xml_version = 1;
345	}
346	}
347
348	close ITEMFILE;
349	return $xml_version;
350	}
351
352	sub tidy_item_file {
353	my $self = shift(@_);
354	my ($filename) = @_;
355
356	open (ITEMFILE, $filename) \|\| die "couldn't open $filename\n";
357	my $backup_filename = "backup.item";
358	open (BACKUP,">$backup_filename")\|\| die "couldn't write to $backup_filename\n";
359	my $line = "";
360	$line = <ITEMFILE>;
361	$line =~ s/^\xEF\xBB\xBF//; # strip BOM
362	$line =~ s/\x0B+//ig;
363	$line =~ s/&/&/g;
364	print BACKUP ($line);
365	#Tidy up the item file some metadata title contains \vt-vertical tab
366	while ($line = <ITEMFILE>) {
367	$line =~ s/\x0B+//ig;
368	$line =~ s/&/&/g;
369	print BACKUP ($line);
370	}
371	close ITEMFILE;
372	close BACKUP;
373	&File::Copy::copy ($backup_filename, $filename);
374	&util::rm($backup_filename);
375
376	}
377
378	sub rotate_image {
379	my $self = shift (@_);
380	my ($filename_full_path) = @_;
381
382	my ($this_filetype) = $filename_full_path =~ /\.([^\.]*)$/;
383	my $result = $self->convert($filename_full_path, $this_filetype, "-rotate 180", "ROTATE");
384	my ($new_filename) = ($result =~ /=>(.*\.$this_filetype)/);
385	if (-e "$new_filename") {
386	return $new_filename;
387	}
388	# somethings gone wrong
389	return $filename_full_path;
390
391	}
392
393	sub process_image {
394	my $self = shift(@_);
395	my ($filename_full_path, $filename_no_path, $doc_obj, $section, $rotate) = @_;
396	# check the filenames
397	return 0 if ($filename_no_path eq "" \|\| !-f $filename_full_path);
398
399	# remember that this image file was one of our source files
400	$doc_obj->associate_source_file($filename_full_path);
401
402	# do rotation
403	if ((defined $rotate) && ($rotate eq "r")) {
404	# we get a new temporary file which is rotated
405	$filename_full_path = $self->rotate_image($filename_full_path);
406	}
407
408	# do generate images
409	my $result = 0;
410	if ($self->{'image_conversion_available'} == 1) {
411	# do we need to convert $filename_no_path to utf8? We are already reading in from a file, what encoding is it in???
412	$result = $self->generate_images($filename_full_path, $filename_no_path, $doc_obj, $section);
413	}
414	#overwrite one set in ImageConverter
415	$doc_obj->set_metadata_element ($section, "FileFormat", "PagedImage");
416	return $result;
417	}
418
419
420	sub xml_start_tag {
421	my $self = shift(@_);
422	my ($expat, $element) = @_;
423	$self->{'element'} = $element;
424
425	my $doc_obj = $self->{'doc_obj'};
426	if ($element eq "PagedDocument") {
427	$self->{'current_section'} = $doc_obj->get_top_section();
428	} elsif ($element eq "PageGroup" \|\| $element eq "Page") {
429	# create a new section as a child
430	$self->{'current_section'} = $doc_obj->insert_section($doc_obj->get_end_child($self->{'current_section'}));
431	$self->{'num_pages'}++;
432	# assign pagenum as what??
433	my $pagenum = $_{'pagenum'}; #TODO!!
434	if (defined $pagenum) {
435	$doc_obj->set_utf8_metadata_element($self->{'current_section'}, 'PageNum', $pagenum);
436	}
437	my ($imgfile) = $_{'imgfile'};
438	if (defined $imgfile) {
439	# *****
440	# What about support for rotate image (e.g. old ':r' notation)?
441	$self->process_image($self->{'xml_file_dir'}.$imgfile, $imgfile, $doc_obj, $self->{'current_section'});
442	}
443	my ($txtfile) = $_{'txtfile'};
444	if (defined($txtfile)&& $txtfile ne "") {
445	$self->process_text ($self->{'xml_file_dir'}.$txtfile, $txtfile, $doc_obj, $self->{'current_section'});
446	} else {
447	$self->add_dummy_text($doc_obj, $self->{'current_section'});
448	}
449	} elsif ($element eq "Metadata") {
450	$self->{'metadata_name'} = $_{'name'};
451	}
452	}
453
454	sub xml_end_tag {
455	my $self = shift(@_);
456	my ($expat, $element) = @_;
457
458	my $doc_obj = $self->{'doc_obj'};
459	if ($element eq "Page" \|\| $element eq "PageGroup") {
460	# if Title hasn't been assigned, set PageNum as Title
461	if (!defined $doc_obj->get_metadata_element ($self->{'current_section'}, "Title") && defined $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" )) {
462	$doc_obj->add_utf8_metadata ($self->{'current_section'}, "Title", $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" ));
463	}
464	# move the current section back to the parent
465	$self->{'current_section'} = $doc_obj->get_parent_section($self->{'current_section'});
466	} elsif ($element eq "Metadata") {
467
468	$doc_obj->add_utf8_metadata ($self->{'current_section'}, $self->{'metadata_name'}, $self->{'metadata_value'});
469	$self->{'metadata_name'} = "";
470	$self->{'metadata_value'} = "";
471
472	}
473	# otherwise we ignore the end tag
474	}
475
476
477	sub xml_text {
478	my $self = shift(@_);
479	my ($expat) = @_;
480
481	if ($self->{'element'} eq "Metadata" && $self->{'metadata_name'}) {
482	$self->{'metadata_value'} .= $_;
483	}
484	}
485
486	sub xml_doctype {
487	}
488
489	sub open_document {
490	my $self = shift(@_);
491
492	# create a new document
493	$self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc", $self->{'file_rename_method'});
494	# TODO is file filenmae_no_path??
495	$self->set_initial_doc_fields($self->{'doc_obj'}, $self->{'file'}, $self->{'processor'});
496
497	my ($dir, $file) = $self->{'filename'} =~ /^(.?)([^\/\\])$/;
498	$self->{'xml_file_dir'} = $dir;
499	$self->{'num_pages'} = 0;
500
501	}
502
503	sub close_document {
504	my $self = shift(@_);
505	my $doc_obj = $self->{'doc_obj'};
506
507	# add numpages metadata
508	my $topsection = $doc_obj->get_top_section();
509
510	$doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', $self->{'num_pages'});
511
512	$doc_obj->set_utf8_metadata_element($topsection,"MaxImageWidth",$self->{'MaxImageWidth'});
513	$doc_obj->set_utf8_metadata_element($topsection,"MaxImageHeight",$self->{'MaxImageHeight'});
514	$self->{'MaxImageWidth'} = undef;
515	$self->{'MaxImageHeight'} = undef;
516
517	}
518
519
520	sub set_initial_doc_fields {
521	my $self = shift(@_);
522	my ($doc_obj, $filename_no_path, $processor) = @_;
523
524	my $topsection = $doc_obj->get_top_section();
525
526	if ($self->{'documenttype'} eq 'paged') {
527	# set the gsdlthistype metadata to Paged - this ensures this document will
528	# be treated as a Paged doc, even if Titles are not numeric
529	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Paged");
530	} else {
531	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Hierarchy");
532	}
533
534	$self->set_Source_metadata($doc_obj, $filename_no_path);
535
536	# if we want a header page, we need to add some text into the top section, otherwise this section will become invisible
537	if ($self->{'headerpage'}) {
538	$self->add_dummy_text($doc_obj, $topsection);
539	}
540
541
542	}
543
544
545	sub process_item {
546	my $self = shift (@_);
547	my ($filename_full_path, $dir, $filename_no_path, $processor) = @_;
548
549	my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
550	$self->set_initial_doc_fields($doc_obj, $filename_no_path, $processor);
551	my $topsection = $doc_obj->get_top_section();
552	open (ITEMFILE, $filename_full_path) \|\| die "couldn't open $filename_full_path\n";
553	my $line = "";
554	my $num = 0;
555	while (defined ($line = <ITEMFILE>)) {
556	next unless $line =~ /\w/;
557	chomp $line;
558	next if $line =~ /^#/; # ignore comment lines
559	if ($line =~ /^<([^>])>\s(.?)\s$/) {
560	$doc_obj->set_utf8_metadata_element ($topsection, $1, $2);
561	#$meta->{$1} = $2;
562	} else {
563	$num++;
564	# line should be like page:imagefilename:textfilename:r - the r is optional -> means rotate the image 180 deg
565	$line =~ s/^\s+//; #remove space at the front
566	$line =~ s/\s+$//; #remove space at the end
567	my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
568
569	# create a new section for each image file
570	my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
571	# the page number becomes the Title
572	$doc_obj->set_utf8_metadata_element($cursection, 'Title', $pagenum);
573
574	# process the image for this page if there is one
575	if (defined $imgname && $imgname ne "") {
576	my $result1 = $self->process_image($dir.$imgname, $imgname, $doc_obj, $cursection, $rotate);
577
578	if (!defined $result1)
579	{
580	print "PagedImagePlugin: couldn't process image \"$dir.$imgname\" for item \"$filename_full_path\"\n";
581	}
582	}
583	# process the text file if one is there
584	if (defined $txtname && $txtname ne "") {
585	my $result2 = $self->process_text ($dir.$txtname, $txtname, $doc_obj, $cursection);
586
587	if (!defined $result2) {
588	print "PagedImagePlugin: couldn't process text file \"$dir.$txtname\" for item \"$filename_full_path\"\n";
589	$self->add_dummy_text($doc_obj, $cursection);
590	}
591	} else {
592	# otherwise add in some dummy text
593	$self->add_dummy_text($doc_obj, $cursection);
594	}
595	}
596	}
597
598	close ITEMFILE;
599
600	# add numpages metadata
601	$doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', "$num");
602
603	$doc_obj->set_utf8_metadata_element($topsection,"MaxImageWidth",$self->{'MaxImageWidth'});
604	$doc_obj->set_utf8_metadata_element($topsection,"MaxImageHeight",$self->{'MaxImageHeight'});
605	$self->{'MaxImageWidth'} = undef;
606	$self->{'MaxImageHeight'} = undef;
607
608
609	return $doc_obj;
610	}
611
612	sub process_text {
613	my $self = shift (@_);
614	my ($filename_full_path, $file, $doc_obj, $cursection) = @_;
615
616	# check that the text file exists!!
617	if (!-f $filename_full_path) {
618	print "PagedImagePlugin: ERROR: File $filename_full_path does not exist, skipping\n";
619	return 0;
620	}
621
622	# remember that this text file was one of our source files
623	$doc_obj->associate_source_file($filename_full_path);
624	# Do encoding stuff
625	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
626
627	my $text="";
628	&ReadTextFile::read_file($self, $filename_full_path, $encoding, $language, \$text);
629	if (!length ($text)) {
630	# It's a bit unusual but not out of the question to have no text, so just give a warning
631	print "PagedImagePlugin: WARNING: $filename_full_path contains no text\n";
632	}
633
634	# we need to escape the escape character, or else mg will convert into
635	# eg literal newlines, instead of leaving the text as '\n'
636	$text =~ s/\\/\\\\/g; # macro language
637	$text =~ s/_/\\_/g; # macro language
638
639
640	if ($text =~ m/<html.?>\s<head.?>.<\/head>\s<body.?>(.)<\/body>\s<\/html>\s*$/is) {
641	# looks like HTML input
642	# no need to escape < and > or put in <pre> tags
643
644	$text = $1;
645
646	# insert preformat tags and add text to document object
647	$doc_obj->add_utf8_text($cursection, "$text");
648	}
649	else {
650	$text =~ s/</</g;
651	$text =~ s/>/>/g;
652
653	# insert preformat tags and add text to document object
654	$doc_obj->add_utf8_text($cursection, "<pre>\n$text\n</pre>");
655	}
656
657
658	return 1;
659	}
660
661
662	sub clean_up_after_doc_obj_processing {
663	my $self = shift(@_);
664
665	$self->ImageConverter::clean_up_temporary_files();
666	}
667
668	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: