Context Navigation

source: gsdl/trunk/perllib/plugins/PagedImagePlugin.pm@ 15905

Last change on this file since 15905 was 15905, checked in by kjdon, 16 years ago
changed some comments, also, new ReadTextFile, need to pass in extra arg so argument parsing is not done yet
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 23.0 KB

Line
1	###########################################################################
2	#
3	# PagedImagePlugin.pm -- plugin for sets of images and OCR text that
4	# make up a document
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	# PagedImagePlugin
28	# processes sequences of images, with optional OCR text
29	#
30	# This plugin takes *.item files, which contain metadata and lists of image
31	# files, and produces a document containing sections, one for each page.
32	# The files should be named something.item, then you can have more than one
33	# book in a directory. You will need to create these files, one for each
34	# document/book.
35	#
36	#There are two formats for the item files: a plain text format, and an xml
37	#format. You can use either format, and can have both formats in the same
38	#collection if you like. If you use the plain format, you must not start the
39	#file off with <PagedDocument>
40
41	#### PLAIN FORMAT
42	# The format of the xxx.item file is as follows:
43	# The first lines contain any metadata for the whole document
44	# <metadata-name>metadata-value
45	# eg.
46	# <Title>Snail farming
47	# <Date>19230102
48	# Then comes a list of pages, one page per line, each line has the format
49	#
50	# pagenum:imagefile:textfile:r
51	#
52	# page num and imagefile are required. pagenum is used for the Title
53	# of the section, and in the display is shown as page <pagenum>.
54	# imagefile is the image for the page. textfile is an optional text
55	# file containing the OCR (or any) text for the page - this gets added
56	# as the text for the section. r is optional, and signals that the image
57	# should be rotated 180deg. Eg use this if the image has been made upside down.
58	# So an example item file looks like:
59	# <Title>Snail farming
60	# <Date>19960403
61	# 1:p1.gif:p1.txt:
62	# 2:p2.gif::
63	# 3:p3.gif:p3.txt:
64	# 3b:p3b.gif:p3b.txt:r
65	# The second page has no text, the fourth page is a back page, and
66	# should be rotated.
67	#
68
69	#### XML FORMAT
70	# The xml format looks like the following
71	#<PagedDocument>
72	#<Metadata name="Title">The Title of the entire document</Metadata>
73	#<Page pagenum="1" imgfile="xxx.jpg" txtfile="yyy.txt">
74	#<Metadata name="Title">The Title of this page</Metadata>
75	#</Page>
76	#... more pages
77	#</PagedDocument>
78	#PagedDocument contains a list of Pages, Metadata and PageGroups. Any metadata
79	#that is not inside another tag will belong to the document.
80	#Each Page has a pagenum (not used at the moment), an imgfile and/or a txtfile.
81	#These are both optional - if neither is used, the section will have no content.
82	#Pages can also have metadata associated with them.
83	#PageGroups can be introduced at any point - they can contain Metadata and Pages and other PageGroups. They are used to introduce hierarchical structure into the document.
84	#For example
85	#<PagedDocument>
86	#<PageGroup>
87	#<Page>
88	#<Page>
89	#</PageGroup>
90	#<Page>
91	#</PagedDocument>
92	#would generate a structure like
93	#X
94	#--X
95	# --X
96	# --X
97	#--X
98	#PageGroup tags can also have imgfile/textfile metadata if you like - this way they get some content themselves.
99
100	#Currently the XML structure doesn't work very well with the paged document type, unless you use numerical Titles for each section.
101	#There is still a bit of work to do on this format:
102	#* enable other text file types, eg html, pdf etc
103	#* make the document paging work properly
104	#* add pagenum as Title unless a Title is present?
105
106	# All the supplemetary image amd text files should be in the same folder as
107	# the .item file.
108	#
109	# To display the images instead of the document text, you can use [srcicon]
110	# in the DocumentText format statement.
111	# For example,
112	#
113	# format DocumentText "<center><table width=_pagewidth_><tr><td>[srcicon]</td></tr></table></center>"
114	#
115	# To have it create thumbnail size images, use the '-create_thumbnail' option.
116	# To have it create medium size images for display, use the '-create_screenview'
117	# option. As usual, running
118	# 'perl -S pluginfo.pl PagedImagePlugin' will list all the options.
119
120	# If you want the resulting documents to be presented with a table of
121	# contents, use '-documenttype hierarchy', otherwise they will have
122	# next and previous arrows, and a goto page X box.
123
124	# If you have used -create_screenview, you can also use [screenicon] in the format
125	# statement to display the smaller image. Here is an example that switches
126	# between the two:
127	#
128	# format DocumentText "<center><table width=_pagewidth_><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small'>Switch to small version.</a>,<a href='_httpdocument_&d=_cgiargd_&p=full'>Switch to fullsize version</a>}</td></tr><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small' title='Switch to small version'>[srcicon]</a>,<a href='_httpdocument_&d=_cgiargd_&p=full' title='Switch to fullsize version'>[screenicon]</a>}</td></tr></table></center>"
129	#
130	# Additional metadata can be added into the .item files, alternatively you can
131	# use normal metadata.xml files, with the name of the xxx.item file as the
132	# FileName (only for document level metadata).
133
134	package PagedImagePlugin;
135
136	use ReadXMLFile;
137	use ReadTextFile;
138	use ImageConverter;
139
140	use strict;
141	no strict 'refs'; # allow filehandles to be variables and viceversa
142
143	sub BEGIN {
144	@PagedImagePlugin::ISA = ('ReadXMLFile', 'ReadTextFile', 'ImageConverter');
145	}
146
147	my $type_list =
148	[ { 'name' => "paged",
149	'desc' => "{PagedImagePlugin.documenttype.paged}" },
150	{ 'name' => "hierarchy",
151	'desc' => "{PagedImagePlugin.documenttype.hierarchy}" } ];
152
153	my $arguments =
154	[ { 'name' => "process_exp",
155	'desc' => "{BasPlug.process_exp}",
156	'type' => "string",
157	'deft' => &get_default_process_exp(),
158	'reqd' => "no" },
159	{ 'name' => "block_exp",
160	'desc' => "{BasPlug.block_exp}",
161	'type' => "string",
162	'deft' => &get_default_block_exp(),
163	'reqd' => "no" },
164	{ 'name' => "title_sub",
165	'desc' => "{HTMLPlug.title_sub}",
166	'type' => "string",
167	'deft' => "" },
168	{ 'name' => "headerpage",
169	'desc' => "{PagedImagePlugin.headerpage}",
170	'type' => "flag",
171	'reqd' => "no" },
172	{ 'name' => "documenttype",
173	'desc' => "{PagedImagePlugin.documenttype}",
174	'type' => "enum",
175	'list' => $type_list,
176	'deft' => "paged",
177	'reqd' => "no" } ];
178
179
180	my $options = { 'name' => "PagedImagePlugin",
181	'desc' => "{PagedImagePlugin.desc}",
182	'abstract' => "no",
183	'inherits' => "yes",
184	'args' => $arguments };
185
186	sub new {
187	my ($class) = shift (@_);
188	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
189	push(@$pluginlist, $class);
190
191	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
192	push(@{$hashArgOptLists->{"OptList"}},$options);
193
194	new ImageConverter($pluginlist, $inputargs, $hashArgOptLists);
195	new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists, 1);
196	my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
197
198	return bless $self, $class;
199	}
200
201
202	sub init {
203	my $self = shift (@_);
204	my ($verbosity, $outhandle, $failhandle) = @_;
205
206	$self->SUPER::init(@_);
207	$self->ImageConverter::init();
208	}
209
210	sub get_default_process_exp {
211	my $self = shift (@_);
212
213	return q^\.item$^;
214	}
215
216	sub get_doctype {
217	my $self = shift(@_);
218
219	return "PagedDocument";
220	}
221
222
223	# want to block everything except the .item ones
224	# but instead we will block images and txt files
225	sub get_default_block_exp {
226	my $self = shift (@_);
227
228	return q^(?i)(\.jpe?g\|\.gif\|\.png\|\.tif?f\|\.te?xt\|\.html?\|~)$^
229	}
230
231	sub rotate_image {
232	my $self = shift (@_);
233	my ($filename_full_path) = @_;
234
235	my ($this_filetype) = $filename_full_path =~ /\.([^\.]*)$/;
236	my $result = $self->convert($filename_full_path, $this_filetype, "-rotate 180", "ROTATE");
237	my ($new_filename) = ($result =~ /=>(.*\.$this_filetype)/);
238	if (-e "$new_filename") {
239	return $new_filename;
240	}
241	# somethings gone wrong
242	return $filename_full_path;
243
244	}
245
246	sub process_image {
247	my $self = shift(@_);
248	my ($filename_full_path, $filename_no_path, $doc_obj, $section, $rotate) = @_;
249	# do rotation
250	if ($rotate eq "r") {
251	# check the filenames
252	return 0 if ($filename_no_path eq "" \|\| !-f $filename_full_path);
253
254	# we get a new temporary file which is rotated
255	$filename_full_path = $self->rotate_image($filename_full_path);
256	}
257
258	# do generate images
259	my $result = $self->generate_images($filename_full_path, $filename_no_path, $doc_obj, $section);
260	#overwrite one set in ImageConverter
261	$doc_obj->set_metadata_element ($section, "FileFormat", "PagedImage");
262	return $result;
263	}
264
265	sub old_read_stuff_for_xml_version {
266	my ($self, $filename, $file, $gli);
267
268	# this bit same as ReadXMLFile read
269	# $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
270	# $self->{'file'} = $file;
271	# $self->{'filename'} = $filename_full_path;
272	# $self->{'processor'} = $processor;
273	# $self->{'metadata'} = $metadata;
274	#
275	eval {
276	$@ = "";
277	my $xslt = $self->{'xslt'};
278	if (defined $xslt && ($xslt ne "")) {
279	# perform xslt
280	my $transformed_xml = $self->apply_xslt($xslt,$filename);
281
282	# feed transformed file (now in memory as string) into XML parser
283	$self->{'parser'}->parse($transformed_xml);
284	###$self->parse_string($transformed_xml);
285	}
286	else {
287	$self->{'parser'}->parsefile($filename);
288	#$self->parse_file($filename);
289	}
290	};
291
292
293
294	if ($@) {
295
296	# parsefile may either croak somewhere in XML::Parser (e.g. because
297	# the document is not well formed) or die somewhere in XMLPlug or a
298	# derived plugin (e.g. because we're attempting to process a
299	# document whose DOCTYPE is not meant for this plugin). For the
300	# first case we'll print a warning and continue, for the second
301	# we'll just continue quietly
302
303	print STDERR "**** XML Parse Error is: $@\n";
304
305	my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
306	if (defined $msg) {
307	my $outhandle = $self->{'outhandle'};
308	my $plugin_name = ref ($self);
309	print $outhandle "$plugin_name failed to process $file ($msg)\n";
310	}
311
312	# reset ourself for the next document
313	$self->{'section_level'}=0;
314	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
315	return -1; # error during processing
316	}
317
318	}
319
320
321	# The PagedImagePlugin read() function. This function does all the right things
322	# to make general options work for a given plugin. It calls the process()
323	# function which does all the work specific to a plugin (like the old
324	# read functions used to do). Most plugins should define their own
325	# process() function and let this read() function keep control.
326	#
327	# PagedImagePlugin overrides read() because there is no need to read the actual
328	# text of the file in, because the contents of the file is not text...
329	#
330	# Return number of files processed, undef if can't process
331	# Note that $base_dir might be "" and that $file might
332	# include directories
333
334	sub read_into_doc_obj {
335	my $self = shift (@_);
336	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
337	my $outhandle = $self->{'outhandle'};
338
339	my ($filename_full_path, $filename_no_path) = $self->get_full_filenames($base_dir, $file);
340
341	print $outhandle "PagedImagePlugin processing \"$filename_full_path\"\n"
342	if $self->{'verbosity'} > 1;
343	print STDERR "<Processing n='$file' p='PagedImagePlugin'>\n" if ($gli);
344
345
346	# here we need to decide if we have an old text .item file, or a new xml
347	# .item file
348	my $xml_version = $self->is_xml_item_file($filename_full_path);
349
350	$self->tidy_item_file($filename_full_path);
351
352	my $doc_obj;
353	if ($xml_version) {
354	# careful checking needed here!! are we using local xml handlers or super ones
355	$self->ReadXMLFile::read($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli);
356	$doc_obj = $self->{'doc_obj'};
357	} else {
358	my ($dir);
359	($dir, $file) = $filename_full_path =~ /^(.?)([^\/\\])$/;
360
361	#process the .item file
362	$doc_obj = $self->process_item($filename_full_path, $dir, $file, $processor);
363
364	}
365
366	my $section = $doc_obj->get_top_section();
367
368	$doc_obj->add_utf8_metadata($section, "Plugin", "$self->{'plugin_type'}");
369	$doc_obj->add_metadata($section, "FileFormat", "PagedImage");
370
371	# include any metadata passed in from previous plugins
372	# note that this metadata is associated with the top level section
373	$self->add_associated_files($doc_obj, $filename_full_path);
374	$self->extra_metadata ($doc_obj, $section, $metadata);
375	$self->auto_extract_metadata ($doc_obj);
376
377	# if we haven't found any Title so far, assign one
378	$self->title_fallback($doc_obj,$section,$filename_no_path);
379
380	$self->add_OID($doc_obj);
381	return (1,$doc_obj);
382	}
383
384	# for now, the test is if the first non-empty line is <PagedDocument>, then its xml
385	sub is_xml_item_file {
386	my $self = shift(@_);
387	my ($filename) = @_;
388
389	my $xml_version = 0;
390	open (ITEMFILE, $filename) \|\| die "couldn't open $filename\n";
391
392	my $line = "";
393	my $num = 0;
394	$line = <ITEMFILE>;
395	while ($line !~ /\w/) {
396	$line = <ITEMFILE>;
397	}
398	chomp $line;
399	if ($line =~ /<PagedDocument/) {
400	$xml_version = 1;
401	}
402	close ITEMFILE;
403	return $xml_version;
404	}
405
406	sub tidy_item_file {
407	my $self = shift(@_);
408	my ($filename) = @_;
409
410	open (ITEMFILE, $filename) \|\| die "couldn't open $filename\n";
411	my $backup_filename = "backup.item";
412	open (BACKUP,">$backup_filename")\|\| die "couldn't write to $backup_filename\n";
413	my $line = "";
414	$line = <ITEMFILE>;
415	$line =~ s/^\xEF\xBB\xBF//; # strip BOM
416	$line =~ s/\x0B+//ig;
417	$line =~ s/&/&/g;
418	print BACKUP ($line);
419	#Tidy up the item file some metadata title contains \vt-vertical tab
420	while ($line = <ITEMFILE>) {
421	$line =~ s/\x0B+//ig;
422	$line =~ s/&/&/g;
423	print BACKUP ($line);
424	}
425	close ITEMFILE;
426	close BACKUP;
427	&File::Copy::copy ($backup_filename, $filename);
428	&util::rm($backup_filename);
429
430	}
431	# de we need this? old read was the same as BasePlug read, not the same as ReadXMLfile read
432	sub read
433	{
434	my $self = shift (@_);
435	$self->BasePlugin::read(@_);
436	}
437
438	sub xml_start_tag {
439	my $self = shift(@_);
440	my ($expat, $element) = @_;
441	$self->{'element'} = $element;
442
443	my $doc_obj = $self->{'doc_obj'};
444	if ($element eq "PagedDocument") {
445	$self->{'current_section'} = $doc_obj->get_top_section();
446	} elsif ($element eq "PageGroup" \|\| $element eq "Page") {
447	# create a new section as a child
448	$self->{'current_section'} = $doc_obj->insert_section($doc_obj->get_end_child($self->{'current_section'}));
449	$self->{'num_pages'}++;
450	# assign pagenum as what??
451	my $pagenum = $_{'pagenum'}; #TODO!!
452	if (defined $pagenum) {
453	$doc_obj->set_utf8_metadata_element($self->{'current_section'}, 'PageNum', $pagenum);
454	}
455	my ($imgfile) = $_{'imgfile'};
456	if (defined $imgfile) {
457	$self->process_image($self->{'base_dir'}.$imgfile, $imgfile, $doc_obj, $self->{'current_section'});
458	}
459	my ($txtfile) = $_{'txtfile'};
460	if (defined($txtfile)&& $txtfile ne "") {
461	$self->process_text ($self->{'base_dir'}.$txtfile, $txtfile, $doc_obj, $self->{'current_section'});
462	} else {
463	$self->add_dummy_text($doc_obj, $self->{'current_section'});
464	}
465	} elsif ($element eq "Metadata") {
466	$self->{'metadata_name'} = $_{'name'};
467	}
468	}
469
470	sub xml_end_tag {
471	my $self = shift(@_);
472	my ($expat, $element) = @_;
473
474	my $doc_obj = $self->{'doc_obj'};
475	if ($element eq "Page" \|\| $element eq "PageGroup") {
476	# if Title hasn't been assigned, set PageNum as Title
477	if (!defined $doc_obj->get_metadata_element ($self->{'current_section'}, "Title") && defined $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" )) {
478	$doc_obj->add_utf8_metadata ($self->{'current_section'}, "Title", $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" ));
479	}
480	# move the current section back to the parent
481	$self->{'current_section'} = $doc_obj->get_parent_section($self->{'current_section'});
482	} elsif ($element eq "Metadata") {
483
484	$doc_obj->add_utf8_metadata ($self->{'current_section'}, $self->{'metadata_name'}, $self->{'metadata_value'});
485	$self->{'metadata_name'} = "";
486	$self->{'metadata_value'} = "";
487
488	}
489	# otherwise we ignore the end tag
490	}
491
492
493	sub xml_text {
494	my $self = shift(@_);
495	my ($expat) = @_;
496
497	if ($self->{'element'} eq "Metadata" && $self->{'metadata_name'}) {
498	$self->{'metadata_value'} .= $_;
499	}
500	}
501
502	sub xml_doctype {
503	}
504
505	sub open_document {
506	my $self = shift(@_);
507
508	# create a new document
509	$self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc");
510	# TODO is file filenmae_no_path??
511	$self->set_initial_doc_fields($self->{'doc_obj'}, $self->{'file'}, $self->{'processor'});
512
513	my ($dir, $file) = $self->{'filename'} =~ /^(.?)([^\/\\])$/;
514	$self->{'base_dir'} = $dir;
515	$self->{'num_pages'} = 0;
516
517	}
518
519	sub close_document {
520	my $self = shift(@_);
521	my $doc_obj = $self->{'doc_obj'};
522
523	# add numpages metadata
524	$doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(), 'NumPages', $self->{'num_pages'});
525
526
527	}
528
529
530	sub set_initial_doc_fields {
531	my $self = shift(@_);
532	my ($doc_obj, $filename_no_path, $processor) = @_;
533
534	$doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
535	my $topsection = $doc_obj->get_top_section();
536
537	if ($self->{'documenttype'} eq 'paged') {
538	# set the gsdlthistype metadata to Paged - this ensures this document will
539	# be treated as a Paged doc, even if Titles are not numeric
540	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Paged");
541	} else {
542	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Hierarchy");
543	}
544
545	$self->set_Source_metadata($doc_obj, $filename_no_path);
546
547	# if we want a header page, we need to add some text into the top section, otherwise this section will become invisible
548	if ($self->{'headerpage'}) {
549	$doc_obj->add_text($topsection, &gsprintf::lookup_string("{BasePlugin.dummy_text}"));
550	}
551
552
553	}
554
555
556	sub process_item {
557	my $self = shift (@_);
558	my ($filename_full_path, $dir, $filename_no_path, $processor) = @_;
559
560	my $doc_obj = new doc ($filename_full_path, "indexed_doc");
561	$self->set_initial_doc_fields($doc_obj, $filename_no_path, $processor);
562	my $topsection = $doc_obj->get_top_section();
563	open (ITEMFILE, $filename_full_path) \|\| die "couldn't open $filename_full_path\n";
564	my $line = "";
565	my $num = 0;
566	while (defined ($line = <ITEMFILE>)) {
567	next unless $line =~ /\w/;
568	chomp $line;
569	next if $line =~ /^#/; # ignore comment lines
570	if ($line =~ /^<([^>])>\s(.?)\s$/) {
571	$doc_obj->set_utf8_metadata_element ($topsection, $1, $2);
572	#$meta->{$1} = $2;
573	} else {
574	$num++;
575	# line should be like page:imagefilename:textfilename:r - the r is optional -> means rotate the image 180 deg
576	$line =~ s/^\s+//; #remove space at the front
577	$line =~ s/\s+$//; #remove space at the end
578	my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
579
580	# create a new section for each image file
581	my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
582	# the page number becomes the Title
583	$doc_obj->set_utf8_metadata_element($cursection, 'Title', $pagenum);
584
585	# process the image for this page if there is one
586	if (defined $imgname && $imgname ne "") {
587	my $result1 = $self->process_image($dir.$imgname, $imgname, $doc_obj, $cursection, $rotate);
588
589	if (!defined $result1)
590	{
591	print "PagedImagePlugin: couldn't process image \"$dir.$imgname\" for item \"$filename_full_path\"\n";
592	}
593	}
594	# process the text file if one is there
595	if (defined $txtname && $txtname ne "") {
596	my $result2 = $self->process_text ($dir.$txtname, $txtname, $doc_obj, $cursection);
597
598	if (!defined $result2) {
599	print "PagedImagePlugin: couldn't process text file \"$dir.$txtname\" for item \"$filename_full_path\"\n";
600	$self->add_dummy_text($doc_obj, $cursection);
601	}
602	} else {
603	# otherwise add in some dummy text
604	$self->add_dummy_text($doc_obj, $cursection);
605	}
606	}
607	}
608
609	close ITEMFILE;
610
611	# add numpages metadata
612	$doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', "$num");
613	return $doc_obj;
614	}
615
616	sub process_text {
617	my $self = shift (@_);
618	my ($filename_full_path, $file, $doc_obj, $cursection) = @_;
619
620	# check that the text file exists!!
621	if (!-f $filename_full_path) {
622	print "PagedImagePlugin: ERROR: File $filename_full_path does not exist, skipping\n";
623	return 0;
624	}
625
626	# Do encoding stuff
627	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
628
629	my $text="";
630	&ReadTextFile::read_file($self, $filename_full_path, $encoding, $language, \$text);
631	if (!length ($text)) {
632	# It's a bit unusual but not out of the question to have no text, so just give a warning
633	print "PagedImagePlugin: WARNING: $filename_full_path contains no text\n";
634	}
635
636	# we need to escape the escape character, or else mg will convert into
637	# eg literal newlines, instead of leaving the text as '\n'
638	$text =~ s/\\/\\\\/g; # macro language
639	$text =~ s/_/\\_/g; # macro language
640
641
642	if ($text =~ m/<html.?>\s<head.?>.<\/head>\s<body.?>(.)<\/body>\s<\/html>\s*$/s) {
643	# looks like HTML input
644	# no need to escape < and > or put in <pre> tags
645
646	$text = $1;
647
648	# insert preformat tags and add text to document object
649	$doc_obj->add_utf8_text($cursection, "$text");
650	}
651	else {
652	$text =~ s/</</g;
653	$text =~ s/>/>/g;
654
655	# insert preformat tags and add text to document object
656	$doc_obj->add_utf8_text($cursection, "<pre>\n$text\n</pre>");
657	}
658
659
660	return 1;
661	}
662
663	# do plugin specific processing of doc_obj
664	sub process_old {
665	my $self = shift (@_);
666	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
667	my $outhandle = $self->{'outhandle'};
668
669	return 1;
670	}
671
672	sub clean_up_after_doc_obj_processing {
673	my $self = shift(@_);
674
675	$self->ImageConverter::clean_up_temporary_files();
676	}
677
678	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: