Context Navigation

source: main/trunk/greenstone2/perllib/plugins/PagedImagePlugin.pm@ 24056

Last change on this file since 24056 was 24056, checked in by ak19, 13 years ago
One of the two changes made in this file's previous commit to fix the unicode handling of PagedImagePlugin was wrong: text item files don't additional decoding into UTF8, that's already handled by the ReadTextFile read operation that's called.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 26.1 KB

Line
1	###########################################################################
2	#
3	# PagedImagePlugin.pm -- plugin for sets of images and OCR text that
4	# make up a document
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	# PagedImagePlugin
28	# processes sequences of images, with optional OCR text
29	#
30	# This plugin takes *.item files, which contain metadata and lists of image
31	# files, and produces a document containing sections, one for each page.
32	# The files should be named something.item, then you can have more than one
33	# book in a directory. You will need to create these files, one for each
34	# document/book.
35	#
36	#There are two formats for the item files: a plain text format, and an xml
37	#format. You can use either format, and can have both formats in the same
38	#collection if you like. If you use the plain format, you must not start the
39	#file off with <PagedDocument>
40
41	#### PLAIN FORMAT
42	# The format of the xxx.item file is as follows:
43	# The first lines contain any metadata for the whole document
44	# <metadata-name>metadata-value
45	# eg.
46	# <Title>Snail farming
47	# <Date>19230102
48	# Then comes a list of pages, one page per line, each line has the format
49	#
50	# pagenum:imagefile:textfile:r
51	#
52	# page num and imagefile are required. pagenum is used for the Title
53	# of the section, and in the display is shown as page <pagenum>.
54	# imagefile is the image for the page. textfile is an optional text
55	# file containing the OCR (or any) text for the page - this gets added
56	# as the text for the section. r is optional, and signals that the image
57	# should be rotated 180deg. Eg use this if the image has been made upside down.
58	# So an example item file looks like:
59	# <Title>Snail farming
60	# <Date>19960403
61	# 1:p1.gif:p1.txt:
62	# 2:p2.gif::
63	# 3:p3.gif:p3.txt:
64	# 3b:p3b.gif:p3b.txt:r
65	# The second page has no text, the fourth page is a back page, and
66	# should be rotated.
67	#
68
69	#### XML FORMAT
70	# The xml format looks like the following
71	#<PagedDocument>
72	#<Metadata name="Title">The Title of the entire document</Metadata>
73	#<Page pagenum="1" imgfile="xxx.jpg" txtfile="yyy.txt">
74	#<Metadata name="Title">The Title of this page</Metadata>
75	#</Page>
76	#... more pages
77	#</PagedDocument>
78	#PagedDocument contains a list of Pages, Metadata and PageGroups. Any metadata
79	#that is not inside another tag will belong to the document.
80	#Each Page has a pagenum (not used at the moment), an imgfile and/or a txtfile.
81	#These are both optional - if neither is used, the section will have no content.
82	#Pages can also have metadata associated with them.
83	#PageGroups can be introduced at any point - they can contain Metadata and Pages and other PageGroups. They are used to introduce hierarchical structure into the document.
84	#For example
85	#<PagedDocument>
86	#<PageGroup>
87	#<Page>
88	#<Page>
89	#</PageGroup>
90	#<Page>
91	#</PagedDocument>
92	#would generate a structure like
93	#X
94	#--X
95	# --X
96	# --X
97	#--X
98	#PageGroup tags can also have imgfile/textfile metadata if you like - this way they get some content themselves.
99
100	#Currently the XML structure doesn't work very well with the paged document type, unless you use numerical Titles for each section.
101	#There is still a bit of work to do on this format:
102	#* enable other text file types, eg html, pdf etc
103	#* make the document paging work properly
104	#* add pagenum as Title unless a Title is present?
105
106	# All the supplemetary image amd text files should be in the same folder as
107	# the .item file.
108	#
109	# To display the images instead of the document text, you can use [srcicon]
110	# in the DocumentText format statement.
111	# For example,
112	#
113	# format DocumentText "<center><table width=_pagewidth_><tr><td>[srcicon]</td></tr></table></center>"
114	#
115	# To have it create thumbnail size images, use the '-create_thumbnail' option.
116	# To have it create medium size images for display, use the '-create_screenview'
117	# option. As usual, running
118	# 'perl -S pluginfo.pl PagedImagePlugin' will list all the options.
119
120	# If you want the resulting documents to be presented with a table of
121	# contents, use '-documenttype hierarchy', otherwise they will have
122	# next and previous arrows, and a goto page X box.
123
124	# If you have used -create_screenview, you can also use [screenicon] in the format
125	# statement to display the smaller image. Here is an example that switches
126	# between the two:
127	#
128	# format DocumentText "<center><table width=_pagewidth_><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small'>Switch to small version.</a>,<a href='_httpdocument_&d=_cgiargd_&p=full'>Switch to fullsize version</a>}</td></tr><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small' title='Switch to small version'>[srcicon]</a>,<a href='_httpdocument_&d=_cgiargd_&p=full' title='Switch to fullsize version'>[screenicon]</a>}</td></tr></table></center>"
129	#
130	# Additional metadata can be added into the .item files, alternatively you can
131	# use normal metadata.xml files, with the name of the xxx.item file as the
132	# FileName (only for document level metadata).
133
134	package PagedImagePlugin;
135
136	use Encode;
137	use ReadXMLFile;
138	use ReadTextFile;
139	use ImageConverter;
140
141	use strict;
142	no strict 'refs'; # allow filehandles to be variables and viceversa
143
144	sub BEGIN {
145	@PagedImagePlugin::ISA = ('ReadXMLFile', 'ReadTextFile', 'ImageConverter');
146	}
147
148	my $type_list =
149	[ { 'name' => "paged",
150	'desc' => "{PagedImagePlugin.documenttype.paged}" },
151	{ 'name' => "hierarchy",
152	'desc' => "{PagedImagePlugin.documenttype.hierarchy}" } ];
153
154	my $arguments =
155	[ { 'name' => "process_exp",
156	'desc' => "{BasePlugin.process_exp}",
157	'type' => "string",
158	'deft' => &get_default_process_exp(),
159	'reqd' => "no" },
160	{ 'name' => "title_sub",
161	'desc' => "{HTMLPlugin.title_sub}",
162	'type' => "string",
163	'deft' => "" },
164	{ 'name' => "headerpage",
165	'desc' => "{PagedImagePlugin.headerpage}",
166	'type' => "flag",
167	'reqd' => "no" },
168	{ 'name' => "documenttype",
169	'desc' => "{PagedImagePlugin.documenttype}",
170	'type' => "enum",
171	'list' => $type_list,
172	'deft' => "paged",
173	'reqd' => "no" },
174	{'name' => "processing_tmp_files",
175	'desc' => "{BasePlugin.processing_tmp_files}",
176	'type' => "flag",
177	'hiddengli' => "yes"}
178	];
179
180
181	my $options = { 'name' => "PagedImagePlugin",
182	'desc' => "{PagedImagePlugin.desc}",
183	'abstract' => "no",
184	'inherits' => "yes",
185	'args' => $arguments };
186
187	sub new {
188	my ($class) = shift (@_);
189	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
190	push(@$pluginlist, $class);
191
192	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
193	push(@{$hashArgOptLists->{"OptList"}},$options);
194
195	my $imc_self = new ImageConverter($pluginlist, $inputargs, $hashArgOptLists);
196	my $rtf_self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists, 1);
197	my $rxf_self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
198
199	my $self = BasePlugin::merge_inheritance($imc_self,$rtf_self,$rxf_self);
200
201	# Update $self used by XML::Parser so it finds callback functions
202	# such as start_document here and not in ReadXMLFile (which is what
203	# $self was when new XML::Parser was done)
204	#
205	# If the $self returned by this constructor is the same as the one
206	# used in ReadXMLFile (e.g. in the GreenstoneXMLPlugin) then this step isn't necessary
207	#
208	# Consider embedding this type of assignment into merge_inheritance
209	# to help catch all cases?
210
211	$rxf_self->{'parser'}->{'PluginObj'} = $self;
212
213	return bless $self, $class;
214	}
215
216
217	sub init {
218	my $self = shift (@_);
219	my ($verbosity, $outhandle, $failhandle) = @_;
220
221	$self->SUPER::init(@_);
222	$self->ImageConverter::init();
223	}
224
225	sub begin {
226	my $self = shift (@_);
227	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
228
229	$self->SUPER::begin(@_);
230	$self->ImageConverter::begin(@_);
231	}
232
233	sub get_default_process_exp {
234	my $self = shift (@_);
235
236	return q^\.item$^;
237	}
238
239	sub get_doctype {
240	my $self = shift(@_);
241
242	return "PagedDocument";
243	}
244
245
246	# want to use BasePlugin's version of this, not ReadXMLFile's
247	sub can_process_this_file {
248	my $self = shift(@_);
249	return $self->BasePlugin::can_process_this_file(@_);
250	}
251
252	# instead of a block exp, now we scan the file and record all text and img files mentioned there for blocking.
253	sub store_block_files
254	{
255	my $self = shift (@_);
256	my ($filename_full_path, $block_hash) = @_;
257
258	my $xml_version = $self->is_xml_item_file($filename_full_path);
259
260	# do we need to do this?
261	# does BOM interfere just with XML parsing? In that case don't need it here
262	# if we do it here, we are modifying the file before we have worked out if
263	# its new or not, so it will always be reimported.
264	#$self->tidy_item_file($filename_full_path);
265
266	my ($dir, $file) = $filename_full_path =~ /^(.?)([^\/\\])$/;
267	if ($xml_version) {
268
269	# do something
270	$self->scan_xml_for_files_to_block($filename_full_path, $dir, $block_hash);
271	} else {
272
273	$self->scan_item_for_files_to_block($filename_full_path, $dir, $block_hash);
274	}
275
276	}
277
278	# we want to use BasePlugin's read, not ReadXMLFile's
279	sub read
280	{
281	my $self = shift (@_);
282
283	$self->BasePlugin::read(@_);
284	}
285
286
287
288	sub read_into_doc_obj {
289	my $self = shift (@_);
290	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
291	my $outhandle = $self->{'outhandle'};
292	my $verbosity = $self->{'verbosity'};
293
294	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
295
296	print $outhandle "PagedImagePlugin processing \"$filename_full_path\"\n"
297	if $verbosity > 1;
298	print STDERR "<Processing n='$file' p='PagedImagePlugin'>\n" if ($gli);
299
300	$self->{'MaxImageWidth'} = 0;
301	$self->{'MaxImageHeight'} = 0;
302
303	# here we need to decide if we have an old text .item file, or a new xml
304	# .item file
305	my $xml_version = $self->is_xml_item_file($filename_full_path);
306
307	$self->tidy_item_file($filename_full_path);
308
309	my $doc_obj;
310	if ($xml_version) {
311	# careful checking needed here!! are we using local xml handlers or super ones
312	$self->ReadXMLFile::read($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
313	$doc_obj = $self->{'doc_obj'};
314	} else {
315	my ($dir);
316	($dir, $file) = $filename_full_path =~ /^(.?)([^\/\\])$/;
317
318	#process the .item file
319	$doc_obj = $self->process_item($filename_full_path, $dir, $file, $processor, $metadata);
320
321	}
322
323	my $section = $doc_obj->get_top_section();
324
325	$doc_obj->add_utf8_metadata($section, "Plugin", "$self->{'plugin_type'}");
326	$doc_obj->add_metadata($section, "FileFormat", "PagedImage");
327
328	# include any metadata passed in from previous plugins
329	# note that this metadata is associated with the top level section
330	$self->add_associated_files($doc_obj, $filename_full_path);
331	$self->extra_metadata ($doc_obj, $section, $metadata);
332	$self->auto_extract_metadata ($doc_obj);
333
334	# if we haven't found any Title so far, assign one
335	$self->title_fallback($doc_obj,$section,$filename_no_path);
336
337	$self->add_OID($doc_obj);
338	return (1,$doc_obj);
339	}
340
341	# for now, the test is if the first non-empty line is <PagedDocument>, then its xml
342	sub is_xml_item_file {
343	my $self = shift(@_);
344	my ($filename) = @_;
345
346	my $xml_version = 0;
347	open (ITEMFILE, $filename) \|\| die "couldn't open $filename\n";
348
349	my $line = "";
350	my $num = 0;
351
352	$line = <ITEMFILE>;
353	while (defined ($line) && ($line !~ /\w/)) {
354	$line = <ITEMFILE>;
355	}
356
357	if (defined $line) {
358	chomp $line;
359	if ($line =~ /<PagedDocument/) {
360	$xml_version = 1;
361	}
362	}
363
364	close ITEMFILE;
365	return $xml_version;
366	}
367
368	sub tidy_item_file {
369	my $self = shift(@_);
370	my ($filename) = @_;
371
372	open (ITEMFILE, $filename) \|\| die "couldn't open $filename\n";
373	my $backup_filename = "backup.item";
374	open (BACKUP,">$backup_filename")\|\| die "couldn't write to $backup_filename\n";
375	my $line = "";
376	$line = <ITEMFILE>;
377	$line =~ s/^\xEF\xBB\xBF//; # strip BOM
378	$line =~ s/\x0B+//ig;
379	$line =~ s/&/&/g;
380	print BACKUP ($line);
381	#Tidy up the item file some metadata title contains \vt-vertical tab
382	while ($line = <ITEMFILE>) {
383	$line =~ s/\x0B+//ig;
384	$line =~ s/&/&/g;
385	print BACKUP ($line);
386	}
387	close ITEMFILE;
388	close BACKUP;
389	&File::Copy::copy ($backup_filename, $filename);
390	&util::rm($backup_filename);
391
392	}
393
394	sub rotate_image {
395	my $self = shift (@_);
396	my ($filename_full_path) = @_;
397
398	my ($this_filetype) = $filename_full_path =~ /\.([^\.]*)$/;
399	my $result = $self->convert($filename_full_path, $this_filetype, "-rotate 180", "ROTATE");
400	my ($new_filename) = ($result =~ /=>(.*\.$this_filetype)/);
401	if (-e "$new_filename") {
402	return $new_filename;
403	}
404	# somethings gone wrong
405	return $filename_full_path;
406
407	}
408
409	sub process_image {
410	my $self = shift(@_);
411	my ($filename_full_path, $filename_no_path, $doc_obj, $section, $rotate) = @_;
412	# check the filenames
413	return 0 if ($filename_no_path eq "" \|\| !-f $filename_full_path);
414
415	# remember that this image file was one of our source files, but only
416	# if we are not processing a tmp file
417	if (!$self->{'processing_tmp_files'} ) {
418	$doc_obj->associate_source_file($filename_full_path);
419	}
420	# do rotation
421	if ((defined $rotate) && ($rotate eq "r")) {
422	# we get a new temporary file which is rotated
423	$filename_full_path = $self->rotate_image($filename_full_path);
424	}
425
426	# do generate images
427	my $result = 0;
428	if ($self->{'image_conversion_available'} == 1) {
429	# do we need to convert $filename_no_path to utf8/url encoded?
430	# We are already reading in from a file, what encoding is it in???
431	my $url_encoded_full_filename
432	= &unicode::raw_filename_to_url_encoded($filename_full_path);
433	$result = $self->generate_images($filename_full_path, $url_encoded_full_filename, $doc_obj, $section);
434	}
435	#overwrite one set in ImageConverter
436	$doc_obj->set_metadata_element ($section, "FileFormat", "PagedImage");
437	return $result;
438	}
439
440
441	sub xml_start_tag {
442	my $self = shift(@_);
443	my ($expat, $element) = @_;
444	$self->{'element'} = $element;
445
446	my $doc_obj = $self->{'doc_obj'};
447	if ($element eq "PagedDocument") {
448	$self->{'current_section'} = $doc_obj->get_top_section();
449	} elsif ($element eq "PageGroup" \|\| $element eq "Page") {
450	# create a new section as a child
451	$self->{'current_section'} = $doc_obj->insert_section($doc_obj->get_end_child($self->{'current_section'}));
452	$self->{'num_pages'}++;
453	# assign pagenum as what??
454	my $pagenum = $_{'pagenum'}; #TODO!!
455	if (defined $pagenum) {
456	$doc_obj->set_utf8_metadata_element($self->{'current_section'}, 'PageNum', $pagenum);
457	}
458	my ($imgfile) = $_{'imgfile'};
459	if (defined $imgfile) {
460	# *****
461	# What about support for rotate image (e.g. old ':r' notation)?
462	$self->process_image($self->{'xml_file_dir'}.$imgfile, $imgfile, $doc_obj, $self->{'current_section'});
463	}
464	my ($txtfile) = $_{'txtfile'};
465	if (defined($txtfile)&& $txtfile ne "") {
466	$self->process_text ($self->{'xml_file_dir'}.$txtfile, $txtfile, $doc_obj, $self->{'current_section'});
467	} else {
468	$self->add_dummy_text($doc_obj, $self->{'current_section'});
469	}
470	} elsif ($element eq "Metadata") {
471	$self->{'metadata_name'} = $_{'name'};
472	}
473	}
474
475	sub xml_end_tag {
476	my $self = shift(@_);
477	my ($expat, $element) = @_;
478
479	my $doc_obj = $self->{'doc_obj'};
480	if ($element eq "Page" \|\| $element eq "PageGroup") {
481	# if Title hasn't been assigned, set PageNum as Title
482	if (!defined $doc_obj->get_metadata_element ($self->{'current_section'}, "Title") && defined $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" )) {
483	$doc_obj->add_utf8_metadata ($self->{'current_section'}, "Title", $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" ));
484	}
485	# move the current section back to the parent
486	$self->{'current_section'} = $doc_obj->get_parent_section($self->{'current_section'});
487	} elsif ($element eq "Metadata") {
488
489	# text read in by XML::Parser is in Perl's binary byte value
490	# form ... need to explicitly make it UTF-8
491	my $meta_name = decode("utf-8",$self->{'metadata_name'});
492	my $metadata_value = decode("utf-8",$self->{'metadata_value'});
493
494	if ($meta_name =~ /\./) {
495	$meta_name = "ex.$meta_name";
496	}
497
498	$doc_obj->add_utf8_metadata ($self->{'current_section'}, $meta_name, $metadata_value);
499	$self->{'metadata_name'} = "";
500	$self->{'metadata_value'} = "";
501
502	}
503	# otherwise we ignore the end tag
504	}
505
506
507	sub xml_text {
508	my $self = shift(@_);
509	my ($expat) = @_;
510
511	if ($self->{'element'} eq "Metadata" && $self->{'metadata_name'}) {
512	$self->{'metadata_value'} .= $_;
513	}
514	}
515
516	sub xml_doctype {
517	}
518
519	sub open_document {
520	my $self = shift(@_);
521
522	# create a new document
523	$self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc", $self->{'file_rename_method'});
524	# TODO is file filenmae_no_path??
525	$self->set_initial_doc_fields($self->{'doc_obj'}, $self->{'filename'}, $self->{'processor'}, $self->{'metadata'});
526
527	my ($dir, $file) = $self->{'filename'} =~ /^(.?)([^\/\\])$/;
528	$self->{'xml_file_dir'} = $dir;
529	$self->{'num_pages'} = 0;
530
531	}
532
533	sub close_document {
534	my $self = shift(@_);
535	my $doc_obj = $self->{'doc_obj'};
536
537	# add numpages metadata
538	my $topsection = $doc_obj->get_top_section();
539
540	$doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', $self->{'num_pages'});
541
542	$doc_obj->set_utf8_metadata_element($topsection,"MaxImageWidth",$self->{'MaxImageWidth'});
543	$doc_obj->set_utf8_metadata_element($topsection,"MaxImageHeight",$self->{'MaxImageHeight'});
544	$self->{'MaxImageWidth'} = undef;
545	$self->{'MaxImageHeight'} = undef;
546
547	}
548
549
550	sub set_initial_doc_fields {
551	my $self = shift(@_);
552	my ($doc_obj, $filename_full_path, $processor, $metadata) = @_;
553
554	my $topsection = $doc_obj->get_top_section();
555
556	if ($self->{'documenttype'} eq 'paged') {
557	# set the gsdlthistype metadata to Paged - this ensures this document will
558	# be treated as a Paged doc, even if Titles are not numeric
559	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Paged");
560	} else {
561	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Hierarchy");
562	}
563
564	my $plugin_filename_encoding = $self->{'filename_encoding'};
565	my $filename_encoding = $self->deduce_filename_encoding($filename_full_path,$metadata,$plugin_filename_encoding);
566	$self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
567
568	# if we want a header page, we need to add some text into the top section, otherwise this section will become invisible
569	if ($self->{'headerpage'}) {
570	$self->add_dummy_text($doc_obj, $topsection);
571	}
572	}
573
574	sub scan_xml_for_files_to_block
575	{
576	my $self = shift (@_);
577	my ($filename_full_path, $dir, $block_hash) = @_;
578
579	open (ITEMFILE, $filename_full_path) \|\| die "couldn't open $filename_full_path to work out which files to block\n";
580	my $line = "";
581	while (defined ($line = <ITEMFILE>)) {
582	next unless $line =~ /\w/;
583
584	if ($line =~ /imgfile=\"([^\"]+)\"/) {
585	&util::block_filename($block_hash,$dir.$1);
586	}
587	if ($line =~ /txtfile=\"([^\"]+)\"/) {
588	&util::block_filename($block_hash,$dir.$1);
589	}
590	}
591	close ITEMFILE;
592
593	}
594
595	sub scan_item_for_files_to_block
596	{
597	my $self = shift (@_);
598	my ($filename_full_path, $dir, $block_hash) = @_;
599
600	open (ITEMFILE, $filename_full_path) \|\| die "couldn't open $filename_full_path to work out which files to block\n";
601	my $line = "";
602	while (defined ($line = <ITEMFILE>)) {
603	next unless $line =~ /\w/;
604	chomp $line;
605	next if $line =~ /^#/; # ignore comment lines
606	next if ($line =~ /^<([^>])>\s(.?)\s$/); # ignore metadata lines
607	# line should be like page:imagefilename:textfilename:r
608	$line =~ s/^\s+//; #remove space at the front
609	$line =~ s/\s+$//; #remove space at the end
610	my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
611
612	# find the image file if there is one
613	if (defined $imgname && $imgname ne "") {
614	&util::block_filename($block_hash, &util::filename_cat( $dir,$imgname));
615	}
616	# find the text file if there is one
617	if (defined $txtname && $txtname ne "") {
618	&util::block_filename($block_hash, &util::filename_cat($dir,$txtname));
619	}
620	}
621	close ITEMFILE;
622
623	}
624
625	sub process_item {
626	my $self = shift (@_);
627	my ($filename_full_path, $dir, $filename_no_path, $processor, $metadata) = @_;
628
629	my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
630	$self->set_initial_doc_fields($doc_obj, $filename_full_path, $processor, $metadata);
631	my $topsection = $doc_obj->get_top_section();
632	open (ITEMFILE, $filename_full_path) \|\| die "couldn't open $filename_full_path\n";
633	my $line = "";
634	my $num = 0;
635	while (defined ($line = <ITEMFILE>)) {
636
637	# Since process_item is called not on an XML item file, but a text item file
638	# don't decode into UTF8 the text that was read in, since it's already UTF-8
639	#$line = decode("utf-8",$line);
640
641	next unless $line =~ /\w/;
642	chomp $line;
643	next if $line =~ /^#/; # ignore comment lines
644	if ($line =~ /^<([^>])>\s(.?)\s$/) {
645	my $meta_name = $1;
646	my $meta_value = $2;
647	if ($meta_name =~ /\./) {
648	$meta_name = "ex.$meta_name";
649	}
650	$doc_obj->set_utf8_metadata_element ($topsection, $meta_name, $meta_value);
651	#$meta->{$1} = $2;
652	} else {
653	$num++;
654	# line should be like page:imagefilename:textfilename:r - the r is optional -> means rotate the image 180 deg
655	$line =~ s/^\s+//; #remove space at the front
656	$line =~ s/\s+$//; #remove space at the end
657	my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
658
659	# create a new section for each image file
660	my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
661	# the page number becomes the Title
662	$doc_obj->set_utf8_metadata_element($cursection, 'Title', $pagenum);
663
664	# process the image for this page if there is one
665	if (defined $imgname && $imgname ne "") {
666	my $result1 = $self->process_image($dir.$imgname, $imgname, $doc_obj, $cursection, $rotate);
667	if (!defined $result1)
668	{
669	print "PagedImagePlugin: couldn't process image \"$dir$imgname\" for item \"$filename_full_path\"\n";
670	}
671	}
672	# process the text file if one is there
673	if (defined $txtname && $txtname ne "") {
674	my $result2 = $self->process_text ($dir.$txtname, $txtname, $doc_obj, $cursection);
675
676	if (!defined $result2) {
677	print "PagedImagePlugin: couldn't process text file \"$dir.$txtname\" for item \"$filename_full_path\"\n";
678	$self->add_dummy_text($doc_obj, $cursection);
679	}
680	} else {
681	# otherwise add in some dummy text
682	$self->add_dummy_text($doc_obj, $cursection);
683	}
684	}
685	}
686
687	close ITEMFILE;
688
689	# add numpages metadata
690	$doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', "$num");
691
692	$doc_obj->set_utf8_metadata_element($topsection,"MaxImageWidth",$self->{'MaxImageWidth'});
693	$doc_obj->set_utf8_metadata_element($topsection,"MaxImageHeight",$self->{'MaxImageHeight'});
694	$self->{'MaxImageWidth'} = undef;
695	$self->{'MaxImageHeight'} = undef;
696
697
698	return $doc_obj;
699	}
700
701	sub process_text {
702	my $self = shift (@_);
703	my ($filename_full_path, $file, $doc_obj, $cursection) = @_;
704
705	# check that the text file exists!!
706	if (!-f $filename_full_path) {
707	print "PagedImagePlugin: ERROR: File $filename_full_path does not exist, skipping\n";
708	return 0;
709	}
710
711	# remember that this text file was one of our source files, but only
712	# if we are not processing a tmp file
713	if (!$self->{'processing_tmp_files'} ) {
714	$doc_obj->associate_source_file($filename_full_path);
715	}
716	# Do encoding stuff
717	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
718
719	my $text="";
720	&ReadTextFile::read_file($self, $filename_full_path, $encoding, $language, \$text); # already decoded as utf8
721	if (!length ($text)) {
722	# It's a bit unusual but not out of the question to have no text, so just give a warning
723	print "PagedImagePlugin: WARNING: $filename_full_path contains no text\n";
724	}
725
726	# we need to escape the escape character, or else mg will convert into
727	# eg literal newlines, instead of leaving the text as '\n'
728	$text =~ s/\\/\\\\/g; # macro language
729	$text =~ s/_/\\_/g; # macro language
730
731
732	if ($text =~ m/<html.?>\s<head.?>.<\/head>\s<body.?>(.)<\/body>\s<\/html>\s*$/is) {
733	# looks like HTML input
734	# no need to escape < and > or put in <pre> tags
735
736	$text = $1;
737
738	# add text to document object
739	$doc_obj->add_utf8_text($cursection, "$text");
740	}
741	else {
742	$text =~ s/</</g;
743	$text =~ s/>/>/g;
744
745	# insert preformat tags and add text to document object
746	$doc_obj->add_utf8_text($cursection, "<pre>\n$text\n</pre>");
747	}
748
749
750	return 1;
751	}
752
753
754	sub clean_up_after_doc_obj_processing {
755	my $self = shift(@_);
756
757	$self->ImageConverter::clean_up_temporary_files();
758	}
759
760	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: