Context Navigation

source: main/trunk/greenstone2/perllib/plugins/PagedImagePlugin.pm@ 24548

Last change on this file since 24548 was 24548, checked in by ak19, 13 years ago
Part 2 of previous commit (r24547). Added new abstract plugin MetadataRead? that defines can_process_this_file_for_metadata that MetadataPlugin? subclasses can inherit (if MetadataRead? is listed first in the ISA inheritance list) and which will then override the one defined in BasePlugin?. For now committing MARC, ISIS and OAIPlugins which now additionally inherit from MetadataRead?. Other metadataPlugins also need to be committed.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 26.2 KB

Line
1	###########################################################################
2	#
3	# PagedImagePlugin.pm -- plugin for sets of images and OCR text that
4	# make up a document
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	# PagedImagePlugin
28	# processes sequences of images, with optional OCR text
29	#
30	# This plugin takes *.item files, which contain metadata and lists of image
31	# files, and produces a document containing sections, one for each page.
32	# The files should be named something.item, then you can have more than one
33	# book in a directory. You will need to create these files, one for each
34	# document/book.
35	#
36	#There are two formats for the item files: a plain text format, and an xml
37	#format. You can use either format, and can have both formats in the same
38	#collection if you like. If you use the plain format, you must not start the
39	#file off with <PagedDocument>
40
41	#### PLAIN FORMAT
42	# The format of the xxx.item file is as follows:
43	# The first lines contain any metadata for the whole document
44	# <metadata-name>metadata-value
45	# eg.
46	# <Title>Snail farming
47	# <Date>19230102
48	# Then comes a list of pages, one page per line, each line has the format
49	#
50	# pagenum:imagefile:textfile:r
51	#
52	# page num and imagefile are required. pagenum is used for the Title
53	# of the section, and in the display is shown as page <pagenum>.
54	# imagefile is the image for the page. textfile is an optional text
55	# file containing the OCR (or any) text for the page - this gets added
56	# as the text for the section. r is optional, and signals that the image
57	# should be rotated 180deg. Eg use this if the image has been made upside down.
58	# So an example item file looks like:
59	# <Title>Snail farming
60	# <Date>19960403
61	# 1:p1.gif:p1.txt:
62	# 2:p2.gif::
63	# 3:p3.gif:p3.txt:
64	# 3b:p3b.gif:p3b.txt:r
65	# The second page has no text, the fourth page is a back page, and
66	# should be rotated.
67	#
68
69	#### XML FORMAT
70	# The xml format looks like the following
71	#<PagedDocument>
72	#<Metadata name="Title">The Title of the entire document</Metadata>
73	#<Page pagenum="1" imgfile="xxx.jpg" txtfile="yyy.txt">
74	#<Metadata name="Title">The Title of this page</Metadata>
75	#</Page>
76	#... more pages
77	#</PagedDocument>
78	#PagedDocument contains a list of Pages, Metadata and PageGroups. Any metadata
79	#that is not inside another tag will belong to the document.
80	#Each Page has a pagenum (not used at the moment), an imgfile and/or a txtfile.
81	#These are both optional - if neither is used, the section will have no content.
82	#Pages can also have metadata associated with them.
83	#PageGroups can be introduced at any point - they can contain Metadata and Pages and other PageGroups. They are used to introduce hierarchical structure into the document.
84	#For example
85	#<PagedDocument>
86	#<PageGroup>
87	#<Page>
88	#<Page>
89	#</PageGroup>
90	#<Page>
91	#</PagedDocument>
92	#would generate a structure like
93	#X
94	#--X
95	# --X
96	# --X
97	#--X
98	#PageGroup tags can also have imgfile/textfile metadata if you like - this way they get some content themselves.
99
100	#Currently the XML structure doesn't work very well with the paged document type, unless you use numerical Titles for each section.
101	#There is still a bit of work to do on this format:
102	#* enable other text file types, eg html, pdf etc
103	#* make the document paging work properly
104	#* add pagenum as Title unless a Title is present?
105
106	# All the supplemetary image amd text files should be in the same folder as
107	# the .item file.
108	#
109	# To display the images instead of the document text, you can use [srcicon]
110	# in the DocumentText format statement.
111	# For example,
112	#
113	# format DocumentText "<center><table width=_pagewidth_><tr><td>[srcicon]</td></tr></table></center>"
114	#
115	# To have it create thumbnail size images, use the '-create_thumbnail' option.
116	# To have it create medium size images for display, use the '-create_screenview'
117	# option. As usual, running
118	# 'perl -S pluginfo.pl PagedImagePlugin' will list all the options.
119
120	# If you want the resulting documents to be presented with a table of
121	# contents, use '-documenttype hierarchy', otherwise they will have
122	# next and previous arrows, and a goto page X box.
123
124	# If you have used -create_screenview, you can also use [screenicon] in the format
125	# statement to display the smaller image. Here is an example that switches
126	# between the two:
127	#
128	# format DocumentText "<center><table width=_pagewidth_><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small'>Switch to small version.</a>,<a href='_httpdocument_&d=_cgiargd_&p=full'>Switch to fullsize version</a>}</td></tr><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small' title='Switch to small version'>[srcicon]</a>,<a href='_httpdocument_&d=_cgiargd_&p=full' title='Switch to fullsize version'>[screenicon]</a>}</td></tr></table></center>"
129	#
130	# Additional metadata can be added into the .item files, alternatively you can
131	# use normal metadata.xml files, with the name of the xxx.item file as the
132	# FileName (only for document level metadata).
133
134	package PagedImagePlugin;
135
136	use Encode;
137	use ReadXMLFile;
138	use ReadTextFile;
139	use ImageConverter;
140	use MetadataRead;
141
142	use strict;
143	no strict 'refs'; # allow filehandles to be variables and viceversa
144
145	sub BEGIN {
146	@PagedImagePlugin::ISA = ('MetadataRead', 'ReadXMLFile', 'ReadTextFile', 'ImageConverter');
147	}
148
149	my $type_list =
150	[ { 'name' => "paged",
151	'desc' => "{PagedImagePlugin.documenttype.paged}" },
152	{ 'name' => "hierarchy",
153	'desc' => "{PagedImagePlugin.documenttype.hierarchy}" } ];
154
155	my $arguments =
156	[ { 'name' => "process_exp",
157	'desc' => "{BasePlugin.process_exp}",
158	'type' => "string",
159	'deft' => &get_default_process_exp(),
160	'reqd' => "no" },
161	{ 'name' => "title_sub",
162	'desc' => "{HTMLPlugin.title_sub}",
163	'type' => "string",
164	'deft' => "" },
165	{ 'name' => "headerpage",
166	'desc' => "{PagedImagePlugin.headerpage}",
167	'type' => "flag",
168	'reqd' => "no" },
169	{ 'name' => "documenttype",
170	'desc' => "{PagedImagePlugin.documenttype}",
171	'type' => "enum",
172	'list' => $type_list,
173	'deft' => "paged",
174	'reqd' => "no" },
175	{'name' => "processing_tmp_files",
176	'desc' => "{BasePlugin.processing_tmp_files}",
177	'type' => "flag",
178	'hiddengli' => "yes"}
179	];
180
181
182	my $options = { 'name' => "PagedImagePlugin",
183	'desc' => "{PagedImagePlugin.desc}",
184	'abstract' => "no",
185	'inherits' => "yes",
186	'args' => $arguments };
187
188	sub new {
189	my ($class) = shift (@_);
190	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
191	push(@$pluginlist, $class);
192
193	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
194	push(@{$hashArgOptLists->{"OptList"}},$options);
195
196	my $imc_self = new ImageConverter($pluginlist, $inputargs, $hashArgOptLists);
197	my $rtf_self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists, 1);
198	my $rxf_self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
199
200	my $self = BasePlugin::merge_inheritance($imc_self,$rtf_self,$rxf_self);
201
202	# Update $self used by XML::Parser so it finds callback functions
203	# such as start_document here and not in ReadXMLFile (which is what
204	# $self was when new XML::Parser was done)
205	#
206	# If the $self returned by this constructor is the same as the one
207	# used in ReadXMLFile (e.g. in the GreenstoneXMLPlugin) then this step isn't necessary
208	#
209	# Consider embedding this type of assignment into merge_inheritance
210	# to help catch all cases?
211
212	$rxf_self->{'parser'}->{'PluginObj'} = $self;
213
214	return bless $self, $class;
215	}
216
217
218	sub init {
219	my $self = shift (@_);
220	my ($verbosity, $outhandle, $failhandle) = @_;
221
222	$self->SUPER::init(@_);
223	$self->ImageConverter::init();
224	}
225
226	sub begin {
227	my $self = shift (@_);
228	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
229
230	$self->SUPER::begin(@_);
231	$self->ImageConverter::begin(@_);
232	}
233
234	sub get_default_process_exp {
235	my $self = shift (@_);
236
237	return q^\.item$^;
238	}
239
240	sub get_doctype {
241	my $self = shift(@_);
242
243	return "PagedDocument";
244	}
245
246
247	# want to use BasePlugin's version of this, not ReadXMLFile's
248	sub can_process_this_file {
249	my $self = shift(@_);
250	return $self->BasePlugin::can_process_this_file(@_);
251	}
252
253	# instead of a block exp, now we scan the file and record all text and img files mentioned there for blocking.
254	sub store_block_files
255	{
256	my $self = shift (@_);
257	my ($filename_full_path, $block_hash) = @_;
258
259	my $xml_version = $self->is_xml_item_file($filename_full_path);
260
261	# do we need to do this?
262	# does BOM interfere just with XML parsing? In that case don't need it here
263	# if we do it here, we are modifying the file before we have worked out if
264	# its new or not, so it will always be reimported.
265	#$self->tidy_item_file($filename_full_path);
266
267	my ($dir, $file) = $filename_full_path =~ /^(.?)([^\/\\])$/;
268	if ($xml_version) {
269
270	# do something
271	$self->scan_xml_for_files_to_block($filename_full_path, $dir, $block_hash);
272	} else {
273
274	$self->scan_item_for_files_to_block($filename_full_path, $dir, $block_hash);
275	}
276
277	}
278
279	# we want to use BasePlugin's read, not ReadXMLFile's
280	sub read
281	{
282	my $self = shift (@_);
283
284	$self->BasePlugin::read(@_);
285	}
286
287
288
289	sub read_into_doc_obj {
290	my $self = shift (@_);
291	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
292	my $outhandle = $self->{'outhandle'};
293	my $verbosity = $self->{'verbosity'};
294
295	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
296
297	print $outhandle "PagedImagePlugin processing \"$filename_full_path\"\n"
298	if $verbosity > 1;
299	print STDERR "<Processing n='$file' p='PagedImagePlugin'>\n" if ($gli);
300
301	$self->{'MaxImageWidth'} = 0;
302	$self->{'MaxImageHeight'} = 0;
303
304	# here we need to decide if we have an old text .item file, or a new xml
305	# .item file
306	my $xml_version = $self->is_xml_item_file($filename_full_path);
307
308	$self->tidy_item_file($filename_full_path);
309
310	my $doc_obj;
311	if ($xml_version) {
312	# careful checking needed here!! are we using local xml handlers or super ones
313	$self->ReadXMLFile::read($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
314	$doc_obj = $self->{'doc_obj'};
315	} else {
316	my ($dir);
317	($dir, $file) = $filename_full_path =~ /^(.?)([^\/\\])$/;
318
319	#process the .item file
320	$doc_obj = $self->process_item($filename_full_path, $dir, $file, $processor, $metadata);
321
322	}
323
324	my $section = $doc_obj->get_top_section();
325
326	$doc_obj->add_utf8_metadata($section, "Plugin", "$self->{'plugin_type'}");
327	$doc_obj->add_metadata($section, "FileFormat", "PagedImage");
328
329	# include any metadata passed in from previous plugins
330	# note that this metadata is associated with the top level section
331	$self->add_associated_files($doc_obj, $filename_full_path);
332	$self->extra_metadata ($doc_obj, $section, $metadata);
333	$self->auto_extract_metadata ($doc_obj);
334
335	# if we haven't found any Title so far, assign one
336	$self->title_fallback($doc_obj,$section,$filename_no_path);
337
338	$self->add_OID($doc_obj);
339	return (1,$doc_obj);
340	}
341
342	# for now, the test is if the first non-empty line is <PagedDocument>, then its xml
343	sub is_xml_item_file {
344	my $self = shift(@_);
345	my ($filename) = @_;
346
347	my $xml_version = 0;
348	open (ITEMFILE, $filename) \|\| die "couldn't open $filename\n";
349
350	my $line = "";
351	my $num = 0;
352
353	$line = <ITEMFILE>;
354	while (defined ($line) && ($line !~ /\w/)) {
355	$line = <ITEMFILE>;
356	}
357
358	if (defined $line) {
359	chomp $line;
360	if ($line =~ /<PagedDocument/) {
361	$xml_version = 1;
362	}
363	}
364
365	close ITEMFILE;
366	return $xml_version;
367	}
368
369	sub tidy_item_file {
370	my $self = shift(@_);
371	my ($filename) = @_;
372
373	open (ITEMFILE, $filename) \|\| die "couldn't open $filename\n";
374	my $backup_filename = "backup.item";
375	open (BACKUP,">$backup_filename")\|\| die "couldn't write to $backup_filename\n";
376	my $line = "";
377	$line = <ITEMFILE>;
378	$line =~ s/^\xEF\xBB\xBF//; # strip BOM
379	$line =~ s/\x0B+//ig;
380	$line =~ s/&/&/g;
381	print BACKUP ($line);
382	#Tidy up the item file some metadata title contains \vt-vertical tab
383	while ($line = <ITEMFILE>) {
384	$line =~ s/\x0B+//ig;
385	$line =~ s/&/&/g;
386	print BACKUP ($line);
387	}
388	close ITEMFILE;
389	close BACKUP;
390	&File::Copy::copy ($backup_filename, $filename);
391	&util::rm($backup_filename);
392
393	}
394
395	sub rotate_image {
396	my $self = shift (@_);
397	my ($filename_full_path) = @_;
398
399	my ($this_filetype) = $filename_full_path =~ /\.([^\.]*)$/;
400	my $result = $self->convert($filename_full_path, $this_filetype, "-rotate 180", "ROTATE");
401	my ($new_filename) = ($result =~ /=>(.*\.$this_filetype)/);
402	if (-e "$new_filename") {
403	return $new_filename;
404	}
405	# somethings gone wrong
406	return $filename_full_path;
407
408	}
409
410	sub process_image {
411	my $self = shift(@_);
412	my ($filename_full_path, $filename_no_path, $doc_obj, $section, $rotate) = @_;
413	# check the filenames
414	return 0 if ($filename_no_path eq "" \|\| !-f $filename_full_path);
415
416	# remember that this image file was one of our source files, but only
417	# if we are not processing a tmp file
418	if (!$self->{'processing_tmp_files'} ) {
419	$doc_obj->associate_source_file($filename_full_path);
420	}
421	# do rotation
422	if ((defined $rotate) && ($rotate eq "r")) {
423	# we get a new temporary file which is rotated
424	$filename_full_path = $self->rotate_image($filename_full_path);
425	}
426
427	# do generate images
428	my $result = 0;
429	if ($self->{'image_conversion_available'} == 1) {
430	# do we need to convert $filename_no_path to utf8/url encoded?
431	# We are already reading in from a file, what encoding is it in???
432	my $url_encoded_full_filename
433	= &unicode::raw_filename_to_url_encoded($filename_full_path);
434	$result = $self->generate_images($filename_full_path, $url_encoded_full_filename, $doc_obj, $section);
435	}
436	#overwrite one set in ImageConverter
437	$doc_obj->set_metadata_element ($section, "FileFormat", "PagedImage");
438	return $result;
439	}
440
441
442	sub xml_start_tag {
443	my $self = shift(@_);
444	my ($expat, $element) = @_;
445	$self->{'element'} = $element;
446
447	my $doc_obj = $self->{'doc_obj'};
448	if ($element eq "PagedDocument") {
449	$self->{'current_section'} = $doc_obj->get_top_section();
450	} elsif ($element eq "PageGroup" \|\| $element eq "Page") {
451	# create a new section as a child
452	$self->{'current_section'} = $doc_obj->insert_section($doc_obj->get_end_child($self->{'current_section'}));
453	$self->{'num_pages'}++;
454	# assign pagenum as what??
455	my $pagenum = $_{'pagenum'}; #TODO!!
456	if (defined $pagenum) {
457	$doc_obj->set_utf8_metadata_element($self->{'current_section'}, 'PageNum', $pagenum);
458	}
459	my ($imgfile) = $_{'imgfile'};
460	if (defined $imgfile) {
461	# *****
462	# What about support for rotate image (e.g. old ':r' notation)?
463	$self->process_image($self->{'xml_file_dir'}.$imgfile, $imgfile, $doc_obj, $self->{'current_section'});
464	}
465	my ($txtfile) = $_{'txtfile'};
466	if (defined($txtfile)&& $txtfile ne "") {
467	$self->process_text ($self->{'xml_file_dir'}.$txtfile, $txtfile, $doc_obj, $self->{'current_section'});
468	} else {
469	$self->add_dummy_text($doc_obj, $self->{'current_section'});
470	}
471	} elsif ($element eq "Metadata") {
472	$self->{'metadata_name'} = $_{'name'};
473	}
474	}
475
476	sub xml_end_tag {
477	my $self = shift(@_);
478	my ($expat, $element) = @_;
479
480	my $doc_obj = $self->{'doc_obj'};
481	if ($element eq "Page" \|\| $element eq "PageGroup") {
482	# if Title hasn't been assigned, set PageNum as Title
483	if (!defined $doc_obj->get_metadata_element ($self->{'current_section'}, "Title") && defined $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" )) {
484	$doc_obj->add_utf8_metadata ($self->{'current_section'}, "Title", $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" ));
485	}
486	# move the current section back to the parent
487	$self->{'current_section'} = $doc_obj->get_parent_section($self->{'current_section'});
488	} elsif ($element eq "Metadata") {
489
490	# text read in by XML::Parser is in Perl's binary byte value
491	# form ... need to explicitly make it UTF-8
492	my $meta_name = decode("utf-8",$self->{'metadata_name'});
493	my $metadata_value = decode("utf-8",$self->{'metadata_value'});
494
495	if ($meta_name =~ /\./) {
496	$meta_name = "ex.$meta_name";
497	}
498
499	$doc_obj->add_utf8_metadata ($self->{'current_section'}, $meta_name, $metadata_value);
500	$self->{'metadata_name'} = "";
501	$self->{'metadata_value'} = "";
502
503	}
504	# otherwise we ignore the end tag
505	}
506
507
508	sub xml_text {
509	my $self = shift(@_);
510	my ($expat) = @_;
511
512	if ($self->{'element'} eq "Metadata" && $self->{'metadata_name'}) {
513	$self->{'metadata_value'} .= $_;
514	}
515	}
516
517	sub xml_doctype {
518	}
519
520	sub open_document {
521	my $self = shift(@_);
522
523	# create a new document
524	$self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc", $self->{'file_rename_method'});
525	# TODO is file filenmae_no_path??
526	$self->set_initial_doc_fields($self->{'doc_obj'}, $self->{'filename'}, $self->{'processor'}, $self->{'metadata'});
527
528	my ($dir, $file) = $self->{'filename'} =~ /^(.?)([^\/\\])$/;
529	$self->{'xml_file_dir'} = $dir;
530	$self->{'num_pages'} = 0;
531
532	}
533
534	sub close_document {
535	my $self = shift(@_);
536	my $doc_obj = $self->{'doc_obj'};
537
538	# add numpages metadata
539	my $topsection = $doc_obj->get_top_section();
540
541	$doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', $self->{'num_pages'});
542
543	$doc_obj->set_utf8_metadata_element($topsection,"MaxImageWidth",$self->{'MaxImageWidth'});
544	$doc_obj->set_utf8_metadata_element($topsection,"MaxImageHeight",$self->{'MaxImageHeight'});
545	$self->{'MaxImageWidth'} = undef;
546	$self->{'MaxImageHeight'} = undef;
547
548	}
549
550
551	sub set_initial_doc_fields {
552	my $self = shift(@_);
553	my ($doc_obj, $filename_full_path, $processor, $metadata) = @_;
554
555	my $topsection = $doc_obj->get_top_section();
556
557	if ($self->{'documenttype'} eq 'paged') {
558	# set the gsdlthistype metadata to Paged - this ensures this document will
559	# be treated as a Paged doc, even if Titles are not numeric
560	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Paged");
561	} else {
562	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Hierarchy");
563	}
564
565	my $plugin_filename_encoding = $self->{'filename_encoding'};
566	my $filename_encoding = $self->deduce_filename_encoding($filename_full_path,$metadata,$plugin_filename_encoding);
567	$self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
568
569	# if we want a header page, we need to add some text into the top section, otherwise this section will become invisible
570	if ($self->{'headerpage'}) {
571	$self->add_dummy_text($doc_obj, $topsection);
572	}
573	}
574
575	sub scan_xml_for_files_to_block
576	{
577	my $self = shift (@_);
578	my ($filename_full_path, $dir, $block_hash) = @_;
579
580	open (ITEMFILE, $filename_full_path) \|\| die "couldn't open $filename_full_path to work out which files to block\n";
581	my $line = "";
582	while (defined ($line = <ITEMFILE>)) {
583	next unless $line =~ /\w/;
584
585	if ($line =~ /imgfile=\"([^\"]+)\"/) {
586	&util::block_filename($block_hash,&util::filename_cat($dir,$1));
587	}
588	if ($line =~ /txtfile=\"([^\"]+)\"/) {
589	&util::block_filename($block_hash,&util::filename_cat($dir,$1));
590	}
591	}
592	close ITEMFILE;
593
594	}
595
596	sub scan_item_for_files_to_block
597	{
598	my $self = shift (@_);
599	my ($filename_full_path, $dir, $block_hash) = @_;
600
601	open (ITEMFILE, $filename_full_path) \|\| die "couldn't open $filename_full_path to work out which files to block\n";
602	my $line = "";
603	while (defined ($line = <ITEMFILE>)) {
604	next unless $line =~ /\w/;
605	chomp $line;
606	next if $line =~ /^#/; # ignore comment lines
607	next if ($line =~ /^<([^>])>\s(.?)\s$/); # ignore metadata lines
608	# line should be like page:imagefilename:textfilename:r
609	$line =~ s/^\s+//; #remove space at the front
610	$line =~ s/\s+$//; #remove space at the end
611	my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
612
613	# find the image file if there is one
614	if (defined $imgname && $imgname ne "") {
615	&util::block_filename($block_hash, &util::filename_cat( $dir,$imgname));
616	}
617	# find the text file if there is one
618	if (defined $txtname && $txtname ne "") {
619	&util::block_filename($block_hash, &util::filename_cat($dir,$txtname));
620	}
621	}
622	close ITEMFILE;
623
624	}
625
626	sub process_item {
627	my $self = shift (@_);
628	my ($filename_full_path, $dir, $filename_no_path, $processor, $metadata) = @_;
629
630	my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
631	$self->set_initial_doc_fields($doc_obj, $filename_full_path, $processor, $metadata);
632	my $topsection = $doc_obj->get_top_section();
633	open (ITEMFILE, $filename_full_path) \|\| die "couldn't open $filename_full_path\n";
634	my $line = "";
635	my $num = 0;
636	while (defined ($line = <ITEMFILE>)) {
637
638	# Since process_item is called not on an XML item file, but a text item file
639	# don't decode into UTF8 the text that was read in, since it's already UTF-8
640	#$line = decode("utf-8",$line);
641
642	next unless $line =~ /\w/;
643	chomp $line;
644	next if $line =~ /^#/; # ignore comment lines
645	if ($line =~ /^<([^>])>\s(.?)\s$/) {
646	my $meta_name = $1;
647	my $meta_value = $2;
648	if ($meta_name =~ /\./) {
649	$meta_name = "ex.$meta_name";
650	}
651	$doc_obj->set_utf8_metadata_element ($topsection, $meta_name, $meta_value);
652	#$meta->{$1} = $2;
653	} else {
654	$num++;
655	# line should be like page:imagefilename:textfilename:r - the r is optional -> means rotate the image 180 deg
656	$line =~ s/^\s+//; #remove space at the front
657	$line =~ s/\s+$//; #remove space at the end
658	my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
659
660	# create a new section for each image file
661	my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
662	# the page number becomes the Title
663	$doc_obj->set_utf8_metadata_element($cursection, 'Title', $pagenum);
664
665	# process the image for this page if there is one
666	if (defined $imgname && $imgname ne "") {
667	my $result1 = $self->process_image($dir.$imgname, $imgname, $doc_obj, $cursection, $rotate);
668	if (!defined $result1)
669	{
670	print "PagedImagePlugin: couldn't process image \"$dir$imgname\" for item \"$filename_full_path\"\n";
671	}
672	}
673	# process the text file if one is there
674	if (defined $txtname && $txtname ne "") {
675	my $result2 = $self->process_text ($dir.$txtname, $txtname, $doc_obj, $cursection);
676
677	if (!defined $result2) {
678	print "PagedImagePlugin: couldn't process text file \"$dir.$txtname\" for item \"$filename_full_path\"\n";
679	$self->add_dummy_text($doc_obj, $cursection);
680	}
681	} else {
682	# otherwise add in some dummy text
683	$self->add_dummy_text($doc_obj, $cursection);
684	}
685	}
686	}
687
688	close ITEMFILE;
689
690	# add numpages metadata
691	$doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', "$num");
692
693	$doc_obj->set_utf8_metadata_element($topsection,"MaxImageWidth",$self->{'MaxImageWidth'});
694	$doc_obj->set_utf8_metadata_element($topsection,"MaxImageHeight",$self->{'MaxImageHeight'});
695	$self->{'MaxImageWidth'} = undef;
696	$self->{'MaxImageHeight'} = undef;
697
698
699	return $doc_obj;
700	}
701
702	sub process_text {
703	my $self = shift (@_);
704	my ($filename_full_path, $file, $doc_obj, $cursection) = @_;
705
706	# check that the text file exists!!
707	if (!-f $filename_full_path) {
708	print "PagedImagePlugin: ERROR: File $filename_full_path does not exist, skipping\n";
709	return 0;
710	}
711
712	# remember that this text file was one of our source files, but only
713	# if we are not processing a tmp file
714	if (!$self->{'processing_tmp_files'} ) {
715	$doc_obj->associate_source_file($filename_full_path);
716	}
717	# Do encoding stuff
718	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
719
720	my $text="";
721	&ReadTextFile::read_file($self, $filename_full_path, $encoding, $language, \$text); # already decoded as utf8
722	if (!length ($text)) {
723	# It's a bit unusual but not out of the question to have no text, so just give a warning
724	print "PagedImagePlugin: WARNING: $filename_full_path contains no text\n";
725	}
726
727	# we need to escape the escape character, or else mg will convert into
728	# eg literal newlines, instead of leaving the text as '\n'
729	$text =~ s/\\/\\\\/g; # macro language
730	$text =~ s/_/\\_/g; # macro language
731
732
733	if ($text =~ m/<html.?>\s<head.?>.<\/head>\s<body.?>(.)<\/body>\s<\/html>\s*$/is) {
734	# looks like HTML input
735	# no need to escape < and > or put in <pre> tags
736
737	$text = $1;
738
739	# add text to document object
740	$doc_obj->add_utf8_text($cursection, "$text");
741	}
742	else {
743	$text =~ s/</</g;
744	$text =~ s/>/>/g;
745
746	# insert preformat tags and add text to document object
747	$doc_obj->add_utf8_text($cursection, "<pre>\n$text\n</pre>");
748	}
749
750
751	return 1;
752	}
753
754
755	sub clean_up_after_doc_obj_processing {
756	my $self = shift(@_);
757
758	$self->ImageConverter::clean_up_temporary_files();
759	}
760
761	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: