Context Navigation

source: main/trunk/greenstone2/perllib/plugins/BaseImporter.pm@ 37048

Last change on this file since 37048 was 37048, checked in by davidb, 16 months ago
Useful support routine added that only sets the document field to say it has no text if the text field is empty. This routine helps plugins such as ImagePlugin that never used to have any text and would therefore set this field to 'no text'. Newer work such as GoogleVisionImagePlugin in Inherits from ImagePlugin and as a result of calling the Google Visions API can now find text in a image. This new support routine helps with setting dummy text on when needed, but if there is evidence of text that is stored in 'doc_obj' then it does not set dummy text; there is also a useful debugging statement added that prints out which inheritend plugin in checking the process expression -- commented out
Property svn:keywords set to `Author Date Id Revision`
File size: 36.4 KB

Line
1	###########################################################################
2	#
3	# BaseImporter.pm -- base class for all the import plugins
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2005 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package BaseImporter;
27
28	use strict;
29	no strict 'subs';
30	no strict 'refs'; # allow filehandles to be variables and viceversa
31
32	use File::Basename;
33	use Encode;
34	use Unicode::Normalize 'normalize';
35
36	use encodings;
37	use unicode;
38	use doc;
39	use ghtml;
40	use gsprintf 'gsprintf';
41	use util;
42	use FileUtils;
43
44	use CommonUtil;
45
46	BEGIN {
47	@BaseImporter::ISA = ( 'CommonUtil' );
48	}
49
50	# the different methods that can be applied when renaming
51	# imported documents and their associated files
52	our $file_rename_method_list =
53	[ { 'name' => "url",
54	'desc' => "{BaseImporter.rename_method.url}" },
55	{ 'name' => "base64",
56	'desc' => "{BaseImporter.rename_method.base64}" },
57	{ 'name' => "none",
58	'desc' => "{BaseImporter.rename_method.none}",
59	'hiddengli' => "yes" } ];
60
61	# here went encoding list stuff
62
63	our $oidtype_list =
64	[ { 'name' => "auto",
65	'desc' => "{BaseImporter.OIDtype.auto}" },
66	{ 'name' => "hash",
67	'desc' => "{import.OIDtype.hash}" },
68	{ 'name' => "hash_on_ga_xml",
69	'desc' => "{import.OIDtype.hash_on_ga_xml}" },
70	{ 'name' => "hash_on_full_filename",
71	'desc' => "{import.OIDtype.hash_on_full_filename}" },
72	{ 'name' => "assigned",
73	'desc' => "{import.OIDtype.assigned}" },
74	{ 'name' => "incremental",
75	'desc' => "{import.OIDtype.incremental}" },
76	{ 'name' => "filename",
77	'desc' => "{import.OIDtype.filename}" },
78	{ 'name' => "dirname",
79	'desc' => "{import.OIDtype.dirname}" },
80	{ 'name' => "full_filename",
81	'desc' => "{import.OIDtype.full_filename}" } ];
82
83	my $arguments =
84	[ { 'name' => "process_exp",
85	'desc' => "{BaseImporter.process_exp}",
86	'type' => "regexp",
87	'deft' => "",
88	'reqd' => "no" },
89	{ 'name' => "store_original_file",
90	'desc' => "{BaseImporter.store_original_file}",
91	'type' => "flag",
92	'reqd' => "no" },
93	{ 'name' => "associate_ext",
94	'desc' => "{BaseImporter.associate_ext}",
95	'type' => "string",
96	'reqd' => "no" },
97	{ 'name' => "associate_tail_re",
98	'desc' => "{BaseImporter.associate_tail_re}",
99	'type' => "string",
100	'reqd' => "no" },
101	{ 'name' => "OIDtype",
102	'desc' => "{import.OIDtype}",
103	'type' => "enum",
104	'list' => $oidtype_list,
105	# leave default empty so we can tell if its been set or not - if not set will use option from import.pl
106	'deft' => "auto",
107	'reqd' => "no" },
108	{ 'name' => "OIDmetadata",
109	'desc' => "{import.OIDmetadata}",
110	'type' => "metadata",
111	'deft' => "dc.Identifier",
112	'reqd' => "no" },
113	# { 'name' => "use_as_doc_identifier",
114	# 'desc' => "{BaseImporter.use_as_doc_identifier}",
115	# 'type' => "string",
116	# 'reqd' => "no" ,
117	# 'deft' => "" } ,
118	{ 'name' => "no_cover_image",
119	'desc' => "{BaseImporter.no_cover_image}",
120	'type' => "flag",
121	'reqd' => "no" },
122	{ 'name' => "file_rename_method",
123	'desc' => "{BaseImporter.file_rename_method}",
124	'type' => "enum",
125	'deft' => &get_default_file_rename_method(), # by default rename imported files and assoc files using this encoding
126	'list' => $file_rename_method_list,
127	'reqd' => "no"
128	}
129
130	# These options take a metadata field, and store its values as individual
131	#fields with true value.
132	#eg wmtb.TKLabel=tk.Clan,tk.Attribution =>tk.Clan=true, tk.Attribution=true.
133	#Implemented for tk lables, but then not needed as this is handled by
134	# xslt instead. Leaving this here in case useful in the future
135	# { 'name' => "store_metadata_values_as_fields_for",
136	# 'desc' => "For these (comma-separated list of) fields, take all values and store them as metadata fields in their own write, value=true ",
137	# 'type' => "metadata",
138	# 'reqd' => "no" },
139	# { 'name' => "namespace_for_new_fields",
140	# 'desc' => "add the specified namespace to the new fields (comma separated list, corresponding to the fields above",
141	# 'type'=> "string",
142	# 'reqd' => "no",
143	# 'deft' => ""},
144	# { 'name' => "include_section_level",
145	# 'desc' => "do we look for this metadata at subsection level too?",
146	# 'type' => "string",
147	# 'deft' => "false"}
148	];
149
150
151	my $options = { 'name' => "BaseImporter",
152	'desc' => "{BaseImporter.desc}",
153	'abstract' => "yes",
154	'inherits' => "yes",
155	'args' => $arguments };
156
157	sub new {
158
159	my ($class) = shift (@_);
160	my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
161	push(@$pluginlist, $class);
162
163	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
164	push(@{$hashArgOptLists->{"OptList"}},$options);
165
166	my $self = new CommonUtil($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
167
168	if ($self->{'info_only'}) {
169	# don't worry about any options etc
170	return bless $self, $class;
171	}
172
173	my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
174	$self->{'plugin_type'} = $plugin_name;
175
176	# remove ex. from OIDmetadata iff it's the only namespace prefix
177	$self->{'OIDmetadata'} =~ s/^ex\.([^.]+)$/$1/ if defined $self->{'OIDmetadata'};
178	$self->{'num_processed'} = 0;
179	$self->{'num_not_processed'} = 0;
180	$self->{'num_blocked'} = 0;
181	$self->{'num_archives'} = 0;
182	$self->{'cover_image'} = 1; # cover image is on by default
183	$self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
184	$self->{'can_process_directories'} = 0;
185	#$self->{'option_list'} = $hashArgOptLists->{"OptList"};
186
187	my $associate_ext = $self->{'associate_ext'};
188	if ((defined $associate_ext) && ($associate_ext ne "")) {
189
190	my $associate_tail_re = $self->{'associate_tail_re'};
191	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
192	my $outhandle = $self->{'outhandle'};
193	print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
194	print $outhandle " defaulting to 'associate_tail_re'\n";
195	}
196	else {
197	my @exts = split(/,/,$associate_ext);
198
199	my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
200	my $associate_tail_re = join("\|",@exts_bracketed);
201	$self->{'associate_tail_re'} = $associate_tail_re;
202	}
203
204	delete $self->{'associate_ext'};
205	}
206
207	return bless $self, $class;
208
209	}
210
211	sub merge_inheritance
212	{
213	my $self = {};
214	my @child_selfs = @_;
215
216	foreach my $child_self (@child_selfs) {
217	foreach my $key (keys %$child_self) {
218	if (defined $self->{$key}) {
219	if ($self->{$key} ne $child_self->{$key}) {
220	# print STDERR "Warning: Conflicting value in multiple inheritance for '$key'\n";
221	# print STDERR "Existing stored value = $self->{$key}\n";
222	# print STDERR "New (child) value = $child_self->{$key}\n";
223	# print STDERR "Keeping existing value\n";
224	# Existing value seems to be option specified in collect.cfg
225
226	### $self->{$key} = $child_self->{$key};
227
228	}
229	else {
230	## print STDERR "****Info: Value $self->{$key} for $key already defined through multiple inheritance as the same value\n";
231	}
232
233	}
234	else {
235	$self->{$key} = $child_self->{$key};
236	}
237	}
238	}
239
240	return $self;
241	}
242
243	# initialize BaseImporter options
244	# if init() is overridden in a sub-class, remember to call BaseImporter::init()
245	sub init {
246	my $self = shift (@_);
247	my ($verbosity, $outhandle, $failhandle) = @_;
248
249	$self->SUPER::init(@_);
250
251	# set process_exp and block_exp to defaults unless they were
252	# explicitly set
253
254	if ((!$self->is_recursive()) and
255	(!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
256
257	$self->{'process_exp'} = $self->get_default_process_exp ();
258	if ($self->{'process_exp'} eq "") {
259	warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
260	}
261	}
262
263	if ((!defined $self->{'block_exp'}) \|\| ($self->{'block_exp'} eq "")) {
264	$self->{'block_exp'} = $self->get_default_block_exp ();
265	}
266
267	}
268
269	sub begin {
270	my $self = shift (@_);
271	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
272
273	if ($self->{'OIDtype'} eq "auto") {
274	# hasn't been set in the plugin, use the processor values
275	$self->{'OIDtype'} = $processor->{'OIDtype'};
276	$self->{'OIDmetadata'} = $processor->{'OIDmetadata'};
277	}
278	if ($self->{'OIDtype'} eq "hash") {
279	# should we hash on the file or on the doc xml??
280	$self->{'OIDtype'} = $self->get_oid_hash_type();
281	if ($self->{'OIDtype'} !~ /^(hash_on_file\|hash_on_ga_xml)$/) {
282	$self->{'OIDtype'} = "hash_on_file";
283	}
284	}
285	}
286
287	# This is called once if removeold is set with import.pl. Most plugins will do
288	# nothing but if a plugin does any stuff outside of creating doc obj, then
289	# it may need to clear something.
290	sub remove_all {
291	my $self = shift (@_);
292	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
293	}
294
295	# This is called per document for docs that have been deleted from the
296	# collection. Most plugins will do nothing
297	# but if a plugin does any stuff outside of creating doc obj, then it may need
298	# to clear something.
299	sub remove_one {
300	my $self = shift (@_);
301
302	my ($file, $oids, $archivedir) = @_;
303	return 0 if $self->can_process_this_file($file);
304	return undef;
305	}
306
307	sub end {
308	# potentially called at the end of each plugin pass
309	# import.pl only has one plugin pass, but buildcol.pl has multiple ones
310
311	my ($self) = shift (@_);
312	}
313
314	sub deinit {
315	# called only once, after all plugin passes have been done
316
317	my ($self) = @_;
318	}
319
320	# default hashing type is to hash on the original file (or converted file)
321	# override this to return hash_on_ga_xml for filetypes where hashing on the
322	# file is no good eg video
323	sub get_oid_hash_type {
324
325	my $self = shift (@_);
326
327	return "hash_on_file";
328	}
329
330
331	# this function should be overridden to return 1
332	# in recursive plugins
333	sub is_recursive {
334	my $self = shift (@_);
335
336	return 0;
337	}
338
339	sub get_default_block_exp {
340	my $self = shift (@_);
341
342	return "";
343	}
344
345	sub get_default_process_exp {
346	my $self = shift (@_);
347
348	return "";
349	}
350
351
352	# rename imported files and assoc files using URL encoding by default
353	# as this will work for most plugins and give more legible filenames
354	sub get_default_file_rename_method() {
355	my $self = shift (@_);
356	return "url";
357	}
358
359	# returns this plugin's active (possibly user-selected) file_rename_method
360	sub get_file_rename_method() {
361	my $self = shift (@_);
362	my $rename_method = $self->{'file_rename_method'};
363	if($rename_method) {
364	return $rename_method;
365	} else {
366	return $self->get_default_file_rename_method();
367	}
368	}
369
370	# default implementation is to do nothing
371	sub store_block_files {
372
373	my $self =shift (@_);
374	my ($filename_full_path, $block_hash) = @_;
375
376	}
377
378	# put files to block into hash
379	sub use_block_expressions {
380
381	my $self =shift (@_);
382	my ($filename_full_path, $block_hash) = @_;
383
384	$filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
385
386	if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
387	$self->block_filename($block_hash,$filename_full_path);
388	}
389
390	}
391
392	#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
393	sub block_cover_image
394	{
395	my $self =shift;
396	my ($filename, $block_hash) = @_;
397
398	$filename = &util::upgrade_if_dos_filename($filename);
399
400	if ($self->{'cover_image'}) {
401	my $coverfile = $filename;
402	$coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
403
404	#if there is no file extension, coverfile will be the same as filename
405	return if $coverfile eq $filename;
406
407	if (!&FileUtils::fileExists($coverfile)) {
408	$coverfile =~ s/jpg$/JPG/;
409	}
410	if (&FileUtils::fileExists($coverfile)) {
411	$self->block_filename($block_hash,$coverfile);
412	}
413	}
414
415	return;
416	}
417
418
419	# discover all the files that should be blocked by this plugin
420	# check the args ...
421	sub file_block_read {
422
423	my $self = shift (@_);
424	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
425	# Keep track of filenames with same root but different extensions
426	# Used to support -associate_ext and the more generalised
427	# -associate_tail_re
428	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
429
430	if (!-d $filename_full_path) {
431	$block_hash->{'all_files'}->{$file} = 1;
432	}
433
434	my $associate_tail_re = $self->{'associate_tail_re'};
435	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
436	my ($file_prefix,$file_ext)
437	= &util::get_prefix_and_tail_by_regex($filename_full_path,$associate_tail_re);
438	if ((defined $file_prefix) && (defined $file_ext)) {
439	my $shared_fileroot = $block_hash->{'shared_fileroot'};
440	if (!defined $shared_fileroot->{$file_prefix}) {
441	my $file_prefix_rec = { 'tie_to' => undef,
442	'exts' => {} };
443	$shared_fileroot->{$file_prefix} = $file_prefix_rec;
444	}
445
446	my $file_prefix_rec = $shared_fileroot->{$file_prefix};
447
448	if ($self->can_process_this_file($filename_full_path) && $file_ext !~ m/.\./) {
449	# This is the document the others should be tied to
450	$file_prefix_rec->{'tie_to'} = $file_ext;
451	}
452	else {
453	if ($file_ext =~ m/$associate_tail_re$/) {
454	# this file should be associated to the main one
455	$file_prefix_rec->{'exts'}->{$file_ext} = 1;
456	}
457	}
458
459	}
460	}
461
462	# check block expressions
463	$self->use_block_expressions($filename_full_path, $block_hash) unless $self->{'no_blocking'};
464
465	# now check whether we are actually processing this
466	if (!-f $filename_full_path \|\| !$self->can_process_this_file($filename_full_path)) {
467	return undef; # can't recognise
468	}
469
470	# if we have a block_exp, then this overrides the normal 'smart' blocking
471	$self->store_block_files($filename_full_path, $block_hash) unless ($self->{'no_blocking'} \|\| $self->{'block_exp'} ne "");
472
473	# block the cover image if there is one
474	if ($self->{'cover_image'}) {
475	$self->block_cover_image($filename_full_path, $block_hash);
476	}
477
478	return 1;
479	}
480
481	# plugins that rely on more than process_exp (eg XML plugins) can override this method
482	sub can_process_this_file {
483	my $self = shift(@_);
484	my ($filename) = @_;
485
486	if (-d $filename && !$self->{'can_process_directories'}) {
487	return 0;
488	}
489
490	# print STDERR "**** BaseImport::can_process_this_file(): ", ref($self), " checking $filename =~ /$self->{'process_exp'}/\n";
491
492	if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
493	return 1;
494	}
495	return 0;
496
497	}
498
499	# Even if a plugin can extract metadata in its metadata_read pass,
500	# make the default return 'undef' so processing of the file continues
501	# down the pipeline, so other plugins can also have the opportunity to
502	# locate metadata and set it up in the extrametakeys variables that
503	# are passed around.
504
505	sub can_process_this_file_for_metadata {
506	my $self = shift(@_);
507
508	return undef;
509	}
510
511
512
513	# Notionally written to be called once for each document, it is however safe to
514	# call multiple times (as in the case of ImagePlugin) which calls this later on
515	# after the original image has potentially been converted to a new source image
516	# format (e.g. TIFF to PNG)
517
518	sub set_Source_metadata {
519	my $self = shift (@_);
520	my ($doc_obj, $raw_filename, $filename_encoding, $section) = @_;
521
522	# 1. Sets the filename (Source) for display encoded as Unicode if possible,
523	# and (as a fallback) using %xx if not for non-ascii chars
524	# 2. Sets the url ref (SourceFile) to the URL encoded version
525	# of filename for generated files
526
527	my ($unused_full_rf, $raw_file) = &util::get_full_filenames("", $raw_filename);
528
529	my $this_section = (defined $section)? $section : $doc_obj->get_top_section();
530
531	my $octet_file = $raw_file;
532
533	# UTF-8 version of filename
534	# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
535	# print STDERR "**** Setting Source Metadata given: $octet_file\n";
536	# }
537
538	# Deal with (on Windows) raw filenames that are in their
539	# abbreviated DOS form
540
541	if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
542	if ((defined $filename_encoding) && ($filename_encoding eq "unicode")) {
543	if (-e $raw_filename) {
544	my $unicode_filename = Win32::GetLongPathName($raw_filename);
545
546	my $unused_full_uf;
547	($unused_full_uf, $octet_file) = &util::get_full_filenames("", $unicode_filename);
548	}
549	}
550	}
551
552	my $url_encoded_filename;
553	if ((defined $filename_encoding) && ($filename_encoding ne "ascii")) {
554	# => Generate a pretty print version of filename that is mapped to Unicode
555
556	# Use filename_encoding to map raw filename to a Perl unicode-aware string
557	$url_encoded_filename = decode($filename_encoding,$octet_file);
558	}
559	else {
560	# otherwise generate %xx encoded version of filename for char > 127
561	$url_encoded_filename = &unicode::raw_filename_to_url_encoded($octet_file);
562	}
563
564	# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
565	# print STDERR "****** saving Source as: $url_encoded_filename\n";
566	# }
567
568	# In the case of converted files and (generalized) exploded documents, there
569	# will already be a source filename => store as OrigSource before overriding
570	my $orig_source = $doc_obj->get_metadata_element ($this_section, "Source");
571	if ((defined $orig_source) && ($orig_source !~ m/^\s*$/)) {
572	$doc_obj->set_utf8_metadata_element($this_section, "OrigSource", $orig_source);
573	}
574
575	# Source is the UTF8 display name - not necessarily the name of the file on the system
576	if ($ENV{'GSDLOS'} =~ m/^darwin$/i) {
577	# on Darwin want all display strings to be in composed form, then can search on that
578	$url_encoded_filename = normalize('C', $url_encoded_filename); # Normalisation Form 'C' (composition)
579	}
580	# set_utf8_metadata actually sets perl unicode aware strings. not utf8
581	$doc_obj->set_utf8_metadata_element($this_section, "Source", $url_encoded_filename);
582
583
584	my $renamed_raw_file = &util::rename_file($raw_file, $self->{'file_rename_method'});
585	# If using URL encoding, then SourceFile is the url-reference to url-encoded
586	# renamed_raw_url: it's a url that refers to the actual file on the system
587	# this call just replaces % with %25
588	my $renamed_raw_url = &unicode::filename_to_url($renamed_raw_file);
589
590	$doc_obj->set_utf8_metadata_element($this_section, "SourceFile",
591	$renamed_raw_url);
592
593	# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
594	# print STDERR "****** saving SourceFile as: $renamed_raw_url\n";
595	# }
596	}
597
598
599	# this should be called by all plugins to set the oid of the doc obj, rather
600	# than calling doc_obj->set_OID directly
601	sub add_OID {
602	my $self = shift (@_);
603	my ($doc_obj, $force) = @_;
604
605	# don't add one if there is one already set, unless we are forced to do so
606	return unless ($doc_obj->get_OID() =~ /^NULL$/ \|\| $force);
607	$doc_obj->set_OIDtype($self->{'OIDtype'}, $self->{'OIDmetadata'});
608
609	# see if there is a plugin specific set_OID function
610	if (defined ($self->can('set_OID'))) {
611	$self->set_OID(@_); # pass through doc_obj and any extra arguments
612	}
613	else {
614	# use the default set_OID() in doc.pm
615	$doc_obj->set_OID();
616	}
617
618	}
619
620	# The BaseImporter read_into_doc_obj() function. This function does all the
621	# right things to make general options work for a given plugin. It doesn't do anything with the file other than setting reads in
622	# a file and sets up a slew of metadata all saved in doc_obj, which
623	# it then returns as part of a tuple (process_status,doc_obj)
624	#
625	# Much of this functionality used to reside in read, but it was broken
626	# down into a supporting routine to make the code more flexible.
627	#
628	# recursive plugins (e.g. RecPlug) and specialized plugins like those
629	# capable of processing many documents within a single file (e.g.
630	# GMLPlug) will normally want to implement their own version of
631	# read_into_doc_obj()
632	#
633	# Note that $base_dir might be "" and that $file might
634	# include directories
635
636	# currently blocking has been done before it gets here - does this affect secondary plugin stuff??
637	sub read_into_doc_obj {
638	my $self = shift (@_);
639	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
640
641	my $outhandle = $self->{'outhandle'};
642
643	# should we move this to read? What about secondary plugins?
644	my $pp_file = &util::prettyprint_file($base_dir,$file,$gli);
645	print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
646	print $outhandle "$self->{'plugin_type'} processing $pp_file\n"
647	if $self->{'verbosity'} > 1;
648
649	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
650
651	# create a new document
652	my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
653	my $top_section = $doc_obj->get_top_section();
654
655	$doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
656	$doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
657
658
659	my $plugin_filename_encoding = $self->{'filename_encoding'};
660	my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
661	$self->set_Source_metadata($doc_obj,$filename_full_path,$filename_encoding,$top_section);
662
663	# plugin specific stuff - what args do we need here??
664	unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
665	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
666	return -1;
667	}
668
669	# include any metadata passed in from previous plugins
670	# note that this metadata is associated with the top level section
671	my $section = $doc_obj->get_top_section();
672	# can we merge these two methods??
673	$self->add_associated_files($doc_obj, $filename_full_path);
674	$self->extra_metadata ($doc_obj, $section, $metadata);
675	$self->auto_extract_metadata($doc_obj);
676
677	# to use tk label options commented out at top
678	#$self->convert_special_metadata($doc_obj);
679
680	# if we haven't found any Title so far, assign one
681	# this was shifted to here from inside read()
682	$self->title_fallback($doc_obj,$section,$filename_no_path);
683
684	$self->add_OID($doc_obj);
685
686	$self->post_process_doc_obj($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
687	return (1,$doc_obj);
688	}
689
690	sub post_process_doc_obj {
691	my $self = shift (@_);
692	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
693
694	return 1;
695	}
696
697	sub add_dummy_text {
698	my $self = shift(@_);
699	my ($doc_obj, $section) = @_;
700
701	# add NoText metadata so we can hide this dummy text in format statements
702	$doc_obj->add_metadata($section, "NoText", "1");
703
704	# lookup_string with extra '1' arg returns perl internal unicode aware text, so we use add_utf8_text so no encoding is done on it.
705	$doc_obj->add_utf8_text($section, &gsprintf::lookup_string("{BaseImporter.dummy_text}",1));
706	#$doc_obj->add_text($section, &gsprintf::lookup_string("{BaseImporter.dummy_text}",1));
707	}
708
709	sub add_dummy_text_if_empty {
710	my $self = shift(@_);
711	my ($doc_obj, $section) = @_;
712
713	my $section_text_len = $doc_obj->get_text_length($section);
714
715	if ($section_text_len == 0) {
716	$self->add_dummy_text($doc_obj,$section);
717	}
718	}
719
720	# does nothing. Can be overridden by subclass
721	sub auto_extract_metadata {
722	my $self = shift(@_);
723	my ($doc_obj) = @_;
724	}
725
726	# used with the tk label options commented out at the start
727	sub convert_special_metadata {
728	my $self = shift(@_);
729	my ($doc_obj) = @_;
730	print STDERR "in convert special meta\n";
731	return unless defined $self->{'store_metadata_values_as_fields_for'};
732	print STDERR "we have fields to convert\n";
733
734	my @existing_meta_fields = split(',', $self->{'store_metadata_values_as_fields_for'});
735	my @new_ns = split(',', $self->{'namespace_for_new_fields'});
736	my $include_section_level = $self->{'include_section_level'};
737
738	foreach my $old_meta (@existing_meta_fields) {
739	my $ns = shift(@new_ns);
740	my $ns_prefix = "";
741	if (defined $ns && $ns =~/\S/) {
742	$ns_prefix = "$ns.";
743	}
744
745
746	my $section = $doc_obj->get_top_section();
747	while (defined $section) {
748	my @section_meta = @{$doc_obj->get_metadata($section, $old_meta)};
749	foreach my $meta (@section_meta) {
750	foreach my $submeta (split /,/, $meta) {
751	# add meta, ns+submeta, value = true
752	# remove whitepace
753	$submeta =~ s/\s+//g;
754	$doc_obj->add_utf8_metadata($section, "$ns_prefix$submeta", "true");
755	}
756	}
757
758	last if (!$include_section_level);
759	$section = $doc_obj->get_next_section($section);
760	}
761	}
762	}
763
764	# adds cover image, associate_file options stuff. Should be called by sub class
765	# read_into_doc_obj
766	sub add_associated_files {
767	my $self = shift(@_);
768	# whatis filename??
769	my ($doc_obj, $filename) = @_;
770
771	# add in the cover image
772	if ($self->{'cover_image'}) {
773	$self->associate_cover_image($doc_obj, $filename);
774	}
775	# store the original (used for eg TextPlugin to store the original for OAI)
776	if ($self->{'store_original_file'}) {
777	$self->associate_source_file($doc_obj, $filename);
778	}
779
780
781	}
782
783	# implement this if you are extracting metadata for other documents
784	sub metadata_read {
785	my $self = shift (@_);
786	my ($pluginfo, $base_dir, $file, $block_hash,
787	$extrametakeys, $extrametadata, $extrametafile,
788	$processor, $gli, $aux) = @_;
789
790	# can we process this file??
791	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
792	return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
793
794	return 1; # we recognise the file, but don't actually do anything with it
795	}
796
797
798	# The BaseImporter read() function. This function calls read_into_doc_obj()
799	# to ensure all the right things to make general options work for a
800	# given plugin are done. It then calls the process() function which
801	# does all the work specific to a plugin (like the old read functions
802	# used to do). Most plugins should define their own process() function
803	# and let this read() function keep control.
804	#
805	# recursive plugins (e.g. RecPlug) and specialized plugins like those
806	# capable of processing many documents within a single file (e.g.
807	# GMLPlug) might want to implement their own version of read(), but
808	# more likely need to implement their own version of read_into_doc_obj()
809	#
810	# Return number of files processed, undef if can't recognise, -1 if can't
811	# process
812
813	sub read {
814	my $self = shift (@_);
815	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
816
817	# can we process this file??
818	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
819
820	return undef unless $self->can_process_this_file($filename_full_path);
821
822	#print STDERR "**** BEFORE READ INTO DOC OBJ: $file\n";
823	my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
824	#print STDERR "**** AFTER READ INTO DOC OBJ: $file\n";
825
826	if ((defined $process_status) && ($process_status == 1)) {
827
828	# process the document
829	$processor->process($doc_obj);
830
831	$self->{'num_processed'} ++;
832	undef $doc_obj;
833	}
834	# delete any temp files that we may have created
835	$self->clean_up_after_doc_obj_processing();
836
837
838	# if process_status == 1, then the file has been processed.
839	return $process_status;
840
841	}
842
843	# returns undef if file is rejected by the plugin
844	sub process {
845	my $self = shift (@_);
846	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
847
848	gsprintf(STDERR, "BaseImporter::process {common.must_be_implemented}\n");
849
850	my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
851	print STDERR "Calling method: $cfilename:$cline $cpackage->$csubr\n";
852
853	die "\n";
854
855	return undef; # never gets here
856	}
857
858	# overwrite this method to delete any temp files that we have created
859	sub clean_up_after_doc_obj_processing {
860	my $self = shift(@_);
861
862	}
863
864
865
866	sub filename_based_title
867	{
868	my $self = shift (@_);
869	my ($file) = @_;
870
871	my $file_derived_title = $file;
872	$file_derived_title =~ s/_/ /g;
873	$file_derived_title =~ s/\.[^.]+$//;
874
875	return $file_derived_title;
876	}
877
878
879	sub title_fallback
880	{
881	my $self = shift (@_);
882	my ($doc_obj,$section,$file) = @_;
883
884	if (!defined $doc_obj->get_metadata_element ($section, "Title")
885	\|\| $doc_obj->get_metadata_element($section, "Title") eq "") {
886
887	my $source_file = $doc_obj->get_metadata_element($section, "Source");
888	my $file_derived_title;
889	if (defined $source_file) {
890	$file_derived_title = $self->filename_based_title($source_file);
891	}
892	else {
893	# pp = pretty print
894	my $pp_file = (defined $source_file) ? $source_file : $file;
895
896	my $raw_title = $self->filename_based_title($file);
897	my $file_derived_title = &unicode::raw_filename_to_url_encoded($raw_title);
898	}
899
900
901	if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
902	$doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
903	}
904	else {
905	$doc_obj->set_utf8_metadata_element ($section, "Title", $file_derived_title);
906	}
907	}
908
909	}
910
911	# add any extra metadata that's been passed around from one
912	# plugin to another.
913	# extra_metadata uses add_utf8_metadata so it expects metadata values
914	# to already be in utf8
915	sub extra_metadata {
916	my $self = shift (@_);
917	my ($doc_obj, $cursection, $metadata) = @_;
918
919	my $associate_tail_re = $self->{'associate_tail_re'};
920
921	# Sort the extra metadata for diffcol so these meta appear in a consistent order
922	# in doc.xml. Necessary for the ex.PDF.* and ex.File.* meta that's extracted in
923	# the PDFBox collection, as the order of these varies between CentOS and Ubuntu.
924	foreach my $field (sort keys(%$metadata)) {
925	# foreach my $field (keys(%$metadata)) {
926	# $metadata->{$field} may be an array reference
927	if ($field eq "gsdlassocfile_tobe") {
928	# 'gsdlassocfile_tobe' is artificially introduced metadata
929	# that is used to signal that certain additional files should
930	# be tied to this document. Useful in situations where a
931	# metadata pass in the plugin pipeline works out some files
932	# need to be associated with a document, but the document hasn't
933	# been formed yet.
934	my $equiv_form = "";
935	foreach my $gaf (@{$metadata->{$field}}) {
936	my ($full_filename,$mimetype) = ($gaf =~ m/^(.):(.):$/);
937	my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
938
939	# we need to make sure the filename is valid utf-8 - we do
940	# this by url or base64 encoding it
941	# $tail_filename is the name that we store the file as
942	$tail_filename = &util::rename_file($tail_filename, $self->{'file_rename_method'});
943	$doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
944	$doc_obj->associate_source_file($full_filename);
945	# If the filename is url_encoded, we need to encode the % signs
946	# in the filename, so that it works in a url
947	my $url_tail_filename = &unicode::filename_to_url($tail_filename);
948	# work out extended tail extension (i.e. matching tail re)
949
950	my ($file_prefix,$file_extended_ext)
951	= &util::get_prefix_and_tail_by_regex($tail_filename,$associate_tail_re);
952	my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.)\..$/);
953	my ($doc_ext) = ($tail_filename =~ m/^.\.(.)$/);
954
955	# the greenstone 2 stuff
956	my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):assocfilepath],[assocfilepath]}/$url_tail_filename\">";
957	#my $start_doclink = "<a href=\'_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/$url_tail_filename\'>";
958	my $start_doclink_gs3 = "<a href=\'_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/$url_tail_filename\'>";
959
960	my $srcicon = "_icon".$doc_ext."_";
961	my $end_doclink = "</a>";
962
963	my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
964
965
966	if (defined $pre_doc_ext && $pre_doc_ext ne "") {
967	# for metadata such as [mp3._edited] [mp3._full] ...
968	$doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
969	}
970
971	# for multiple metadata such as [mp3.assoclink]
972	$doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
973
974	$equiv_form .= " $assoc_form";
975
976	# following are used for greenstone 3,
977	$doc_obj->add_utf8_metadata ($cursection, "equivDocLink", $start_doclink_gs3);
978	$doc_obj->add_utf8_metadata ($cursection, "equivDocIcon", $srcicon);
979	$doc_obj->add_utf8_metadata ($cursection, "/equivDocLink", $end_doclink);
980
981	}
982	$doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
983	}
984	elsif ($field eq "gsdlzipfilename") {
985	# special case for when files have come out of a zip. source_path
986	# (used for archives dbs and keeping track for incremental import)
987	# must be set to the zip file name
988	my $zip_filename = $metadata->{$field};
989	# overwrite the source_path
990	$doc_obj->set_source_path($zip_filename);
991	# and set the metadata
992	$zip_filename = &util::filename_within_collection($zip_filename);
993	$zip_filename = $doc_obj->encode_filename($zip_filename, $self->{'file_rename_method'});
994	$doc_obj->add_utf8_metadata ($cursection, $field, $zip_filename);
995	}
996	elsif (ref ($metadata->{$field}) eq "ARRAY") {
997	if ($field =~ /(.+?)\/\/\/Section\/([\d.]*)/m){
998	my $field_new_name = $1;
999	my $specified_section = $2;
1000	map {
1001	$doc_obj->add_utf8_metadata ($specified_section, $field_new_name, $_);
1002	} @{$metadata->{$field}};
1003	} else {
1004	map {
1005	$doc_obj->add_utf8_metadata ($cursection, $field, $_);
1006	} @{$metadata->{$field}};
1007	}
1008
1009	} else {
1010	if ($field =~ /(.+?)\/\/\/Section\/([\d.]*)/m){
1011	my $field_new_name = $1;
1012	my $specified_section = $2;
1013	$doc_obj->add_utf8_metadata ($specified_section, $field_new_name, $metadata->{$field});
1014	} else {
1015	$doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1016	}
1017	}
1018	}
1019	}
1020
1021
1022	sub compile_stats {
1023	my $self = shift(@_);
1024	my ($stats) = @_;
1025
1026	$stats->{'num_processed'} += $self->{'num_processed'};
1027	$stats->{'num_not_processed'} += $self->{'num_not_processed'};
1028	$stats->{'num_archives'} += $self->{'num_archives'};
1029
1030	}
1031	sub associate_source_file {
1032	my $self = shift(@_);
1033
1034	my ($doc_obj, $filename) = @_;
1035	my $cursection = $doc_obj->get_top_section();
1036	my $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
1037
1038	$doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
1039	# srclink_file is now deprecated because of the "_" in the metadataname. Use srclinkFile
1040	$doc_obj->add_utf8_metadata ($cursection, "srclink_file", $doc_obj->get_sourcefile());
1041	$doc_obj->add_utf8_metadata ($cursection, "srclinkFile", $doc_obj->get_sourcefile());
1042	}
1043
1044	sub associate_cover_image {
1045	my $self = shift(@_);
1046	my ($doc_obj, $filename) = @_;
1047
1048	my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
1049
1050	$filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1051	$upgraded_filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1052
1053	if (exists $self->{'covers_missing_cache'}->{$upgraded_filename}) {
1054	# don't stat() for existence e.g. for multiple document input files
1055	# (eg SplitPlug)
1056	return;
1057	}
1058
1059	my $top_section=$doc_obj->get_top_section();
1060
1061	if (&FileUtils::fileExists($upgraded_filename)) {
1062	$doc_obj->associate_source_file($filename);
1063	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1064	$doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1065	} else {
1066	my $upper_filename = $filename;
1067	my $upgraded_upper_filename = $upgraded_filename;
1068
1069	$upper_filename =~ s/jpg$/JPG/;
1070	$upgraded_upper_filename =~ s/jpg$/JPG/;
1071
1072	if (&FileUtils::fileExists($upgraded_upper_filename)) {
1073	$doc_obj->associate_source_file($upper_filename);
1074	$doc_obj->associate_file($upper_filename, "cover.jpg",
1075	"image/jpeg");
1076	$doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1077	} else {
1078	# file doesn't exist, so record the fact that it's missing so
1079	# we don't stat() again (stat is slow)
1080	$self->{'covers_missing_cache'}->{$upgraded_filename} = 1;
1081	}
1082	}
1083
1084	}
1085
1086
1087	# Overridden by exploding plugins (eg. ISISPlug)
1088	sub clean_up_after_exploding
1089	{
1090	my $self = shift(@_);
1091	}
1092
1093
1094
1095	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: