Context Navigation

source: main/trunk/greenstone2/perllib/plugins/BaseImporter.pm@ 32501

Last change on this file since 32501 was 32501, checked in by Georgiy Litvinov, 6 years ago
Workaround to set assign metadata via csv metadata plugin. "Section" column could be used in csv file to specify section for metadata to assign
Property svn:keywords set to `Author Date Id Revision`
File size: 33.6 KB

Rev	Line
[537]	1	###########################################################################
	2	#
[31492]	3	# BaseImporter.pm -- base class for all the import plugins
[537]	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
[9413]	8	# Copyright (C) 1999-2005 New Zealand Digital Library Project
[537]	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
[4]	25
[31492]	26	package BaseImporter;
[2219]	27
[10254]	28	use strict;
	29	no strict 'subs';
	30	no strict 'refs'; # allow filehandles to be variables and viceversa
[9413]	31
[8892]	32	use File::Basename;
[23335]	33	use Encode;
[23832]	34	use Unicode::Normalize 'normalize';
[8892]	35
[1870]	36	use encodings;
[11389]	37	use unicode;
[1242]	38	use doc;
[2751]	39	use ghtml;
[9413]	40	use gsprintf 'gsprintf';
[27306]	41	use util;
	42	use FileUtils;
[4]	43
[31492]	44	use CommonUtil;
[10218]	45
[15868]	46	BEGIN {
[31492]	47	@BaseImporter::ISA = ( 'CommonUtil' );
[15868]	48	}
[5681]	49
[18320]	50	# the different methods that can be applied when renaming
	51	# imported documents and their associated files
	52	our $file_rename_method_list =
	53	[ { 'name' => "url",
[31492]	54	'desc' => "{BaseImporter.rename_method.url}" },
[18320]	55	{ 'name' => "base64",
[31492]	56	'desc' => "{BaseImporter.rename_method.base64}" },
[18404]	57	{ 'name' => "none",
[31492]	58	'desc' => "{BaseImporter.rename_method.none}",
[18398]	59	'hiddengli' => "yes" } ];
[18320]	60
[31457]	61	# here went encoding list stuff
[3540]	62
[16698]	63	our $oidtype_list =
[17026]	64	[ { 'name' => "auto",
[31492]	65	'desc' => "{BaseImporter.OIDtype.auto}" },
[17026]	66	{ 'name' => "hash",
[16698]	67	'desc' => "{import.OIDtype.hash}" },
[24290]	68	{ 'name' => "hash_on_ga_xml",
	69	'desc' => "{import.OIDtype.hash_on_ga_xml}" },
[26536]	70	{ 'name' => "hash_on_full_filename",
	71	'desc' => "{import.OIDtype.hash_on_full_filename}" },
[16698]	72	{ 'name' => "assigned",
	73	'desc' => "{import.OIDtype.assigned}" },
	74	{ 'name' => "incremental",
	75	'desc' => "{import.OIDtype.incremental}" },
[26221]	76	{ 'name' => "filename",
	77	'desc' => "{import.OIDtype.filename}" },
[16698]	78	{ 'name' => "dirname",
[26536]	79	'desc' => "{import.OIDtype.dirname}" },
	80	{ 'name' => "full_filename",
	81	'desc' => "{import.OIDtype.full_filename}" } ];
[16698]	82
[4873]	83	my $arguments =
[3540]	84	[ { 'name' => "process_exp",
[31492]	85	'desc' => "{BaseImporter.process_exp}",
[6408]	86	'type' => "regexp",
[3540]	87	'deft' => "",
	88	'reqd' => "no" },
[31476]	89	{ 'name' => "store_original_file",
[31492]	90	'desc' => "{BaseImporter.store_original_file}",
[22215]	91	'type' => "flag",
	92	'reqd' => "no" },
[8892]	93	{ 'name' => "associate_ext",
[31492]	94	'desc' => "{BaseImporter.associate_ext}",
[8892]	95	'type' => "string",
	96	'reqd' => "no" },
[11122]	97	{ 'name' => "associate_tail_re",
[31492]	98	'desc' => "{BaseImporter.associate_tail_re}",
[11122]	99	'type' => "string",
	100	'reqd' => "no" },
[16698]	101	{ 'name' => "OIDtype",
	102	'desc' => "{import.OIDtype}",
	103	'type' => "enum",
	104	'list' => $oidtype_list,
[16847]	105	# leave default empty so we can tell if its been set or not - if not set will use option from import.pl
[17026]	106	'deft' => "auto",
[18591]	107	'reqd' => "no" },
[16698]	108	{ 'name' => "OIDmetadata",
	109	'desc' => "{import.OIDmetadata}",
	110	'type' => "metadata",
	111	'deft' => "dc.Identifier",
[18591]	112	'reqd' => "no" },
[16698]	113	# { 'name' => "use_as_doc_identifier",
[31492]	114	# 'desc' => "{BaseImporter.use_as_doc_identifier}",
[16698]	115	# 'type' => "string",
	116	# 'reqd' => "no" ,
	117	# 'deft' => "" } ,
[18320]	118	{ 'name' => "no_cover_image",
[31492]	119	'desc' => "{BaseImporter.no_cover_image}",
[3540]	120	'type' => "flag",
	121	'reqd' => "no" },
[31493]	122	{ 'name' => "file_rename_method",
[31492]	123	'desc' => "{BaseImporter.file_rename_method}",
[18320]	124	'type' => "enum",
	125	'deft' => &get_default_file_rename_method(), # by default rename imported files and assoc files using this encoding
	126	'list' => $file_rename_method_list,
	127	'reqd' => "no"
[23457]	128	}
[15868]	129
	130	];
[3540]	131
[9398]	132
[31492]	133	my $options = { 'name' => "BaseImporter",
	134	'desc' => "{BaseImporter.desc}",
[6408]	135	'abstract' => "yes",
[31457]	136	'inherits' => "yes",
[4750]	137	'args' => $arguments };
[3540]	138
[4]	139	sub new {
[10218]	140
[15868]	141	my ($class) = shift (@_);
[16698]	142	my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
[10218]	143	push(@$pluginlist, $class);
[9398]	144
[15868]	145	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	146	push(@{$hashArgOptLists->{"OptList"}},$options);
[10218]	147
[31492]	148	my $self = new CommonUtil($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
[16390]	149
	150	if ($self->{'info_only'}) {
	151	# don't worry about any options etc
	152	return bless $self, $class;
	153	}
	154
[15868]	155	my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
	156	$self->{'plugin_type'} = $plugin_name;
[10579]	157
[24404]	158	# remove ex. from OIDmetadata iff it's the only namespace prefix
	159	$self->{'OIDmetadata'} =~ s/^ex\.([^.]+)$/$1/ if defined $self->{'OIDmetadata'};
[2785]	160	$self->{'num_processed'} = 0;
	161	$self->{'num_not_processed'} = 0;
	162	$self->{'num_blocked'} = 0;
	163	$self->{'num_archives'} = 0;
[8678]	164	$self->{'cover_image'} = 1; # cover image is on by default
[10218]	165	$self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
[19222]	166	$self->{'can_process_directories'} = 0;
[10579]	167	#$self->{'option_list'} = $hashArgOptLists->{"OptList"};
[3540]	168
[8892]	169	my $associate_ext = $self->{'associate_ext'};
	170	if ((defined $associate_ext) && ($associate_ext ne "")) {
[9351]	171
[11122]	172	my $associate_tail_re = $self->{'associate_tail_re'};
	173	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
	174	my $outhandle = $self->{'outhandle'};
	175	print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
	176	print $outhandle " defaulting to 'associate_tail_re'\n";
[8892]	177	}
[11122]	178	else {
	179	my @exts = split(/,/,$associate_ext);
[8892]	180
[11122]	181	my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
	182	my $associate_tail_re = join("\|",@exts_bracketed);
	183	$self->{'associate_tail_re'} = $associate_tail_re;
	184	}
	185
	186	delete $self->{'associate_ext'};
[8892]	187	}
	188
[15868]	189	return bless $self, $class;
[11089]	190
[4]	191	}
	192
[16821]	193	sub merge_inheritance
	194	{
	195	my $self = {};
	196	my @child_selfs = @_;
	197
	198	foreach my $child_self (@child_selfs) {
	199	foreach my $key (keys %$child_self) {
	200	if (defined $self->{$key}) {
	201	if ($self->{$key} ne $child_self->{$key}) {
	202	# print STDERR "Warning: Conflicting value in multiple inheritance for '$key'\n";
	203	# print STDERR "Existing stored value = $self->{$key}\n";
	204	# print STDERR "New (child) value = $child_self->{$key}\n";
	205	# print STDERR "Keeping existing value\n";
	206	# Existing value seems to be option specified in collect.cfg
	207
	208	### $self->{$key} = $child_self->{$key};
	209
	210	}
	211	else {
	212	## print STDERR "****Info: Value $self->{$key} for $key already defined through multiple inheritance as the same value\n";
	213	}
	214
	215	}
	216	else {
	217	$self->{$key} = $child_self->{$key};
	218	}
	219	}
	220	}
	221
	222	return $self;
	223	}
	224
[31492]	225	# initialize BaseImporter options
	226	# if init() is overridden in a sub-class, remember to call BaseImporter::init()
[1242]	227	sub init {
	228	my $self = shift (@_);
[2785]	229	my ($verbosity, $outhandle, $failhandle) = @_;
[16390]	230
[31457]	231	$self->SUPER::init(@_);
	232
[1242]	233	# set process_exp and block_exp to defaults unless they were
	234	# explicitly set
[1244]	235
	236	if ((!$self->is_recursive()) and
[1242]	237	(!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
[1244]	238
[1242]	239	$self->{'process_exp'} = $self->get_default_process_exp ();
	240	if ($self->{'process_exp'} eq "") {
[1244]	241	warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
[1242]	242	}
	243	}
	244
	245	if ((!defined $self->{'block_exp'}) \|\| ($self->{'block_exp'} eq "")) {
	246	$self->{'block_exp'} = $self->get_default_block_exp ();
	247	}
[11089]	248
[1242]	249	}
	250
[839]	251	sub begin {
	252	my $self = shift (@_);
	253	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
[16821]	254
[17026]	255	if ($self->{'OIDtype'} eq "auto") {
	256	# hasn't been set in the plugin, use the processor values
	257	$self->{'OIDtype'} = $processor->{'OIDtype'};
	258	$self->{'OIDmetadata'} = $processor->{'OIDmetadata'};
	259	}
	260	if ($self->{'OIDtype'} eq "hash") {
	261	# should we hash on the file or on the doc xml??
	262	$self->{'OIDtype'} = $self->get_oid_hash_type();
	263	if ($self->{'OIDtype'} !~ /^(hash_on_file\|hash_on_ga_xml)$/) {
	264	$self->{'OIDtype'} = "hash_on_file";
	265	}
	266	}
[839]	267	}
	268
[21308]	269	# This is called once if removeold is set with import.pl. Most plugins will do
	270	# nothing but if a plugin does any stuff outside of creating doc obj, then
	271	# it may need to clear something.
	272	sub remove_all {
[21286]	273	my $self = shift (@_);
	274	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
	275	}
	276
[21308]	277	# This is called per document for docs that have been deleted from the
	278	# collection. Most plugins will do nothing
	279	# but if a plugin does any stuff outside of creating doc obj, then it may need
	280	# to clear something.
	281	sub remove_one {
	282	my $self = shift (@_);
	283
[21315]	284	my ($file, $oids, $archivedir) = @_;
	285	return 0 if $self->can_process_this_file($file);
	286	return undef;
[21308]	287	}
	288
[839]	289	sub end {
[10155]	290	# potentially called at the end of each plugin pass
	291	# import.pl only has one plugin pass, but buildcol.pl has multiple ones
	292
[15868]	293	my ($self) = shift (@_);
[839]	294	}
	295
[10155]	296	sub deinit {
	297	# called only once, after all plugin passes have been done
	298
	299	my ($self) = @_;
	300	}
	301
[17026]	302	# default hashing type is to hash on the original file (or converted file)
	303	# override this to return hash_on_ga_xml for filetypes where hashing on the
	304	# file is no good eg video
	305	sub get_oid_hash_type {
	306
	307	my $self = shift (@_);
	308
	309	return "hash_on_file";
	310	}
	311
[15868]	312
[1242]	313	# this function should be overridden to return 1
	314	# in recursive plugins
[4]	315	sub is_recursive {
	316	my $self = shift (@_);
	317
[1242]	318	return 0;
[4]	319	}
	320
[1242]	321	sub get_default_block_exp {
	322	my $self = shift (@_);
	323
	324	return "";
	325	}
	326
	327	sub get_default_process_exp {
	328	my $self = shift (@_);
	329
	330	return "";
	331	}
	332
[23419]	333
[18320]	334	# rename imported files and assoc files using URL encoding by default
	335	# as this will work for most plugins and give more legible filenames
	336	sub get_default_file_rename_method() {
	337	my $self = shift (@_);
	338	return "url";
	339	}
	340
[18398]	341	# returns this plugin's active (possibly user-selected) file_rename_method
	342	sub get_file_rename_method() {
	343	my $self = shift (@_);
	344	my $rename_method = $self->{'file_rename_method'};
	345	if($rename_method) {
	346	return $rename_method;
	347	} else {
	348	return $self->get_default_file_rename_method();
	349	}
	350	}
	351
[16390]	352	# default implementation is to do nothing
	353	sub store_block_files {
	354
[9067]	355	my $self =shift (@_);
[16390]	356	my ($filename_full_path, $block_hash) = @_;
	357
[9067]	358	}
	359
[16390]	360	# put files to block into hash
	361	sub use_block_expressions {
	362
	363	my $self =shift (@_);
	364	my ($filename_full_path, $block_hash) = @_;
	365
[23363]	366	$filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
	367
[16390]	368	if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
[31476]	369	$self->block_filename($block_hash,$filename_full_path);
[16390]	370	}
	371
	372	}
	373
[9067]	374	#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
	375	sub block_cover_image
	376	{
[10833]	377	my $self =shift;
[16390]	378	my ($filename, $block_hash) = @_;
[10833]	379
[23363]	380	$filename = &util::upgrade_if_dos_filename($filename);
	381
[9067]	382	if ($self->{'cover_image'}) {
	383	my $coverfile = $filename;
	384	$coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
[23759]	385
[25743]	386	#if there is no file extension, coverfile will be the same as filename
	387	return if $coverfile eq $filename;
	388
[27306]	389	if (!&FileUtils::fileExists($coverfile)) {
[9067]	390	$coverfile =~ s/jpg$/JPG/;
	391	}
[27306]	392	if (&FileUtils::fileExists($coverfile)) {
[31476]	393	$self->block_filename($block_hash,$coverfile);
[11089]	394	}
[9067]	395	}
	396
	397	return;
	398	}
[11122]	399
	400
[16390]	401	# discover all the files that should be blocked by this plugin
	402	# check the args ...
	403	sub file_block_read {
[11122]	404
[8510]	405	my $self = shift (@_);
[16390]	406	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
[8892]	407	# Keep track of filenames with same root but different extensions
[11122]	408	# Used to support -associate_ext and the more generalised
	409	# -associate_tail_re
[16390]	410	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
[8892]	411
[18441]	412	if (!-d $filename_full_path) {
	413	$block_hash->{'all_files'}->{$file} = 1;
	414	}
	415
[11122]	416	my $associate_tail_re = $self->{'associate_tail_re'};
	417	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
	418	my ($file_prefix,$file_ext)
[16390]	419	= &util::get_prefix_and_tail_by_regex($filename_full_path,$associate_tail_re);
[8892]	420	if ((defined $file_prefix) && (defined $file_ext)) {
[16390]	421	my $shared_fileroot = $block_hash->{'shared_fileroot'};
[8892]	422	if (!defined $shared_fileroot->{$file_prefix}) {
[11122]	423	my $file_prefix_rec = { 'tie_to' => undef,
	424	'exts' => {} };
[8892]	425	$shared_fileroot->{$file_prefix} = $file_prefix_rec;
	426	}
	427
	428	my $file_prefix_rec = $shared_fileroot->{$file_prefix};
	429
[30358]	430	if ($self->can_process_this_file($filename_full_path) && $file_ext !~ m/.\./) {
[8892]	431	# This is the document the others should be tied to
	432	$file_prefix_rec->{'tie_to'} = $file_ext;
	433	}
	434	else {
[11122]	435	if ($file_ext =~ m/$associate_tail_re$/) {
[16390]	436	# this file should be associated to the main one
[9351]	437	$file_prefix_rec->{'exts'}->{$file_ext} = 1;
	438	}
[8892]	439	}
[11122]	440
[8892]	441	}
	442	}
[11122]	443
[16390]	444	# check block expressions
	445	$self->use_block_expressions($filename_full_path, $block_hash) unless $self->{'no_blocking'};
	446
[9067]	447	# now check whether we are actually processing this
[16390]	448	if (!-f $filename_full_path \|\| !$self->can_process_this_file($filename_full_path)) {
[9067]	449	return undef; # can't recognise
	450	}
[23457]	451
[16852]	452	# if we have a block_exp, then this overrides the normal 'smart' blocking
	453	$self->store_block_files($filename_full_path, $block_hash) unless ($self->{'no_blocking'} \|\| $self->{'block_exp'} ne "");
[8892]	454
[11089]	455	# block the cover image if there is one
	456	if ($self->{'cover_image'}) {
[16852]	457	$self->block_cover_image($filename_full_path, $block_hash);
[11089]	458	}
[23457]	459
[9067]	460	return 1;
[8510]	461	}
	462
[16390]	463	# plugins that rely on more than process_exp (eg XML plugins) can override this method
	464	sub can_process_this_file {
	465	my $self = shift(@_);
	466	my ($filename) = @_;
[8892]	467
[19222]	468	if (-d $filename && !$self->{'can_process_directories'}) {
	469	return 0;
	470	}
[22427]	471
[16390]	472	if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
	473	return 1;
[8892]	474	}
	475	return 0;
[10280]	476
	477	}
	478
[24403]	479	# Even if a plugin can extract metadata in its metadata_read pass,
	480	# make the default return 'undef' so processing of the file continues
	481	# down the pipeline, so other plugins can also have the opportunity to
	482	# locate metadata and set it up in the extrametakeys variables that
	483	# are passed around.
	484
	485	sub can_process_this_file_for_metadata {
	486	my $self = shift(@_);
	487
	488	return undef;
	489	}
	490
	491
[10280]	492
[23335]	493	# Notionally written to be called once for each document, it is however safe to
	494	# call multiple times (as in the case of ImagePlugin) which calls this later on
	495	# after the original image has potentially been converted to a new source image
	496	# format (e.g. TIFF to PNG)
	497
[15868]	498	sub set_Source_metadata {
	499	my $self = shift (@_);
[23461]	500	my ($doc_obj, $raw_filename, $filename_encoding, $section) = @_;
	501
[23335]	502	# 1. Sets the filename (Source) for display encoded as Unicode if possible,
	503	# and (as a fallback) using %xx if not for non-ascii chars
	504	# 2. Sets the url ref (SourceFile) to the URL encoded version
	505	# of filename for generated files
[23352]	506
	507	my ($unused_full_rf, $raw_file) = &util::get_full_filenames("", $raw_filename);
[23335]	508
[23461]	509	my $this_section = (defined $section)? $section : $doc_obj->get_top_section();
[23352]	510
	511	my $octet_file = $raw_file;
	512
	513	# UTF-8 version of filename
[23387]	514	# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
	515	# print STDERR "**** Setting Source Metadata given: $octet_file\n";
	516	# }
[14961]	517
[23352]	518	# Deal with (on Windows) raw filenames that are in their
	519	# abbreviated DOS form
	520
[28375]	521	if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
[23352]	522	if ((defined $filename_encoding) && ($filename_encoding eq "unicode")) {
	523	if (-e $raw_filename) {
	524	my $unicode_filename = Win32::GetLongPathName($raw_filename);
	525
	526	my $unused_full_uf;
	527	($unused_full_uf, $octet_file) = &util::get_full_filenames("", $unicode_filename);
	528	}
[23347]	529	}
[23352]	530	}
[23335]	531
	532	my $url_encoded_filename;
[23364]	533	if ((defined $filename_encoding) && ($filename_encoding ne "ascii")) {
[23352]	534	# => Generate a pretty print version of filename that is mapped to Unicode
	535
	536	# Use filename_encoding to map raw filename to a Perl unicode-aware string
	537	$url_encoded_filename = decode($filename_encoding,$octet_file);
[23335]	538	}
	539	else {
[23352]	540	# otherwise generate %xx encoded version of filename for char > 127
	541	$url_encoded_filename = &unicode::raw_filename_to_url_encoded($octet_file);
[23335]	542	}
[18320]	543
[23387]	544	# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
	545	# print STDERR "****** saving Source as: $url_encoded_filename\n";
	546	# }
[23759]	547
	548	# In the case of converted files and (generalized) exploded documents, there
	549	# will already be a source filename => store as OrigSource before overriding
	550	my $orig_source = $doc_obj->get_metadata_element ($this_section, "Source");
	551	if ((defined $orig_source) && ($orig_source !~ m/^\s*$/)) {
	552	$doc_obj->set_utf8_metadata_element($this_section, "OrigSource", $orig_source);
	553	}
	554
[16919]	555	# Source is the UTF8 display name - not necessarily the name of the file on the system
[23829]	556	if ($ENV{'GSDLOS'} =~ m/^darwin$/i) {
	557	# on Darwin want all display strings to be in composed form, then can search on that
	558	$url_encoded_filename = normalize('C', $url_encoded_filename); # Normalisation Form 'C' (composition)
	559	}
[31438]	560	# set_utf8_metadata actually sets perl unicode aware strings. not utf8
[23461]	561	$doc_obj->set_utf8_metadata_element($this_section, "Source", $url_encoded_filename);
[23759]	562
[23352]	563
[23335]	564	my $renamed_raw_file = &util::rename_file($raw_file, $self->{'file_rename_method'});
[18320]	565	# If using URL encoding, then SourceFile is the url-reference to url-encoded
[23335]	566	# renamed_raw_url: it's a url that refers to the actual file on the system
[31438]	567	# this call just replaces % with %25
[23335]	568	my $renamed_raw_url = &unicode::filename_to_url($renamed_raw_file);
[18320]	569
[23461]	570	$doc_obj->set_utf8_metadata_element($this_section, "SourceFile",
[23335]	571	$renamed_raw_url);
	572
[23387]	573	# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
	574	# print STDERR "****** saving SourceFile as: $renamed_raw_url\n";
	575	# }
[14961]	576	}
[23457]	577
[17026]	578	# this should be called by all plugins to set the oid of the doc obj, rather
	579	# than calling doc_obj->set_OID directly
[15868]	580	sub add_OID {
[15018]	581	my $self = shift (@_);
[26893]	582	my ($doc_obj, $force) = @_;
[15018]	583
[26893]	584	# don't add one if there is one already set, unless we are forced to do so
	585	return unless ($doc_obj->get_OID() =~ /^NULL$/ \|\| $force);
[17026]	586	$doc_obj->set_OIDtype($self->{'OIDtype'}, $self->{'OIDmetadata'});
[15018]	587
[17026]	588	# see if there is a plugin specific set_OID function
	589	if (defined ($self->can('set_OID'))) {
	590	$self->set_OID(@_); # pass through doc_obj and any extra arguments
[15018]	591	}
[17026]	592	else {
[15018]	593	# use the default set_OID() in doc.pm
	594	$doc_obj->set_OID();
	595	}
[17026]	596
[15018]	597	}
[23457]	598
[31492]	599	# The BaseImporter read_into_doc_obj() function. This function does all the
[15868]	600	# right things to make general options work for a given plugin. It doesn't do anything with the file other than setting reads in
[10280]	601	# a file and sets up a slew of metadata all saved in doc_obj, which
	602	# it then returns as part of a tuple (process_status,doc_obj)
	603	#
	604	# Much of this functionality used to reside in read, but it was broken
	605	# down into a supporting routine to make the code more flexible.
	606	#
	607	# recursive plugins (e.g. RecPlug) and specialized plugins like those
	608	# capable of processing many documents within a single file (e.g.
	609	# GMLPlug) will normally want to implement their own version of
	610	# read_into_doc_obj()
	611	#
	612	# Note that $base_dir might be "" and that $file might
	613	# include directories
[15868]	614
	615	# currently blocking has been done before it gets here - does this affect secondary plugin stuff??
[10280]	616	sub read_into_doc_obj {
	617	my $self = shift (@_);
[16390]	618	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[10280]	619
[15868]	620	my $outhandle = $self->{'outhandle'};
[10280]	621
[15868]	622	# should we move this to read? What about secondary plugins?
[23759]	623	my $pp_file = &util::prettyprint_file($base_dir,$file,$gli);
[15868]	624	print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
[23363]	625	print $outhandle "$self->{'plugin_type'} processing $pp_file\n"
[23457]	626	if $self->{'verbosity'} > 1;
[10280]	627
[16390]	628	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
[23457]	629
[1242]	630	# create a new document
[18320]	631	my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
[14961]	632	my $top_section = $doc_obj->get_top_section();
	633
	634	$doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
[15868]	635	$doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
[23457]	636
[18469]	637
[23352]	638	my $plugin_filename_encoding = $self->{'filename_encoding'};
[23347]	639	my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
[23472]	640	$self->set_Source_metadata($doc_obj,$filename_full_path,$filename_encoding,$top_section);
[8166]	641
[15868]	642	# plugin specific stuff - what args do we need here??
	643	unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
	644	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
	645	return -1;
[2816]	646	}
[1242]	647
[15868]	648	# include any metadata passed in from previous plugins
	649	# note that this metadata is associated with the top level section
	650	my $section = $doc_obj->get_top_section();
	651	# can we merge these two methods??
	652	$self->add_associated_files($doc_obj, $filename_full_path);
	653	$self->extra_metadata ($doc_obj, $section, $metadata);
	654	$self->auto_extract_metadata($doc_obj);
[1242]	655
[15868]	656	# if we haven't found any Title so far, assign one
	657	# this was shifted to here from inside read()
	658	$self->title_fallback($doc_obj,$section,$filename_no_path);
	659
	660	$self->add_OID($doc_obj);
	661
[21219]	662	$self->post_process_doc_obj($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
[15868]	663	return (1,$doc_obj);
	664	}
[2785]	665
[21219]	666	sub post_process_doc_obj {
	667	my $self = shift (@_);
	668	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
	669
	670	return 1;
	671	}
	672
[15868]	673	sub add_dummy_text {
	674	my $self = shift(@_);
	675	my ($doc_obj, $section) = @_;
[2785]	676
[15868]	677	# add NoText metadata so we can hide this dummy text in format statements
	678	$doc_obj->add_metadata($section, "NoText", "1");
[31420]	679
	680	# lookup_string with extra '1' arg returns perl internal unicode aware text, so we use add_utf8_text so no encoding is done on it.
[31492]	681	$doc_obj->add_utf8_text($section, &gsprintf::lookup_string("{BaseImporter.dummy_text}",1));
	682	#$doc_obj->add_text($section, &gsprintf::lookup_string("{BaseImporter.dummy_text}",1));
[15868]	683
[31420]	684
[15868]	685	}
[8510]	686
[15868]	687	# does nothing. Can be overridden by subclass
	688	sub auto_extract_metadata {
	689	my $self = shift(@_);
	690	my ($doc_obj) = @_;
	691	}
[11122]	692
[15868]	693	# adds cover image, associate_file options stuff. Should be called by sub class
	694	# read_into_doc_obj
	695	sub add_associated_files {
	696	my $self = shift(@_);
	697	# whatis filename??
	698	my ($doc_obj, $filename) = @_;
	699
	700	# add in the cover image
	701	if ($self->{'cover_image'}) {
	702	$self->associate_cover_image($doc_obj, $filename);
[8716]	703	}
[22215]	704	# store the original (used for eg TextPlugin to store the original for OAI)
	705	if ($self->{'store_original_file'}) {
	706	$self->associate_source_file($doc_obj, $filename);
	707	}
[23457]	708
[15018]	709
[10280]	710	}
[1242]	711
[16390]	712	# implement this if you are extracting metadata for other documents
	713	sub metadata_read {
	714	my $self = shift (@_);
[19493]	715	my ($pluginfo, $base_dir, $file, $block_hash,
	716	$extrametakeys, $extrametadata, $extrametafile,
[23212]	717	$processor, $gli, $aux) = @_;
[16390]	718
	719	# can we process this file??
	720	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
[24403]	721	return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
[16390]	722
	723	return 1; # we recognise the file, but don't actually do anything with it
	724	}
	725
	726
[31492]	727	# The BaseImporter read() function. This function calls read_into_doc_obj()
[10280]	728	# to ensure all the right things to make general options work for a
	729	# given plugin are done. It then calls the process() function which
	730	# does all the work specific to a plugin (like the old read functions
	731	# used to do). Most plugins should define their own process() function
	732	# and let this read() function keep control.
	733	#
	734	# recursive plugins (e.g. RecPlug) and specialized plugins like those
	735	# capable of processing many documents within a single file (e.g.
	736	# GMLPlug) might want to implement their own version of read(), but
	737	# more likely need to implement their own version of read_into_doc_obj()
	738	#
	739	# Return number of files processed, undef if can't recognise, -1 if can't
	740	# process
	741
	742	sub read {
	743	my $self = shift (@_);
[16390]	744	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[10280]	745
[16390]	746	# can we process this file??
	747	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
[20577]	748
[16390]	749	return undef unless $self->can_process_this_file($filename_full_path);
	750
[31445]	751	#print STDERR "**** BEFORE READ INTO DOC OBJ: $file\n";
[10280]	752	my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
[31445]	753	#print STDERR "**** AFTER READ INTO DOC OBJ: $file\n";
	754
[10280]	755	if ((defined $process_status) && ($process_status == 1)) {
[21219]	756
[10280]	757	# process the document
	758	$processor->process($doc_obj);
[15868]	759
[10280]	760	$self->{'num_processed'} ++;
	761	undef $doc_obj;
[9398]	762	}
[15868]	763	# delete any temp files that we may have created
	764	$self->clean_up_after_doc_obj_processing();
[9398]	765
[18469]	766
[10280]	767	# if process_status == 1, then the file has been processed.
	768	return $process_status;
	769
[4]	770	}
	771
[1244]	772	# returns undef if file is rejected by the plugin
[1242]	773	sub process {
	774	my $self = shift (@_);
[11089]	775	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1242]	776
[31492]	777	gsprintf(STDERR, "BaseImporter::process {common.must_be_implemented}\n");
[1244]	778
[23457]	779	my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
[23419]	780	print STDERR "Calling method: $cfilename:$cline $cpackage->$csubr\n";
	781
[23457]	782	die "\n";
[23419]	783
[1244]	784	return undef; # never gets here
[1242]	785	}
	786
[15868]	787	# overwrite this method to delete any temp files that we have created
	788	sub clean_up_after_doc_obj_processing {
	789	my $self = shift(@_);
[4]	790
[10280]	791	}
[16390]	792
[1219]	793
[10280]	794
[7504]	795	sub filename_based_title
	796	{
	797	my $self = shift (@_);
	798	my ($file) = @_;
	799
	800	my $file_derived_title = $file;
	801	$file_derived_title =~ s/_/ /g;
[23335]	802	$file_derived_title =~ s/\.[^.]+$//;
[7504]	803
	804	return $file_derived_title;
	805	}
	806
[9398]	807
[7504]	808	sub title_fallback
	809	{
	810	my $self = shift (@_);
	811	my ($doc_obj,$section,$file) = @_;
	812
[22705]	813	if (!defined $doc_obj->get_metadata_element ($section, "Title")
[23335]	814	\|\| $doc_obj->get_metadata_element($section, "Title") eq "") {
[7504]	815
[23335]	816	my $source_file = $doc_obj->get_metadata_element($section, "Source");
	817	my $file_derived_title;
	818	if (defined $source_file) {
	819	$file_derived_title = $self->filename_based_title($source_file);
	820	}
	821	else {
	822	# pp = pretty print
	823	my $pp_file = (defined $source_file) ? $source_file : $file;
	824
	825	my $raw_title = $self->filename_based_title($file);
	826	my $file_derived_title = &unicode::raw_filename_to_url_encoded($raw_title);
	827	}
	828
	829
[15868]	830	if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
	831	$doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
[9413]	832	}
[15868]	833	else {
[16995]	834	$doc_obj->set_utf8_metadata_element ($section, "Title", $file_derived_title);
[15868]	835	}
[9413]	836	}
[23457]	837
[1844]	838	}
[23457]	839
[1219]	840	# add any extra metadata that's been passed around from one
	841	# plugin to another.
	842	# extra_metadata uses add_utf8_metadata so it expects metadata values
	843	# to already be in utf8
	844	sub extra_metadata {
	845	my $self = shift (@_);
	846	my ($doc_obj, $cursection, $metadata) = @_;
	847
[11122]	848	my $associate_tail_re = $self->{'associate_tail_re'};
	849
[27949]	850	# Sort the extra metadata for diffcol so these meta appear in a consistent order
	851	# in doc.xml. Necessary for the ex.PDF.* and ex.File.* meta that's extracted in
	852	# the PDFBox collection, as the order of these varies between CentOS and Ubuntu.
[27973]	853	foreach my $field (sort keys(%$metadata)) {
	854	# foreach my $field (keys(%$metadata)) {
[839]	855	# $metadata->{$field} may be an array reference
[8510]	856	if ($field eq "gsdlassocfile_tobe") {
	857	# 'gsdlassocfile_tobe' is artificially introduced metadata
	858	# that is used to signal that certain additional files should
	859	# be tied to this document. Useful in situations where a
	860	# metadata pass in the plugin pipeline works out some files
	861	# need to be associated with a document, but the document hasn't
	862	# been formed yet.
	863	my $equiv_form = "";
	864	foreach my $gaf (@{$metadata->{$field}}) {
	865	my ($full_filename,$mimetype) = ($gaf =~ m/^(.):(.):$/);
[18171]	866	my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
	867
	868	# we need to make sure the filename is valid utf-8 - we do
[18320]	869	# this by url or base64 encoding it
[18171]	870	# $tail_filename is the name that we store the file as
[18320]	871	$tail_filename = &util::rename_file($tail_filename, $self->{'file_rename_method'});
[8510]	872	$doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
[20778]	873	$doc_obj->associate_source_file($full_filename);
[18320]	874	# If the filename is url_encoded, we need to encode the % signs
	875	# in the filename, so that it works in a url
[18404]	876	my $url_tail_filename = &unicode::filename_to_url($tail_filename);
[11122]	877	# work out extended tail extension (i.e. matching tail re)
	878
	879	my ($file_prefix,$file_extended_ext)
[16390]	880	= &util::get_prefix_and_tail_by_regex($tail_filename,$associate_tail_re);
[11122]	881	my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.)\..$/);
[8510]	882	my ($doc_ext) = ($tail_filename =~ m/^.\.(.)$/);
[24219]	883
[25345]	884	# the greenstone 2 stuff
	885	my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):assocfilepath],[assocfilepath]}/$url_tail_filename\">";
	886	#my $start_doclink = "<a href=\'_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/$url_tail_filename\'>";
[24219]	887	my $start_doclink_gs3 = "<a href=\'_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/$url_tail_filename\'>";
	888
[8510]	889	my $srcicon = "_icon".$doc_ext."_";
	890	my $end_doclink = "</a>";
[24219]	891
[11122]	892	my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
	893
[24219]	894
[18171]	895	if (defined $pre_doc_ext && $pre_doc_ext ne "") {
[11122]	896	# for metadata such as [mp3._edited] [mp3._full] ...
	897	$doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
	898	}
	899
	900	# for multiple metadata such as [mp3.assoclink]
	901	$doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
[24219]	902
[11122]	903	$equiv_form .= " $assoc_form";
[25345]	904
	905	# following are used for greenstone 3,
	906	$doc_obj->add_utf8_metadata ($cursection, "equivDocLink", $start_doclink_gs3);
	907	$doc_obj->add_utf8_metadata ($cursection, "equivDocIcon", $srcicon);
	908	$doc_obj->add_utf8_metadata ($cursection, "/equivDocLink", $end_doclink);
	909
[8510]	910	}
	911	$doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
	912	}
[23279]	913	elsif ($field eq "gsdlzipfilename") {
	914	# special case for when files have come out of a zip. source_path
	915	# (used for archives dbs and keeping track for incremental import)
	916	# must be set to the zip file name
	917	my $zip_filename = $metadata->{$field};
	918	# overwrite the source_path
[23363]	919	$doc_obj->set_source_path($zip_filename);
[23279]	920	# and set the metadata
	921	$zip_filename = &util::filename_within_collection($zip_filename);
	922	$zip_filename = $doc_obj->encode_filename($zip_filename, $self->{'file_rename_method'});
	923	$doc_obj->add_utf8_metadata ($cursection, $field, $zip_filename);
	924	}
[8510]	925	elsif (ref ($metadata->{$field}) eq "ARRAY") {
[32501]	926	if ($field =~ /(.+?)\/\/\/Section\/([\d.]*)/m){
	927	my $field_new_name = $1;
	928	my $specified_section = $2;
	929	map {
	930	$doc_obj->add_utf8_metadata ($specified_section, $field_new_name, $_);
	931	} @{$metadata->{$field}};
	932	} else {
	933	map {
	934	$doc_obj->add_utf8_metadata ($cursection, $field, $_);
	935	} @{$metadata->{$field}};
	936	}
	937
[839]	938	} else {
[32501]	939	if ($field =~ /(.+?)\/\/\/Section\/([\d.]*)/m){
	940	my $field_new_name = $1;
	941	my $specified_section = $2;
	942	$doc_obj->add_utf8_metadata ($specified_section, $field_new_name, $metadata->{$field});
	943	} else {
	944	$doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
	945	}
[839]	946	}
	947	}
	948	}
	949
[1396]	950
[2785]	951	sub compile_stats {
	952	my $self = shift(@_);
	953	my ($stats) = @_;
	954
	955	$stats->{'num_processed'} += $self->{'num_processed'};
	956	$stats->{'num_not_processed'} += $self->{'num_not_processed'};
[2796]	957	$stats->{'num_archives'} += $self->{'num_archives'};
[2785]	958
	959	}
[22215]	960	sub associate_source_file {
	961	my $self = shift(@_);
	962
	963	my ($doc_obj, $filename) = @_;
	964	my $cursection = $doc_obj->get_top_section();
	965	my $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
	966
	967	$doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
[24225]	968	# srclink_file is now deprecated because of the "_" in the metadataname. Use srclinkFile
[22663]	969	$doc_obj->add_utf8_metadata ($cursection, "srclink_file", $doc_obj->get_sourcefile());
[24225]	970	$doc_obj->add_utf8_metadata ($cursection, "srclinkFile", $doc_obj->get_sourcefile());
[22215]	971	}
[2785]	972
[2816]	973	sub associate_cover_image {
[22215]	974	my $self = shift(@_);
[2816]	975	my ($doc_obj, $filename) = @_;
	976
[23363]	977	my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
	978
[10833]	979	$filename =~ s/\.[^\\\/\.]+$/\.jpg/;
[23363]	980	$upgraded_filename =~ s/\.[^\\\/\.]+$/\.jpg/;
	981
	982	if (exists $self->{'covers_missing_cache'}->{$upgraded_filename}) {
	983	# don't stat() for existence e.g. for multiple document input files
[10833]	984	# (eg SplitPlug)
	985	return;
	986	}
	987
[9413]	988	my $top_section=$doc_obj->get_top_section();
	989
[27306]	990	if (&FileUtils::fileExists($upgraded_filename)) {
[20778]	991	$doc_obj->associate_source_file($filename);
[13968]	992	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
[9413]	993	$doc_obj->add_utf8_metadata($top_section, "hascover", 1);
[3086]	994	} else {
[10833]	995	my $upper_filename = $filename;
[23363]	996	my $upgraded_upper_filename = $upgraded_filename;
	997
[10833]	998	$upper_filename =~ s/jpg$/JPG/;
[23363]	999	$upgraded_upper_filename =~ s/jpg$/JPG/;
	1000
[27306]	1001	if (&FileUtils::fileExists($upgraded_upper_filename)) {
[20778]	1002	$doc_obj->associate_source_file($upper_filename);
[10833]	1003	$doc_obj->associate_file($upper_filename, "cover.jpg",
	1004	"image/jpeg");
[9413]	1005	$doc_obj->add_utf8_metadata($top_section, "hascover", 1);
[10833]	1006	} else {
	1007	# file doesn't exist, so record the fact that it's missing so
	1008	# we don't stat() again (stat is slow)
[23363]	1009	$self->{'covers_missing_cache'}->{$upgraded_filename} = 1;
[3086]	1010	}
[2816]	1011	}
[10833]	1012
[2816]	1013	}
	1014
[11332]	1015
	1016	# Overridden by exploding plugins (eg. ISISPlug)
	1017	sub clean_up_after_exploding
	1018	{
	1019	my $self = shift(@_);
	1020	}
	1021
	1022
[16390]	1023
[4]	1024	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: