Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: main/trunk/greenstone2/perllib/plugins/CSVPlugin.pm@ 36480

Last change on this file since 36480 was 36480, checked in by kjdon, 20 months ago
new and improved CSVPlugin - handles input encoding, spaces inside fields, assigning metadata to source docs as well as generating dummy docs
File size: 13.3 KB

Rev	Line
[36480]	1	###########################################################################
	2	#
	3	# CSVPlugin.pm -- A plugin for files in comma-separated value format
	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
	9	# Copyright 2006 New Zealand Digital Library Project
	10	#
	11	# This program is free software; you can redistribute it and/or modify
	12	# it under the terms of the GNU General Public License as published by
	13	# the Free Software Foundation; either version 2 of the License, or
	14	# (at your option) any later version.
	15	#
	16	# This program is distributed in the hope that it will be useful,
	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	# GNU General Public License for more details.
	20	#
	21	# You should have received a copy of the GNU General Public License
	22	# along with this program; if not, write to the Free Software
	23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	#
	25	###########################################################################
	26
	27	package NewCSVPlugin;
	28
	29	use extrametautil;
	30
	31	use ReadTextFile;
	32	use SplitTextFile; # for a couple routines, but we not inheriting
	33	use MetadataRead;
	34	use CSVFieldSeparator;
	35
	36	use strict;
	37	no strict 'refs'; # allow filehandles to be variables and viceversa
	38
	39	use Text::CSV;
	40
	41	sub BEGIN {
	42	@NewCSVPlugin::ISA = ('MetadataRead', 'ReadTextFile', 'CSVFieldSeparator');
	43	binmode(STDERR, ":utf8");
	44
	45	}
	46
	47
	48	my $arguments =
	49	[
	50	{ 'name' => "process_exp",
	51	'desc' => "{BaseImporter.process_exp}",
	52	'type' => "regexp",
	53	'reqd' => "no",
	54	'deft' => &get_default_process_exp() },
	55	{ 'name' => "filename_field",
	56	'desc' => "{CSVPlugin.filename_field}",
	57	'type' => "string",
	58	'reqd' => "no",
	59	'deft' => "Filename" }
	60	];
	61
	62
	63	my $options = { 'name' => "NewCSVPlugin",
	64	'desc' => "{CSVPlugin.desc}",
	65	'abstract' => "no",
	66	'inherits' => "yes",
	67	'explodes' => "yes",
	68	'args' => $arguments };
	69
	70
	71	# This plugin processes files with the suffix ".csv"
	72	sub get_default_process_exp {
	73	return q^(?i)(\.csv)$^;
	74	}
	75
	76	sub new
	77	{
	78	my ($class) = shift (@_);
	79	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	80	push(@$pluginlist, $class);
	81
	82	push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
	83	push(@{$hashArgOptLists->{"OptList"}}, $options);
	84
	85	new CSVFieldSeparator($pluginlist, $inputargs, $hashArgOptLists);
	86	my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
	87
	88	$self->{'textcat_store'} = {};
	89	$self->{'metapass_srcdoc'} = {}; # which segments have valid metadata_srcdoc
	90	return bless $self, $class;
	91	}
	92
	93	# mark the file as a metadata file
	94	sub file_block_read {
	95	my $self = shift (@_);
	96	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
	97
	98	my $filename_full_path = &FileUtils::filenameConcatenate($base_dir, $file);
	99	return undef unless $self->can_process_this_file($filename_full_path);
	100
	101	if (($ENV{'GSDLOS'} =~ m/^windows$/) && ($^O ne "cygwin")) {
	102	# convert to full name - paths stored in block hash are long filenames
	103	$filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
	104	}
	105	# kjdon - upgrade method converts everyhting to lower case drive letter.
	106	# so would we need the following stuff???
	107	# my $lower_drive = $filename_full_path;
	108	# $lower_drive =~ s/^([A-Z]):/\l$1:/i;
	109
	110	# my $upper_drive = $filename_full_path;
	111	# $upper_drive =~ s/^([A-Z]):/\u$1:/i;
	112
	113	# $block_hash->{'metadata_files'}->{$lower_drive} = 1;
	114	# $block_hash->{'metadata_files'}->{$upper_drive} = 1;
	115
	116	# }
	117	# else {
	118	### $block_hash->{'metadata_files'}->{$filename_full_path} = 1;
	119	# }
	120	$block_hash->{'metadata_files'}->{$filename_full_path} = 1;
	121	return undef; #1
	122	}
	123
	124	sub metadata_read
	125	{
	126	my $self = shift (@_);
	127	my ($pluginfo, $base_dir, $file, $block_hash,
	128	$extrametakeys, $extrametadata, $extrametafile,
	129	$processor, $gli, $aux) = @_;
	130
	131	# can we process this file??
	132	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
	133	return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
	134
	135	print STDERR "\n<Processing n='$file' p='NewCSVPlugin'>\n" if ($gli);
	136	print STDERR "NewCSVPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
	137
	138	my $outhandle = $self->{'outhandle'};
	139	my $failhandle = $self->{'failhandle'};
	140	my $verbosity = $self->{'verbosity'};
	141
	142	# don't add to block list, as we may do some processing in read.
	143
	144	# Do encoding stuff
	145	my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
	146	if ($self->{'verbosity'} > 2) {
	147	print $outhandle "NewCSVPlugin: reading $file as ($content_encoding,$language)\n";
	148	}
	149	# store these values for read
	150	my $le_rec = { 'language' => $language, 'encoding' => $content_encoding };
	151	$self->{'textcat_store'}->{$file} = $le_rec;
	152
	153	my $metadata_store = {};
	154	$self->{'metadata_store'}->{$file} = $metadata_store; # used to record metadata for segments with no src doc
	155
	156	my $CSV_FILE;
	157	open($CSV_FILE, "<:encoding($content_encoding)", "$filename_full_path");
	158	my $separate_char = $self->{'csv_field_separator'};
	159
	160	my $md_val_sep = $self->{'metadata_value_separator'};
	161	undef $md_val_sep if ($md_val_sep eq "");
	162
	163	my $csv_file_field_line;
	164	if ($separate_char =~ m/^auto$/i) {
	165
	166	$csv_file_field_line = <$CSV_FILE>;
	167	$separate_char = $self->resolve_auto($csv_file_field_line,$self->{'plugin_type'});
	168	seek $CSV_FILE, 0, 0; # move pointer back to start of file, as we want to read in the fields using csv.
	169	}
	170
	171	my $csv = Text::CSV->new();
	172	$csv->sep_char($separate_char);
	173	$csv->binary(1);
	174
	175	my @csv_file_fields = undef;
	176
	177	my $first_row = $csv->getline ($CSV_FILE);
	178	if (defined $first_row) {
	179	@csv_file_fields = @$first_row;
	180	}
	181	else {
	182	$self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "Error: Badly formatted CSV header line: $csv_file_field_line");
	183	return -1;
	184	}
	185
	186	my $found_filename_field = 0;
	187	my $filename_field = $self->{'filename_field'};
	188	print STDERR "looking for $filename_field field\n";
	189	for (my $i = 0; $i < scalar(@csv_file_fields); $i++) {
	190	# Remove any spaces from the field names, and surrounding quotes too
	191	$csv_file_fields[$i] =~ s/ //g;
	192	$csv_file_fields[$i] =~ s/^"//;
	193	$csv_file_fields[$i] =~ s/"$//;
	194
	195	if ($csv_file_fields[$i] eq $filename_field) {
	196	$found_filename_field = 1;
	197	}
	198	}
	199
	200
	201	if (!$found_filename_field) {
	202	$self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "No $filename_field field in CSV file, metadata cannot be assigned to documents, will use metadata only dummy documents");
	203
	204	}
	205
	206	my $count = 0;
	207	while (my $csv_line = $csv->getline($CSV_FILE)) {
	208	my @md_vals = @$csv_line;
	209	$count++;
	210
	211	# Build a hash of metadata name to metadata value for this line
	212	my %csv_line_metadata;
	213
	214	my $md_vals_len = scalar(@md_vals);
	215
	216	for (my $i=0; $i<$md_vals_len; $i++) {
	217	my $md_val = $md_vals[$i];
	218	print STDERR "$count: md val = $md_val\n";
	219	# Only bother with non-empty values
	220	if ($md_val ne "" && defined($csv_file_fields[$i])) {
	221
	222	my $md_name = $csv_file_fields[$i];
	223	$csv_line_metadata{$md_name} = [];
	224	if (defined $md_val_sep) {
	225
	226	my @within_md_vals = split(/${md_val_sep}/,$md_val);
	227
	228	# protect square brackets in metadata values by hex entity encoding them
	229	# As unescaped square bracket chars in metadata
	230	# have special meaning in GS' Java runtime code
	231	my @escaped_within_md_vals = ();
	232	for my $meta_value (@within_md_vals) {
	233
	234	$meta_value =~ s/\[/&\#091;/g;
	235	$meta_value =~ s/\]/&\#093;/g;
	236	push(@escaped_within_md_vals, $meta_value);
	237	}
	238	push (@{$csv_line_metadata{$md_name}}, @escaped_within_md_vals);
	239	}
	240	else {
	241	# protect square brackets in metadata values by hex entity encoding them
	242	my $escaped_metadata_value = $md_val;
	243	$escaped_metadata_value =~ s/\[/&\#091;/g;
	244	$escaped_metadata_value =~ s/\]/&\#093;/g;
	245	push (@{$csv_line_metadata{$md_name}}, $escaped_metadata_value);
	246	}
	247	}
	248	}
	249
	250	my $csv_line_section_array = $csv_line_metadata{"Section"};
	251	my $section_suffix = "";
	252	if (defined $csv_line_section_array) {
	253	my $section_value = shift(@$csv_line_section_array);
	254	if ($section_value =~ /[\d.]+/m){
	255	my $section_suffix = "///Section/" . $section_value;
	256	foreach my $metaname (keys %csv_line_metadata) {
	257	my $new_name = $metaname . $section_suffix;
	258	$csv_line_metadata{$new_name} = delete $csv_line_metadata{$metaname};
	259	}
	260	} else{
	261	unshift(@$csv_line_section_array, $section_value);
	262	}
	263	}
	264
	265	# do we have filename field?
	266	# We can't associate any metadata without knowing the file to associate it with
	267	my $has_srcdoc = 0;
	268	my $csv_line_filename="";;
	269	if ($found_filename_field) {
	270	# is there a srcdoc mentioned?
	271	my $csv_line_filename_array = $csv_line_metadata{$filename_field};
	272	if (!defined $csv_line_filename_array) {
	273	$self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "No $filename_field metadata in CSV line num $count");
	274	} else {
	275	$csv_line_filename = shift(@$csv_line_filename_array);
	276	# TODO - have an option for whether we do this or not
	277	if (&FileUtils::fileExists(&FileUtils::filenameConcatenate($base_dir, $csv_line_filename))) {
	278	$has_srcdoc = 1;
	279
	280	delete $csv_line_metadata{$filename_field};
	281	}
	282	}
	283
	284	}
	285	if ($has_srcdoc) {
	286	print STDERR "storing meta in extra meta for $csv_line_filename\n";
	287	$self->store_meta_in_extrametadata($csv_line_filename, \%csv_line_metadata, $file, $filename_full_path, $extrametakeys, $extrametadata, $extrametafile);
	288	} else {
	289	print STDERR "storing meta for $count, $csv_line_filename\n";
	290	$metadata_store->{$count} = \%csv_line_metadata;
	291	}
	292	} # while csv_line = csv->getline
	293	close ($CSV_FILE);
	294	}
	295
	296	#adapted from read in splittextfile
	297	sub read {
	298	my $self = shift (@_);
	299	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
	300	my $outhandle = $self->{'outhandle'};
	301	my $verbosity = $self->{'verbosity'};
	302
	303	# can we process this file??
	304	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
	305	return undef unless $self->can_process_this_file($filename_full_path);
	306
	307	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
	308
	309	my $le_rec = $self->{'textcat_store'}->{$file};
	310	if (!defined $le_rec) {
	311	# means no text was found;
	312	return 0; # not processed but no point in passing it on
	313	}
	314
	315	print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
	316	print $outhandle "$self->{'plugin_type'} processing $file\n"
	317	if $self->{'verbosity'} > 1;
	318
	319	my $language = $le_rec->{'language'};
	320	my $encoding = $le_rec->{'encoding'};
	321	$self->{'textcat_store'}->{$file} = undef;
	322
	323	my $metadata_store = $self->{'metadata_store'}->{$file}; # a hash of seg num to metadata hash
	324
	325
	326	# Process each segment in turn
	327	my $segment = 0; #which segment/record number we have
	328	my $count = 0; # num doc objs produced
	329
	330	my ($filemeta) = $file =~ /([^\\\/]+)$/; #why?
	331	my $plugin_filename_encoding = $self->{'filename_encoding'};
	332	my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
	333
	334	my $id;
	335	print STDERR "num keys = ".scalar(keys (%$metadata_store))."\n";
	336	foreach $segment (sort keys (%$metadata_store)) {
	337	print $outhandle "processing segment $segment\n"
	338	if $self->{'verbosity'} > 1;
	339	$count++;
	340	# create a new document
	341	my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
	342	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
	343	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
	344
	345	$self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
	346
	347	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment");
	348	if ($self->{'cover_image'}) {
	349	$self->associate_cover_image($doc_obj, $filename_full_path);
	350	}
	351	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
	352
	353	# include any metadata passed in from previous plugins
	354	# note that this metadata is associated with the top level section
	355	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
	356
	357	# add our stored metadata from metadata_read pass
	358	my $segment_metadata = $metadata_store->{$segment};
	359	$self->extra_metadata($doc_obj, $doc_obj->get_top_section(), $segment_metadata);
	360
	361	# do any automatic metadata extraction - does this make sense??
	362	#$self->auto_extract_metadata ($doc_obj);
	363
	364	# Calculate a "base" document ID.
	365	if (!defined $id) {
	366	$id = &SplitTextFile::get_base_OID($self,$doc_obj);
	367	}
	368
	369	# add an OID
	370	&SplitTextFile::add_segment_OID($self, $doc_obj, $id, $segment);
	371
	372	# process the document
	373	$processor->process($doc_obj);
	374
	375	$self->{'num_processed'} ++;
	376	if ($maxdocs != -1 && $self->{'num_processed'} >= $maxdocs) {
	377	last;
	378	}
	379	}
	380
	381	delete $self->{'metadata_store'}->{$file};
	382
	383	# Return number of document objects produced
	384	return $count;
	385	}
	386
	387	sub print_error
	388	{
	389
	390	my $self = shift(@_);
	391	my ($outhandle, $failhandle, $gli, $file, $error) = @_;
	392
	393	print $outhandle "NewCSVPlugin Error: $file: $error\n";
	394	print $failhandle "NewCSVPlugin Error: $file: $error\n";
	395	print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli);
	396	}
	397
	398
	399	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: