Context Navigation

PJPlugin.pm@ 32147

Last change on this file since 32147 was 32147, checked in by kjdon, 6 years ago
take the image and ocr files from nzdl-storage instead of copying them into import
File size: 5.6 KB

Rev	Line
[31892]	1	package PJPlugin;
	2
	3	use PagedImagePlugin;
	4
	5	use strict;
	6	no strict 'refs'; # allow filehandles to be variables and viceversa
	7	no strict 'subs';
	8
	9	sub BEGIN {
	10	@PJPlugin::ISA = ('PagedImagePlugin');
	11	}
	12
	13	my $arguments = [];
	14
	15	my $options = { 'name' => "PJPlugin",
	16	'desc' => "PagedImagePlugin variant for pei-jones written-works collection",
	17	'abstract' => "no",
	18	'inherits' => "yes",
	19	'args' => $arguments };
	20
[32147]	21	my $files_dir ="/nzdl-storage/other-projects/pei-jones/Jones_Collection/";
	22
	23	sub begin {
	24	my $self = shift (@_);
	25	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
	26
	27	# Save base_dir for use in file cache
	28	$self->{'base_dir'} = $files_dir;
	29	}
	30
	31
[31892]	32	sub new {
	33	my ($class) = shift (@_);
	34	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	35	push(@$pluginlist, $class);
	36
	37	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	38	push(@{$hashArgOptLists->{"OptList"}},$options);
	39
	40	my $self = new PagedImagePlugin($pluginlist, $inputargs, $hashArgOptLists);
	41
	42	return bless $self, $class;
	43	}
	44
	45
	46	sub process_item {
	47	my $self = shift (@_);
	48	my ($filename_full_path, $dir, $filename_no_path, $processor, $metadata) = @_;
	49
	50	my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
	51	$self->set_initial_doc_fields($doc_obj, $filename_full_path, $processor, $metadata);
	52	my $topsection = $doc_obj->get_top_section();
	53	# simple item files are always paged unless user specified
	54	if ($self->{'documenttype'} eq "auto") {
	55	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "paged");
	56	} else {
	57	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", $self->{'documenttype'});
	58	}
	59	open (ITEMFILE, "<:encoding(UTF-8)", $filename_full_path) \|\| die "couldn't open $filename_full_path\n";
	60	my $line = "";
	61	my $num = 0;
	62	while (defined ($line = <ITEMFILE>)) {
	63
	64	next unless $line =~ /\w/;
	65	chomp $line;
	66	next if $line =~ /^#/; # ignore comment lines
	67	if ($line =~ /^<([^>])>\s(.?)\s$/) {
	68	my $meta_name = $1;
	69	my $meta_value = $2;
	70	#if ($meta_name =~ /\./) {
	71	# $meta_name = "ex.$meta_name";
	72	# }
	73	# PJ mod:
	74	if ($meta_value !~ /^unknown$/) {
	75	# don't add in unknown values
	76	# set all metadata at pj.
	77	$doc_obj->set_utf8_metadata_element ($topsection, "pj.".$meta_name, $meta_value);
	78	}
	79	#$meta->{$1} = $2;
	80	} else {
	81	$num++;
	82	# line should be like page:imagefilename:textfilename:r - the r is optional -> means rotate the image 180 deg
	83	$line =~ s/^\s+//; #remove space at the front
	84	$line =~ s/\s+$//; #remove space at the end
	85	my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
	86
	87	# PJ: use jpg versions instead of tif versions
[31940]	88	#$imgname =~ s/tif/jpg/g;
	89	#print STDERR "new img name=$imgname\n";
[31892]	90	# create a new section for each image file
	91	my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
	92	# the page number becomes the Title
	93	$doc_obj->set_utf8_metadata_element($cursection, 'Title', $pagenum);
	94
[32147]	95	print STDERR "image file $files_dir$imgname\n";
	96
[31892]	97	# process the image for this page if there is one
	98	if (defined $imgname && $imgname ne "") {
[32147]	99	my $result1 = $self->process_image($files_dir.$imgname, $imgname, $doc_obj, $cursection, $rotate);
[31892]	100	if (!defined $result1)
	101	{
[32147]	102	print "PagedImagePlugin: couldn't process image \"$files_dir$imgname\" for item \"$filename_full_path\"\n";
[31892]	103	}
	104	}
	105	# process the text file if one is there
	106	if (defined $txtname && $txtname ne "") {
[32147]	107	my $result2 = $self->process_text ($files_dir.$txtname, $txtname, $doc_obj, $cursection);
[31892]	108
	109	if (!defined $result2) {
[32147]	110	print "PagedImagePlugin: couldn't process text file \"$files_dir.$txtname\" for item \"$filename_full_path\"\n";
[31892]	111	$self->add_dummy_text($doc_obj, $cursection);
	112	}
	113	} else {
	114	# otherwise add in some dummy text
	115	$self->add_dummy_text($doc_obj, $cursection);
	116	}
	117	}
	118	}
	119
	120	close ITEMFILE;
	121
	122	# add numpages metadata
	123	$doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', "$num");
	124
	125	$doc_obj->set_utf8_metadata_element($topsection,"MaxImageWidth",$self->{'MaxImageWidth'});
	126	$doc_obj->set_utf8_metadata_element($topsection,"MaxImageHeight",$self->{'MaxImageHeight'});
	127	$self->{'MaxImageWidth'} = undef;
	128	$self->{'MaxImageHeight'} = undef;
	129
	130
	131	return $doc_obj;
	132	}
	133
	134	sub scan_item_for_files_to_block
	135	{
	136	my $self = shift (@_);
	137	my ($filename_full_path, $dir, $block_hash) = @_;
	138
	139
	140	open (ITEMFILE, $filename_full_path) \|\| die "couldn't open $filename_full_path to work out which files to block\n";
	141	my $line = "";
	142	while (defined ($line = <ITEMFILE>)) {
	143	next unless $line =~ /\w/;
	144	chomp $line;
	145	next if $line =~ /^#/; # ignore comment lines
	146	next if ($line =~ /^<([^>])>\s(.?)\s$/); # ignore metadata lines
	147	# line should be like page:imagefilename:textfilename:r
	148	$line =~ s/^\s+//; #remove space at the front
	149	$line =~ s/\s+$//; #remove space at the end
	150	my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
	151
	152	# PJ: use jpg versions instead of tif versions
[31940]	153	#$imgname =~ s/tif/jpg/g;
[31892]	154
	155	# find the image file if there is one
	156	if (defined $imgname && $imgname ne "") {
	157	$self->block_raw_filename($block_hash, &FileUtils::filenameConcatenate( $dir,$imgname));
	158	}
	159	# find the text file if there is one
	160	if (defined $txtname && $txtname ne "") {
	161	$self->block_raw_filename($block_hash, &FileUtils::filenameConcatenate($dir,$txtname));
	162	}
	163	}
	164	close ITEMFILE;
	165
	166	}
	167
	168	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/model-sites-dev/pei-jones/collect/written-works/perllib/plugins/PJPlugin.pm@ 32147

Download in other formats: