Context Navigation

PJPlugin.pm@ 32147

Last change on this file since 32147 was 32147, checked in by kjdon, 6 years ago
take the image and ocr files from nzdl-storage instead of copying them into import
File size: 5.6 KB

Line
1	package PJPlugin;
2
3	use PagedImagePlugin;
4
5	use strict;
6	no strict 'refs'; # allow filehandles to be variables and viceversa
7	no strict 'subs';
8
9	sub BEGIN {
10	@PJPlugin::ISA = ('PagedImagePlugin');
11	}
12
13	my $arguments = [];
14
15	my $options = { 'name' => "PJPlugin",
16	'desc' => "PagedImagePlugin variant for pei-jones written-works collection",
17	'abstract' => "no",
18	'inherits' => "yes",
19	'args' => $arguments };
20
21	my $files_dir ="/nzdl-storage/other-projects/pei-jones/Jones_Collection/";
22
23	sub begin {
24	my $self = shift (@_);
25	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
26
27	# Save base_dir for use in file cache
28	$self->{'base_dir'} = $files_dir;
29	}
30
31
32	sub new {
33	my ($class) = shift (@_);
34	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
35	push(@$pluginlist, $class);
36
37	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
38	push(@{$hashArgOptLists->{"OptList"}},$options);
39
40	my $self = new PagedImagePlugin($pluginlist, $inputargs, $hashArgOptLists);
41
42	return bless $self, $class;
43	}
44
45
46	sub process_item {
47	my $self = shift (@_);
48	my ($filename_full_path, $dir, $filename_no_path, $processor, $metadata) = @_;
49
50	my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
51	$self->set_initial_doc_fields($doc_obj, $filename_full_path, $processor, $metadata);
52	my $topsection = $doc_obj->get_top_section();
53	# simple item files are always paged unless user specified
54	if ($self->{'documenttype'} eq "auto") {
55	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "paged");
56	} else {
57	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", $self->{'documenttype'});
58	}
59	open (ITEMFILE, "<:encoding(UTF-8)", $filename_full_path) \|\| die "couldn't open $filename_full_path\n";
60	my $line = "";
61	my $num = 0;
62	while (defined ($line = <ITEMFILE>)) {
63
64	next unless $line =~ /\w/;
65	chomp $line;
66	next if $line =~ /^#/; # ignore comment lines
67	if ($line =~ /^<([^>])>\s(.?)\s$/) {
68	my $meta_name = $1;
69	my $meta_value = $2;
70	#if ($meta_name =~ /\./) {
71	# $meta_name = "ex.$meta_name";
72	# }
73	# PJ mod:
74	if ($meta_value !~ /^unknown$/) {
75	# don't add in unknown values
76	# set all metadata at pj.
77	$doc_obj->set_utf8_metadata_element ($topsection, "pj.".$meta_name, $meta_value);
78	}
79	#$meta->{$1} = $2;
80	} else {
81	$num++;
82	# line should be like page:imagefilename:textfilename:r - the r is optional -> means rotate the image 180 deg
83	$line =~ s/^\s+//; #remove space at the front
84	$line =~ s/\s+$//; #remove space at the end
85	my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
86
87	# PJ: use jpg versions instead of tif versions
88	#$imgname =~ s/tif/jpg/g;
89	#print STDERR "new img name=$imgname\n";
90	# create a new section for each image file
91	my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
92	# the page number becomes the Title
93	$doc_obj->set_utf8_metadata_element($cursection, 'Title', $pagenum);
94
95	print STDERR "image file $files_dir$imgname\n";
96
97	# process the image for this page if there is one
98	if (defined $imgname && $imgname ne "") {
99	my $result1 = $self->process_image($files_dir.$imgname, $imgname, $doc_obj, $cursection, $rotate);
100	if (!defined $result1)
101	{
102	print "PagedImagePlugin: couldn't process image \"$files_dir$imgname\" for item \"$filename_full_path\"\n";
103	}
104	}
105	# process the text file if one is there
106	if (defined $txtname && $txtname ne "") {
107	my $result2 = $self->process_text ($files_dir.$txtname, $txtname, $doc_obj, $cursection);
108
109	if (!defined $result2) {
110	print "PagedImagePlugin: couldn't process text file \"$files_dir.$txtname\" for item \"$filename_full_path\"\n";
111	$self->add_dummy_text($doc_obj, $cursection);
112	}
113	} else {
114	# otherwise add in some dummy text
115	$self->add_dummy_text($doc_obj, $cursection);
116	}
117	}
118	}
119
120	close ITEMFILE;
121
122	# add numpages metadata
123	$doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', "$num");
124
125	$doc_obj->set_utf8_metadata_element($topsection,"MaxImageWidth",$self->{'MaxImageWidth'});
126	$doc_obj->set_utf8_metadata_element($topsection,"MaxImageHeight",$self->{'MaxImageHeight'});
127	$self->{'MaxImageWidth'} = undef;
128	$self->{'MaxImageHeight'} = undef;
129
130
131	return $doc_obj;
132	}
133
134	sub scan_item_for_files_to_block
135	{
136	my $self = shift (@_);
137	my ($filename_full_path, $dir, $block_hash) = @_;
138
139
140	open (ITEMFILE, $filename_full_path) \|\| die "couldn't open $filename_full_path to work out which files to block\n";
141	my $line = "";
142	while (defined ($line = <ITEMFILE>)) {
143	next unless $line =~ /\w/;
144	chomp $line;
145	next if $line =~ /^#/; # ignore comment lines
146	next if ($line =~ /^<([^>])>\s(.?)\s$/); # ignore metadata lines
147	# line should be like page:imagefilename:textfilename:r
148	$line =~ s/^\s+//; #remove space at the front
149	$line =~ s/\s+$//; #remove space at the end
150	my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
151
152	# PJ: use jpg versions instead of tif versions
153	#$imgname =~ s/tif/jpg/g;
154
155	# find the image file if there is one
156	if (defined $imgname && $imgname ne "") {
157	$self->block_raw_filename($block_hash, &FileUtils::filenameConcatenate( $dir,$imgname));
158	}
159	# find the text file if there is one
160	if (defined $txtname && $txtname ne "") {
161	$self->block_raw_filename($block_hash, &FileUtils::filenameConcatenate($dir,$txtname));
162	}
163	}
164	close ITEMFILE;
165
166	}
167
168	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/model-sites-dev/pei-jones/collect/written-works/perllib/plugins/PJPlugin.pm@ 32147

Download in other formats: