source: main/trunk/model-sites-dev/pei-jones/collect/written-works/perllib/plugins/PJPlugin.pm@ 32147

Last change on this file since 32147 was 32147, checked in by kjdon, 6 years ago

take the image and ocr files from nzdl-storage instead of copying them into import

File size: 5.6 KB
RevLine 
[31892]1package PJPlugin;
2
3use PagedImagePlugin;
4
5use strict;
6no strict 'refs'; # allow filehandles to be variables and viceversa
7no strict 'subs';
8
9sub BEGIN {
10 @PJPlugin::ISA = ('PagedImagePlugin');
11}
12
13my $arguments = [];
14
15my $options = { 'name' => "PJPlugin",
16 'desc' => "PagedImagePlugin variant for pei-jones written-works collection",
17 'abstract' => "no",
18 'inherits' => "yes",
19 'args' => $arguments };
20
[32147]21my $files_dir ="/nzdl-storage/other-projects/pei-jones/Jones_Collection/";
22
23sub begin {
24 my $self = shift (@_);
25 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
26
27 # Save base_dir for use in file cache
28 $self->{'base_dir'} = $files_dir;
29}
30
31
[31892]32sub new {
33 my ($class) = shift (@_);
34 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
35 push(@$pluginlist, $class);
36
37 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
38 push(@{$hashArgOptLists->{"OptList"}},$options);
39
40 my $self = new PagedImagePlugin($pluginlist, $inputargs, $hashArgOptLists);
41
42 return bless $self, $class;
43}
44
45
46sub process_item {
47 my $self = shift (@_);
48 my ($filename_full_path, $dir, $filename_no_path, $processor, $metadata) = @_;
49
50 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
51 $self->set_initial_doc_fields($doc_obj, $filename_full_path, $processor, $metadata);
52 my $topsection = $doc_obj->get_top_section();
53 # simple item files are always paged unless user specified
54 if ($self->{'documenttype'} eq "auto") {
55 $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "paged");
56 } else {
57 $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", $self->{'documenttype'});
58 }
59 open (ITEMFILE, "<:encoding(UTF-8)", $filename_full_path) || die "couldn't open $filename_full_path\n";
60 my $line = "";
61 my $num = 0;
62 while (defined ($line = <ITEMFILE>)) {
63
64 next unless $line =~ /\w/;
65 chomp $line;
66 next if $line =~ /^#/; # ignore comment lines
67 if ($line =~ /^<([^>]*)>\s*(.*?)\s*$/) {
68 my $meta_name = $1;
69 my $meta_value = $2;
70 #if ($meta_name =~ /\./) {
71 # $meta_name = "ex.$meta_name";
72 # }
73 # PJ mod:
74 if ($meta_value !~ /^unknown$/) {
75 # don't add in unknown values
76 # set all metadata at pj.
77 $doc_obj->set_utf8_metadata_element ($topsection, "pj.".$meta_name, $meta_value);
78 }
79 #$meta->{$1} = $2;
80 } else {
81 $num++;
82 # line should be like page:imagefilename:textfilename:r - the r is optional -> means rotate the image 180 deg
83 $line =~ s/^\s+//; #remove space at the front
84 $line =~ s/\s+$//; #remove space at the end
85 my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
86
87 # PJ: use jpg versions instead of tif versions
[31940]88 #$imgname =~ s/tif/jpg/g;
89 #print STDERR "new img name=$imgname\n";
[31892]90 # create a new section for each image file
91 my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
92 # the page number becomes the Title
93 $doc_obj->set_utf8_metadata_element($cursection, 'Title', $pagenum);
94
[32147]95 print STDERR "image file $files_dir$imgname\n";
96
[31892]97 # process the image for this page if there is one
98 if (defined $imgname && $imgname ne "") {
[32147]99 my $result1 = $self->process_image($files_dir.$imgname, $imgname, $doc_obj, $cursection, $rotate);
[31892]100 if (!defined $result1)
101 {
[32147]102 print "PagedImagePlugin: couldn't process image \"$files_dir$imgname\" for item \"$filename_full_path\"\n";
[31892]103 }
104 }
105 # process the text file if one is there
106 if (defined $txtname && $txtname ne "") {
[32147]107 my $result2 = $self->process_text ($files_dir.$txtname, $txtname, $doc_obj, $cursection);
[31892]108
109 if (!defined $result2) {
[32147]110 print "PagedImagePlugin: couldn't process text file \"$files_dir.$txtname\" for item \"$filename_full_path\"\n";
[31892]111 $self->add_dummy_text($doc_obj, $cursection);
112 }
113 } else {
114 # otherwise add in some dummy text
115 $self->add_dummy_text($doc_obj, $cursection);
116 }
117 }
118 }
119
120 close ITEMFILE;
121
122 # add numpages metadata
123 $doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', "$num");
124
125 $doc_obj->set_utf8_metadata_element($topsection,"MaxImageWidth",$self->{'MaxImageWidth'});
126 $doc_obj->set_utf8_metadata_element($topsection,"MaxImageHeight",$self->{'MaxImageHeight'});
127 $self->{'MaxImageWidth'} = undef;
128 $self->{'MaxImageHeight'} = undef;
129
130
131 return $doc_obj;
132}
133
134sub scan_item_for_files_to_block
135{
136 my $self = shift (@_);
137 my ($filename_full_path, $dir, $block_hash) = @_;
138
139
140 open (ITEMFILE, $filename_full_path) || die "couldn't open $filename_full_path to work out which files to block\n";
141 my $line = "";
142 while (defined ($line = <ITEMFILE>)) {
143 next unless $line =~ /\w/;
144 chomp $line;
145 next if $line =~ /^#/; # ignore comment lines
146 next if ($line =~ /^<([^>]*)>\s*(.*?)\s*$/); # ignore metadata lines
147 # line should be like page:imagefilename:textfilename:r
148 $line =~ s/^\s+//; #remove space at the front
149 $line =~ s/\s+$//; #remove space at the end
150 my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
151
152 # PJ: use jpg versions instead of tif versions
[31940]153 #$imgname =~ s/tif/jpg/g;
[31892]154
155 # find the image file if there is one
156 if (defined $imgname && $imgname ne "") {
157 $self->block_raw_filename($block_hash, &FileUtils::filenameConcatenate( $dir,$imgname));
158 }
159 # find the text file if there is one
160 if (defined $txtname && $txtname ne "") {
161 $self->block_raw_filename($block_hash, &FileUtils::filenameConcatenate($dir,$txtname));
162 }
163 }
164 close ITEMFILE;
165
166}
167
1681;
Note: See TracBrowser for help on using the repository browser.