root/main/trunk/model-sites-dev/pei-jones/collect/written-works/perllib/plugins/PJPlugin.pm @ 31892

Revision 31892, 5.2 KB (checked in by kjdon, 3 years ago)

new plugin for importing the item files

Line 
1package PJPlugin;
2
3use PagedImagePlugin;
4
5use strict;
6no strict 'refs'; # allow filehandles to be variables and viceversa
7no strict 'subs';
8
9sub BEGIN {
10    @PJPlugin::ISA = ('PagedImagePlugin');
11}
12
13my $arguments = [];
14
15my $options = { 'name'     => "PJPlugin",
16                'desc'     => "PagedImagePlugin variant for pei-jones written-works collection",
17                'abstract' => "no",
18                'inherits' => "yes",
19                'args'     => $arguments };
20
21sub new {
22    my ($class) = shift (@_);
23    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
24    push(@$pluginlist, $class);
25
26    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
27    push(@{$hashArgOptLists->{"OptList"}},$options);
28
29    my $self = new PagedImagePlugin($pluginlist, $inputargs, $hashArgOptLists);
30
31    return bless $self, $class;
32}
33
34
35sub process_item {
36    my $self = shift (@_);
37    my ($filename_full_path, $dir, $filename_no_path, $processor, $metadata) = @_;
38
39    my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
40    $self->set_initial_doc_fields($doc_obj, $filename_full_path, $processor, $metadata);
41    my $topsection = $doc_obj->get_top_section();
42    # simple item files are always paged unless user specified
43    if ($self->{'documenttype'} eq "auto") {
44    $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "paged");
45    } else {
46    $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", $self->{'documenttype'});
47    }
48    open (ITEMFILE, "<:encoding(UTF-8)", $filename_full_path) || die "couldn't open $filename_full_path\n";
49    my $line = "";
50    my $num = 0;
51    while (defined ($line = <ITEMFILE>)) {
52   
53    next unless $line =~ /\w/;
54    chomp $line;
55    next if $line =~ /^#/; # ignore comment lines
56    if ($line =~ /^<([^>]*)>\s*(.*?)\s*$/) {
57        my $meta_name = $1;
58        my $meta_value = $2;
59        #if ($meta_name =~ /\./) {
60        #   $meta_name = "ex.$meta_name";
61        # }
62        # PJ mod:
63        if ($meta_value !~ /^unknown$/) {
64        # don't add in unknown values
65        # set all metadata at pj.
66        $doc_obj->set_utf8_metadata_element ($topsection, "pj.".$meta_name, $meta_value);
67        }
68        #$meta->{$1} = $2;
69    } else {
70        $num++;
71        # line should be like page:imagefilename:textfilename:r - the r is optional -> means rotate the image 180 deg
72        $line =~ s/^\s+//; #remove space at the front
73        $line =~ s/\s+$//; #remove space at the end
74        my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
75
76        # PJ: use jpg versions instead of tif versions
77        $imgname =~ s/tif/jpg/g;
78        print STDERR "new img name=$imgname\n";
79        # create a new section for each image file
80        my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
81        # the page number becomes the Title
82        $doc_obj->set_utf8_metadata_element($cursection, 'Title', $pagenum);
83     
84        # process the image for this page if there is one
85        if (defined $imgname && $imgname ne "") {
86        my $result1 = $self->process_image($dir.$imgname, $imgname, $doc_obj, $cursection, $rotate);
87        if (!defined $result1)
88        {
89            print "PagedImagePlugin: couldn't process image \"$dir$imgname\" for item \"$filename_full_path\"\n";
90        }
91        }
92        # process the text file if one is there
93        if (defined $txtname && $txtname ne "") {
94        my $result2 = $self->process_text ($dir.$txtname, $txtname, $doc_obj, $cursection);
95               
96        if (!defined $result2) {
97            print "PagedImagePlugin: couldn't process text file \"$dir.$txtname\" for item \"$filename_full_path\"\n";
98            $self->add_dummy_text($doc_obj, $cursection);
99        }
100        } else {
101        # otherwise add in some dummy text
102        $self->add_dummy_text($doc_obj, $cursection);
103        }
104    }
105    }
106   
107    close ITEMFILE;
108
109    # add numpages metadata
110    $doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', "$num");
111
112    $doc_obj->set_utf8_metadata_element($topsection,"MaxImageWidth",$self->{'MaxImageWidth'});
113    $doc_obj->set_utf8_metadata_element($topsection,"MaxImageHeight",$self->{'MaxImageHeight'});
114    $self->{'MaxImageWidth'} = undef;
115    $self->{'MaxImageHeight'} = undef;
116
117
118    return $doc_obj;
119}
120
121sub scan_item_for_files_to_block
122{
123    my $self = shift (@_);
124    my ($filename_full_path, $dir, $block_hash) = @_;
125
126
127    open (ITEMFILE, $filename_full_path) || die "couldn't open $filename_full_path to work out which files to block\n";
128    my $line = "";
129    while (defined ($line = <ITEMFILE>)) {
130    next unless $line =~ /\w/;
131    chomp $line;
132    next if $line =~ /^#/; # ignore comment lines
133    next if ($line =~ /^<([^>]*)>\s*(.*?)\s*$/); # ignore metadata lines
134    # line should be like page:imagefilename:textfilename:r
135    $line =~ s/^\s+//; #remove space at the front
136    $line =~ s/\s+$//; #remove space at the end
137    my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
138
139    # PJ: use jpg versions instead of tif versions
140    $imgname =~ s/tif/jpg/g;
141
142    # find the image file if there is one
143    if (defined $imgname && $imgname ne "") {
144        $self->block_raw_filename($block_hash, &FileUtils::filenameConcatenate( $dir,$imgname));
145    }
146    # find the text file if there is one
147    if (defined $txtname && $txtname ne "") {
148        $self->block_raw_filename($block_hash, &FileUtils::filenameConcatenate($dir,$txtname));
149    }
150    }
151    close ITEMFILE;
152
153}
154
1551;
Note: See TracBrowser for help on using the browser.