Ticket #735: PagedImagePlugin.pm.patch

File PagedImagePlugin.pm.patch, 5.2 KB (added by kjdon, 13 years ago)

pagedimageplugin patch

Line 
1255c255
2< return q^(?i)(\.jpe?g|\.gif|\.png|\.tif?f|\.te?xt|\.html?|~)$^
3---
4> return q^(?i)(\.jpe?g|\.gif|\.png|\.tif?f|\.te?xt|\.html?|\.pdf|~)$^
5397a398,465
6> sub process_pdf {
7> my $self = shift;
8> my ($filename_full_path, $filename_no_path, $doc_obj, $section) = @_;
9>
10> return 0 if ($filename_no_path eq "" || !-f $filename_full_path);
11>
12> if (!$self->{'processing_tmp_files'} ) {
13> $doc_obj->associate_source_file($filename_full_path);
14> }
15>
16> $self->generate_pdf_stuff( $filename_full_path, $filename_no_path, $doc_obj, $section );
17>
18> return 1; # what are we really supposed to return? seems like void, from ImageConverter::generate_images()
19> }
20>
21> sub extract_pdf_text {
22> my $self = shift;
23> my ($filename_full_path, $file, $doc_obj, $cursection) = @_;
24>
25> # check that the PDF exists!!
26> if (!-f $filename_full_path) {
27> print "PagedImagePlugin: ERROR: File $filename_full_path does not exist, skipping\n";
28> return 0;
29> }
30>
31> # remember that this text file was one of our source files, but only
32> # if we are not processing a tmp file
33> if (!$self->{'processing_tmp_files'} ) {
34> $doc_obj->associate_source_file($filename_full_path);
35> }
36>
37> my $cmd = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "pdftotext");
38>
39> my $text = `$cmd -enc UTF-8 "$filename_full_path" -`;
40> $text =~ s/ '/'/g; # confusion around RTL indicators
41> $text =~ s/[ \t]+/ /g;
42> $text =~ s/\n/ /g;
43>
44> if (!length ($text)) {
45> # It's a bit unusual but not out of the question to have no text, so just give a warning
46> print "PagedImagePlugin: WARNING: $filename_full_path contains no text\n";
47> }
48>
49> # we need to escape the escape character, or else mg will convert into
50> # eg literal newlines, instead of leaving the text as '\n'
51> $text =~ s/\\/\\\\/g; # macro language
52> $text =~ s/_/\\_/g; # macro language
53>
54>
55> if ($text =~ m/<html.*?>\s*<head.*?>.*<\/head>\s*<body.*?>(.*)<\/body>\s*<\/html>\s*$/is) {
56> # looks like HTML input
57> # no need to escape < and > or put in <pre> tags
58>
59> $text = $1;
60>
61> # insert preformat tags and add text to document object
62> $doc_obj->add_utf8_text($cursection, "$text");
63> }
64> else {
65> $text =~ s/</&lt;/g;
66> $text =~ s/>/&gt;/g;
67> # insert preformat tags and add text to document object
68> $doc_obj->add_utf8_text($cursection, "<pre>\n$text\n</pre>");
69> }
70>
71> return 1;
72> }
73>
74400a469
75>
76425a495,536
77> sub generate_pdf_stuff {
78> my $self = shift;
79> my ($filename_full_path, $filename_no_path, $doc_obj, $section) = @_;
80>
81> return 0 if ($filename_no_path eq "" || !-f $filename_full_path);
82>
83> if ($self->{'enable_cache'}) {
84> $self->init_cache_for_file($filename_full_path);
85> }
86>
87> my $verbosity = $self->{'verbosity'};
88> my $outhandle = $self->{'outhandle'};
89>
90> my $filehead = $filename_no_path;
91> $filehead =~ s/\.([^\.]*)$//; # filename with no extension
92> my $assocfilemeta = "[assocfilepath]";
93> if ($section ne $doc_obj->get_top_section()) {
94> $assocfilemeta = "[parent(Top):assocfilepath]";
95> }
96>
97> # The images that will get generated may contain percent signs in their src filenames
98> # Encode those percent signs themselves so that urls to the imgs refer to them correctly
99> my $url_to_filehead = &unicode::filename_to_url($filehead);
100> my $url_to_filename_no_path = &unicode::filename_to_url($filename_no_path);
101>
102> my $type = "application/pdf";
103>
104> # here we overwrite the original with the potentially converted one
105> $doc_obj->set_utf8_metadata_element($section, "Source", &unicode::url_decode($filename_no_path)); # displayname of generated image
106> $doc_obj->set_utf8_metadata_element($section, "SourceFile", $url_to_filename_no_path); # displayname of generated image
107>
108> #overwrite the ones added in BasePlugin
109> $doc_obj->set_metadata_element ($section, "FileFormat", 'PagedPDF');
110> $doc_obj->set_metadata_element ($section, "FileSize", (-s $filename_full_path) );
111>
112> $doc_obj->add_metadata ($section, "srclink", "<a href=\"_httpprefix_/collect/[collection]/index/assoc/$assocfilemeta/[Image]\">");
113> $doc_obj->add_metadata ($section, "/srclink", "</a>");
114> $doc_obj->add_metadata ($section, "srcicon", "<img src=\"_httpprefix_/collect/[collection]/index/assoc/$assocfilemeta/[Image]\" width=\"[ImageWidth]\" height=\"[ImageHeight]\">");
115>
116> # Add the image as an associated file
117> $doc_obj->associate_file($filename_full_path, $filename_no_path, "image/$type", $section);
118> }
119445c556,563
120< if (defined $imgfile) {
121---
122> if (defined $imgfile && $imgfile =~ /\.pdf$/i) {
123> $self->process_pdf($self->{'xml_file_dir'}.$imgfile, $imgfile, $doc_obj, $self->{'current_section'});
124>
125> if ( ! defined $_{'txtfile'} ) {
126> $self->extract_pdf_text($self->{'xml_file_dir'}.$imgfile, $imgfile, $doc_obj, $self->{'current_section'});
127> }
128> }
129> elsif (defined $imgfile) {
130449a568
131>
132453c572,573
133< } else {
134---
135> }
136> if ( ! $doc_obj->get_text( $self->{'current_section'} ) ) {