1 | 255c255
|
---|
2 | < return q^(?i)(\.jpe?g|\.gif|\.png|\.tif?f|\.te?xt|\.html?|~)$^
|
---|
3 | ---
|
---|
4 | > return q^(?i)(\.jpe?g|\.gif|\.png|\.tif?f|\.te?xt|\.html?|\.pdf|~)$^
|
---|
5 | 397a398,465
|
---|
6 | > sub process_pdf {
|
---|
7 | > my $self = shift;
|
---|
8 | > my ($filename_full_path, $filename_no_path, $doc_obj, $section) = @_;
|
---|
9 | >
|
---|
10 | > return 0 if ($filename_no_path eq "" || !-f $filename_full_path);
|
---|
11 | >
|
---|
12 | > if (!$self->{'processing_tmp_files'} ) {
|
---|
13 | > $doc_obj->associate_source_file($filename_full_path);
|
---|
14 | > }
|
---|
15 | >
|
---|
16 | > $self->generate_pdf_stuff( $filename_full_path, $filename_no_path, $doc_obj, $section );
|
---|
17 | >
|
---|
18 | > return 1; # what are we really supposed to return? seems like void, from ImageConverter::generate_images()
|
---|
19 | > }
|
---|
20 | >
|
---|
21 | > sub extract_pdf_text {
|
---|
22 | > my $self = shift;
|
---|
23 | > my ($filename_full_path, $file, $doc_obj, $cursection) = @_;
|
---|
24 | >
|
---|
25 | > # check that the PDF exists!!
|
---|
26 | > if (!-f $filename_full_path) {
|
---|
27 | > print "PagedImagePlugin: ERROR: File $filename_full_path does not exist, skipping\n";
|
---|
28 | > return 0;
|
---|
29 | > }
|
---|
30 | >
|
---|
31 | > # remember that this text file was one of our source files, but only
|
---|
32 | > # if we are not processing a tmp file
|
---|
33 | > if (!$self->{'processing_tmp_files'} ) {
|
---|
34 | > $doc_obj->associate_source_file($filename_full_path);
|
---|
35 | > }
|
---|
36 | >
|
---|
37 | > my $cmd = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "pdftotext");
|
---|
38 | >
|
---|
39 | > my $text = `$cmd -enc UTF-8 "$filename_full_path" -`;
|
---|
40 | > $text =~ s/ '/'/g; # confusion around RTL indicators
|
---|
41 | > $text =~ s/[ \t]+/ /g;
|
---|
42 | > $text =~ s/\n/ /g;
|
---|
43 | >
|
---|
44 | > if (!length ($text)) {
|
---|
45 | > # It's a bit unusual but not out of the question to have no text, so just give a warning
|
---|
46 | > print "PagedImagePlugin: WARNING: $filename_full_path contains no text\n";
|
---|
47 | > }
|
---|
48 | >
|
---|
49 | > # we need to escape the escape character, or else mg will convert into
|
---|
50 | > # eg literal newlines, instead of leaving the text as '\n'
|
---|
51 | > $text =~ s/\\/\\\\/g; # macro language
|
---|
52 | > $text =~ s/_/\\_/g; # macro language
|
---|
53 | >
|
---|
54 | >
|
---|
55 | > if ($text =~ m/<html.*?>\s*<head.*?>.*<\/head>\s*<body.*?>(.*)<\/body>\s*<\/html>\s*$/is) {
|
---|
56 | > # looks like HTML input
|
---|
57 | > # no need to escape < and > or put in <pre> tags
|
---|
58 | >
|
---|
59 | > $text = $1;
|
---|
60 | >
|
---|
61 | > # insert preformat tags and add text to document object
|
---|
62 | > $doc_obj->add_utf8_text($cursection, "$text");
|
---|
63 | > }
|
---|
64 | > else {
|
---|
65 | > $text =~ s/</</g;
|
---|
66 | > $text =~ s/>/>/g;
|
---|
67 | > # insert preformat tags and add text to document object
|
---|
68 | > $doc_obj->add_utf8_text($cursection, "<pre>\n$text\n</pre>");
|
---|
69 | > }
|
---|
70 | >
|
---|
71 | > return 1;
|
---|
72 | > }
|
---|
73 | >
|
---|
74 | 400a469
|
---|
75 | >
|
---|
76 | 425a495,536
|
---|
77 | > sub generate_pdf_stuff {
|
---|
78 | > my $self = shift;
|
---|
79 | > my ($filename_full_path, $filename_no_path, $doc_obj, $section) = @_;
|
---|
80 | >
|
---|
81 | > return 0 if ($filename_no_path eq "" || !-f $filename_full_path);
|
---|
82 | >
|
---|
83 | > if ($self->{'enable_cache'}) {
|
---|
84 | > $self->init_cache_for_file($filename_full_path);
|
---|
85 | > }
|
---|
86 | >
|
---|
87 | > my $verbosity = $self->{'verbosity'};
|
---|
88 | > my $outhandle = $self->{'outhandle'};
|
---|
89 | >
|
---|
90 | > my $filehead = $filename_no_path;
|
---|
91 | > $filehead =~ s/\.([^\.]*)$//; # filename with no extension
|
---|
92 | > my $assocfilemeta = "[assocfilepath]";
|
---|
93 | > if ($section ne $doc_obj->get_top_section()) {
|
---|
94 | > $assocfilemeta = "[parent(Top):assocfilepath]";
|
---|
95 | > }
|
---|
96 | >
|
---|
97 | > # The images that will get generated may contain percent signs in their src filenames
|
---|
98 | > # Encode those percent signs themselves so that urls to the imgs refer to them correctly
|
---|
99 | > my $url_to_filehead = &unicode::filename_to_url($filehead);
|
---|
100 | > my $url_to_filename_no_path = &unicode::filename_to_url($filename_no_path);
|
---|
101 | >
|
---|
102 | > my $type = "application/pdf";
|
---|
103 | >
|
---|
104 | > # here we overwrite the original with the potentially converted one
|
---|
105 | > $doc_obj->set_utf8_metadata_element($section, "Source", &unicode::url_decode($filename_no_path)); # displayname of generated image
|
---|
106 | > $doc_obj->set_utf8_metadata_element($section, "SourceFile", $url_to_filename_no_path); # displayname of generated image
|
---|
107 | >
|
---|
108 | > #overwrite the ones added in BasePlugin
|
---|
109 | > $doc_obj->set_metadata_element ($section, "FileFormat", 'PagedPDF');
|
---|
110 | > $doc_obj->set_metadata_element ($section, "FileSize", (-s $filename_full_path) );
|
---|
111 | >
|
---|
112 | > $doc_obj->add_metadata ($section, "srclink", "<a href=\"_httpprefix_/collect/[collection]/index/assoc/$assocfilemeta/[Image]\">");
|
---|
113 | > $doc_obj->add_metadata ($section, "/srclink", "</a>");
|
---|
114 | > $doc_obj->add_metadata ($section, "srcicon", "<img src=\"_httpprefix_/collect/[collection]/index/assoc/$assocfilemeta/[Image]\" width=\"[ImageWidth]\" height=\"[ImageHeight]\">");
|
---|
115 | >
|
---|
116 | > # Add the image as an associated file
|
---|
117 | > $doc_obj->associate_file($filename_full_path, $filename_no_path, "image/$type", $section);
|
---|
118 | > }
|
---|
119 | 445c556,563
|
---|
120 | < if (defined $imgfile) {
|
---|
121 | ---
|
---|
122 | > if (defined $imgfile && $imgfile =~ /\.pdf$/i) {
|
---|
123 | > $self->process_pdf($self->{'xml_file_dir'}.$imgfile, $imgfile, $doc_obj, $self->{'current_section'});
|
---|
124 | >
|
---|
125 | > if ( ! defined $_{'txtfile'} ) {
|
---|
126 | > $self->extract_pdf_text($self->{'xml_file_dir'}.$imgfile, $imgfile, $doc_obj, $self->{'current_section'});
|
---|
127 | > }
|
---|
128 | > }
|
---|
129 | > elsif (defined $imgfile) {
|
---|
130 | 449a568
|
---|
131 | >
|
---|
132 | 453c572,573
|
---|
133 | < } else {
|
---|
134 | ---
|
---|
135 | > }
|
---|
136 | > if ( ! $doc_obj->get_text( $self->{'current_section'} ) ) {
|
---|