Context Navigation

Back to Ticket #735

Ticket #735: PagedImagePlugin.pm.patch

File PagedImagePlugin.pm.patch, 5.2 KB (added by kjdon, 13 years ago)
pagedimageplugin patch

Line
1	255c255
2	< return q^(?i)(\.jpe?g\|\.gif\|\.png\|\.tif?f\|\.te?xt\|\.html?\|~)$^
3	---
4	> return q^(?i)(\.jpe?g\|\.gif\|\.png\|\.tif?f\|\.te?xt\|\.html?\|\.pdf\|~)$^
5	397a398,465
6	> sub process_pdf {
7	> my $self = shift;
8	> my ($filename_full_path, $filename_no_path, $doc_obj, $section) = @_;
9	>
10	> return 0 if ($filename_no_path eq "" \|\| !-f $filename_full_path);
11	>
12	> if (!$self->{'processing_tmp_files'} ) {
13	> $doc_obj->associate_source_file($filename_full_path);
14	> }
15	>
16	> $self->generate_pdf_stuff( $filename_full_path, $filename_no_path, $doc_obj, $section );
17	>
18	> return 1; # what are we really supposed to return? seems like void, from ImageConverter::generate_images()
19	> }
20	>
21	> sub extract_pdf_text {
22	> my $self = shift;
23	> my ($filename_full_path, $file, $doc_obj, $cursection) = @_;
24	>
25	> # check that the PDF exists!!
26	> if (!-f $filename_full_path) {
27	> print "PagedImagePlugin: ERROR: File $filename_full_path does not exist, skipping\n";
28	> return 0;
29	> }
30	>
31	> # remember that this text file was one of our source files, but only
32	> # if we are not processing a tmp file
33	> if (!$self->{'processing_tmp_files'} ) {
34	> $doc_obj->associate_source_file($filename_full_path);
35	> }
36	>
37	> my $cmd = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "pdftotext");
38	>
39	> my $text = `$cmd -enc UTF-8 "$filename_full_path" -`;
40	> $text =~ s/ '/'/g; # confusion around RTL indicators
41	> $text =~ s/[ \t]+/ /g;
42	> $text =~ s/\n/ /g;
43	>
44	> if (!length ($text)) {
45	> # It's a bit unusual but not out of the question to have no text, so just give a warning
46	> print "PagedImagePlugin: WARNING: $filename_full_path contains no text\n";
47	> }
48	>
49	> # we need to escape the escape character, or else mg will convert into
50	> # eg literal newlines, instead of leaving the text as '\n'
51	> $text =~ s/\\/\\\\/g; # macro language
52	> $text =~ s/_/\\_/g; # macro language
53	>
54	>
55	> if ($text =~ m/<html.?>\s<head.?>.<\/head>\s<body.?>(.)<\/body>\s<\/html>\s*$/is) {
56	> # looks like HTML input
57	> # no need to escape < and > or put in <pre> tags
58	>
59	> $text = $1;
60	>
61	> # insert preformat tags and add text to document object
62	> $doc_obj->add_utf8_text($cursection, "$text");
63	> }
64	> else {
65	> $text =~ s/</</g;
66	> $text =~ s/>/>/g;
67	> # insert preformat tags and add text to document object
68	> $doc_obj->add_utf8_text($cursection, "<pre>\n$text\n</pre>");
69	> }
70	>
71	> return 1;
72	> }
73	>
74	400a469
75	>
76	425a495,536
77	> sub generate_pdf_stuff {
78	> my $self = shift;
79	> my ($filename_full_path, $filename_no_path, $doc_obj, $section) = @_;
80	>
81	> return 0 if ($filename_no_path eq "" \|\| !-f $filename_full_path);
82	>
83	> if ($self->{'enable_cache'}) {
84	> $self->init_cache_for_file($filename_full_path);
85	> }
86	>
87	> my $verbosity = $self->{'verbosity'};
88	> my $outhandle = $self->{'outhandle'};
89	>
90	> my $filehead = $filename_no_path;
91	> $filehead =~ s/\.([^\.]*)$//; # filename with no extension
92	> my $assocfilemeta = "[assocfilepath]";
93	> if ($section ne $doc_obj->get_top_section()) {
94	> $assocfilemeta = "[parent(Top):assocfilepath]";
95	> }
96	>
97	> # The images that will get generated may contain percent signs in their src filenames
98	> # Encode those percent signs themselves so that urls to the imgs refer to them correctly
99	> my $url_to_filehead = &unicode::filename_to_url($filehead);
100	> my $url_to_filename_no_path = &unicode::filename_to_url($filename_no_path);
101	>
102	> my $type = "application/pdf";
103	>
104	> # here we overwrite the original with the potentially converted one
105	> $doc_obj->set_utf8_metadata_element($section, "Source", &unicode::url_decode($filename_no_path)); # displayname of generated image
106	> $doc_obj->set_utf8_metadata_element($section, "SourceFile", $url_to_filename_no_path); # displayname of generated image
107	>
108	> #overwrite the ones added in BasePlugin
109	> $doc_obj->set_metadata_element ($section, "FileFormat", 'PagedPDF');
110	> $doc_obj->set_metadata_element ($section, "FileSize", (-s $filename_full_path) );
111	>
112	> $doc_obj->add_metadata ($section, "srclink", "<a href=\"_httpprefix_/collect/[collection]/index/assoc/$assocfilemeta/[Image]\">");
113	> $doc_obj->add_metadata ($section, "/srclink", "</a>");
114	> $doc_obj->add_metadata ($section, "srcicon", "<img src=\"_httpprefix_/collect/[collection]/index/assoc/$assocfilemeta/[Image]\" width=\"[ImageWidth]\" height=\"[ImageHeight]\">");
115	>
116	> # Add the image as an associated file
117	> $doc_obj->associate_file($filename_full_path, $filename_no_path, "image/$type", $section);
118	> }
119	445c556,563
120	< if (defined $imgfile) {
121	---
122	> if (defined $imgfile && $imgfile =~ /\.pdf$/i) {
123	> $self->process_pdf($self->{'xml_file_dir'}.$imgfile, $imgfile, $doc_obj, $self->{'current_section'});
124	>
125	> if ( ! defined $_{'txtfile'} ) {
126	> $self->extract_pdf_text($self->{'xml_file_dir'}.$imgfile, $imgfile, $doc_obj, $self->{'current_section'});
127	> }
128	> }
129	> elsif (defined $imgfile) {
130	449a568
131	>
132	453c572,573
133	< } else {
134	---
135	> }
136	> if ( ! $doc_obj->get_text( $self->{'current_section'} ) ) {

Download in other formats:

Original Format