Changeset 32280
- Timestamp:
- 2018-07-17T20:40:57+12:00 (6 years ago)
- Location:
- main/trunk/greenstone2/perllib
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm
r32277 r32280 170 170 $self->{'convert_to_plugin'} = "StructuredHTMLPlugin"; 171 171 $self->{'convert_to_ext'} = "html"; 172 } elsif ($convert_to =~ /^pagedimg/ ) {172 } elsif ($convert_to =~ /^pagedimg/ || $convert_to eq "paged_text") { 173 173 $self->{'convert_to_plugin'} = "PagedImagePlugin"; 174 my ($convert_to_ext) = $convert_to =~ /pagedimg\_(jpg|gif|png)/i; 175 $convert_to_ext = 'jpg' unless defined $convert_to_ext; 176 $self->{'convert_to_ext'} = $convert_to_ext; 174 if($convert_to eq "paged_text") { 175 $self->{'convert_to_ext'} = "txt"; 176 } else { 177 my ($convert_to_ext) = $convert_to =~ /pagedimg(?:txt)?\_(jpg|gif|png)/i; # the ?: prefix avoids capturing or else discards the optional 'txt' in 'pagedimgtxt', 178 # so that we can consider the actual portion we want to capture: the img type 179 $convert_to_ext = 'jpg' unless defined $convert_to_ext; 180 $self->{'convert_to_ext'} = $convert_to_ext; 181 } 177 182 } 178 183 } -
main/trunk/greenstone2/perllib/plugins/PDFv2Plugin.pm
r32277 r32280 174 174 # TODO 175 175 print STDERR "@@@ Conversion to " . $self->{'convert_to'} , " with Xpdf Tools is not yet implemented.\n"; 176 print STDERR "@@@ Converting to atext instead.\n";177 $self->{'convert_to'} = "text";176 #print STDERR "@@@ Converting to text instead.\n"; 177 #$self->{'convert_to'} = "text"; 178 178 } 179 179 -
main/trunk/greenstone2/perllib/plugins/PrintInfo.pm
r25957 r32280 175 175 } 176 176 $self->print_xml($high_level_information_only); 177 # print STDERR "\n@@@ BLA\n\n"; 177 178 } 178 179 -
main/trunk/greenstone2/perllib/util.pm
r32193 r32280 1729 1729 sub page_number { 1730 1730 my ($dir) = @_; 1731 my ($pagenum) =($dir =~ m/^.*?[-\.]?(\d+)(\.(jpg|gif|png ))?$/i);1731 my ($pagenum) =($dir =~ m/^.*?[-\.]?(\d+)(\.(jpg|gif|png|txt))?$/i); 1732 1732 # my ($pagenum) =($dir =~ m/(\d+)(\.(jpg|gif|png))?$/i); # this works but is not as safe/strict about input filepatterns as the above 1733 1733 … … 1763 1763 my $hasTxtFile = &FileUtils::fileExists($txtfilename); 1764 1764 1765 foreach my $file (@dir_files){ 1766 if ($file !~ /\.item/i && $file !~ /\.txt/i){ 1765 # Write out the elements of the item file. 1766 # We could be dealing with 3 types of conversion output formats: txt only (paged_text), 1767 # images only (pagedimg_) and images AND text (pagedimgtxt_). 1768 foreach my $file (@dir_files) { 1769 if ($file !~ /\.item/i) { 1767 1770 $page_num = page_number($file); 1768 1771 $page_num++ if $starts_at_0; # image numbers start at 0, so add 1 1769 if($hasTxtFile) { 1770 print $item_fh " <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"$page_num.txt\"/>\n"; 1771 } else { 1772 print $item_fh " <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"\"/>\n"; 1772 1773 if ($convert_to eq "txt") { # output format is paged_text, which has no images 1774 if ($file =~ m/\.txt/i) { # check only txt files (should be all there is, besides the skipped .item file) 1775 print $item_fh " <Page pagenum=\"$page_num\" imgfile=\"\" txtfile=\"$page_num.txt\"/>\n"; 1776 } # else, some non-txt file ext, skip 1773 1777 } 1774 } 1775 } 1778 else { # either pagedimg or pagedimgtxt output mode 1779 if($file !~ /\.txt/i) { # check only img files, skip any matching txt files 1780 if($hasTxtFile) { # if every image has a matching txt file, output txtfile too 1781 print $item_fh " <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"$page_num.txt\"/>\n"; 1782 } else { # when its pagedimg only, txtfile is empty 1783 print $item_fh " <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"\"/>\n"; 1784 } 1785 } 1786 } 1787 } 1788 } 1789 1776 1790 1777 1791 print $item_fh "</PagedDocument>\n";
Note:
See TracChangeset
for help on using the changeset viewer.