- Timestamp:
- 2018-06-05T21:11:04+12:00 (6 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/util.pm
r32096 r32193 1716 1716 1717 1717 # Used by pdfpstoimg.pl and PDFBoxConverter to create a .item file from 1718 # a directory containing sequentially numbered images .1718 # a directory containing sequentially numbered images (and optional matching sequentially numbered .txt files). 1719 1719 sub create_itemfile 1720 1720 { … … 1752 1752 print $item_fh "<PagedDocument>\n"; 1753 1753 1754 # In the past, sub create_itemfile() never output txtfile names into the item file (they were left as empty strings), 1755 # only image file names. Now that PDFBox is being customised for GS with the new GS_PDFToImagesAndText.java class to 1756 # create images of each PDF page and extract text for that page if extractable, we can have matching txt files for 1757 # each img file. So now we can output txt file names if we're working with txt files. 1758 # We just test if a text file exists in the same dir that matches the name of the first image file 1759 # if a matching txt file does not exist, don't output txtfile names into the item file 1760 1761 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($firstfile, "\\.[^\\.]+\$"); # relative filenames so no dirname 1762 my $txtfilename = &FileUtils::filenameConcatenate($output_dir, $tailname . ".txt"); 1763 my $hasTxtFile = &FileUtils::fileExists($txtfilename); 1764 1754 1765 foreach my $file (@dir_files){ 1755 if ($file !~ /\.item/i ){1766 if ($file !~ /\.item/i && $file !~ /\.txt/i){ 1756 1767 $page_num = page_number($file); 1757 1768 $page_num++ if $starts_at_0; # image numbers start at 0, so add 1 1758 print $item_fh " <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"\"/>\n"; 1769 if($hasTxtFile) { 1770 print $item_fh " <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"$page_num.txt\"/>\n"; 1771 } else { 1772 print $item_fh " <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"\"/>\n"; 1773 } 1759 1774 } 1760 1775 }
Note:
See TracChangeset
for help on using the changeset viewer.