1 | #!/usr/bin/perl -w
|
---|
2 |
|
---|
3 | ###########################################################################
|
---|
4 | #
|
---|
5 | # gs2txt.pl -- convert various documents to TEXT
|
---|
6 | # A component of the Greenstone digital library software
|
---|
7 | # from the New Zealand Digital Library Project at the
|
---|
8 | # University of Waikato, New Zealand.
|
---|
9 | #
|
---|
10 | # Copyright (C) 1999 New Zealand Digital Library Project
|
---|
11 | #
|
---|
12 | # This program is free software; you can redistribute it and/or modify
|
---|
13 | # it under the terms of the GNU General Public License as published by
|
---|
14 | # the Free Software Foundation; either version 2 of the License, or
|
---|
15 | # (at your option) any later version.
|
---|
16 | #
|
---|
17 | # This program is distributed in the hope that it will be useful,
|
---|
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
20 | # GNU General Public License for more details.
|
---|
21 | #
|
---|
22 | # You should have received a copy of the GNU General Public License
|
---|
23 | # along with this program; if not, write to the Free Software
|
---|
24 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
25 | #
|
---|
26 | ###########################################################################
|
---|
27 |
|
---|
28 | # Convert Microsoft Word, Adobe PDF, and Adobe Postscript to text
|
---|
29 | # using the appropriate specialist convert util
|
---|
30 |
|
---|
31 |
|
---|
32 | BEGIN {
|
---|
33 | die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
|
---|
34 | unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
|
---|
35 |
|
---|
36 | }
|
---|
37 |
|
---|
38 | use parsargv;
|
---|
39 | use util;
|
---|
40 | use Cwd;
|
---|
41 | use File::Basename;
|
---|
42 |
|
---|
43 |
|
---|
44 | sub print_usage
|
---|
45 | {
|
---|
46 |
|
---|
47 | print STDERR "Usage: $0 [-t doc|pdf|ps] filename\n";
|
---|
48 | exit(1);
|
---|
49 | }
|
---|
50 |
|
---|
51 |
|
---|
52 | sub main
|
---|
53 | {
|
---|
54 | my (@ARGV) = @_;
|
---|
55 |
|
---|
56 | my ($input_type,$verbose);
|
---|
57 |
|
---|
58 | if (!parsargv::parse(\@ARGV,
|
---|
59 | 'type/(doc|pdf|ps)/', \$input_type,
|
---|
60 | 'verbose/\d+/0', \$verbose))
|
---|
61 | {
|
---|
62 | print_usage();
|
---|
63 | }
|
---|
64 |
|
---|
65 | if (scalar(@ARGV!=1))
|
---|
66 | {
|
---|
67 | print_usage();
|
---|
68 | }
|
---|
69 |
|
---|
70 | my $input_filename = $ARGV[0];
|
---|
71 | if (!-r $input_filename)
|
---|
72 | {
|
---|
73 | print STDERR "Error: unable to open $input_filename for reading\n";
|
---|
74 | exit(1);
|
---|
75 | }
|
---|
76 |
|
---|
77 | my ($tailname,$dirname,$suffix)
|
---|
78 | = File::Basename::fileparse($input_filename,'\..+');
|
---|
79 | my $output_filename = &util::filename_cat($dirname,"$tailname.txt");
|
---|
80 |
|
---|
81 | if ($input_type eq "")
|
---|
82 | {
|
---|
83 | $input_type = substr($suffix,1,length($suffix)-1);
|
---|
84 | }
|
---|
85 |
|
---|
86 | # Change to temporary working directory
|
---|
87 | my $stored_dir = cwd();
|
---|
88 | chdir ($dirname) || die "Unable to change to directory $dirname";
|
---|
89 |
|
---|
90 |
|
---|
91 | # Select convert utility
|
---|
92 | my $cmd = "";
|
---|
93 | if (!defined $input_type)
|
---|
94 | {
|
---|
95 | print STDERR "Error: No filename extension or input type defined\n";
|
---|
96 | exit(1);
|
---|
97 | }
|
---|
98 | elsif ($input_type eq "doc")
|
---|
99 | {
|
---|
100 | my $wv_cfgfile = &util::filename_cat($ENV{'GSDLHOME'},"etc","wvtext.xml");
|
---|
101 | $cmd = "( echo Processing ; wvHtml --config $wv_cfgfile \"$input_filename\" 2>&1 > \"$output_filename\" )";
|
---|
102 | $cmd .= " | fgrep -v wvWarning" if ($verbose<3);
|
---|
103 | $cmd .= " | fgrep -v wvError" if ($verbose<5);
|
---|
104 | }
|
---|
105 | elsif ($input_type eq "pdf")
|
---|
106 | {
|
---|
107 | $cmd = "pdftotext \"$input_filename\"";
|
---|
108 | }
|
---|
109 | elsif ($input_type eq "ps")
|
---|
110 | {
|
---|
111 | $cmd = "ps2ascii \"$input_filename\" > \"$output_filename\"";
|
---|
112 | }
|
---|
113 | else
|
---|
114 | {
|
---|
115 | print STDERR "Error: Unable to convert to type '$input_type'\n";
|
---|
116 | exit(1);
|
---|
117 | }
|
---|
118 |
|
---|
119 | if (system($cmd)>0)
|
---|
120 | {
|
---|
121 | print STDERR "Error: failed to execute $cmd: $!\n";
|
---|
122 | exit(1);
|
---|
123 | }
|
---|
124 |
|
---|
125 | # restore to original working directory
|
---|
126 | chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
|
---|
127 |
|
---|
128 | }
|
---|
129 |
|
---|
130 |
|
---|
131 |
|
---|
132 | &main(@ARGV)
|
---|