1 | #!/usr/bin/perl -w
|
---|
2 |
|
---|
3 |
|
---|
4 | BEGIN {
|
---|
5 | die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
|
---|
6 | unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
|
---|
7 | }
|
---|
8 |
|
---|
9 |
|
---|
10 | use File::Basename;
|
---|
11 | use unicode;
|
---|
12 |
|
---|
13 | my @meta_files = ();
|
---|
14 |
|
---|
15 | &recursive_rename("import");
|
---|
16 |
|
---|
17 | # read in log.Type.txt and log.Category.txt
|
---|
18 | my $classifications = &read_cat_files();
|
---|
19 |
|
---|
20 | # process meta files
|
---|
21 | foreach my $mfile (@meta_files) {
|
---|
22 | &process_meta_file($mfile);
|
---|
23 | }
|
---|
24 |
|
---|
25 | sub recursive_rename {
|
---|
26 | my ($dir) = @_;
|
---|
27 |
|
---|
28 | opendir (DIR, "$dir") || die;
|
---|
29 | my @files = readdir DIR;
|
---|
30 | closedir DIR;
|
---|
31 |
|
---|
32 | foreach $file (@files) {
|
---|
33 | next if $file eq "." || $file eq "..";
|
---|
34 | next if $file =~ /processed.htm$/;
|
---|
35 |
|
---|
36 | my $path = "$dir/$file";
|
---|
37 | if (-d $path) {
|
---|
38 | &recursive_rename($path);
|
---|
39 | } else {
|
---|
40 | $newfile = &rename_file($file);
|
---|
41 | if ($newfile ne $file) {
|
---|
42 | print STDERR "renaming $file --> $newfile\n";
|
---|
43 | `mv "$path" "$dir/$newfile"`;
|
---|
44 | }
|
---|
45 |
|
---|
46 | if ($file =~ /^title\.txt$/) {
|
---|
47 | # we'll process all the title.txt files after we've finished
|
---|
48 | # renaming everything
|
---|
49 | push(@meta_files, "$dir/$newfile");
|
---|
50 | }
|
---|
51 |
|
---|
52 | if ($file =~ /\.html?$/i) {
|
---|
53 | &process_html_file("$dir/$newfile");
|
---|
54 | }
|
---|
55 | }
|
---|
56 | }
|
---|
57 | }
|
---|
58 |
|
---|
59 | # clean up the html (currently just use the non-css version)
|
---|
60 | sub process_html_file {
|
---|
61 | my ($filename) = @_;
|
---|
62 |
|
---|
63 | print STDERR "processing $filename\n";
|
---|
64 |
|
---|
65 | open (FILE, $filename) || die;
|
---|
66 | undef $/;
|
---|
67 | my $file = <FILE>;
|
---|
68 | $/ = "\n";
|
---|
69 | close FILE;
|
---|
70 |
|
---|
71 | my $header = "<html>\n<head></head>\n<body bgcolor=\"#FFFFFF\">\n";
|
---|
72 | my $footer = "</body>\n</html>\n";
|
---|
73 |
|
---|
74 | my ($noncss, $css) = $file =~ /document\.write\(\"(.*?[^\\])\"\).*?document\.write\(\"(.*?[^\\])\"\)/si;
|
---|
75 |
|
---|
76 | # remove backslashes added for javascript strings
|
---|
77 | $noncss =~ s/\\\"/\"/sg;
|
---|
78 | $noncss =~ s/\\(\s)/$1/g;
|
---|
79 | # $noncss =~ s/\\\n/\n/sg;
|
---|
80 |
|
---|
81 | # Occurances of 2HCl (lowercase el) were mistakenly changed to 2HC1
|
---|
82 | # (digit one) during OCR in some cases so we'll change them back
|
---|
83 | $noncss =~ s/2HC1/2HCl/g;
|
---|
84 |
|
---|
85 | # alter <img> tags to support renamed files
|
---|
86 | $noncss =~ s/(<img src=\")([^\"]+)/$1 . &rename_file($2)/sige;
|
---|
87 |
|
---|
88 | # remove rules="all" attribute from table tags
|
---|
89 | $noncss =~ s/(<table.*?)rules=\"all\"\s+/$1/sig;
|
---|
90 |
|
---|
91 | # remove empty <div> tags
|
---|
92 | while ($noncss =~ s/(<div[^>]*>\s*<\/div>)//sig) {}
|
---|
93 |
|
---|
94 | # remove empty table rows
|
---|
95 | $noncss =~ s/(<tr[^>]*>(\s*<td[^>]*>\s*<\/td>)*\s*<\/tr>)//sig;
|
---|
96 | $noncss =~ s/<table[^>]*>\s*<\/table>//sig;
|
---|
97 |
|
---|
98 | open (FILE, ">$filename") || die;
|
---|
99 | print FILE $header . $noncss . $footer;
|
---|
100 | close FILE;
|
---|
101 |
|
---|
102 | }
|
---|
103 |
|
---|
104 | sub rename_file {
|
---|
105 | my ($filename) = @_;
|
---|
106 |
|
---|
107 | $filename =~ s/\s+//g;
|
---|
108 | $filename =~ s/^protemix\(\d+\)-?//;
|
---|
109 | $filename =~ s/^OCR\d+-0//;
|
---|
110 |
|
---|
111 | $filename =~ s/^(\d+)-\d+\.pdf$/$1-all.pdf/;
|
---|
112 |
|
---|
113 | return $filename;
|
---|
114 | }
|
---|
115 |
|
---|
116 | # process a title.txt file and replace it with a meta.xml file
|
---|
117 | sub process_meta_file {
|
---|
118 | my ($filename) = @_;
|
---|
119 |
|
---|
120 | open (FILE, $filename) || die ("couldn't open $filename");
|
---|
121 | undef $/;
|
---|
122 | my $title = <FILE>;
|
---|
123 | $/ = "\n";
|
---|
124 | close FILE;
|
---|
125 |
|
---|
126 | unlink($filename);
|
---|
127 |
|
---|
128 | $title =~ s/\s+/ /gs;
|
---|
129 | $title =~ s/^\s+//;
|
---|
130 | $title =~ s/\s+$//;
|
---|
131 | $title =~ s/\"/"/g;
|
---|
132 | $title =~ s/</</g;
|
---|
133 | $title =~ s/>/>/g;
|
---|
134 | $title =~ s/&/&/g;
|
---|
135 | $title = &unicode::ascii2utf8(\$title); # assumes title is iso-8859-1
|
---|
136 | my $metafile = "<Metafile>\n";
|
---|
137 | $metafile .= " <Metadata name=\"Title\">$title</Metadata>\n";
|
---|
138 |
|
---|
139 | my $dir = File::Basename::dirname($filename);
|
---|
140 | my ($subdirs) = $dir =~ /import\/(.*)$/;
|
---|
141 |
|
---|
142 | opendir(DIR, $dir) || die;
|
---|
143 | my @files = readdir DIR;
|
---|
144 | foreach my $file (@files) {
|
---|
145 | if ($file =~ /\.html?$/i) {
|
---|
146 | my $fstub = $file;
|
---|
147 | $fstub =~ s/\.html?$//i;
|
---|
148 | $metafile .= " <Page filename=\"$fstub\">\n";
|
---|
149 | if (defined ($classifications->{'type'}->{"$subdirs/$file"})) {
|
---|
150 | $metafile .= " <Metadata name=\"Class1\">" .
|
---|
151 | $classifications->{'type'}->{"$subdirs/$file"} . "</Metadata>\n";
|
---|
152 | } else {
|
---|
153 | print STDERR "$subdirs/$file has no type metadata\n";
|
---|
154 | }
|
---|
155 | if (defined ($classifications->{'category'}->{"$subdirs/$file"})) {
|
---|
156 | $metafile .= " <Metadata name=\"Class2\">" .
|
---|
157 | $classifications->{'category'}->{"$subdirs/$file"} . "</Metadata>\n";
|
---|
158 | } else {
|
---|
159 | print STDERR "$subdirs/$file has no category metadata\n";
|
---|
160 | }
|
---|
161 | $metafile .= " </Page>\n";
|
---|
162 | }
|
---|
163 | }
|
---|
164 | $metafile .= "</Metafile>\n";
|
---|
165 |
|
---|
166 | $filename = "$dir/meta.xml";
|
---|
167 | open (FILE, ">$filename") || die;
|
---|
168 | print FILE $metafile;
|
---|
169 | close FILE;
|
---|
170 | }
|
---|
171 |
|
---|
172 | sub read_cat_files {
|
---|
173 |
|
---|
174 | my $class = {'type' => {}, 'category' => {}};
|
---|
175 |
|
---|
176 | open (TYPE, "import/log.Type.txt") || die;
|
---|
177 | my $line = "";
|
---|
178 | while (defined ($line = <TYPE>)) {
|
---|
179 | my ($filename, $cat) = split(/ /, $line);
|
---|
180 | chomp $cat;
|
---|
181 | $filename =~ s/^.*?Protemix\(ii\)/OCR2/;
|
---|
182 | $filename =~ s/^.*?Protemix\(i\)/OCR1/;
|
---|
183 | $class->{'type'}->{$filename} = $cat;
|
---|
184 | }
|
---|
185 | close TYPE;
|
---|
186 |
|
---|
187 | open (CAT, "import/log.Category.txt") || die;
|
---|
188 | $line = "";
|
---|
189 | while (defined ($line = <CAT>)) {
|
---|
190 | my ($filename, $cat) = split(/ /, $line);
|
---|
191 | chomp $cat;
|
---|
192 | $filename =~ s/^.*?Protemix\(ii\)/OCR2/;
|
---|
193 | $filename =~ s/^.*?Protemix\(i\)/OCR1/;
|
---|
194 | $class->{'category'}->{$filename} = $cat;
|
---|
195 | }
|
---|
196 | close CAT;
|
---|
197 |
|
---|
198 | return $class;
|
---|
199 | }
|
---|