[3162] | 1 | #!/usr/bin/perl -w
|
---|
| 2 |
|
---|
| 3 |
|
---|
| 4 | BEGIN {
|
---|
| 5 | die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
|
---|
| 6 | unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
|
---|
| 7 | }
|
---|
| 8 |
|
---|
| 9 |
|
---|
| 10 | use File::Basename;
|
---|
| 11 | use unicode;
|
---|
| 12 |
|
---|
| 13 | my @meta_files = ();
|
---|
| 14 |
|
---|
| 15 | &recursive_rename("import");
|
---|
[3207] | 16 |
|
---|
| 17 | # read in log.Type.txt and log.Category.txt
|
---|
| 18 | my $classifications = &read_cat_files();
|
---|
| 19 |
|
---|
[3162] | 20 | # process meta files
|
---|
| 21 | foreach my $mfile (@meta_files) {
|
---|
| 22 | &process_meta_file($mfile);
|
---|
| 23 | }
|
---|
| 24 |
|
---|
| 25 | sub recursive_rename {
|
---|
| 26 | my ($dir) = @_;
|
---|
| 27 |
|
---|
| 28 | opendir (DIR, "$dir") || die;
|
---|
| 29 | my @files = readdir DIR;
|
---|
| 30 | closedir DIR;
|
---|
| 31 |
|
---|
| 32 | foreach $file (@files) {
|
---|
| 33 | next if $file eq "." || $file eq "..";
|
---|
| 34 | next if $file =~ /processed.htm$/;
|
---|
| 35 |
|
---|
| 36 | my $path = "$dir/$file";
|
---|
| 37 | if (-d $path) {
|
---|
| 38 | &recursive_rename($path);
|
---|
| 39 | } else {
|
---|
| 40 | $newfile = &rename_file($file);
|
---|
| 41 | if ($newfile ne $file) {
|
---|
| 42 | print STDERR "renaming $file --> $newfile\n";
|
---|
| 43 | `mv "$path" "$dir/$newfile"`;
|
---|
| 44 | }
|
---|
| 45 |
|
---|
| 46 | if ($file =~ /^title\.txt$/) {
|
---|
| 47 | # we'll process all the title.txt files after we've finished
|
---|
| 48 | # renaming everything
|
---|
| 49 | push(@meta_files, "$dir/$newfile");
|
---|
| 50 | }
|
---|
| 51 |
|
---|
| 52 | if ($file =~ /\.html?$/i) {
|
---|
| 53 | &process_html_file("$dir/$newfile");
|
---|
| 54 | }
|
---|
| 55 | }
|
---|
| 56 | }
|
---|
| 57 | }
|
---|
| 58 |
|
---|
| 59 | # clean up the html (currently just use the non-css version)
|
---|
| 60 | sub process_html_file {
|
---|
| 61 | my ($filename) = @_;
|
---|
| 62 |
|
---|
| 63 | print STDERR "processing $filename\n";
|
---|
| 64 |
|
---|
| 65 | open (FILE, $filename) || die;
|
---|
| 66 | undef $/;
|
---|
| 67 | my $file = <FILE>;
|
---|
| 68 | $/ = "\n";
|
---|
| 69 | close FILE;
|
---|
| 70 |
|
---|
| 71 | my $header = "<html>\n<head></head>\n<body bgcolor=\"#FFFFFF\">\n";
|
---|
| 72 | my $footer = "</body>\n</html>\n";
|
---|
| 73 |
|
---|
| 74 | my ($noncss, $css) = $file =~ /document\.write\(\"(.*?[^\\])\"\).*?document\.write\(\"(.*?[^\\])\"\)/si;
|
---|
| 75 |
|
---|
| 76 | # remove backslashes added for javascript strings
|
---|
| 77 | $noncss =~ s/\\\"/\"/sg;
|
---|
[3208] | 78 | $noncss =~ s/\\(\s)/$1/g;
|
---|
| 79 | # $noncss =~ s/\\\n/\n/sg;
|
---|
[3162] | 80 |
|
---|
[3182] | 81 | # Occurances of 2HCl (lowercase el) were mistakenly changed to 2HC1
|
---|
| 82 | # (digit one) during OCR in some cases so we'll change them back
|
---|
| 83 | $noncss =~ s/2HC1/2HCl/g;
|
---|
| 84 |
|
---|
[3162] | 85 | # alter <img> tags to support renamed files
|
---|
| 86 | $noncss =~ s/(<img src=\")([^\"]+)/$1 . &rename_file($2)/sige;
|
---|
| 87 |
|
---|
| 88 | # remove rules="all" attribute from table tags
|
---|
| 89 | $noncss =~ s/(<table.*?)rules=\"all\"\s+/$1/sig;
|
---|
| 90 |
|
---|
[3168] | 91 | # remove empty <div> tags
|
---|
| 92 | while ($noncss =~ s/(<div[^>]*>\s*<\/div>)//sig) {}
|
---|
| 93 |
|
---|
| 94 | # remove empty table rows
|
---|
| 95 | $noncss =~ s/(<tr[^>]*>(\s*<td[^>]*>\s*<\/td>)*\s*<\/tr>)//sig;
|
---|
| 96 | $noncss =~ s/<table[^>]*>\s*<\/table>//sig;
|
---|
| 97 |
|
---|
[3162] | 98 | open (FILE, ">$filename") || die;
|
---|
| 99 | print FILE $header . $noncss . $footer;
|
---|
| 100 | close FILE;
|
---|
| 101 |
|
---|
| 102 | }
|
---|
| 103 |
|
---|
| 104 | sub rename_file {
|
---|
| 105 | my ($filename) = @_;
|
---|
| 106 |
|
---|
| 107 | $filename =~ s/\s+//g;
|
---|
| 108 | $filename =~ s/^protemix\(\d+\)-?//;
|
---|
[3168] | 109 | $filename =~ s/^OCR\d+-0//;
|
---|
[3162] | 110 |
|
---|
[3187] | 111 | $filename =~ s/^(\d+)-\d+\.pdf$/$1-all.pdf/;
|
---|
| 112 |
|
---|
[3162] | 113 | return $filename;
|
---|
| 114 | }
|
---|
| 115 |
|
---|
| 116 | # process a title.txt file and replace it with a meta.xml file
|
---|
| 117 | sub process_meta_file {
|
---|
| 118 | my ($filename) = @_;
|
---|
| 119 |
|
---|
| 120 | open (FILE, $filename) || die ("couldn't open $filename");
|
---|
| 121 | undef $/;
|
---|
| 122 | my $title = <FILE>;
|
---|
| 123 | $/ = "\n";
|
---|
| 124 | close FILE;
|
---|
| 125 |
|
---|
| 126 | unlink($filename);
|
---|
| 127 |
|
---|
| 128 | $title =~ s/\s+/ /gs;
|
---|
| 129 | $title =~ s/^\s+//;
|
---|
| 130 | $title =~ s/\s+$//;
|
---|
[3191] | 131 | $title =~ s/\"/"/g;
|
---|
| 132 | $title =~ s/</</g;
|
---|
| 133 | $title =~ s/>/>/g;
|
---|
[3192] | 134 | $title =~ s/&/&/g;
|
---|
[3162] | 135 | $title = &unicode::ascii2utf8(\$title); # assumes title is iso-8859-1
|
---|
| 136 | my $metafile = "<Metafile>\n";
|
---|
| 137 | $metafile .= " <Metadata name=\"Title\">$title</Metadata>\n";
|
---|
| 138 |
|
---|
| 139 | my $dir = File::Basename::dirname($filename);
|
---|
[3207] | 140 | my ($subdirs) = $dir =~ /import\/(.*)$/;
|
---|
[3162] | 141 |
|
---|
| 142 | opendir(DIR, $dir) || die;
|
---|
| 143 | my @files = readdir DIR;
|
---|
| 144 | foreach my $file (@files) {
|
---|
[3207] | 145 | if ($file =~ /\.html?$/i) {
|
---|
| 146 | my $fstub = $file;
|
---|
| 147 | $fstub =~ s/\.html?$//i;
|
---|
| 148 | $metafile .= " <Page filename=\"$fstub\">\n";
|
---|
| 149 | if (defined ($classifications->{'type'}->{"$subdirs/$file"})) {
|
---|
| 150 | $metafile .= " <Metadata name=\"Class1\">" .
|
---|
| 151 | $classifications->{'type'}->{"$subdirs/$file"} . "</Metadata>\n";
|
---|
| 152 | } else {
|
---|
| 153 | print STDERR "$subdirs/$file has no type metadata\n";
|
---|
| 154 | }
|
---|
| 155 | if (defined ($classifications->{'category'}->{"$subdirs/$file"})) {
|
---|
| 156 | $metafile .= " <Metadata name=\"Class2\">" .
|
---|
| 157 | $classifications->{'category'}->{"$subdirs/$file"} . "</Metadata>\n";
|
---|
| 158 | } else {
|
---|
| 159 | print STDERR "$subdirs/$file has no category metadata\n";
|
---|
| 160 | }
|
---|
| 161 | $metafile .= " </Page>\n";
|
---|
[3162] | 162 | }
|
---|
| 163 | }
|
---|
| 164 | $metafile .= "</Metafile>\n";
|
---|
| 165 |
|
---|
| 166 | $filename = "$dir/meta.xml";
|
---|
| 167 | open (FILE, ">$filename") || die;
|
---|
| 168 | print FILE $metafile;
|
---|
| 169 | close FILE;
|
---|
| 170 | }
|
---|
| 171 |
|
---|
[3207] | 172 | sub read_cat_files {
|
---|
[3162] | 173 |
|
---|
[3207] | 174 | my $class = {'type' => {}, 'category' => {}};
|
---|
| 175 |
|
---|
| 176 | open (TYPE, "import/log.Type.txt") || die;
|
---|
| 177 | my $line = "";
|
---|
| 178 | while (defined ($line = <TYPE>)) {
|
---|
| 179 | my ($filename, $cat) = split(/ /, $line);
|
---|
| 180 | chomp $cat;
|
---|
| 181 | $filename =~ s/^.*?Protemix\(ii\)/OCR2/;
|
---|
| 182 | $filename =~ s/^.*?Protemix\(i\)/OCR1/;
|
---|
| 183 | $class->{'type'}->{$filename} = $cat;
|
---|
| 184 | }
|
---|
| 185 | close TYPE;
|
---|
| 186 |
|
---|
| 187 | open (CAT, "import/log.Category.txt") || die;
|
---|
| 188 | $line = "";
|
---|
| 189 | while (defined ($line = <CAT>)) {
|
---|
| 190 | my ($filename, $cat) = split(/ /, $line);
|
---|
| 191 | chomp $cat;
|
---|
| 192 | $filename =~ s/^.*?Protemix\(ii\)/OCR2/;
|
---|
| 193 | $filename =~ s/^.*?Protemix\(i\)/OCR1/;
|
---|
| 194 | $class->{'category'}->{$filename} = $cat;
|
---|
| 195 | }
|
---|
| 196 | close CAT;
|
---|
| 197 |
|
---|
| 198 | return $class;
|
---|
| 199 | }
|
---|