source: trunk/protemix/rename.pl@ 3207

Last change on this file since 3207 was 3207, checked in by sjboddie, 20 years ago

* empty log message *

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 5.0 KB
Line 
1#!/usr/bin/perl -w
2
3
4BEGIN {
5 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
6 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
7}
8
9
10use File::Basename;
11use unicode;
12
13my @meta_files = ();
14
15&recursive_rename("import");
16
17# read in log.Type.txt and log.Category.txt
18my $classifications = &read_cat_files();
19
20# process meta files
21foreach my $mfile (@meta_files) {
22 &process_meta_file($mfile);
23}
24
25sub recursive_rename {
26 my ($dir) = @_;
27
28 opendir (DIR, "$dir") || die;
29 my @files = readdir DIR;
30 closedir DIR;
31
32 foreach $file (@files) {
33 next if $file eq "." || $file eq "..";
34 next if $file =~ /processed.htm$/;
35
36 my $path = "$dir/$file";
37 if (-d $path) {
38 &recursive_rename($path);
39 } else {
40 $newfile = &rename_file($file);
41 if ($newfile ne $file) {
42 print STDERR "renaming $file --> $newfile\n";
43 `mv "$path" "$dir/$newfile"`;
44 }
45
46 if ($file =~ /^title\.txt$/) {
47 # we'll process all the title.txt files after we've finished
48 # renaming everything
49 push(@meta_files, "$dir/$newfile");
50 }
51
52 if ($file =~ /\.html?$/i) {
53 &process_html_file("$dir/$newfile");
54 }
55 }
56 }
57}
58
59# clean up the html (currently just use the non-css version)
60sub process_html_file {
61 my ($filename) = @_;
62
63 print STDERR "processing $filename\n";
64
65 open (FILE, $filename) || die;
66 undef $/;
67 my $file = <FILE>;
68 $/ = "\n";
69 close FILE;
70
71 my $header = "<html>\n<head></head>\n<body bgcolor=\"#FFFFFF\">\n";
72 my $footer = "</body>\n</html>\n";
73
74 my ($noncss, $css) = $file =~ /document\.write\(\"(.*?[^\\])\"\).*?document\.write\(\"(.*?[^\\])\"\)/si;
75
76 # remove backslashes added for javascript strings
77 $noncss =~ s/\\\"/\"/sg;
78 $noncss =~ s/\\\n/\n/sg;
79
80 # Occurances of 2HCl (lowercase el) were mistakenly changed to 2HC1
81 # (digit one) during OCR in some cases so we'll change them back
82 $noncss =~ s/2HC1/2HCl/g;
83
84 # alter <img> tags to support renamed files
85 $noncss =~ s/(<img src=\")([^\"]+)/$1 . &rename_file($2)/sige;
86
87 # remove rules="all" attribute from table tags
88 $noncss =~ s/(<table.*?)rules=\"all\"\s+/$1/sig;
89
90 # remove empty <div> tags
91 while ($noncss =~ s/(<div[^>]*>\s*<\/div>)//sig) {}
92
93 # remove empty table rows
94 $noncss =~ s/(<tr[^>]*>(\s*<td[^>]*>\s*<\/td>)*\s*<\/tr>)//sig;
95 $noncss =~ s/<table[^>]*>\s*<\/table>//sig;
96
97 open (FILE, ">$filename") || die;
98 print FILE $header . $noncss . $footer;
99 close FILE;
100
101}
102
103sub rename_file {
104 my ($filename) = @_;
105
106 $filename =~ s/\s+//g;
107 $filename =~ s/^protemix\(\d+\)-?//;
108 $filename =~ s/^OCR\d+-0//;
109
110 $filename =~ s/^(\d+)-\d+\.pdf$/$1-all.pdf/;
111
112 return $filename;
113}
114
115# process a title.txt file and replace it with a meta.xml file
116sub process_meta_file {
117 my ($filename) = @_;
118
119 open (FILE, $filename) || die ("couldn't open $filename");
120 undef $/;
121 my $title = <FILE>;
122 $/ = "\n";
123 close FILE;
124
125 unlink($filename);
126
127 $title =~ s/\s+/ /gs;
128 $title =~ s/^\s+//;
129 $title =~ s/\s+$//;
130 $title =~ s/\"/&quot;/g;
131 $title =~ s/</&lt;/g;
132 $title =~ s/>/&gt;/g;
133 $title =~ s/&/&amp;/g;
134 $title = &unicode::ascii2utf8(\$title); # assumes title is iso-8859-1
135 my $metafile = "<Metafile>\n";
136 $metafile .= " <Metadata name=\"Title\">$title</Metadata>\n";
137
138 my $dir = File::Basename::dirname($filename);
139 my ($subdirs) = $dir =~ /import\/(.*)$/;
140
141 opendir(DIR, $dir) || die;
142 my @files = readdir DIR;
143 foreach my $file (@files) {
144 if ($file =~ /\.html?$/i) {
145 my $fstub = $file;
146 $fstub =~ s/\.html?$//i;
147 $metafile .= " <Page filename=\"$fstub\">\n";
148 if (defined ($classifications->{'type'}->{"$subdirs/$file"})) {
149 $metafile .= " <Metadata name=\"Class1\">" .
150 $classifications->{'type'}->{"$subdirs/$file"} . "</Metadata>\n";
151 } else {
152 print STDERR "$subdirs/$file has no type metadata\n";
153 }
154 if (defined ($classifications->{'category'}->{"$subdirs/$file"})) {
155 $metafile .= " <Metadata name=\"Class2\">" .
156 $classifications->{'category'}->{"$subdirs/$file"} . "</Metadata>\n";
157 } else {
158 print STDERR "$subdirs/$file has no category metadata\n";
159 }
160 $metafile .= " </Page>\n";
161 }
162 }
163 $metafile .= "</Metafile>\n";
164
165 $filename = "$dir/meta.xml";
166 open (FILE, ">$filename") || die;
167 print FILE $metafile;
168 close FILE;
169}
170
171sub read_cat_files {
172
173 my $class = {'type' => {}, 'category' => {}};
174
175 open (TYPE, "import/log.Type.txt") || die;
176 my $line = "";
177 while (defined ($line = <TYPE>)) {
178 my ($filename, $cat) = split(/ /, $line);
179 chomp $cat;
180 $filename =~ s/^.*?Protemix\(ii\)/OCR2/;
181 $filename =~ s/^.*?Protemix\(i\)/OCR1/;
182 $class->{'type'}->{$filename} = $cat;
183 }
184 close TYPE;
185
186 open (CAT, "import/log.Category.txt") || die;
187 $line = "";
188 while (defined ($line = <CAT>)) {
189 my ($filename, $cat) = split(/ /, $line);
190 chomp $cat;
191 $filename =~ s/^.*?Protemix\(ii\)/OCR2/;
192 $filename =~ s/^.*?Protemix\(i\)/OCR1/;
193 $class->{'category'}->{$filename} = $cat;
194 }
195 close CAT;
196
197 return $class;
198}
Note: See TracBrowser for help on using the repository browser.