source: other-projects/trunk/protemix/rename.pl@ 14162

Last change on this file since 14162 was 3208, checked in by sjboddie, 20 years ago

* empty log message *

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 5.0 KB
Line 
1#!/usr/bin/perl -w
2
3
4BEGIN {
5 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
6 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
7}
8
9
10use File::Basename;
11use unicode;
12
13my @meta_files = ();
14
15&recursive_rename("import");
16
17# read in log.Type.txt and log.Category.txt
18my $classifications = &read_cat_files();
19
20# process meta files
21foreach my $mfile (@meta_files) {
22 &process_meta_file($mfile);
23}
24
25sub recursive_rename {
26 my ($dir) = @_;
27
28 opendir (DIR, "$dir") || die;
29 my @files = readdir DIR;
30 closedir DIR;
31
32 foreach $file (@files) {
33 next if $file eq "." || $file eq "..";
34 next if $file =~ /processed.htm$/;
35
36 my $path = "$dir/$file";
37 if (-d $path) {
38 &recursive_rename($path);
39 } else {
40 $newfile = &rename_file($file);
41 if ($newfile ne $file) {
42 print STDERR "renaming $file --> $newfile\n";
43 `mv "$path" "$dir/$newfile"`;
44 }
45
46 if ($file =~ /^title\.txt$/) {
47 # we'll process all the title.txt files after we've finished
48 # renaming everything
49 push(@meta_files, "$dir/$newfile");
50 }
51
52 if ($file =~ /\.html?$/i) {
53 &process_html_file("$dir/$newfile");
54 }
55 }
56 }
57}
58
59# clean up the html (currently just use the non-css version)
60sub process_html_file {
61 my ($filename) = @_;
62
63 print STDERR "processing $filename\n";
64
65 open (FILE, $filename) || die;
66 undef $/;
67 my $file = <FILE>;
68 $/ = "\n";
69 close FILE;
70
71 my $header = "<html>\n<head></head>\n<body bgcolor=\"#FFFFFF\">\n";
72 my $footer = "</body>\n</html>\n";
73
74 my ($noncss, $css) = $file =~ /document\.write\(\"(.*?[^\\])\"\).*?document\.write\(\"(.*?[^\\])\"\)/si;
75
76 # remove backslashes added for javascript strings
77 $noncss =~ s/\\\"/\"/sg;
78 $noncss =~ s/\\(\s)/$1/g;
79# $noncss =~ s/\\\n/\n/sg;
80
81 # Occurances of 2HCl (lowercase el) were mistakenly changed to 2HC1
82 # (digit one) during OCR in some cases so we'll change them back
83 $noncss =~ s/2HC1/2HCl/g;
84
85 # alter <img> tags to support renamed files
86 $noncss =~ s/(<img src=\")([^\"]+)/$1 . &rename_file($2)/sige;
87
88 # remove rules="all" attribute from table tags
89 $noncss =~ s/(<table.*?)rules=\"all\"\s+/$1/sig;
90
91 # remove empty <div> tags
92 while ($noncss =~ s/(<div[^>]*>\s*<\/div>)//sig) {}
93
94 # remove empty table rows
95 $noncss =~ s/(<tr[^>]*>(\s*<td[^>]*>\s*<\/td>)*\s*<\/tr>)//sig;
96 $noncss =~ s/<table[^>]*>\s*<\/table>//sig;
97
98 open (FILE, ">$filename") || die;
99 print FILE $header . $noncss . $footer;
100 close FILE;
101
102}
103
104sub rename_file {
105 my ($filename) = @_;
106
107 $filename =~ s/\s+//g;
108 $filename =~ s/^protemix\(\d+\)-?//;
109 $filename =~ s/^OCR\d+-0//;
110
111 $filename =~ s/^(\d+)-\d+\.pdf$/$1-all.pdf/;
112
113 return $filename;
114}
115
116# process a title.txt file and replace it with a meta.xml file
117sub process_meta_file {
118 my ($filename) = @_;
119
120 open (FILE, $filename) || die ("couldn't open $filename");
121 undef $/;
122 my $title = <FILE>;
123 $/ = "\n";
124 close FILE;
125
126 unlink($filename);
127
128 $title =~ s/\s+/ /gs;
129 $title =~ s/^\s+//;
130 $title =~ s/\s+$//;
131 $title =~ s/\"/&quot;/g;
132 $title =~ s/</&lt;/g;
133 $title =~ s/>/&gt;/g;
134 $title =~ s/&/&amp;/g;
135 $title = &unicode::ascii2utf8(\$title); # assumes title is iso-8859-1
136 my $metafile = "<Metafile>\n";
137 $metafile .= " <Metadata name=\"Title\">$title</Metadata>\n";
138
139 my $dir = File::Basename::dirname($filename);
140 my ($subdirs) = $dir =~ /import\/(.*)$/;
141
142 opendir(DIR, $dir) || die;
143 my @files = readdir DIR;
144 foreach my $file (@files) {
145 if ($file =~ /\.html?$/i) {
146 my $fstub = $file;
147 $fstub =~ s/\.html?$//i;
148 $metafile .= " <Page filename=\"$fstub\">\n";
149 if (defined ($classifications->{'type'}->{"$subdirs/$file"})) {
150 $metafile .= " <Metadata name=\"Class1\">" .
151 $classifications->{'type'}->{"$subdirs/$file"} . "</Metadata>\n";
152 } else {
153 print STDERR "$subdirs/$file has no type metadata\n";
154 }
155 if (defined ($classifications->{'category'}->{"$subdirs/$file"})) {
156 $metafile .= " <Metadata name=\"Class2\">" .
157 $classifications->{'category'}->{"$subdirs/$file"} . "</Metadata>\n";
158 } else {
159 print STDERR "$subdirs/$file has no category metadata\n";
160 }
161 $metafile .= " </Page>\n";
162 }
163 }
164 $metafile .= "</Metafile>\n";
165
166 $filename = "$dir/meta.xml";
167 open (FILE, ">$filename") || die;
168 print FILE $metafile;
169 close FILE;
170}
171
172sub read_cat_files {
173
174 my $class = {'type' => {}, 'category' => {}};
175
176 open (TYPE, "import/log.Type.txt") || die;
177 my $line = "";
178 while (defined ($line = <TYPE>)) {
179 my ($filename, $cat) = split(/ /, $line);
180 chomp $cat;
181 $filename =~ s/^.*?Protemix\(ii\)/OCR2/;
182 $filename =~ s/^.*?Protemix\(i\)/OCR1/;
183 $class->{'type'}->{$filename} = $cat;
184 }
185 close TYPE;
186
187 open (CAT, "import/log.Category.txt") || die;
188 $line = "";
189 while (defined ($line = <CAT>)) {
190 my ($filename, $cat) = split(/ /, $line);
191 chomp $cat;
192 $filename =~ s/^.*?Protemix\(ii\)/OCR2/;
193 $filename =~ s/^.*?Protemix\(i\)/OCR1/;
194 $class->{'category'}->{$filename} = $cat;
195 }
196 close CAT;
197
198 return $class;
199}
Note: See TracBrowser for help on using the repository browser.