root/other-projects/trunk/protemix/rename.pl @ 14162

Revision 3208, 5.0 KB (checked in by sjboddie, 17 years ago)

*** empty log message ***

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1#!/usr/bin/perl -w
2
3
4BEGIN {
5    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
6    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
7}
8
9
10use File::Basename;
11use unicode;
12
13my @meta_files = ();
14
15&recursive_rename("import");
16
17# read in log.Type.txt and log.Category.txt
18my $classifications = &read_cat_files();
19
20# process meta files
21foreach my $mfile (@meta_files) {
22    &process_meta_file($mfile);
23}
24
25sub recursive_rename {
26    my ($dir) = @_;
27
28    opendir (DIR, "$dir") || die;
29    my @files = readdir DIR;
30    closedir DIR;
31   
32    foreach $file (@files) {
33    next if $file eq "." || $file eq "..";
34    next if $file =~ /processed.htm$/;
35
36    my $path = "$dir/$file";
37    if (-d $path) {
38        &recursive_rename($path);
39    } else {
40        $newfile = &rename_file($file);
41        if ($newfile ne $file) {
42        print STDERR "renaming $file --> $newfile\n";
43        `mv "$path" "$dir/$newfile"`;
44        }
45
46        if ($file =~ /^title\.txt$/) {
47        # we'll process all the title.txt files after we've finished
48        # renaming everything
49        push(@meta_files, "$dir/$newfile");
50        }
51
52        if ($file =~ /\.html?$/i) {
53        &process_html_file("$dir/$newfile");
54        }
55    }
56    }
57}
58
59# clean up the html (currently just use the non-css version)
60sub process_html_file {
61    my ($filename) = @_;
62
63    print STDERR "processing $filename\n";
64
65    open (FILE, $filename) || die;
66    undef $/;
67    my $file = <FILE>;
68    $/ = "\n";
69    close FILE;
70
71    my $header = "<html>\n<head></head>\n<body bgcolor=\"#FFFFFF\">\n";
72    my $footer = "</body>\n</html>\n";
73
74    my ($noncss, $css) = $file =~ /document\.write\(\"(.*?[^\\])\"\).*?document\.write\(\"(.*?[^\\])\"\)/si;
75
76    # remove backslashes added for javascript strings
77    $noncss =~ s/\\\"/\"/sg;
78    $noncss =~ s/\\(\s)/$1/g;
79#    $noncss =~ s/\\\n/\n/sg;
80
81    # Occurances of 2HCl (lowercase el) were mistakenly changed to 2HC1
82    # (digit one) during OCR in some cases so we'll change them back
83    $noncss =~ s/2HC1/2HCl/g;
84
85    # alter <img> tags to support renamed files
86    $noncss =~ s/(<img src=\")([^\"]+)/$1 . &rename_file($2)/sige;
87
88    # remove rules="all" attribute from table tags
89    $noncss =~ s/(<table.*?)rules=\"all\"\s+/$1/sig;
90
91    # remove empty <div> tags
92    while ($noncss =~ s/(<div[^>]*>\s*<\/div>)//sig) {}
93
94    # remove empty table rows
95    $noncss =~ s/(<tr[^>]*>(\s*<td[^>]*>\s*<\/td>)*\s*<\/tr>)//sig;
96    $noncss =~ s/<table[^>]*>\s*<\/table>//sig;
97
98    open (FILE, ">$filename") || die;
99    print FILE $header . $noncss . $footer;
100    close FILE;
101
102}
103
104sub rename_file {
105    my ($filename) = @_;
106
107    $filename =~ s/\s+//g;
108    $filename =~ s/^protemix\(\d+\)-?//;
109    $filename =~ s/^OCR\d+-0//;
110
111    $filename =~ s/^(\d+)-\d+\.pdf$/$1-all.pdf/;
112
113    return $filename;
114}
115
116# process a title.txt file and replace it with a meta.xml file
117sub process_meta_file {
118    my ($filename) = @_;
119
120    open (FILE, $filename) || die ("couldn't open $filename");
121    undef $/;
122    my $title = <FILE>;
123    $/ = "\n";
124    close FILE;
125
126    unlink($filename);
127
128    $title =~ s/\s+/ /gs;
129    $title =~ s/^\s+//;
130    $title =~ s/\s+$//;
131    $title =~ s/\"/&quot;/g;
132    $title =~ s/</&lt;/g;
133    $title =~ s/>/&gt;/g;
134    $title =~ s/&/&amp;/g;
135    $title = &unicode::ascii2utf8(\$title); # assumes title is iso-8859-1
136    my $metafile = "<Metafile>\n";
137    $metafile .= "  <Metadata name=\"Title\">$title</Metadata>\n";
138
139    my $dir = File::Basename::dirname($filename);
140    my ($subdirs) = $dir =~ /import\/(.*)$/;
141
142    opendir(DIR, $dir) || die;
143    my @files = readdir DIR;
144    foreach my $file (@files) {
145    if ($file =~ /\.html?$/i) {
146        my $fstub = $file;
147        $fstub =~ s/\.html?$//i;
148        $metafile .= "  <Page filename=\"$fstub\">\n";
149        if (defined ($classifications->{'type'}->{"$subdirs/$file"})) {
150        $metafile .= "    <Metadata name=\"Class1\">" .
151            $classifications->{'type'}->{"$subdirs/$file"} . "</Metadata>\n";
152        } else {
153        print STDERR "$subdirs/$file has no type metadata\n";
154        }
155        if (defined ($classifications->{'category'}->{"$subdirs/$file"})) {
156        $metafile .= "    <Metadata name=\"Class2\">" .
157            $classifications->{'category'}->{"$subdirs/$file"} . "</Metadata>\n";
158        } else {
159        print STDERR "$subdirs/$file has no category metadata\n";
160        }
161        $metafile .= "  </Page>\n";
162    }
163    }
164    $metafile .= "</Metafile>\n";
165
166    $filename = "$dir/meta.xml";
167    open (FILE, ">$filename") || die;
168    print FILE $metafile;
169    close FILE;
170}
171
172sub read_cat_files {
173
174    my $class = {'type' => {}, 'category' => {}};
175   
176    open (TYPE, "import/log.Type.txt") || die;
177    my $line = "";
178    while (defined ($line = <TYPE>)) {
179    my ($filename, $cat) = split(/ /, $line);
180    chomp $cat;
181    $filename =~ s/^.*?Protemix\(ii\)/OCR2/;
182    $filename =~ s/^.*?Protemix\(i\)/OCR1/;
183    $class->{'type'}->{$filename} = $cat;
184    }
185    close TYPE;
186   
187    open (CAT, "import/log.Category.txt") || die;
188    $line = "";
189    while (defined ($line = <CAT>)) {
190    my ($filename, $cat) = split(/ /, $line);
191    chomp $cat;
192    $filename =~ s/^.*?Protemix\(ii\)/OCR2/;
193    $filename =~ s/^.*?Protemix\(i\)/OCR1/;
194    $class->{'category'}->{$filename} = $cat;
195    }
196    close CAT;
197
198    return $class;
199}
Note: See TracBrowser for help on using the browser.