source: trunk/protemix/rename.pl@ 3191

Last change on this file since 3191 was 3191, checked in by sjboddie, 22 years ago

* empty log message *

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 5.3 KB
Line 
1#!/usr/bin/perl -w
2
3
4BEGIN {
5 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
6 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
7}
8
9
10use File::Basename;
11use unicode;
12
13my %classifications = (
14 'Pharmacokinetics' => ['Absorption', 'Distribution', 'Metabolism', 'Elimination',
15 'Effect of Food', 'Drug Interactions', 'Special Populations'],
16 'Pharmacodynamics' => ['Trace Metal', 'Excretion', 'Selectivity', 'Copper', 'Zinc',
17 'Manganese', 'Iron', 'Distribution', 'Tissue', 'Brain',
18 'Heart', 'Plasma', 'Fetal', 'Liver', 'Kidney'],
19 'Safety/Side Effects/Toxicity' => ['Acute Toxicity', 'Repeated Dose Toxicity',
20 'Studies in Rodents', 'Studies in Dogs',
21 'Overdosage', 'Reproductive Toxicity',
22 'Teratogenicity', 'Mutagenecity', 'Carcinogenicity'],
23 'Other' => ['Superoxide Dismutase', 'Ferroxidase I', 'Ferroxidase II',
24 'Metallothienein', 'Ceruloplasmin'],
25 'Efficacy' => ['Heart', 'Complications', 'Microvascular', 'Renal', 'Diabetes'],
26 'Contraindications' => [],
27 'Precautions' => []
28 );
29
30my @meta_files = ();
31
32&recursive_rename("import");
33# process meta files
34foreach my $mfile (@meta_files) {
35 &process_meta_file($mfile);
36}
37
38sub recursive_rename {
39 my ($dir) = @_;
40
41 opendir (DIR, "$dir") || die;
42 my @files = readdir DIR;
43 closedir DIR;
44
45 foreach $file (@files) {
46 next if $file eq "." || $file eq "..";
47 next if $file =~ /processed.htm$/;
48
49 my $path = "$dir/$file";
50 if (-d $path) {
51 &recursive_rename($path);
52 } else {
53 $newfile = &rename_file($file);
54 if ($newfile ne $file) {
55 print STDERR "renaming $file --> $newfile\n";
56 `mv "$path" "$dir/$newfile"`;
57 }
58
59 if ($file =~ /^title\.txt$/) {
60 # we'll process all the title.txt files after we've finished
61 # renaming everything
62 push(@meta_files, "$dir/$newfile");
63 }
64
65 if ($file =~ /\.html?$/i) {
66 &process_html_file("$dir/$newfile");
67 }
68 }
69 }
70}
71
72# clean up the html (currently just use the non-css version)
73sub process_html_file {
74 my ($filename) = @_;
75
76 print STDERR "processing $filename\n";
77
78 open (FILE, $filename) || die;
79 undef $/;
80 my $file = <FILE>;
81 $/ = "\n";
82 close FILE;
83
84 my $header = "<html>\n<head></head>\n<body bgcolor=\"#FFFFFF\">\n";
85 my $footer = "</body>\n</html>\n";
86
87 my ($noncss, $css) = $file =~ /document\.write\(\"(.*?[^\\])\"\).*?document\.write\(\"(.*?[^\\])\"\)/si;
88
89 # remove backslashes added for javascript strings
90 $noncss =~ s/\\\"/\"/sg;
91 $noncss =~ s/\\\n/\n/sg;
92
93 # Occurances of 2HCl (lowercase el) were mistakenly changed to 2HC1
94 # (digit one) during OCR in some cases so we'll change them back
95 $noncss =~ s/2HC1/2HCl/g;
96
97 # alter <img> tags to support renamed files
98 $noncss =~ s/(<img src=\")([^\"]+)/$1 . &rename_file($2)/sige;
99
100 # remove rules="all" attribute from table tags
101 $noncss =~ s/(<table.*?)rules=\"all\"\s+/$1/sig;
102
103 # remove empty <div> tags
104 while ($noncss =~ s/(<div[^>]*>\s*<\/div>)//sig) {}
105
106 # remove empty table rows
107 $noncss =~ s/(<tr[^>]*>(\s*<td[^>]*>\s*<\/td>)*\s*<\/tr>)//sig;
108 $noncss =~ s/<table[^>]*>\s*<\/table>//sig;
109
110 open (FILE, ">$filename") || die;
111 print FILE $header . $noncss . $footer;
112 close FILE;
113
114}
115
116sub rename_file {
117 my ($filename) = @_;
118
119 $filename =~ s/\s+//g;
120 $filename =~ s/^protemix\(\d+\)-?//;
121 $filename =~ s/^OCR\d+-0//;
122
123 $filename =~ s/^(\d+)-\d+\.pdf$/$1-all.pdf/;
124
125 return $filename;
126}
127
128# process a title.txt file and replace it with a meta.xml file
129sub process_meta_file {
130 my ($filename) = @_;
131
132 open (FILE, $filename) || die ("couldn't open $filename");
133 undef $/;
134 my $title = <FILE>;
135 $/ = "\n";
136 close FILE;
137
138 unlink($filename);
139
140 $title =~ s/\s+/ /gs;
141 $title =~ s/^\s+//;
142 $title =~ s/\s+$//;
143 $title =~ s/\"/&quot;/g;
144 $title =~ s/</&lt;/g;
145 $title =~ s/>/&gt;/g;
146 $metafile =~ s/&/&amp;/g;
147 $title = &unicode::ascii2utf8(\$title); # assumes title is iso-8859-1
148 my $metafile = "<Metafile>\n";
149 $metafile .= " <Metadata name=\"Title\">$title</Metadata>\n";
150
151 # currently just write some random values for classification metadata
152 my @c1 = ('Animal', 'Human', 'Other');
153 my $rand = int(rand 3);
154 my $classtext = " <Metadata name=\"Class1\">" . $c1[$rand] . "</Metadata>\n";
155 my @c2 = ('Pharmacokinetics', 'Pharmacodynamics', 'Safety/Side Effects/Toxicity',
156 'Other', 'Efficacy', 'Contraindications', 'Precautions');
157 $rand = int(rand 7);
158 my $c2val = $c2[$rand];
159 $classtext .= " <Metadata name=\"Class2\">$c2val</Metadata>\n";
160 if (defined $classifications{$c2val}) {
161 if (scalar(@{$classifications{$c2val}})) {
162 my $numvals = scalar(@{$classifications{$c2val}});
163 $rand = int(rand $numvals);
164 my $c3val = $classifications{$c2val}->[$rand];
165 $classtext .= " <Metadata name=\"Class3\">$c3val</Metadata>\n";
166 }
167 } else {
168 print STDERR "ERROR: '$c2val' not in clasifications list\n";
169 }
170
171 my $dir = File::Basename::dirname($filename);
172
173 opendir(DIR, $dir) || die;
174 my @files = readdir DIR;
175 foreach my $file (@files) {
176 if ($file =~ s/\.html?$//i) {
177 $metafile .= " <Page filename=\"$file\">\n$classtext </Page>\n";
178 }
179 }
180 $metafile .= "</Metafile>\n";
181
182 $filename = "$dir/meta.xml";
183 open (FILE, ">$filename") || die;
184 print FILE $metafile;
185 close FILE;
186}
187
188
Note: See TracBrowser for help on using the repository browser.