source: trunk/protemix/rename.pl@ 3177

Last change on this file since 3177 was 3168, checked in by sjboddie, 22 years ago

* empty log message *

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 5.0 KB
Line 
1#!/usr/bin/perl -w
2
3
4BEGIN {
5 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
6 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
7}
8
9
10use File::Basename;
11use unicode;
12
13my %classifications = (
14 'Pharmacokinetics' => ['Absorption', 'Distribution', 'Metabolism', 'Elimination',
15 'Effect of Food', 'Drug Interactions', 'Special Populations'],
16 'Pharmacodynamics' => ['Trace Metal', 'Excretion', 'Selectivity', 'Copper', 'Zinc',
17 'Manganese', 'Iron', 'Distribution', 'Tissue', 'Brain',
18 'Heart', 'Plasma', 'Fetal', 'Liver', 'Kidney'],
19 'Safety/Side Effects/Toxicity' => ['Acute Toxicity', 'Repeated Dose Toxicity',
20 'Studies in Rodents', 'Studies in Dogs',
21 'Overdosage', 'Reproductive Toxicity',
22 'Teratogenicity', 'Mutagenecity', 'Carcinogenicity'],
23 'Other' => ['Superoxide Dismutase', 'Ferroxidase I', 'Ferroxidase II',
24 'Metallothienein', 'Ceruloplasmin'],
25 'Efficacy' => ['Heart', 'Complications', 'Microvascular', 'Renal', 'Diabetes'],
26 'Contraindications' => [],
27 'Precautions' => []
28 );
29
30my @meta_files = ();
31
32&recursive_rename("import");
33# process meta files
34foreach my $mfile (@meta_files) {
35 &process_meta_file($mfile);
36}
37
38sub recursive_rename {
39 my ($dir) = @_;
40
41 opendir (DIR, "$dir") || die;
42 my @files = readdir DIR;
43 closedir DIR;
44
45 foreach $file (@files) {
46 next if $file eq "." || $file eq "..";
47 next if $file =~ /processed.htm$/;
48
49 my $path = "$dir/$file";
50 if (-d $path) {
51 &recursive_rename($path);
52 } else {
53 $newfile = &rename_file($file);
54 if ($newfile ne $file) {
55 print STDERR "renaming $file --> $newfile\n";
56 `mv "$path" "$dir/$newfile"`;
57 }
58
59 if ($file =~ /^title\.txt$/) {
60 # we'll process all the title.txt files after we've finished
61 # renaming everything
62 push(@meta_files, "$dir/$newfile");
63 }
64
65 if ($file =~ /\.html?$/i) {
66 &process_html_file("$dir/$newfile");
67 }
68 }
69 }
70}
71
72# clean up the html (currently just use the non-css version)
73sub process_html_file {
74 my ($filename) = @_;
75
76 print STDERR "processing $filename\n";
77
78 open (FILE, $filename) || die;
79 undef $/;
80 my $file = <FILE>;
81 $/ = "\n";
82 close FILE;
83
84 my $header = "<html>\n<head></head>\n<body bgcolor=\"#FFFFFF\">\n";
85 my $footer = "</body>\n</html>\n";
86
87 my ($noncss, $css) = $file =~ /document\.write\(\"(.*?[^\\])\"\).*?document\.write\(\"(.*?[^\\])\"\)/si;
88
89 # remove backslashes added for javascript strings
90 $noncss =~ s/\\\"/\"/sg;
91 $noncss =~ s/\\\n/\n/sg;
92
93 # alter <img> tags to support renamed files
94 $noncss =~ s/(<img src=\")([^\"]+)/$1 . &rename_file($2)/sige;
95
96 # remove rules="all" attribute from table tags
97 $noncss =~ s/(<table.*?)rules=\"all\"\s+/$1/sig;
98
99 # remove empty <div> tags
100 while ($noncss =~ s/(<div[^>]*>\s*<\/div>)//sig) {}
101
102 # remove empty table rows
103 $noncss =~ s/(<tr[^>]*>(\s*<td[^>]*>\s*<\/td>)*\s*<\/tr>)//sig;
104 $noncss =~ s/<table[^>]*>\s*<\/table>//sig;
105
106 open (FILE, ">$filename") || die;
107 print FILE $header . $noncss . $footer;
108 close FILE;
109
110}
111
112sub rename_file {
113 my ($filename) = @_;
114
115 $filename =~ s/\s+//g;
116 $filename =~ s/^protemix\(\d+\)-?//;
117 $filename =~ s/^OCR\d+-0//;
118
119 return $filename;
120}
121
122# process a title.txt file and replace it with a meta.xml file
123sub process_meta_file {
124 my ($filename) = @_;
125
126 open (FILE, $filename) || die ("couldn't open $filename");
127 undef $/;
128 my $title = <FILE>;
129 $/ = "\n";
130 close FILE;
131
132 unlink($filename);
133
134 $title =~ s/\s+/ /gs;
135 $title =~ s/^\s+//;
136 $title =~ s/\s+$//;
137 $title = &unicode::ascii2utf8(\$title); # assumes title is iso-8859-1
138 my $metafile = "<Metafile>\n";
139 $metafile .= " <Metadata name=\"Title\">$title</Metadata>\n";
140
141 # currently just write some random values for classification metadata
142 my @c1 = ('Animal', 'Human', 'Other');
143 my $rand = int(rand 3);
144 my $classtext = " <Metadata name=\"Class1\">" . $c1[$rand] . "</Metadata>\n";
145 my @c2 = ('Pharmacokinetics', 'Pharmacodynamics', 'Safety/Side Effects/Toxicity',
146 'Other', 'Efficacy', 'Contraindications', 'Precautions');
147 $rand = int(rand 7);
148 my $c2val = $c2[$rand];
149 $classtext .= " <Metadata name=\"Class2\">$c2val</Metadata>\n";
150 if (defined $classifications{$c2val}) {
151 if (scalar(@{$classifications{$c2val}})) {
152 my $numvals = scalar(@{$classifications{$c2val}});
153 $rand = int(rand $numvals);
154 my $c3val = $classifications{$c2val}->[$rand];
155 $classtext .= " <Metadata name=\"Class3\">$c3val</Metadata>\n";
156 }
157 } else {
158 print STDERR "ERROR: '$c2val' not in clasifications list\n";
159 }
160
161 my $dir = File::Basename::dirname($filename);
162
163 opendir(DIR, $dir) || die;
164 my @files = readdir DIR;
165 foreach my $file (@files) {
166 if ($file =~ s/\.html?$//i) {
167 $metafile .= " <Page filename=\"$file\">\n$classtext </Page>\n";
168 }
169 }
170 $metafile .= "</Metafile>\n";
171
172 $filename = "$dir/meta.xml";
173 open (FILE, ">$filename") || die;
174 print FILE $metafile;
175 close FILE;
176}
177
178
Note: See TracBrowser for help on using the repository browser.