1 | #!/usr/bin/perl -w
|
---|
2 |
|
---|
3 |
|
---|
4 | BEGIN {
|
---|
5 | die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
|
---|
6 | unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
|
---|
7 | }
|
---|
8 |
|
---|
9 |
|
---|
10 | use File::Basename;
|
---|
11 | use unicode;
|
---|
12 |
|
---|
13 | my %classifications = (
|
---|
14 | 'Pharmacokinetics' => ['Absorption', 'Distribution', 'Metabolism', 'Elimination',
|
---|
15 | 'Effect of Food', 'Drug Interactions', 'Special Populations'],
|
---|
16 | 'Pharmacodynamics' => ['Trace Metal', 'Excretion', 'Selectivity', 'Copper', 'Zinc',
|
---|
17 | 'Manganese', 'Iron', 'Distribution', 'Tissue', 'Brain',
|
---|
18 | 'Heart', 'Plasma', 'Fetal', 'Liver', 'Kidney'],
|
---|
19 | 'Safety/Side Effects/Toxicity' => ['Acute Toxicity', 'Repeated Dose Toxicity',
|
---|
20 | 'Studies in Rodents', 'Studies in Dogs',
|
---|
21 | 'Overdosage', 'Reproductive Toxicity',
|
---|
22 | 'Teratogenicity', 'Mutagenecity', 'Carcinogenicity'],
|
---|
23 | 'Other' => ['Superoxide Dismutase', 'Ferroxidase I', 'Ferroxidase II',
|
---|
24 | 'Metallothienein', 'Ceruloplasmin'],
|
---|
25 | 'Efficacy' => ['Heart', 'Complications', 'Microvascular', 'Renal', 'Diabetes'],
|
---|
26 | 'Contraindications' => [],
|
---|
27 | 'Precautions' => []
|
---|
28 | );
|
---|
29 |
|
---|
30 | my @meta_files = ();
|
---|
31 |
|
---|
32 | &recursive_rename("import");
|
---|
33 | # process meta files
|
---|
34 | foreach my $mfile (@meta_files) {
|
---|
35 | &process_meta_file($mfile);
|
---|
36 | }
|
---|
37 |
|
---|
38 | sub recursive_rename {
|
---|
39 | my ($dir) = @_;
|
---|
40 |
|
---|
41 | opendir (DIR, "$dir") || die;
|
---|
42 | my @files = readdir DIR;
|
---|
43 | closedir DIR;
|
---|
44 |
|
---|
45 | foreach $file (@files) {
|
---|
46 | next if $file eq "." || $file eq "..";
|
---|
47 | next if $file =~ /processed.htm$/;
|
---|
48 |
|
---|
49 | my $path = "$dir/$file";
|
---|
50 | if (-d $path) {
|
---|
51 | &recursive_rename($path);
|
---|
52 | } else {
|
---|
53 | $newfile = &rename_file($file);
|
---|
54 | if ($newfile ne $file) {
|
---|
55 | print STDERR "renaming $file --> $newfile\n";
|
---|
56 | `mv "$path" "$dir/$newfile"`;
|
---|
57 | }
|
---|
58 |
|
---|
59 | if ($file =~ /^title\.txt$/) {
|
---|
60 | # we'll process all the title.txt files after we've finished
|
---|
61 | # renaming everything
|
---|
62 | push(@meta_files, "$dir/$newfile");
|
---|
63 | }
|
---|
64 |
|
---|
65 | if ($file =~ /\.html?$/i) {
|
---|
66 | &process_html_file("$dir/$newfile");
|
---|
67 | }
|
---|
68 | }
|
---|
69 | }
|
---|
70 | }
|
---|
71 |
|
---|
72 | # clean up the html (currently just use the non-css version)
|
---|
73 | sub process_html_file {
|
---|
74 | my ($filename) = @_;
|
---|
75 |
|
---|
76 | print STDERR "processing $filename\n";
|
---|
77 |
|
---|
78 | open (FILE, $filename) || die;
|
---|
79 | undef $/;
|
---|
80 | my $file = <FILE>;
|
---|
81 | $/ = "\n";
|
---|
82 | close FILE;
|
---|
83 |
|
---|
84 | my $header = "<html>\n<head></head>\n<body bgcolor=\"#FFFFFF\">\n";
|
---|
85 | my $footer = "</body>\n</html>\n";
|
---|
86 |
|
---|
87 | my ($noncss, $css) = $file =~ /document\.write\(\"(.*?[^\\])\"\).*?document\.write\(\"(.*?[^\\])\"\)/si;
|
---|
88 |
|
---|
89 | # remove backslashes added for javascript strings
|
---|
90 | $noncss =~ s/\\\"/\"/sg;
|
---|
91 | $noncss =~ s/\\\n/\n/sg;
|
---|
92 |
|
---|
93 | # alter <img> tags to support renamed files
|
---|
94 | $noncss =~ s/(<img src=\")([^\"]+)/$1 . &rename_file($2)/sige;
|
---|
95 |
|
---|
96 | # remove rules="all" attribute from table tags
|
---|
97 | $noncss =~ s/(<table.*?)rules=\"all\"\s+/$1/sig;
|
---|
98 |
|
---|
99 | open (FILE, ">$filename") || die;
|
---|
100 | print FILE $header . $noncss . $footer;
|
---|
101 | close FILE;
|
---|
102 |
|
---|
103 | }
|
---|
104 |
|
---|
105 | sub rename_file {
|
---|
106 | my ($filename) = @_;
|
---|
107 |
|
---|
108 | $filename =~ s/\s+//g;
|
---|
109 | $filename =~ s/^protemix\(\d+\)-?//;
|
---|
110 |
|
---|
111 | return $filename;
|
---|
112 | }
|
---|
113 |
|
---|
114 | # process a title.txt file and replace it with a meta.xml file
|
---|
115 | sub process_meta_file {
|
---|
116 | my ($filename) = @_;
|
---|
117 |
|
---|
118 | open (FILE, $filename) || die ("couldn't open $filename");
|
---|
119 | undef $/;
|
---|
120 | my $title = <FILE>;
|
---|
121 | $/ = "\n";
|
---|
122 | close FILE;
|
---|
123 |
|
---|
124 | unlink($filename);
|
---|
125 |
|
---|
126 | $title =~ s/\s+/ /gs;
|
---|
127 | $title =~ s/^\s+//;
|
---|
128 | $title =~ s/\s+$//;
|
---|
129 | $title = &unicode::ascii2utf8(\$title); # assumes title is iso-8859-1
|
---|
130 | my $metafile = "<Metafile>\n";
|
---|
131 | $metafile .= " <Metadata name=\"Title\">$title</Metadata>\n";
|
---|
132 |
|
---|
133 | # currently just write some random values for classification metadata
|
---|
134 | my @c1 = ('Animal', 'Human', 'Other');
|
---|
135 | my $rand = int(rand 3);
|
---|
136 | my $classtext = " <Metadata name=\"Class1\">" . $c1[$rand] . "</Metadata>\n";
|
---|
137 | my @c2 = ('Pharmacokinetics', 'Pharmacodynamics', 'Safety/Side Effects/Toxicity',
|
---|
138 | 'Other', 'Efficacy', 'Contraindications', 'Precautions');
|
---|
139 | $rand = int(rand 7);
|
---|
140 | my $c2val = $c2[$rand];
|
---|
141 | $classtext .= " <Metadata name=\"Class2\">$c2val</Metadata>\n";
|
---|
142 | if (defined $classifications{$c2val}) {
|
---|
143 | if (scalar(@{$classifications{$c2val}})) {
|
---|
144 | my $numvals = scalar(@{$classifications{$c2val}});
|
---|
145 | $rand = int(rand $numvals);
|
---|
146 | my $c3val = $classifications{$c2val}->[$rand];
|
---|
147 | $classtext .= " <Metadata name=\"Class3\">$c3val</Metadata>\n";
|
---|
148 | }
|
---|
149 | } else {
|
---|
150 | print STDERR "ERROR: '$c2val' not in clasifications list\n";
|
---|
151 | }
|
---|
152 |
|
---|
153 | my $dir = File::Basename::dirname($filename);
|
---|
154 |
|
---|
155 | opendir(DIR, $dir) || die;
|
---|
156 | my @files = readdir DIR;
|
---|
157 | foreach my $file (@files) {
|
---|
158 | if ($file =~ s/\.html?$//i) {
|
---|
159 | $metafile .= " <Page filename=\"$file\">\n$classtext </Page>\n";
|
---|
160 | }
|
---|
161 | }
|
---|
162 | $metafile .= "</Metafile>\n";
|
---|
163 |
|
---|
164 | $filename = "$dir/meta.xml";
|
---|
165 | open (FILE, ">$filename") || die;
|
---|
166 | print FILE $metafile;
|
---|
167 | close FILE;
|
---|
168 | }
|
---|
169 |
|
---|
170 |
|
---|