[3162] | 1 | #!/usr/bin/perl -w
|
---|
| 2 |
|
---|
| 3 |
|
---|
| 4 | BEGIN {
|
---|
| 5 | die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
|
---|
| 6 | unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
|
---|
| 7 | }
|
---|
| 8 |
|
---|
| 9 |
|
---|
| 10 | use File::Basename;
|
---|
| 11 | use unicode;
|
---|
| 12 |
|
---|
| 13 | my %classifications = (
|
---|
| 14 | 'Pharmacokinetics' => ['Absorption', 'Distribution', 'Metabolism', 'Elimination',
|
---|
| 15 | 'Effect of Food', 'Drug Interactions', 'Special Populations'],
|
---|
| 16 | 'Pharmacodynamics' => ['Trace Metal', 'Excretion', 'Selectivity', 'Copper', 'Zinc',
|
---|
| 17 | 'Manganese', 'Iron', 'Distribution', 'Tissue', 'Brain',
|
---|
| 18 | 'Heart', 'Plasma', 'Fetal', 'Liver', 'Kidney'],
|
---|
| 19 | 'Safety/Side Effects/Toxicity' => ['Acute Toxicity', 'Repeated Dose Toxicity',
|
---|
| 20 | 'Studies in Rodents', 'Studies in Dogs',
|
---|
| 21 | 'Overdosage', 'Reproductive Toxicity',
|
---|
| 22 | 'Teratogenicity', 'Mutagenecity', 'Carcinogenicity'],
|
---|
| 23 | 'Other' => ['Superoxide Dismutase', 'Ferroxidase I', 'Ferroxidase II',
|
---|
| 24 | 'Metallothienein', 'Ceruloplasmin'],
|
---|
| 25 | 'Efficacy' => ['Heart', 'Complications', 'Microvascular', 'Renal', 'Diabetes'],
|
---|
| 26 | 'Contraindications' => [],
|
---|
| 27 | 'Precautions' => []
|
---|
| 28 | );
|
---|
| 29 |
|
---|
| 30 | my @meta_files = ();
|
---|
| 31 |
|
---|
| 32 | &recursive_rename("import");
|
---|
| 33 | # process meta files
|
---|
| 34 | foreach my $mfile (@meta_files) {
|
---|
| 35 | &process_meta_file($mfile);
|
---|
| 36 | }
|
---|
| 37 |
|
---|
| 38 | sub recursive_rename {
|
---|
| 39 | my ($dir) = @_;
|
---|
| 40 |
|
---|
| 41 | opendir (DIR, "$dir") || die;
|
---|
| 42 | my @files = readdir DIR;
|
---|
| 43 | closedir DIR;
|
---|
| 44 |
|
---|
| 45 | foreach $file (@files) {
|
---|
| 46 | next if $file eq "." || $file eq "..";
|
---|
| 47 | next if $file =~ /processed.htm$/;
|
---|
| 48 |
|
---|
| 49 | my $path = "$dir/$file";
|
---|
| 50 | if (-d $path) {
|
---|
| 51 | &recursive_rename($path);
|
---|
| 52 | } else {
|
---|
| 53 | $newfile = &rename_file($file);
|
---|
| 54 | if ($newfile ne $file) {
|
---|
| 55 | print STDERR "renaming $file --> $newfile\n";
|
---|
| 56 | `mv "$path" "$dir/$newfile"`;
|
---|
| 57 | }
|
---|
| 58 |
|
---|
| 59 | if ($file =~ /^title\.txt$/) {
|
---|
| 60 | # we'll process all the title.txt files after we've finished
|
---|
| 61 | # renaming everything
|
---|
| 62 | push(@meta_files, "$dir/$newfile");
|
---|
| 63 | }
|
---|
| 64 |
|
---|
| 65 | if ($file =~ /\.html?$/i) {
|
---|
| 66 | &process_html_file("$dir/$newfile");
|
---|
| 67 | }
|
---|
| 68 | }
|
---|
| 69 | }
|
---|
| 70 | }
|
---|
| 71 |
|
---|
| 72 | # clean up the html (currently just use the non-css version)
|
---|
| 73 | sub process_html_file {
|
---|
| 74 | my ($filename) = @_;
|
---|
| 75 |
|
---|
| 76 | print STDERR "processing $filename\n";
|
---|
| 77 |
|
---|
| 78 | open (FILE, $filename) || die;
|
---|
| 79 | undef $/;
|
---|
| 80 | my $file = <FILE>;
|
---|
| 81 | $/ = "\n";
|
---|
| 82 | close FILE;
|
---|
| 83 |
|
---|
| 84 | my $header = "<html>\n<head></head>\n<body bgcolor=\"#FFFFFF\">\n";
|
---|
| 85 | my $footer = "</body>\n</html>\n";
|
---|
| 86 |
|
---|
| 87 | my ($noncss, $css) = $file =~ /document\.write\(\"(.*?[^\\])\"\).*?document\.write\(\"(.*?[^\\])\"\)/si;
|
---|
| 88 |
|
---|
| 89 | # remove backslashes added for javascript strings
|
---|
| 90 | $noncss =~ s/\\\"/\"/sg;
|
---|
| 91 | $noncss =~ s/\\\n/\n/sg;
|
---|
| 92 |
|
---|
| 93 | # alter <img> tags to support renamed files
|
---|
| 94 | $noncss =~ s/(<img src=\")([^\"]+)/$1 . &rename_file($2)/sige;
|
---|
| 95 |
|
---|
| 96 | # remove rules="all" attribute from table tags
|
---|
| 97 | $noncss =~ s/(<table.*?)rules=\"all\"\s+/$1/sig;
|
---|
| 98 |
|
---|
[3168] | 99 | # remove empty <div> tags
|
---|
| 100 | while ($noncss =~ s/(<div[^>]*>\s*<\/div>)//sig) {}
|
---|
| 101 |
|
---|
| 102 | # remove empty table rows
|
---|
| 103 | $noncss =~ s/(<tr[^>]*>(\s*<td[^>]*>\s*<\/td>)*\s*<\/tr>)//sig;
|
---|
| 104 | $noncss =~ s/<table[^>]*>\s*<\/table>//sig;
|
---|
| 105 |
|
---|
[3162] | 106 | open (FILE, ">$filename") || die;
|
---|
| 107 | print FILE $header . $noncss . $footer;
|
---|
| 108 | close FILE;
|
---|
| 109 |
|
---|
| 110 | }
|
---|
| 111 |
|
---|
| 112 | sub rename_file {
|
---|
| 113 | my ($filename) = @_;
|
---|
| 114 |
|
---|
| 115 | $filename =~ s/\s+//g;
|
---|
| 116 | $filename =~ s/^protemix\(\d+\)-?//;
|
---|
[3168] | 117 | $filename =~ s/^OCR\d+-0//;
|
---|
[3162] | 118 |
|
---|
| 119 | return $filename;
|
---|
| 120 | }
|
---|
| 121 |
|
---|
| 122 | # process a title.txt file and replace it with a meta.xml file
|
---|
| 123 | sub process_meta_file {
|
---|
| 124 | my ($filename) = @_;
|
---|
| 125 |
|
---|
| 126 | open (FILE, $filename) || die ("couldn't open $filename");
|
---|
| 127 | undef $/;
|
---|
| 128 | my $title = <FILE>;
|
---|
| 129 | $/ = "\n";
|
---|
| 130 | close FILE;
|
---|
| 131 |
|
---|
| 132 | unlink($filename);
|
---|
| 133 |
|
---|
| 134 | $title =~ s/\s+/ /gs;
|
---|
| 135 | $title =~ s/^\s+//;
|
---|
| 136 | $title =~ s/\s+$//;
|
---|
| 137 | $title = &unicode::ascii2utf8(\$title); # assumes title is iso-8859-1
|
---|
| 138 | my $metafile = "<Metafile>\n";
|
---|
| 139 | $metafile .= " <Metadata name=\"Title\">$title</Metadata>\n";
|
---|
| 140 |
|
---|
| 141 | # currently just write some random values for classification metadata
|
---|
| 142 | my @c1 = ('Animal', 'Human', 'Other');
|
---|
| 143 | my $rand = int(rand 3);
|
---|
| 144 | my $classtext = " <Metadata name=\"Class1\">" . $c1[$rand] . "</Metadata>\n";
|
---|
| 145 | my @c2 = ('Pharmacokinetics', 'Pharmacodynamics', 'Safety/Side Effects/Toxicity',
|
---|
| 146 | 'Other', 'Efficacy', 'Contraindications', 'Precautions');
|
---|
| 147 | $rand = int(rand 7);
|
---|
| 148 | my $c2val = $c2[$rand];
|
---|
| 149 | $classtext .= " <Metadata name=\"Class2\">$c2val</Metadata>\n";
|
---|
| 150 | if (defined $classifications{$c2val}) {
|
---|
| 151 | if (scalar(@{$classifications{$c2val}})) {
|
---|
| 152 | my $numvals = scalar(@{$classifications{$c2val}});
|
---|
| 153 | $rand = int(rand $numvals);
|
---|
| 154 | my $c3val = $classifications{$c2val}->[$rand];
|
---|
| 155 | $classtext .= " <Metadata name=\"Class3\">$c3val</Metadata>\n";
|
---|
| 156 | }
|
---|
| 157 | } else {
|
---|
| 158 | print STDERR "ERROR: '$c2val' not in clasifications list\n";
|
---|
| 159 | }
|
---|
| 160 |
|
---|
| 161 | my $dir = File::Basename::dirname($filename);
|
---|
| 162 |
|
---|
| 163 | opendir(DIR, $dir) || die;
|
---|
| 164 | my @files = readdir DIR;
|
---|
| 165 | foreach my $file (@files) {
|
---|
| 166 | if ($file =~ s/\.html?$//i) {
|
---|
| 167 | $metafile .= " <Page filename=\"$file\">\n$classtext </Page>\n";
|
---|
| 168 | }
|
---|
| 169 | }
|
---|
| 170 | $metafile .= "</Metafile>\n";
|
---|
| 171 |
|
---|
| 172 | $filename = "$dir/meta.xml";
|
---|
| 173 | open (FILE, ">$filename") || die;
|
---|
| 174 | print FILE $metafile;
|
---|
| 175 | close FILE;
|
---|
| 176 | }
|
---|
| 177 |
|
---|
| 178 |
|
---|