#!/usr/bin/perl -w BEGIN { die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'}; unshift (@INC, "$ENV{'GSDLHOME'}/perllib"); } use File::Basename; use unicode; my @meta_files = (); &recursive_rename("import"); # read in log.Type.txt and log.Category.txt my $classifications = &read_cat_files(); # process meta files foreach my $mfile (@meta_files) { &process_meta_file($mfile); } sub recursive_rename { my ($dir) = @_; opendir (DIR, "$dir") || die; my @files = readdir DIR; closedir DIR; foreach $file (@files) { next if $file eq "." || $file eq ".."; next if $file =~ /processed.htm$/; my $path = "$dir/$file"; if (-d $path) { &recursive_rename($path); } else { $newfile = &rename_file($file); if ($newfile ne $file) { print STDERR "renaming $file --> $newfile\n"; `mv "$path" "$dir/$newfile"`; } if ($file =~ /^title\.txt$/) { # we'll process all the title.txt files after we've finished # renaming everything push(@meta_files, "$dir/$newfile"); } if ($file =~ /\.html?$/i) { &process_html_file("$dir/$newfile"); } } } } # clean up the html (currently just use the non-css version) sub process_html_file { my ($filename) = @_; print STDERR "processing $filename\n"; open (FILE, $filename) || die; undef $/; my $file = ; $/ = "\n"; close FILE; my $header = "\n\n\n"; my $footer = "\n\n"; my ($noncss, $css) = $file =~ /document\.write\(\"(.*?[^\\])\"\).*?document\.write\(\"(.*?[^\\])\"\)/si; # remove backslashes added for javascript strings $noncss =~ s/\\\"/\"/sg; $noncss =~ s/\\(\s)/$1/g; # $noncss =~ s/\\\n/\n/sg; # Occurances of 2HCl (lowercase el) were mistakenly changed to 2HC1 # (digit one) during OCR in some cases so we'll change them back $noncss =~ s/2HC1/2HCl/g; # alter tags to support renamed files $noncss =~ s/( tags while ($noncss =~ s/(]*>\s*<\/div>)//sig) {} # remove empty table rows $noncss =~ s/(]*>(\s*]*>\s*<\/td>)*\s*<\/tr>)//sig; $noncss =~ s/]*>\s*<\/table>//sig; open (FILE, ">$filename") || die; print FILE $header . $noncss . $footer; close FILE; } sub rename_file { my ($filename) = @_; $filename =~ s/\s+//g; $filename =~ s/^protemix\(\d+\)-?//; $filename =~ s/^OCR\d+-0//; $filename =~ s/^(\d+)-\d+\.pdf$/$1-all.pdf/; return $filename; } # process a title.txt file and replace it with a meta.xml file sub process_meta_file { my ($filename) = @_; open (FILE, $filename) || die ("couldn't open $filename"); undef $/; my $title = ; $/ = "\n"; close FILE; unlink($filename); $title =~ s/\s+/ /gs; $title =~ s/^\s+//; $title =~ s/\s+$//; $title =~ s/\"/"/g; $title =~ s//>/g; $title =~ s/&/&/g; $title = &unicode::ascii2utf8(\$title); # assumes title is iso-8859-1 my $metafile = "\n"; $metafile .= " $title\n"; my $dir = File::Basename::dirname($filename); my ($subdirs) = $dir =~ /import\/(.*)$/; opendir(DIR, $dir) || die; my @files = readdir DIR; foreach my $file (@files) { if ($file =~ /\.html?$/i) { my $fstub = $file; $fstub =~ s/\.html?$//i; $metafile .= " \n"; if (defined ($classifications->{'type'}->{"$subdirs/$file"})) { $metafile .= " " . $classifications->{'type'}->{"$subdirs/$file"} . "\n"; } else { print STDERR "$subdirs/$file has no type metadata\n"; } if (defined ($classifications->{'category'}->{"$subdirs/$file"})) { $metafile .= " " . $classifications->{'category'}->{"$subdirs/$file"} . "\n"; } else { print STDERR "$subdirs/$file has no category metadata\n"; } $metafile .= " \n"; } } $metafile .= "\n"; $filename = "$dir/meta.xml"; open (FILE, ">$filename") || die; print FILE $metafile; close FILE; } sub read_cat_files { my $class = {'type' => {}, 'category' => {}}; open (TYPE, "import/log.Type.txt") || die; my $line = ""; while (defined ($line = )) { my ($filename, $cat) = split(/ /, $line); chomp $cat; $filename =~ s/^.*?Protemix\(ii\)/OCR2/; $filename =~ s/^.*?Protemix\(i\)/OCR1/; $class->{'type'}->{$filename} = $cat; } close TYPE; open (CAT, "import/log.Category.txt") || die; $line = ""; while (defined ($line = )) { my ($filename, $cat) = split(/ /, $line); chomp $cat; $filename =~ s/^.*?Protemix\(ii\)/OCR2/; $filename =~ s/^.*?Protemix\(i\)/OCR1/; $class->{'category'}->{$filename} = $cat; } close CAT; return $class; }