1 | #! /usr/bin/perl -w
|
---|
2 |
|
---|
3 | # converts an old style humanity collection which uses an index.txt file to
|
---|
4 | # use a single metadata.xml file instead
|
---|
5 |
|
---|
6 |
|
---|
7 | BEGIN {
|
---|
8 | die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
|
---|
9 | unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
|
---|
10 | }
|
---|
11 |
|
---|
12 | use util;
|
---|
13 | use cfgread;
|
---|
14 |
|
---|
15 | my $collection = $ARGV[0];
|
---|
16 | my $collectdir = &util::filename_cat($ENV{'GSDLHOME'}, "collect", $collection);
|
---|
17 | my $importdir = &util::filename_cat($collectdir, "import");
|
---|
18 |
|
---|
19 | die unless -d $importdir;
|
---|
20 |
|
---|
21 | # new import structure will be created in "import.new" directory
|
---|
22 | my $importnewdir = $importdir . ".new";
|
---|
23 | `mkdir $importnewdir`;
|
---|
24 |
|
---|
25 |
|
---|
26 | # read in index.txt file and generate metadata.xml, in the process
|
---|
27 | # converting the html files and copying them across to the import.new
|
---|
28 | # directory
|
---|
29 |
|
---|
30 | my $metadata_xml = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n";
|
---|
31 | $metadata_xml .= "<!DOCTYPE DirectoryMetadata SYSTEM " .
|
---|
32 | "\"http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd\">\n";
|
---|
33 | $metadata_xml .= "<DirectoryMetadata>\n\n";
|
---|
34 |
|
---|
35 | my $index_txt = &util::filename_cat($importdir, "index.txt");
|
---|
36 |
|
---|
37 | open (INDEXTXT, $index_txt) || die;
|
---|
38 |
|
---|
39 | my $line = [];
|
---|
40 | my @fields = ();
|
---|
41 | my $count = 0;
|
---|
42 | while (defined ($line = cfgread::read_cfg_line("main::INDEXTXT"))) {
|
---|
43 |
|
---|
44 | # last if $count > 10;
|
---|
45 |
|
---|
46 | if ($line->[0] eq "key:") {
|
---|
47 | shift @$line;
|
---|
48 | @fields = @$line;
|
---|
49 | } else {
|
---|
50 |
|
---|
51 | my $jobnumber = shift @$line;
|
---|
52 | &new_document($jobnumber);
|
---|
53 | $count ++;
|
---|
54 |
|
---|
55 | my $i = 0;
|
---|
56 | for ($i = 0; $i < scalar(@$line); $i++) {
|
---|
57 | if ($line->[$i] =~ /^<([^>]+)>(.*)$/) {
|
---|
58 | &set_metadata($1, $2);
|
---|
59 | } else {
|
---|
60 | if (defined ($fields[$i])) {
|
---|
61 | &set_metadata($fields[$i], $line->[$i]);
|
---|
62 | } else {
|
---|
63 | print STDERR "error 1\n";
|
---|
64 | }
|
---|
65 | }
|
---|
66 | }
|
---|
67 |
|
---|
68 | $metadata_xml .= " </Description>\n";
|
---|
69 | $metadata_xml .= " </FileSet>\n\n";
|
---|
70 | }
|
---|
71 | }
|
---|
72 | close INDEXTXT;
|
---|
73 |
|
---|
74 | $metadata_xml .= "</DirectoryMetadata>\n";
|
---|
75 |
|
---|
76 | my $metafile = &util::filename_cat($importnewdir, "metadata.xml");
|
---|
77 | open (META, ">$metafile") || die;
|
---|
78 | print META $metadata_xml;
|
---|
79 | close META;
|
---|
80 |
|
---|
81 |
|
---|
82 | sub new_document {
|
---|
83 | my ($jobnumber) = @_;
|
---|
84 |
|
---|
85 | print STDERR "creating new document ($jobnumber)\n";
|
---|
86 |
|
---|
87 | my $docdir = &util::filename_cat($importdir, $jobnumber);
|
---|
88 | die unless -d $docdir;
|
---|
89 |
|
---|
90 | # copy whole directory across to import.new
|
---|
91 | $jobnumber =~ s/^.*?\///;
|
---|
92 | my $newdocdir = &util::filename_cat($importnewdir, $jobnumber);
|
---|
93 | die if -e $newdocdir;
|
---|
94 | `cp -r $docdir $newdocdir`;
|
---|
95 |
|
---|
96 | # convert the htm file to use the new syntax
|
---|
97 | my $htmfile = &util::filename_cat($newdocdir, "$jobnumber.htm");
|
---|
98 | die unless -e $htmfile;
|
---|
99 | `convert_toc.pl < $htmfile > $htmfile.new`;
|
---|
100 | `mv $htmfile.new $htmfile`;
|
---|
101 |
|
---|
102 | # update metadata.xml
|
---|
103 | $metadata_xml .= " <FileSet>\n";
|
---|
104 | $metadata_xml .= " <FileName>$jobnumber</FileName>\n";
|
---|
105 | $metadata_xml .= " <Description>\n";
|
---|
106 | }
|
---|
107 |
|
---|
108 | sub set_metadata {
|
---|
109 | my ($key, $value) = @_;
|
---|
110 |
|
---|
111 | $metadata_xml .= " <Metadata name=\"$key\" mode=\"accumulate\">$value</Metadata>\n";
|
---|
112 | }
|
---|