Changeset 1604 for trunk/gsdl/src/phind/generate/phproc.pm
- Timestamp:
- 2000-10-17T12:35:59+13:00 (24 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/src/phind/generate/phproc.pm
r1562 r1604 38 38 39 39 sub new { 40 my ($class, $archive_dir, $phindex_dir, 40 my ($class, $archive_dir, $phindex_dir, $phindcfg, 41 41 $language, $delimiter, $verbosity, $outhandle) = @_; 42 42 43 43 my $self = new docproc (); 44 44 45 # $self->{'collection'} = $collection;46 45 $self->{'archive_dir'} = $archive_dir; 47 46 $self->{'phindex_dir'} = $phindex_dir; 47 $self->{'indexes'} = $phindcfg; 48 48 49 49 $language =~ s/,/\|/g; … … 60 60 $self->{'txthandle'} = TEXT; 61 61 62 &util::rm("$phindex_dir/mg-d.txt") if (-e "$phindex_dir/mg-d.txt"); 63 open(DOCS, ">$phindex_dir/mg-d.txt") 64 || die "Cannot open $phindex_dir/mg-d.txt: $!"; 62 my $docfile = &util::filename_cat("$phindex_dir", "docs.txt"); 63 &util::rm($docfile) if (-e $docfile); 64 open(DOCS, ">$docfile") 65 || die "Cannot open $docfile: $!"; 65 66 $self->{'dochandle'} = DOCS; 66 67 … … 95 96 my $dochandle = $self->{'dochandle'}; 96 97 # print "dochandle: =$dochandle=\n"; 97 print $dochandle "$OID\t$title\n"; 98 99 # store the text 100 $text = convert_gml_to_tokens($doc_obj->get_text()); 101 98 print $dochandle "<Document>\t$OID\t$title\n"; 99 100 # XXX 101 # Store the text of this object 102 my $indexlist = $self->{'indexes'}; 103 my @parts; 104 my ($index, $part, $level, $field, $section, $data, $text); 105 106 # Output the document delimiter 102 107 my $txthandle = $self->{'txthandle'}; 103 print $txthandle $self->{'delimiter'}, "\n$text\n"; 108 print $txthandle $self->{'delimiter'}, "\n"; 109 110 # Iterarate over all the indexes specified in collect.cfg and 111 # add their text to the clauses file. 112 foreach $index (@$indexlist) { 113 $text = ""; 114 115 # Iterate over all the feilds in each index 116 @parts = split(/,/, $index); 117 foreach $part (@parts) { 118 119 # Each field has a level and a data element ((e.g. document:Title) 120 ($level, $field) = split(/:/, $part); 121 die unless ($level && $field); 122 123 # Extract the text from every section 124 # (In phind, document:text and section:text are equivalent) 125 if ($field eq "text") { 126 $data = ""; 127 $section = $doc_obj->get_top_section(); 128 while (defined($section)) { 129 $data .= $doc_obj->get_text($section) . "\n"; 130 $section = $doc_obj->get_next_section($section); 131 } 132 $text .= convert_gml_to_tokens($data) . "\n"; 133 } 134 135 # Extract a metadata field from a document 136 elsif ($level eq "document") { 137 $data = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field); 138 $text .= convert_gml_to_tokens($data) . "\n"; 139 } 140 141 # Extract metadata from every section in a document 142 elsif ($level eq "section") { 143 $data = ""; 144 $section = $doc_obj->get_top_section(); 145 while (defined($section)) { 146 $data .= $doc_obj->get_metadata_element($section, $field) . "\n"; 147 $section = $doc_obj->get_next_section($section); 148 } 149 $text .= convert_gml_to_tokens($data) . "\n"; 150 } 151 152 # Some sort of specification which I don't understand 153 else { 154 die "Unknown level ($level) in phind key ($part) in phind index ($index)\n"; 155 } 156 157 } 158 159 # print the text 160 print $txthandle "$text"; 161 162 } 104 163 } 105 164
Note:
See TracChangeset
for help on using the changeset viewer.