source: trunk/gsdl/perllib/classify/phind.pm@ 2487

Last change on this file since 2487 was 2487, checked in by sjboddie, 23 years ago

Changes to get phind working under windows

  • Property svn:keywords set to Author Date Id Revision
File size: 39.2 KB
Line 
1###########################################################################
2#
3# phind.pm -- the Phind classifier
4#
5# Copyright (C) 2000 Gordon W. Paynter
6# Copyright (C) 2000 New Zealand Digital Library Project
7#
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# The phind clasifier plugin.
30# Options are dexcribed in the print_usage function.
31# Type "classinfo.pl phind" at the command line for a summary.
32
33package phind;
34
35use BasClas;
36use util;
37use ghtml;
38use unicode;
39
40sub BEGIN {
41 @ISA = ('BasClas');
42}
43
44
45sub print_usage {
46 print STDERR "
47 usage: classify phind [options]
48
49 options:
50 -text Fields The text used to build the phrase hierarchy.
51 (default: 'section:Title,section:text')
52
53 -title Title The metadata field used to describe each document.
54 (default: 'Title')
55
56 -button Name The label for the classifier screen and button in
57 navigation bar.
58 (default: 'Phrase')
59
60 -language Regex Language or languages to use building hierarchy.
61 Languages are identified by two-letter country codes
62 like en (English), es (Spanish), and fr (French).
63 Language is a regular expression, so 'en|fr' (English or
64 French) and '..' (match any language) are valid.
65 (default: 'en'.)
66
67 -savephrases File If set, the phrase infomation will be stored in
68 the given file as text. It is probably a good idea
69 to use an absolute path.
70 (defualt: not set)
71
72 -suffixmode N The smode parameter to the phrase extraction program. A
73 value of 0 means that stopwords are ignored, and of 1
74 means that stopwords are used.
75 (default: 1)
76
77 -thesaurus Name Name of a thesaurus stored in phind format in the
78 collection's etc directory.
79 (default: not set)
80
81 -untidy Don't remove working files.
82
83"; }
84
85
86%wanted_index_files = ('td'=>1,
87 't'=>1,
88 'ti'=>1,
89 'tl'=>1,
90 'tsd'=>1,
91 'idb'=>1,
92 'ib1'=>1,
93 'ib2'=>1,
94 'ib3'=>1,
95 'i'=>1,
96 'il'=>1,
97 'w'=>1,
98 'wa'=>1);
99
100
101
102# Phrase delimiter symbols - these should be abstracted out someplace
103
104my $colstart = "COLLECTIONSTART";
105my $colend = "COLLECTIONEND";
106my $doclimit = "DOCUMENTLIMIT";
107my $senlimit = "SENTENCELIMIT";
108my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
109
110
111# Create a new phind browser based on collect.cfg
112
113sub new {
114 my $class = shift (@_);
115 my $self = new BasClas($class, @_);
116
117 my $out = $self->{'outhandle'};
118
119
120 # Phind installation check
121 # The phind phrase browser is research software and is not installed
122 # by defualt. If the user attepts to use it we warn them that it's a
123 # bit dodgy, then tell them how to install it. If they can do that
124 # and get all the files in place, then we let them proceed.
125
126 print $out "Checking Phind phrase browser requirements...\n";
127
128 # Ensure the Phind generate scripts are in place
129 my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
130 $file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
131 my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
132
133 if (!(-e $file1)) {
134 print STDERR "The phind \"suffix\" program is not installed. ";
135 print STDERR "To install it, change to the directory\n";
136 print STDERR " $src\n";
137 print STDERR "and type \"make install-phind\".\n\n";
138 exit(1);
139 }
140
141 # Ensure the Phind CGI script is in place
142 $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "cgi-bin", "phindcgi");
143 $file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
144 $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "host");
145
146 if (!(-e $file1)) {
147 print STDERR "The phind CGI program is not installed. ";
148 print STDERR "To install it, change to the directory\n";
149 print STDERR " $src\n";
150 print STDERR "and type \"make install-phind\".\n\n";
151 exit(1);
152 }
153
154 # Ensure the Phind Java applet is in place
155 $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "client");
156 $file1 = &util::filename_cat($src, "Phind.class");
157
158 if (!(-e $file1)) {
159 print STDERR "The phind Java classes are not compiled. ";
160 print STDERR "To compile them, change to the directory\n";
161 print STDERR " $src\n";
162 print STDERR "and use your Java compiler to compile Phind.java.\n";
163 print STDERR "(if you have Java 1.2 installed, type \"javac Phind.java\")\n\n";
164 exit(1);
165 }
166
167 # Parse classifier arguments
168 my $builddir = "";
169 if (!parsargv::parse(\@_,
170 q^text/.*/section:Title,section:text^, \$self->{'indexes'},
171 q^title/.*/Title^, \$self->{'titlefield'},
172 q^button/.*/Phrase^, \$self->{'buttonname'},
173 q^language/.*/en^, \$language,
174 q^builddir/.*/^, \$builddir,
175 q^savephrases/\d/0^, \$self->{'savephrases'},
176 q^suffixmode/\d/1^, \$self->{'suffixmode'},
177 q^thesaurus/.*/^, \$self->{'thesaurus'},
178 q^untidy^, \$self->{'untidy'},
179 "allow_extra_options")) {
180
181 print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
182 &print_usage();
183 die "\n";
184 }
185
186 # classifier information
187 $self->{'collection'} = $ENV{'GSDLCOLLECTION'};
188
189 # limit languages
190 $self->{'language_exp'} = $language;
191
192 # collection directories
193 $self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
194 if (!$builddir) {
195 $builddir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
196 }
197 $self->{'builddir'} = $builddir;
198
199 $self->{'total'} = 0;
200
201 return bless $self, $class;
202}
203
204
205# Initialise the phind classifier
206
207sub init {
208 my $self = shift (@_);
209
210 # ensure we have a build directory
211 my $builddir = $self->{'builddir'};
212 die unless (-e "$builddir");
213
214 # create phind directory
215 my $phnumber = 1;
216 my $phinddir = &util::filename_cat($builddir, "phind1");
217 while (-e "$phinddir") {
218 $phnumber++;
219 $phinddir = &util::filename_cat($builddir, "phind$phnumber");
220 }
221 &util::mk_dir("$phinddir");
222 $self->{'phinddir'} = $phinddir;
223 $self->{'phindnumber'} = $phnumber;
224
225 # open filehandles for documents and text
226 my $clausefile = &util::filename_cat("$phinddir", "clauses");
227 &util::rm($clausefile) if (-e $clausefile);
228
229 my $txthandle = 'TEXT' . $phnumber;
230 open($txthandle, ">$clausefile") || die "Cannot open $clausefile: $!";
231 $self->{'txthandle'} = $txthandle;
232
233 my $docfile = &util::filename_cat("$phinddir", "docs.txt");
234 &util::rm($docfile) if (-e $docfile);
235
236 my $dochandle = 'DOC' . $phnumber;
237 open($dochandle, ">$docfile") || die "Cannot open $docfile: $!";
238 $self->{'dochandle'} = $dochandle;
239
240}
241
242
243# Classify each document.
244#
245# Each document is passed here in turn. The classifier extracts the
246# text of each and stores it in the clauses file. Document details are
247# stored in the docs.txt file.
248
249sub classify {
250 my $self = shift (@_);
251 my $doc_obj = shift @_;
252
253 my $verbosity = $self->{'verbosity'};
254 my $top_section = $doc_obj->get_top_section();
255
256 my $titlefield = $self->{'titlefield'};
257
258 my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
259 print "process: $title\n" if ($verbosity > 2);
260
261 # Only consider the file if it is in the correct language
262 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
263 my $phrlanguage = $self->{'language_exp'};
264 return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
265
266 # record this file
267 $self->{'total'} ++;
268 print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
269
270
271 # Store document details
272 my $OID = $doc_obj->get_OID();
273 $OID = "NULL" unless defined $OID;
274 my $dochandle = $self->{'dochandle'};
275 print $dochandle "<Document>\t$OID\t$title\n";
276
277 # Store the text occuring in this object
278
279 # output the document delimiter
280 my $txthandle = $self->{'txthandle'};
281 print $txthandle "$doclimit\n";
282
283 # iterarate over the required indexes and store their text
284 my $indexes = $self->{'indexes'};
285 my $text = "";
286 my ($part, $level, $field, $section, $data, $dataref);
287
288 foreach $part (split(/,/, $indexes)) {
289
290 # Each field has a level and a data element ((e.g. document:Title)
291 ($level, $field) = split(/:/, $part);
292 die unless ($level && $field);
293
294 # Extract the text from every section
295 # (In phind, document:text and section:text are equivalent)
296 if ($field eq "text") {
297 $data = "";
298 $section = $doc_obj->get_top_section();
299 while (defined($section)) {
300 $data .= $doc_obj->get_text($section) . "\n";
301 $section = $doc_obj->get_next_section($section);
302 }
303 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
304 }
305
306 # Extract a metadata field from a document
307 # (If ther eis more than one element of the given type, get them all.)
308 elsif ($level eq "document") {
309 $dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
310 foreach $data (@$dataref) {
311 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
312 }
313 }
314
315 # Extract metadata from every section in a document
316 elsif ($level eq "section") {
317 $data = "";
318 $section = $doc_obj->get_top_section();
319 while (defined($section)) {
320 $dataref = $doc_obj->get_metadata($section, $field);
321 $data .= join("\n", @$dataref) . "\n";
322 $section = $doc_obj->get_next_section($section);
323 }
324 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
325 }
326
327 # Some sort of specification which I don't understand
328 else {
329 die "Unknown level ($level) in phind index ($part)\n";
330 }
331
332 }
333
334 # output the text
335 $text =~ tr/\n//s;
336 print $txthandle "$text";
337}
338
339
340# Construct the classifier from the information already gathered
341#
342# When get_classify_info is called, the clauses and docs.txt files have
343# already been constructed in the phind directory. This function will
344# translate them into compressed, indexed MGPP files that can be read by
345# the phindcgi script. It will also register our classifier so that it
346# shows up in the navigation bar.
347
348sub get_classify_info {
349 my $self = shift (@_);
350
351 close $self->{'dochandle'};
352 close $self->{'txthandle'};
353 my $verbosity = $self->{'verbosity'};
354 my $out = $self->{'outhandle'};
355 my $phinddir = $self->{'phinddir'};
356
357 my $osextra = "";
358 if ($ENV{'GSDLOS'} !~ /^windows$/i) {
359 $osextra = " -d /";
360 }
361
362 if ($verbosity) {
363 print $out "\n*** phind.pm generating indexes for ", $self->{'indexes'}, "\n";
364 print $out "*** in", $self->{'phinddir'}, "\n";
365 }
366
367 # Construct phind indexes
368 my $suffixmode = $self->{'suffixmode'};
369 my ($command, $status);
370
371 # Generate the vocabulary, symbol statistics, and numbers file
372 # from the clauses file
373 print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
374 &extract_vocabulary($self);
375
376 # Use the suffix program to generate the phind/phrases file
377 print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
378 &execute("suffix $phinddir $suffixmode $verbosity", $verbosity, $out);
379
380 # Create the phrase file and put phrase numbers in phind/phrases
381 print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
382 &renumber_phrases($self);
383
384 print $out "\nCreating phrase databases\n";
385 my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
386 my $mg_stem = &util::filename_cat($phinddir, "pdata");
387
388 &execute("mgpp_passes $osextra -f $mg_stem -T1 $mg_input", $verbosity, $out);
389 &execute("mgpp_compression_dict $osextra -f $mg_stem", $verbosity, $out);
390 &execute("mgpp_passes $osextra -f $mg_stem -T2 $mg_input", $verbosity, $out);
391
392 # create the mg index of words
393 print $out "\nCreating word-level search indexes\n";
394 $mg_input = &util::filename_cat($phinddir, "pword.txt");
395 $mg_stem = &util::filename_cat($phinddir, "pword");
396
397 &execute("mgpp_passes $osextra -f $mg_stem -T1 -I1 $mg_input", $verbosity, $out);
398 &execute("mgpp_compression_dict $osextra -f $mg_stem", $verbosity, $out);
399 &execute("mgpp_perf_hash_build $osextra -f $mg_stem", $verbosity, $out);
400 &execute("mgpp_passes $osextra -f $mg_stem -T2 -I2 $mg_input", $verbosity, $out);
401 &execute("mgpp_weights_build $osextra -f $mg_stem", $verbosity, $out);
402 &execute("mgpp_invf_dict $osextra -f $mg_stem", $verbosity, $out);
403
404 &execute("mgpp_stem_idx $osextra -f $mg_stem -s 1", $verbosity, $out);
405 &execute("mgpp_stem_idx $osextra -f $mg_stem -s 2", $verbosity, $out);
406 &execute("mgpp_stem_idx $osextra -f $mg_stem -s 3", $verbosity, $out);
407
408 # create the mg document information database
409 print $out "\nCreating document information databases\n";
410 $mg_input = &util::filename_cat($phinddir, "docs.txt");
411 $mg_stem = &util::filename_cat($phinddir, "docs");
412
413 &execute("mgpp_passes $osextra -f $mg_stem -T1 $mg_input", $verbosity, $out);
414 &execute("mgpp_compression_dict $osextra -f $mg_stem", $verbosity, $out);
415 &execute("mgpp_passes $osextra -f $mg_stem -T2 $mg_input", $verbosity, $out);
416
417 # Tidy up stray files
418 if (!$self->{'untidy'}) {
419 print $out "\nCleaning up\n" if ($verbosity > 2);
420 opendir (DIR, $phinddir) || die;
421 my @files = readdir DIR;
422 closedir DIR;
423
424 foreach $file (@files) {
425 next if $file =~ /^\.\.?$/;
426 my ($suffix) = $file =~ /\.([^\.]+)$/;
427 if (!defined $suffix || !defined $wanted_index_files{$suffix}) {
428 # delete it!
429 print $out "deleting $file\n"; # if $verbosity > 2;
430 &util::rm (&util::filename_cat ($phinddir, $file));
431 }
432 }
433 }
434
435 # Return the information about the classifier that we'll later want to
436 # use to create macros when the Phind classifier document is displayed.
437 my %classifyinfo = ('thistype'=>'Invisible',
438 'childtype'=>'Phind',
439 'Title'=>$self->{'buttonname'},
440 'parameters'=>"phindnumber=$self->{'phindnumber'}",
441 'contains'=>[]);
442
443 my $collection = $self->{'collection'};
444 my $url = "library?a=p&p=phind&c=$collection";
445 push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
446
447 return \%classifyinfo;
448}
449
450
451
452sub convert_gml_to_tokens {
453
454 my ($language_exp, $text) = @_;
455
456 if ($language_exp =~ /en/) {
457 return &convert_gml_to_tokens_EN($text);
458 }
459
460 $_ = $text;
461
462 # 1. remove GML tags
463
464 # Remove everything that is in a tag
465 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
466 s/\s*<br>\s*/ LINEBREAK /isgo;
467 s/<[^>]*>/ /sgo;
468
469 # Now we have the text, but it may contain HTML
470 # elements coded as &gt; etc. Remove these tags.
471 s/&amp;/&/sgo;
472 s/&lt;/</sgo;
473 s/&gt;/>/sgo;
474 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
475 s/\s*<br>\s*/ LINEBREAK /isgo;
476 s/<[^>]*>/ /sgo;
477
478 # replace<p> and <br> placeholders with clause break symbol (\n)
479 s/\s+/ /gso;
480 s/PARAGRAPHBREAK/\n/sgo;
481 s/LINEBREAK/\n/sgo;
482
483
484
485
486 # 2. Split the remaining text into space-delimited tokens
487
488 # Convert any HTML special characters (like &quot;) to their UTF8 equivalent
489 s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
490
491 # Split text at word boundaries
492 s/\b/ /go;
493
494 # 3. Convert the remaining text to "clause format"
495
496 # Insert newline if the end of a sentence is detected
497 # (delimter is: "[\.\?\!]\s")
498 # s/\s*[\.\?\!]\s+/\n/go;
499
500 # remove unnecessary punctuation and replace with clause break symbol (\n)
501 s/[^\w ]/\n/go;
502
503 # remove extraneous whitespace
504 s/ +/ /sgo;
505 s/^\s+//mgo;
506 s/\s*$/\n/mgo;
507
508 # remove lines that contain one word or less
509 s/^\S*$//mgo;
510 s/^\s*$//mgo;
511 tr/\n//s;
512
513 return $_;
514}
515
516# A version of convert_gml_to_tokens that is fine-tuned to the English language.
517
518sub convert_gml_to_tokens_EN {
519 $_ = shift @_;
520
521 # FIRST, remove GML tags
522
523 # Replace all whitespace with a simple space
524 s/\s+/ /gs;
525
526 # Remove everything that is in a tag
527 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
528 s/\s*<br>\s*/ LINEBREAK /isg;
529 s/<[^>]*>/ /sg;
530
531 # Now we have the text, but it may contain HTML
532 # elements coded as &gt; etc. Remove these tags.
533 s/&lt;/</sg;
534 s/&gt;/>/sg;
535
536 s/\s+/ /sg;
537 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
538 s/\s*<br>\s*/ LINEBREAK /isg;
539 s/<[^>]*>/ /sg;
540
541 # remove &amp; and other miscellaneous markup tags
542 s/&amp;/&/sg;
543 s/&lt;/</sg;
544 s/&gt;/>/sg;
545 s/&amp;/&/sg;
546
547 # replace<p> and <br> placeholders with carriage returns
548 s/PARAGRAPHBREAK/\n/sg;
549 s/LINEBREAK/\n/sg;
550
551
552 # Exceptional punctuation
553 #
554 # We make special cases of some punctuation
555
556 # remove any apostrophe that indicates omitted letters
557 s/(\w+)\'(\w*\s)/ $1$2 /g;
558
559 # remove period that appears in a person's initals
560 s/\s([A-Z])\./ $1 /g;
561
562 # replace hyphens in hypheanted words and names with a space
563 s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
564
565 # Convert the remaining text to "clause format",
566 # This means removing all excess punctuation and garbage text,
567 # normalising valid punctuation to fullstops and commas,
568 # then putting one cluse on each line.
569
570 # Insert newline when the end of a sentence is detected
571 # (delimter is: "[\.\?\!]\s")
572 s/\s*[\.\?\!]\s+/\n/g;
573
574 # split numbers after four digits
575 s/(\d\d\d\d)/$1 /g;
576
577 # split words after 32 characters
578
579 # squash repeated punctuation
580 tr/A-Za-z0-9 //cs;
581
582 # save email addresses
583 # s/\w+@\w+\.[\w\.]+/EMAIL/g;
584
585 # normalise clause breaks (mostly punctuation symbols) to commas
586 s/[^A-Za-z0-9 \n]+/ , /g;
587
588 # Remove repeated commas, and replace with newline
589 s/\s*,[, ]+/\n/g;
590
591 # remove extra whitespace
592 s/ +/ /sg;
593 s/^\s+//mg;
594 s/\s*$/\n/mg;
595
596 # remove lines that contain one word or less
597 s/^\w*$//mg;
598 s/^\s*$//mg;
599 tr/\n//s;
600
601 return $_;
602
603}
604
605
606
607# Execute a system command
608
609sub execute {
610 my ($command, $verbosity, $outhandle) = @_;
611 print $outhandle "Executing: $command\n" if ($verbosity > 2);
612 $! = 0;
613 my $status = system($command);
614 if ($status != 0) {
615 print STDERR "phind - Error executing '$command': $!\n";
616 exit($status);
617 }
618}
619
620
621# Generate the vocabulary, symbol statistics, and numbers file from the
622# clauses file. This is legacy code, so is a bit messy and probably wont
623# run under windows.
624
625sub extract_vocabulary {
626 my ($self) = @_;
627
628 my $verbosity = $self->{'verbosity'};
629 my $out = $self->{'outhandle'};
630
631 my $collectiondir = $self->{'collectiondir'};
632 my $phinddir = $self->{'phinddir'};
633
634 my $language_exp = $self->{'language_exp'};
635
636 my ($w, $l, $line, $word);
637
638 my ($first_delimiter, $last_delimiter,
639 $first_stopword, $last_stopword,
640 $first_extractword, $last_extractword,
641 $first_contentword, $last_contentword,
642 $phrasedelimiter);
643
644 my $thesaurus = $self->{'thesaurus'};
645 my ($thesaurus_links, $thesaurus_terms,
646 %thesaurus, $first_thesaurusword, $last_thesaurusword);
647
648 my %symbol;
649 my (%freq);
650
651 print $out "Calculating vocabulary\n" if ($verbosity > 1);
652
653 # Read and store the stopwords
654 my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "stopwords");
655 my $stopword_files = ();
656 my ($language, $language_dir, $file, $file_name);
657 my %stopwords;
658
659 # Examine each directory in the stopword directory
660 opendir(STOPDIR, $stopdir);
661 foreach $language (readdir STOPDIR) {
662
663 # Ignore entries that do not match the classifier's language
664 next unless ($language =~ /$language_exp/);
665 $language_dir = &util::filename_cat($stopdir, $language);
666 next unless (-d "$language_dir");
667
668 opendir(LANGDIR, $language_dir);
669 foreach $file (readdir LANGDIR) {
670
671 # Ignore entries that are not stopword files
672 next unless ($file =~ /sw$/);
673 $file_name = &util::filename_cat($language_dir, $file);
674 next unless (-f "$file_name");
675
676 # Read the stopwords
677 open(STOPFILE, "<$file_name");
678 while (<STOPFILE>) {
679 s/^\s+//;
680 s/\s.*//;
681 $word = $_;
682 $l = lc($word);
683 $stopwords{$l} = $word;
684 }
685 close STOPFILE;
686
687 }
688 closedir LANGDIR;
689 }
690 closedir STOPDIR;
691
692 # Read thesaurus information
693 if ($thesaurus) {
694
695 # link file exists
696 $thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
697 die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
698
699 # ensure term file exists in the correct language
700 if ($language_exp =~ /^([a-z][a-z])/) {
701 $language = $1;
702 } else {
703 $language = 'en';
704 }
705 $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
706 die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
707
708
709 # Read the thesaurus terms
710 open(TH, "<$thesaurus_terms");
711 while(<TH>) {
712 s/^\d+ //;
713 s/\(.*\)//;
714 foreach $w (split(/\s+/, $_)) {
715 $thesaurus{lc($w)} = $w;
716 }
717 }
718 close TH;
719 }
720
721 # Read words in the text and count occurences
722 open(TXT, "<$phinddir/clauses");
723 my @words;
724
725 while(<TXT>) {
726 $line = $_;
727 next unless ($line =~ /./);
728
729 @words = split(/\s+/, $line);
730 foreach $w (@words) {
731 $l = lc($w);
732 $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
733 $freq{$w}++;
734 }
735 $freq{$senlimit}++;
736 }
737
738 close TXT;
739
740 # Calculate the "best" form of each word
741 my (%bestform, %totalfreq, %bestfreq);
742
743 foreach $w (sort (keys %freq)) {
744 $l = lc($w);
745
746 # totalfreq is the number of times a term appears in any form
747 $totalfreq{$l} += $freq{$w};
748
749 if (defined $stopwords{$l}) {
750 $bestform{$l} = $stopwords{$l};
751
752 } elsif (defined $thesaurus{$l}) {
753 $bestform{$l} = $thesaurus{$l};
754
755 } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
756 $bestfreq{$l} = $freq{$w};
757 $bestform{$l} = $w;
758 }
759 }
760
761 undef %freq;
762 undef %bestfreq;
763
764
765 # Assign symbol numbers to tokens
766 my $nextsymbol = 1;
767 my (@vocab);
768
769 # Delimiters
770 $first_delimiter = 1;
771
772 foreach $word (@delimiters) {
773
774 $word = lc($word);
775 $bestform{$word} = uc($word);
776 $vocab[$nextsymbol] = $word;
777 $symbol{$word} = $nextsymbol;
778 $nextsymbol++;
779 }
780 $last_delimiter = $nextsymbol - 1;
781
782 # Stopwords
783 $first_stopword = $nextsymbol;
784
785 foreach my $word (sort keys %stopwords) {
786
787 # don't incluse stopword unless it occurs in the text
788 $word = lc($word);
789 next unless ($totalfreq{$word});
790 next if ($symbol{$word});
791
792 $vocab[$nextsymbol] = $word;
793 $symbol{$word} = $nextsymbol;
794 $nextsymbol++;
795 }
796 $last_stopword = $nextsymbol - 1;
797 $first_contentword = $nextsymbol;
798
799 # Thesaurus terms
800 if ($thesaurus) {
801 $first_thesaurusword = $nextsymbol;
802
803 foreach my $word (sort keys %thesaurus) {
804
805 $word = lc($word);
806 next if ($symbol{$word});
807 $bestform{$word} = $thesaurus{$word};
808
809 $vocab[$nextsymbol] = $word;
810 $symbol{$word} = $nextsymbol;
811 $nextsymbol++;
812
813 }
814 $last_thesaurusword = $nextsymbol - 1;
815 }
816
817 # Other content words
818 $first_extractword = $nextsymbol;
819
820 foreach my $word (sort (keys %bestform)) {
821
822 next if ($symbol{$word});
823
824 $vocab[$nextsymbol] = $word;
825 $symbol{$word} = $nextsymbol;
826 $nextsymbol++;
827 }
828 $last_extractword = $nextsymbol - 1;
829 $last_contentword = $nextsymbol - 1;
830
831
832 # Outut the words
833 print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
834 open(VOC, ">$phinddir/clauses.vocab");
835
836 for (my $i = 1; $i < $nextsymbol; $i++) {
837 $w = $vocab[$i];
838
839 print VOC "$bestform{$w}\n";
840 $totalfreq{$w} = 0 unless ($totalfreq{$w});
841 }
842 close VOC;
843
844
845 # Create statistics file
846 # Output statistics about the vocablary
847 print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
848 &util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
849
850 open(STAT, ">$phinddir/clauses.stats")
851 || die "Cannot open $phinddir/clauses.stats: $!";
852
853 print STAT "first_delimiter $first_delimiter\n";
854 print STAT "last_delimiter $last_delimiter\n";
855 print STAT "first_stopword $first_stopword\n";
856 print STAT "last_stopword $last_stopword\n";
857 if ($thesaurus) {
858 print STAT "first_thesaurusword $first_thesaurusword\n";
859 print STAT "last_thesaurusword $last_thesaurusword\n";
860 }
861 print STAT "first_extractword $first_extractword\n";
862 print STAT "last_extractword $last_extractword\n";
863 print STAT "first_contentword $first_contentword\n";
864 print STAT "last_contentword $last_contentword\n";
865 print STAT "first_symbol $first_delimiter\n";
866 print STAT "last_symbol $last_contentword\n";
867 print STAT "first_word $first_stopword\n";
868 print STAT "last_word $last_contentword\n";
869 close STAT;
870
871 undef @vocab;
872
873
874 # Create numbers file
875 # Save text as symbol numbers
876 print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
877
878 open(TXT, "<$phinddir/clauses");
879 open(NUM, ">$phinddir/clauses.numbers");
880
881 $phrasedelimiter = $symbol{lc($senlimit)};
882 print NUM "$symbol{lc($colstart)}\n";
883
884 # set up the special symbols that delimit documents and sentences
885 while(<TXT>) {
886
887 # split sentence into a list of tokens
888 $line = $_;
889 next unless ($line =~ /./);
890 @words = split(/\s+/, $line);
891
892 # output one token at a time
893 foreach $word (@words) {
894 $word = lc($word);
895 print NUM "$symbol{$word}\n";
896 }
897
898 # output phrase delimiter
899 print NUM "$phrasedelimiter\n";
900 }
901
902 close TXT;
903 print NUM "$symbol{lc($colend)}\n";
904 close NUM;
905
906 # Save thesaurus data in one convienient file
907 if ($thesaurus) {
908
909 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
910
911
912 print $out "Saving thesaurus as numbers in $thesaurusfile\n"
913 if ($verbosity > 1);
914
915 # Read the thesaurus terms
916 my ($num, $text, %thes_symbols);
917
918 open(TH, "<$thesaurus_terms");
919 while(<TH>) {
920 chomp;
921 @words = split(/\s+/, $_);
922 $num = shift @words;
923 $text = "";
924
925 # translate words into symbol numbers
926 foreach $word (@words) {
927 $word = lc($word);
928 if ($symbol{$word}) {
929 $text .= "s$symbol{$word} ";
930 } elsif ($verbosity) {
931 print $out "phind: No thesaurus symbol, ignoring \"$word\"\n";
932 }
933 }
934 $text =~ s/ $//;
935 $thes_symbols{$num} = $text;
936 }
937 close TH;
938
939 # Read the thesaurus links and write the corresponding data
940 open(TH, "<$thesaurus_links");
941 open(THOUT, ">$thesaurusfile");
942
943 while(<TH>) {
944 chomp;
945 ($num, $text) = split(/:/, $_);
946
947 if (defined($thes_symbols{$num})) {
948 print THOUT "$num:$thes_symbols{$num}:$text\n";
949 } else {
950 print THOUT "$num:untranslated:$text\n";
951 }
952 }
953 close TH;
954 close THOUT;
955 }
956
957
958
959
960}
961
962
963# renumber_phrases
964#
965# Prepare the phrases file to be input to mgpp. The biggest problem is
966# reconciling the phrase identifiers used by the suffix program (which
967# we'll call suffix-id numbers) with the numbers used in the thesaurus
968# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
969# Phind-id numbers must be sorted by frequency of occurance.
970#
971# Start creating a set of phind-id numbers from the sorted suffix-id
972# numbers and (if required) the thesaurus-id numbers. Then add any other
973# phrases occuring in the thesaurus.
974#
975# The last thing we have to do is restore the vocabulary information to the
976# phrase file so that the phrases are stored as words, not as symbol
977# numbers.
978
979# The original phrases file looks something like this:
980# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
981# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
982# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
983# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
984
985
986sub renumber_phrases {
987 my ($self) = @_;
988
989 renumber_suffix_data($self);
990 renumber_thesaurus_data($self);
991 restore_vocabulary_data($self);
992
993}
994
995
996
997# renumber_suffix_data
998#
999# Translate phrases file to phrases.2 using phind keys instead
1000# of suffix keys and sorting the expansion data.
1001
1002sub renumber_suffix_data {
1003 my ($self) = @_;
1004
1005 my $verbosity = $self->{'verbosity'};
1006 my $out = $self->{'outhandle'};
1007 print $out "Translate phrases: suffix-ids become phind-id's\n"
1008 if ($verbosity);
1009
1010 my $phinddir = $self->{'phinddir'};
1011 my $infile = &util::filename_cat($phinddir, 'phrases');
1012 my $outfile = &util::filename_cat($phinddir, 'phrases.2');
1013
1014 # Read the phrase file. Calculate initial set of phind-id
1015 # numbers and store (suffixid -> frequency) relation.
1016
1017 my %suffixtophind;
1018 my @phindfrequency;
1019 my (@fields, $suffixid);
1020 my $nextphind = 1;
1021
1022 open(IN, "<$infile");
1023 while(<IN>) {
1024
1025 chomp;
1026 @fields = split(/:/, $_);
1027
1028 # get next suffixid and phindid
1029 $suffixid = shift @fields;
1030 $suffixtophind{$suffixid} = $nextphind;
1031
1032 # store total frequency
1033 shift @fields;
1034 $totalfrequency[$nextphind] = shift @fields;
1035
1036 $nextphind++;
1037 }
1038 close IN;
1039
1040
1041 # Translate phrases file to phrases.2. Use phind keys (not suffix
1042 # keys), sort expansion and document occurance data in order of
1043 # descending frequency..
1044 open(IN, "<$infile");
1045 open(OUT, ">$outfile");
1046
1047 my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1048 my (@documwents, @newexp, $k, $n);
1049 my $linenumber = 0;
1050
1051 while(<IN>) {
1052
1053 # read the line
1054 chomp;
1055 @fields = split(/:/, $_);
1056
1057 # get a phrase number for this line
1058 $suffixid = shift @fields;
1059 die unless (defined($suffixtophind{$suffixid}));
1060 $phindid = $suffixtophind{$suffixid};
1061
1062 # get the symbols in the phrase
1063 $text = shift @fields;
1064
1065 # output status information
1066 $linenumber++;
1067 if ($verbosity > 2) {
1068 if ($linenumber % 1000 == 0) {
1069 print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1070 }
1071 print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1072 }
1073
1074 # get the phrase frequency
1075 $tf = shift @fields;
1076
1077 # get the number of expansions
1078 $countexp = shift @fields;
1079
1080 # get the expansions, convert them into phind-id numbers, and sort them
1081 $expansions = shift @fields;
1082 @newexp = ();
1083 foreach $k (split(/,/, $expansions)) {
1084 die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1085 $n = $suffixtophind{$k};
1086 push @newexp, $n;
1087 }
1088 @newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1089
1090 # get the number of documents
1091 $countdocs = shift @fields;
1092
1093 # get the documents and sort them
1094 $documents = shift @fields;
1095 $documents =~ s/d//g;
1096 @documents = split(/;/, $documents);
1097 @documents = sort by_doc_frequency @documents;
1098
1099 # output the phrase data
1100 print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1101 print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1102
1103 }
1104
1105 close IN;
1106 close OUT;
1107}
1108
1109
1110# renumber_thesaurus_data
1111#
1112# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1113
1114sub renumber_thesaurus_data {
1115 my ($self) = @_;
1116
1117 my $out = $self->{'outhandle'};
1118 my $verbosity = $self->{'verbosity'};
1119 my $thesaurus = $self->{'thesaurus'};
1120
1121 my $phinddir = $self->{'phinddir'};
1122 my $infile = &util::filename_cat($phinddir, "phrases.2");
1123 my $outfile = &util::filename_cat($phinddir, "phrases.3");
1124
1125
1126 # If no thesaurus is defined, simply move the phrases file.
1127 if (!$thesaurus) {
1128 print $out "Translate phrases.2: no thesaurus data\n"
1129 if ($verbosity);
1130 &util::mv($infile, $outfile);
1131 return;
1132 }
1133
1134 print $out "Translate phrases.2: add thesaurus data\n"
1135 if ($verbosity);
1136
1137 # 1.
1138 # Read thesaurus file and store (symbols->thesaurusid) mapping
1139 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1140 my %symbolstothesid;
1141 my (@fields, $thesid, $symbols);
1142
1143 open(TH, "<$thesaurusfile");
1144
1145 while (<TH>) {
1146
1147 chomp;
1148 @fields = split(/:/, $_);
1149
1150 # get id and text
1151 $thesid = shift @fields;
1152 $symbols = shift @fields;
1153 $symbolstothesid{$symbols} = $thesid;
1154 }
1155 close TH;
1156
1157 # 2.
1158 # Read phrases file to find thesaurus entries that already
1159 # have a phindid. Store their phind-ids for later translation,
1160 # and store their frequency for later sorting.
1161 my %thesaurustophindid;
1162 my %phindidtofrequency;
1163 my ($phindid, $freq);
1164
1165 open(IN, "<$infile");
1166
1167 while(<IN>) {
1168
1169 chomp;
1170 @fields = split(/:/, $_);
1171
1172 # phindid and symbols for this line
1173 $phindid = shift @fields;
1174 $symbols = shift @fields;
1175 $freq = shift @fields;
1176
1177 # do we have a thesaurus id corresponding to this phrase?
1178 if (defined($symbolstothesid{$symbols})) {
1179 $thesid = $symbolstothesid{$symbols};
1180 $thesaurustophindid{$thesid} = $phindid;
1181 $phindidtofrequency{$phindid} = $freq;
1182 }
1183 }
1184 close IN;
1185
1186 undef %symbolstothesid;
1187
1188 # 3.
1189 # Create phind-id numbers for remaining thesaurus entries,
1190 # and note that their frequency is 0 for later sorting.
1191 my $nextphindid = $phindid + 1;
1192
1193 open(TH, "<$thesaurusfile");
1194 while(<TH>) {
1195
1196 chomp;
1197 @fields = split(/:/, $_);
1198
1199 # read thesaurus-id and ensure it has a corresponding phind-id
1200 $thesid = shift @fields;
1201 if (!defined($thesaurustophindid{$thesid})) {
1202 $thesaurustophindid{$thesid} = $nextphindid;
1203 $phindidtofrequency{$nextphindid} = 0;
1204 $nextphindid++;
1205 }
1206 }
1207 close TH;
1208
1209 # 4.
1210 # Translate thesaurus file, replacing thesaurus-id numbers with
1211 # phind-id numbers.
1212 my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1213 my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1214 my (@links, $linkid, %linkidtotype, $newrelation);
1215
1216 open(TH, "<$thesaurusfile");
1217 open(TO, ">$newthesaurusfile");
1218 while(<TH>) {
1219
1220 chomp;
1221 @fields = split(/:/, $_);
1222
1223 # phindid and symbols for this line
1224 ($thesid, $symbols, $relations) = @fields;
1225
1226 die unless ($thesid && $symbols);
1227 die unless $thesaurustophindid{$thesid};
1228 $phindid = $thesaurustophindid{$thesid};
1229
1230 # convert each part of the relation string to use phind-id numbers
1231 # at the same time, we want to sort the list by frequency.
1232 undef %linkidtotype;
1233
1234 foreach $linktext (split(/;/, $relations)) {
1235 @linkdata = split(/,/, $linktext);
1236
1237 # remember the linktype (e.g. BT, NT)
1238 $linktype = shift @linkdata;
1239
1240 # store the type of each link
1241 foreach $thesid (@linkdata) {
1242 die unless (defined($thesaurustophindid{$thesid}));
1243 $linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1244 }
1245 }
1246
1247 # sort the list of links, first by frequency, then by type.
1248 @links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1249 or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1250 $linkcounter = (scalar @links);
1251
1252 # create a string describing the link information
1253 $linktype = $linkidtotype{$links[0]};
1254 $newrelation = $linktype;
1255 foreach $linkid (@links) {
1256 if ($linkidtotype{$linkid} ne $linktype) {
1257 $linktype = $linkidtotype{$linkid};
1258 $newrelation .= ";" . $linktype;
1259 }
1260 $newrelation .= "," . $linkid;
1261 }
1262 $newrelation .= ";";
1263
1264
1265 # output the new line
1266 print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1267 }
1268 close TH;
1269 close TO;
1270
1271 undef %thesaurustophindid;
1272 undef %linkidtotype;
1273 undef %phindidtofrequency;
1274
1275 # 5.
1276 # Read thesaurus data (in phind-id format) into memory
1277 my %thesaurusdata;
1278
1279 open(TH, "<$newthesaurusfile");
1280 while(<TH>) {
1281 chomp;
1282 ($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1283 die unless ($phindid && $symbols);
1284 $thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1285 }
1286 close TH;
1287
1288 # 6.
1289 # Add thesaurus data to phrases file
1290 my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1291 my (@documwents, @newexp, $k, $n);
1292 my $linenumber = 0;
1293
1294 open(IN, "<$infile");
1295 open(OUT, ">$outfile");
1296
1297 # Update existing phrases
1298 while(<IN>) {
1299
1300 chomp;
1301 @fields = split(/:/, $_);
1302
1303 # get data for this line
1304 $phindid = shift @fields;
1305
1306 # output the phrase data, with thesaurus information
1307 print OUT "$phindid:", join(":", @fields);
1308
1309 # add thesaurus data
1310 if (defined($thesaurusdata{$phindid})) {
1311 @fields = split(/:/, $thesaurusdata{$phindid});
1312 shift @fields;
1313 $linkcounter = shift @fields;
1314 $relations = shift @fields;
1315
1316 print OUT ":$linkcounter:$relations";
1317 $thesaurusdata{$phindid} = "";
1318 }
1319 print OUT "\n";
1320 }
1321 close IN;
1322
1323 # Add phrases that aren't already in the file
1324 foreach $phindid (sort numerically keys %thesaurusdata) {
1325 next unless ($thesaurusdata{$phindid});
1326
1327 @fields = split(/:/, $thesaurusdata{$phindid});
1328 $symbols = shift @fields;
1329 $linkcounter = shift @fields;
1330 $relations = shift @fields;
1331
1332 print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1333 }
1334 close OUT;
1335
1336}
1337
1338# restore_vocabulary_data
1339#
1340# Read phrases.3 and restore vocabulary information. Then write
1341# this data to the MGPP input files (pwrod.txt and pdata.txt) and
1342# (if requested) to the saved phrases file.
1343
1344sub restore_vocabulary_data {
1345 my ($self) = @_;
1346
1347 my $out = $self->{'outhandle'};
1348 my $verbosity = $self->{'verbosity'};
1349 print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1350
1351 my $phinddir = $self->{'phinddir'};
1352 my $infile = &util::filename_cat($phinddir, 'phrases.3');
1353 my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1354 my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1355 my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1356
1357 my $savephrases = $self->{'savephrases'};
1358
1359 # 1.
1360 # Read the vocabulary file
1361 open(V, "<$vocabfile")
1362 || die "Cannot open $vocabfile: $!";
1363 my @symbol;
1364 my $i = 1;
1365 while(<V>) {
1366 chomp;
1367 $symbol[$i++] = $_;
1368 }
1369 close V;
1370
1371 # 2.
1372 # Translate phrases.3 to MGPP input files
1373 my ($key, $text, $word, $isThesaurus);
1374 my @fields;
1375 my $linenumber = 0;
1376
1377 open(IN, "<$infile");
1378 open(DATA, ">$datafile");
1379 open(WORD, ">$wordfile");
1380
1381 # Save the phrases in a separate text file
1382 if ($savephrases) {
1383 print $out "Saving phrases in $savephrases\n" if ($verbosity);
1384 open(SAVE, ">$savephrases");
1385 }
1386
1387 while(<IN>) {
1388
1389 # read the line
1390 chomp;
1391 $line = $_;
1392 @fields = split(/:/, $line);
1393
1394 # get a phrase number for this line
1395 $key = shift @fields;
1396
1397 # restore the text of the phrase
1398 $text = shift @fields;
1399 $text =~ s/s(\d+)/$symbol[$1]/g;
1400 if ($text =~ / /) {
1401 $word = "";
1402 } elsif ($text ne 'untranslated') {
1403 $word = $text;
1404 }
1405
1406 # output the phrase data
1407 print DATA "<Document>";
1408 print DATA "$key:$text:", join(":", @fields), ":\n";
1409
1410 # output the word index search data
1411 print WORD "<Document>$word\n";
1412
1413 # output the phrases to a text file
1414 if ($savephrases) {
1415 if ((scalar @fields) == 7) {
1416 $isThesaurus = 1;
1417 } else {
1418 $isThesaurus = 0;
1419 }
1420 print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1421 }
1422 }
1423 close IN;
1424 close WORD;
1425 close DATA;
1426 close SAVE if ($savephrases);
1427
1428}
1429
1430
1431
1432# sort routines used to renumber phrases
1433
1434sub numerically { $a <=> $b }
1435
1436sub by_doc_frequency {
1437 my $fa = 1;
1438 if ($a =~ /,/) {
1439 $fa = $a;
1440 $fa =~ s/\d+,//;
1441 }
1442 my $fb = 1;
1443 if ($b =~ /,/) {
1444 $fb = $b;
1445 $fb =~ s/\d+,//;
1446 }
1447
1448 return ($fb <=> $fa);
1449}
1450
14511;
Note: See TracBrowser for help on using the repository browser.