Context Navigation

source: trunk/gsdl/perllib/classify/phind.pm@ 3400

Last change on this file since 3400 was 3244, checked in by jrm21, 22 years ago
we no longer exit with an error if the suffix program failed to create necessary files. If suffix returns non-zero we will still exit, but exiting means the build failed.
Property svn:keywords set to `Author Date Id Revision`
File size: 38.8 KB

Line
1	###########################################################################
2	#
3	# phind.pm -- the Phind classifier
4	#
5	# Copyright (C) 2000 Gordon W. Paynter
6	# Copyright (C) 2000 New Zealand Digital Library Project
7	#
8	#
9	# A component of the Greenstone digital library software
10	# from the New Zealand Digital Library Project at the
11	# University of Waikato, New Zealand.
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# The phind clasifier plugin.
30	# Options are dexcribed in the print_usage function.
31	# Type "classinfo.pl phind" at the command line for a summary.
32
33	package phind;
34
35	use BasClas;
36	use util;
37	use ghtml;
38	use unicode;
39
40	my @removedirs = ();
41
42	my %wanted_index_files = ('td'=>1,
43	't'=>1,
44	'ti'=>1,
45	'tl'=>1,
46	'tsd'=>1,
47	'idb'=>1,
48	'ib1'=>1,
49	'ib2'=>1,
50	'ib3'=>1,
51	'i'=>1,
52	'il'=>1,
53	'w'=>1,
54	'wa'=>1);
55
56	sub BEGIN {
57	@ISA = ('BasClas');
58	}
59
60	sub END {
61
62	# Tidy up stray files - we do this here as there's some weird problem
63	# preventing us from doing it in the get_classify_info() function (on
64	# windows at least) where the close() appears to fail on txthandle and
65	# dochandle, thus preventing us from deleting those files
66
67	foreach my $dir (@removedirs) {
68	if (-d $dir && opendir (DIR, $dir)) {
69	my @files = readdir DIR;
70	closedir DIR;
71
72	foreach $file (@files) {
73	next if $file =~ /^\.\.?$/;
74	my ($suffix) = $file =~ /\.([^\.]+)$/;
75	if (!defined $suffix \|\| !defined $wanted_index_files{$suffix}) {
76	# delete it!
77	&util::rm (&util::filename_cat ($dir, $file));
78	}
79	}
80	}
81	}
82	}
83
84	sub print_usage {
85	print STDERR "
86	usage: classify phind [options]
87
88	options:
89	-text Fields The text used to build the phrase hierarchy.
90	(default: 'section:Title,section:text')
91
92	-title Title The metadata field used to describe each document.
93	(default: 'Title')
94
95	-button Name The label for the classifier screen and button in
96	navigation bar.
97	(default: 'Phrase')
98
99	-language Regex Language or languages to use building hierarchy.
100	Languages are identified by two-letter country codes
101	like en (English), es (Spanish), and fr (French).
102	Language is a regular expression, so 'en\|fr' (English or
103	French) and '..' (match any language) are valid.
104	(default: 'en'.)
105
106	-savephrases File If set, the phrase infomation will be stored in
107	the given file as text. It is probably a good idea
108	to use an absolute path.
109	(default: not set)
110
111	-suffixmode N The smode parameter to the phrase extraction program. A
112	value of 0 means that stopwords are ignored, and of 1
113	means that stopwords are used.
114	(default: 1)
115
116	-thesaurus Name Name of a thesaurus stored in phind format in the
117	collection's etc directory.
118	(default: not set)
119
120	-untidy Don't remove working files.
121
122	"; }
123
124	# Phrase delimiter symbols - these should be abstracted out someplace
125
126	my $colstart = "COLLECTIONSTART";
127	my $colend = "COLLECTIONEND";
128	my $doclimit = "DOCUMENTLIMIT";
129	my $senlimit = "SENTENCELIMIT";
130	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
131
132
133	# Create a new phind browser based on collect.cfg
134
135	sub new {
136	my $class = shift (@_);
137	my $self = new BasClas($class, @_);
138
139	my $out = $self->{'outhandle'};
140
141	# Ensure the Phind generate scripts are in place
142	my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
143	$file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
144	my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
145
146	if (!(-e $file1)) {
147	print STDERR "phind.pm: ERROR: The phind \"suffix\" program is not installed.\n\n";
148	exit(1);
149	}
150
151	# Parse classifier arguments
152	my $builddir = "";
153	if (!parsargv::parse(\@_,
154	q^text/.*/section:Title,section:text^, \$self->{'indexes'},
155	q^title/.*/Title^, \$self->{'titlefield'},
156	q^button/.*/Phrase^, \$self->{'buttonname'},
157	q^language/.*/en^, \$language,
158	q^builddir/.*/^, \$builddir,
159	q^savephrases/.*/^, \$self->{'savephrases'},
160	q^suffixmode/\d/1^, \$self->{'suffixmode'},
161	q^thesaurus/.*/^, \$self->{'thesaurus'},
162	q^untidy^, \$self->{'untidy'},
163	"allow_extra_options")) {
164
165	print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
166	&print_usage();
167	die "\n";
168	}
169
170	# classifier information
171	$self->{'collection'} = $ENV{'GSDLCOLLECTION'};
172
173	# limit languages
174	$self->{'language_exp'} = $language;
175
176	# collection directories
177	$self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
178	if (!$builddir) {
179	$builddir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
180	}
181	$self->{'builddir'} = $builddir;
182
183	$self->{'total'} = 0;
184
185	return bless $self, $class;
186	}
187
188
189	# Initialise the phind classifier
190
191	sub init {
192	my $self = shift (@_);
193
194	# ensure we have a build directory
195	my $builddir = $self->{'builddir'};
196	die unless (-e "$builddir");
197
198	# create phind directory
199	my $phnumber = 1;
200	my $phinddir = &util::filename_cat($builddir, "phind1");
201	while (-e "$phinddir") {
202	$phnumber++;
203	$phinddir = &util::filename_cat($builddir, "phind$phnumber");
204	}
205	&util::mk_dir("$phinddir");
206	$self->{'phinddir'} = $phinddir;
207	$self->{'phindnumber'} = $phnumber;
208
209	push(@removedirs, $phinddir) unless $self->{'untidy'};
210
211	# open filehandles for documents and text
212	my $clausefile = &util::filename_cat("$phinddir", "clauses");
213	&util::rm($clausefile) if (-e $clausefile);
214
215	my $txthandle = 'TEXT' . $phnumber;
216	open($txthandle, ">$clausefile") \|\| die "Cannot open $clausefile: $!";
217	$self->{'txthandle'} = $txthandle;
218
219	my $docfile = &util::filename_cat("$phinddir", "docs.txt");
220	&util::rm($docfile) if (-e $docfile);
221
222	my $dochandle = 'DOC' . $phnumber;
223	open($dochandle, ">$docfile") \|\| die "Cannot open $docfile: $!";
224	$self->{'dochandle'} = $dochandle;
225
226	}
227
228
229	# Classify each document.
230	#
231	# Each document is passed here in turn. The classifier extracts the
232	# text of each and stores it in the clauses file. Document details are
233	# stored in the docs.txt file.
234
235	sub classify {
236	my $self = shift (@_);
237	my $doc_obj = shift @_;
238
239	my $verbosity = $self->{'verbosity'};
240	my $top_section = $doc_obj->get_top_section();
241
242	my $titlefield = $self->{'titlefield'};
243
244	my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
245	print "process: $title\n" if ($verbosity > 2);
246
247	# Only consider the file if it is in the correct language
248	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
249	my $phrlanguage = $self->{'language_exp'};
250	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
251
252	# record this file
253	$self->{'total'} ++;
254	print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
255
256
257	# Store document details
258	my $OID = $doc_obj->get_OID();
259	$OID = "NULL" unless defined $OID;
260	my $dochandle = $self->{'dochandle'};
261	print $dochandle "<Document>\t$OID\t$title\n";
262
263	# Store the text occuring in this object
264
265	# output the document delimiter
266	my $txthandle = $self->{'txthandle'};
267	print $txthandle "$doclimit\n";
268
269	# iterarate over the required indexes and store their text
270	my $indexes = $self->{'indexes'};
271	my $text = "";
272	my ($part, $level, $field, $section, $data, $dataref);
273
274	foreach $part (split(/,/, $indexes)) {
275
276	# Each field has a level and a data element ((e.g. document:Title)
277	($level, $field) = split(/:/, $part);
278	die unless ($level && $field);
279
280	# Extract the text from every section
281	# (In phind, document:text and section:text are equivalent)
282	if ($field eq "text") {
283	$data = "";
284	$section = $doc_obj->get_top_section();
285	while (defined($section)) {
286	$data .= $doc_obj->get_text($section) . "\n";
287	$section = $doc_obj->get_next_section($section);
288	}
289	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
290	}
291
292	# Extract a metadata field from a document
293	# (If there is more than one element of the given type, get them all.)
294	elsif ($level eq "document") {
295	$dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
296	foreach $data (@$dataref) {
297	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
298	}
299	}
300
301	# Extract metadata from every section in a document
302	elsif ($level eq "section") {
303	$data = "";
304	$section = $doc_obj->get_top_section();
305	while (defined($section)) {
306	$dataref = $doc_obj->get_metadata($section, $field);
307	$data .= join("\n", @$dataref) . "\n";
308	$section = $doc_obj->get_next_section($section);
309	}
310	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
311	}
312
313	# Some sort of specification which I don't understand
314	else {
315	die "Unknown level ($level) in phind index ($part)\n";
316	}
317
318	}
319
320	# output the text
321	$text =~ tr/\n//s;
322	print $txthandle "$text";
323	}
324
325
326	# Construct the classifier from the information already gathered
327	#
328	# When get_classify_info is called, the clauses and docs.txt files have
329	# already been constructed in the phind directory. This function will
330	# translate them into compressed, indexed MGPP files that can be read by
331	# the phindcgi script. It will also register our classifier so that it
332	# shows up in the navigation bar.
333
334	sub get_classify_info {
335	my $self = shift (@_);
336
337	close $self->{'dochandle'};
338	close $self->{'txthandle'};
339	my $verbosity = $self->{'verbosity'};
340	my $out = $self->{'outhandle'};
341	my $phinddir = $self->{'phinddir'};
342
343	my $osextra = "";
344	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
345	$osextra = " -d /";
346	}
347
348	if ($verbosity) {
349	print $out "\n*** phind.pm generating indexes for ", $self->{'indexes'}, "\n";
350	print $out "*** in ", $self->{'phinddir'}, "\n";
351	}
352
353	# Construct phind indexes
354	my $suffixmode = $self->{'suffixmode'};
355	my ($command, $status);
356
357	# Generate the vocabulary, symbol statistics, and numbers file
358	# from the clauses file
359	print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
360	&extract_vocabulary($self);
361
362	# Use the suffix program to generate the phind/phrases file
363	print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
364	&execute("suffix \"$phinddir\" $suffixmode $verbosity", $verbosity, $out);
365
366
367	# check that we generated some files. It's not necessarily an error if
368	# we didn't (execute() would have quit on error), but we can't go on.
369	my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
370	if (! -r $phrasesfile) {
371	print $out "\nNo phrases found for phind classifier!\n";
372	return;
373	}
374
375	# Create the phrase file and put phrase numbers in phind/phrases
376	print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
377	&renumber_phrases($self);
378
379	print $out "\nCreating phrase databases\n";
380	my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
381	my $mg_stem = &util::filename_cat($phinddir, "pdata");
382
383	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
384	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
385	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
386
387	# create the mg index of words
388	print $out "\nCreating word-level search indexes\n";
389	$mg_input = &util::filename_cat($phinddir, "pword.txt");
390	$mg_stem = &util::filename_cat($phinddir, "pword");
391
392	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
393	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
394	&execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
395	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
396	&execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
397	&execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
398
399	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
400	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
401	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
402
403	# create the mg document information database
404	print $out "\nCreating document information databases\n";
405	$mg_input = &util::filename_cat($phinddir, "docs.txt");
406	$mg_stem = &util::filename_cat($phinddir, "docs");
407
408	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
409	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
410	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
411
412	# Return the information about the classifier that we'll later want to
413	# use to create macros when the Phind classifier document is displayed.
414	my %classifyinfo = ('thistype'=>'Invisible',
415	'childtype'=>'Phind',
416	'Title'=>$self->{'buttonname'},
417	'parameters'=>"phindnumber=$self->{'phindnumber'}",
418	'contains'=>[]);
419
420	my $collection = $self->{'collection'};
421	my $url = "library?a=p&p=phind&c=$collection";
422	push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
423
424	return \%classifyinfo;
425	}
426
427
428
429	sub convert_gml_to_tokens {
430
431	my ($language_exp, $text) = @_;
432
433	# escape any magic words... - jrm21
434	foreach my $delim (@delimiters) {
435	my $replacement=lc($delim);
436	my $num= $text=~ s/$delim/$replacement/g;
437	if (!$num) {$num=0;}
438	}
439
440	if ($language_exp =~ /en/) {
441	return &convert_gml_to_tokens_EN($text);
442	}
443
444	$_ = $text;
445
446	# 1. remove GML tags
447
448	# Remove everything that is in a tag
449	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
450	s/\s<br>\s/ LINEBREAK /isgo;
451	s/<[^>]*>/ /sgo;
452
453	# Now we have the text, but it may contain HTML
454	# elements coded as > etc. Remove these tags.
455	s/&/&/sgo;
456	s/</</sgo;
457	s/>/>/sgo;
458	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
459	s/\s<br>\s/ LINEBREAK /isgo;
460	s/<[^>]*>/ /sgo;
461
462	# replace<p> and <br> placeholders with clause break symbol (\n)
463	s/\s+/ /gso;
464	s/PARAGRAPHBREAK/\n/sgo;
465	s/LINEBREAK/\n/sgo;
466
467
468
469
470	# 2. Split the remaining text into space-delimited tokens
471
472	# Convert entities to their UTF8 equivalents
473	s/&([^;]+);/&ghtml::getcharequiv($1,1)/gse;
474
475	# Split text at word boundaries
476	s/\b/ /go;
477
478	# 3. Convert the remaining text to "clause format"
479
480	# Insert newline if the end of a sentence is detected
481	# (delimter is: "[\.\?\!]\s")
482	# s/\s*[\.\?\!]\s+/\n/go;
483
484	# remove unnecessary punctuation and replace with clause break symbol (\n)
485	s/[^\w ]/\n/go;
486
487	# remove extraneous whitespace
488	s/ +/ /sgo;
489	s/^\s+//mgo;
490	s/\s*$/\n/mgo;
491
492	# remove lines that contain one word or less
493	s/^\S*$//mgo;
494	s/^\s*$//mgo;
495	tr/\n//s;
496
497	return $_;
498	}
499
500	# A version of convert_gml_to_tokens that is fine-tuned to the English language.
501
502	sub convert_gml_to_tokens_EN {
503	$_ = shift @_;
504
505	# FIRST, remove GML tags
506
507	# Replace all whitespace with a simple space
508	s/\s+/ /gs;
509
510	# Remove everything that is in a tag
511	s/\s<p>\s/ PARAGRAPHBREAK /isg;
512	s/\s<br>\s/ LINEBREAK /isg;
513	s/<[^>]*>/ /sg;
514
515	# Now we have the text, but it may contain HTML
516	# elements coded as > etc. Remove these tags.
517	s/</</sg;
518	s/>/>/sg;
519
520	s/\s+/ /sg;
521	s/\s<p>\s/ PARAGRAPHBREAK /isg;
522	s/\s<br>\s/ LINEBREAK /isg;
523	s/<[^>]*>/ /sg;
524
525	# remove & and other miscellaneous markup tags
526	s/&/&/sg;
527	s/</</sg;
528	s/>/>/sg;
529	s/&/&/sg;
530
531	# replace<p> and <br> placeholders with carriage returns
532	s/PARAGRAPHBREAK/\n/sg;
533	s/LINEBREAK/\n/sg;
534
535
536	# Exceptional punctuation
537	#
538	# We make special cases of some punctuation
539
540	# remove any apostrophe that indicates omitted letters
541	s/(\w+)\'(\w*\s)/ $1$2 /g;
542
543	# remove period that appears in a person's initals
544	s/\s([A-Z])\./ $1 /g;
545
546	# replace hyphens in hypheanted words and names with a space
547	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
548
549	# Convert the remaining text to "clause format",
550	# This means removing all excess punctuation and garbage text,
551	# normalising valid punctuation to fullstops and commas,
552	# then putting one cluse on each line.
553
554	# Insert newline when the end of a sentence is detected
555	# (delimter is: "[\.\?\!]\s")
556	s/\s*[\.\?\!]\s+/\n/g;
557
558	# split numbers after four digits
559	s/(\d\d\d\d)/$1 /g;
560
561	# split words after 32 characters
562
563	# squash repeated punctuation
564	tr/A-Za-z0-9 //cs;
565
566	# save email addresses
567	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
568
569	# normalise clause breaks (mostly punctuation symbols) to commas
570	s/[^A-Za-z0-9 \n]+/ , /g;
571
572	# Remove repeated commas, and replace with newline
573	s/\s*,[, ]+/\n/g;
574
575	# remove extra whitespace
576	s/ +/ /sg;
577	s/^\s+//mg;
578	s/\s*$/\n/mg;
579
580	# remove lines that contain one word or less
581	s/^\w*$//mg;
582	s/^\s*$//mg;
583	tr/\n//s;
584
585	return $_;
586
587	}
588
589
590
591	# Execute a system command
592
593	sub execute {
594	my ($command, $verbosity, $outhandle) = @_;
595	print $outhandle "Executing: $command\n" if ($verbosity > 2);
596	$! = 0;
597	my $status = system($command);
598	if ($status != 0) {
599	print STDERR "phind - Error executing '$command': $!\n";
600	exit($status); # this causes the build to fail...
601	}
602	}
603
604
605	# Generate the vocabulary, symbol statistics, and numbers file from the
606	# clauses file. This is legacy code, so is a bit messy and probably wont
607	# run under windows.
608
609	sub extract_vocabulary {
610	my ($self) = @_;
611
612	my $verbosity = $self->{'verbosity'};
613	my $out = $self->{'outhandle'};
614
615	my $collectiondir = $self->{'collectiondir'};
616	my $phinddir = $self->{'phinddir'};
617
618	my $language_exp = $self->{'language_exp'};
619
620	my ($w, $l, $line, $word);
621
622	my ($first_delimiter, $last_delimiter,
623	$first_stopword, $last_stopword,
624	$first_extractword, $last_extractword,
625	$first_contentword, $last_contentword,
626	$phrasedelimiter);
627
628	my $thesaurus = $self->{'thesaurus'};
629	my ($thesaurus_links, $thesaurus_terms,
630	%thesaurus, $first_thesaurusword, $last_thesaurusword);
631
632	my %symbol;
633	my (%freq);
634
635	print $out "Calculating vocabulary\n" if ($verbosity > 1);
636
637	# Read and store the stopwords
638	my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
639	my $stopword_files = ();
640	my ($language, $language_dir, $file, $file_name);
641	my %stopwords;
642
643	# Examine each directory in the stopword directory
644	opendir(STOPDIR, $stopdir);
645	foreach $language (readdir STOPDIR) {
646
647	# Ignore entries that do not match the classifier's language
648	next unless ($language =~ /$language_exp/);
649	$language_dir = &util::filename_cat($stopdir, $language);
650	next unless (-d "$language_dir");
651
652	opendir(LANGDIR, $language_dir);
653	foreach $file (readdir LANGDIR) {
654
655	# Ignore entries that are not stopword files
656	next unless ($file =~ /sw$/);
657	$file_name = &util::filename_cat($language_dir, $file);
658	next unless (-f "$file_name");
659
660	# Read the stopwords
661	open(STOPFILE, "<$file_name");
662	while (<STOPFILE>) {
663	s/^\s+//;
664	s/\s.*//;
665	$word = $_;
666	$l = lc($word);
667	$stopwords{$l} = $word;
668	}
669	close STOPFILE;
670
671	}
672	closedir LANGDIR;
673	}
674	closedir STOPDIR;
675
676	# Read thesaurus information
677	if ($thesaurus) {
678
679	# link file exists
680	$thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
681	die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
682
683	# ensure term file exists in the correct language
684	if ($language_exp =~ /^([a-z][a-z])/) {
685	$language = $1;
686	} else {
687	$language = 'en';
688	}
689	$thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
690	die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
691
692
693	# Read the thesaurus terms
694	open(TH, "<$thesaurus_terms");
695	while(<TH>) {
696	s/^\d+ //;
697	s/\(.*\)//;
698	foreach $w (split(/\s+/, $_)) {
699	$thesaurus{lc($w)} = $w;
700	}
701	}
702	close TH;
703	}
704
705	# Read words in the text and count occurences
706	open(TXT, "<$phinddir/clauses");
707
708	my @words;
709	while(<TXT>) {
710	$line = $_;
711	next unless ($line =~ /./);
712
713	@words = split(/\s+/, $line);
714	foreach $w (@words) {
715	$l = lc($w);
716	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
717	$freq{$w}++;
718	}
719	$freq{$senlimit}++;
720	}
721
722	close TXT;
723
724	# Calculate the "best" form of each word
725	my (%bestform, %totalfreq, %bestfreq);
726
727	foreach $w (sort (keys %freq)) {
728	$l = lc($w);
729
730	# totalfreq is the number of times a term appears in any form
731	$totalfreq{$l} += $freq{$w};
732
733	if (defined $stopwords{$l}) {
734	$bestform{$l} = $stopwords{$l};
735
736	} elsif (defined $thesaurus{$l}) {
737	$bestform{$l} = $thesaurus{$l};
738
739	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
740	$bestfreq{$l} = $freq{$w};
741	$bestform{$l} = $w;
742	}
743	}
744	undef %freq;
745	undef %bestfreq;
746
747
748	# Assign symbol numbers to tokens
749	my $nextsymbol = 1;
750	my (@vocab);
751
752	# Delimiters
753	$first_delimiter = 1;
754
755	foreach $word (@delimiters) {
756
757	# $word = lc($word); # jrm21
758	$word = uc($word);
759	$bestform{$word} = $word;
760	$vocab[$nextsymbol] = $word;
761	$symbol{$word} = $nextsymbol;
762	$nextsymbol++;
763	}
764	$last_delimiter = $nextsymbol - 1;
765	# Stopwords
766	$first_stopword = $nextsymbol;
767
768	foreach my $word (sort keys %stopwords) {
769	# don't include stopword unless it occurs in the text
770	$word = lc($word);
771	next unless ($totalfreq{$word});
772	next if ($symbol{$word});
773
774	$vocab[$nextsymbol] = $word;
775	$symbol{$word} = $nextsymbol;
776	$nextsymbol++;
777	}
778	$last_stopword = $nextsymbol - 1;
779	$first_contentword = $nextsymbol;
780
781	# Thesaurus terms
782	if ($thesaurus) {
783	$first_thesaurusword = $nextsymbol;
784
785	foreach my $word (sort keys %thesaurus) {
786
787	$word = lc($word);
788	next if ($symbol{$word});
789	$bestform{$word} = $thesaurus{$word};
790
791	$vocab[$nextsymbol] = $word;
792	$symbol{$word} = $nextsymbol;
793	$nextsymbol++;
794
795	}
796	$last_thesaurusword = $nextsymbol - 1;
797	}
798
799	# Other content words
800	$first_extractword = $nextsymbol;
801
802	foreach my $word (sort (keys %bestform)) {
803
804	next if ($symbol{$word});
805
806	$vocab[$nextsymbol] = $word;
807	$symbol{$word} = $nextsymbol;
808	$nextsymbol++;
809	}
810	$last_extractword = $nextsymbol - 1;
811	$last_contentword = $nextsymbol - 1;
812
813	# Outut the words
814	print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
815	open(VOC, ">$phinddir/clauses.vocab");
816
817	for (my $i = 1; $i < $nextsymbol; $i++) {
818	$w = $vocab[$i];
819
820	print VOC "$bestform{$w}\n";
821	$totalfreq{$w} = 0 unless ($totalfreq{$w});
822	}
823	close VOC;
824
825
826	# Create statistics file
827	# Output statistics about the vocablary
828	print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
829	&util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
830
831	open(STAT, ">$phinddir/clauses.stats")
832	\|\| die "Cannot open $phinddir/clauses.stats: $!";
833
834	print STAT "first_delimiter $first_delimiter\n";
835	print STAT "last_delimiter $last_delimiter\n";
836	print STAT "first_stopword $first_stopword\n";
837	print STAT "last_stopword $last_stopword\n";
838	if ($thesaurus) {
839	print STAT "first_thesaurusword $first_thesaurusword\n";
840	print STAT "last_thesaurusword $last_thesaurusword\n";
841	}
842	print STAT "first_extractword $first_extractword\n";
843	print STAT "last_extractword $last_extractword\n";
844	print STAT "first_contentword $first_contentword\n";
845	print STAT "last_contentword $last_contentword\n";
846	print STAT "first_symbol $first_delimiter\n";
847	print STAT "last_symbol $last_contentword\n";
848	print STAT "first_word $first_stopword\n";
849	print STAT "last_word $last_contentword\n";
850	close STAT;
851
852	undef @vocab;
853
854
855	# Create numbers file
856	# Save text as symbol numbers
857	print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
858
859	open(TXT, "<$phinddir/clauses");
860	open(NUM, ">$phinddir/clauses.numbers");
861
862	## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
863	## print NUM "$symbol{lc($colstart)}\n"; # jrm21
864	$phrasedelimiter = $symbol{$senlimit};
865	print NUM "$symbol{$colstart}\n";
866
867	# set up the special symbols that delimit documents and sentences
868	while(<TXT>) {
869
870	# split sentence into a list of tokens
871	$line = $_;
872	next unless ($line =~ /./);
873	@words = split(/\s+/, $line);
874
875	# output one token at a time
876	foreach $word (@words) {
877	# don't lower-case special delimiters - jrm21
878	if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
879	$word = lc($word);
880	}
881	print NUM "$symbol{$word}\n";
882	}
883
884	# output phrase delimiter
885	print NUM "$phrasedelimiter\n";
886	}
887
888	close TXT;
889	# print NUM "$symbol{lc($colend)}\n";# jrm21
890	print NUM "$symbol{$colend}\n";
891	close NUM;
892
893	# Save thesaurus data in one convienient file
894	if ($thesaurus) {
895
896	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
897
898
899	print $out "Saving thesaurus as numbers in $thesaurusfile\n"
900	if ($verbosity > 1);
901
902	# Read the thesaurus terms
903	my ($num, $text, %thes_symbols);
904
905	open(TH, "<$thesaurus_terms");
906	while(<TH>) {
907	chomp;
908	@words = split(/\s+/, $_);
909	$num = shift @words;
910	$text = "";
911
912	# translate words into symbol numbers
913	foreach $word (@words) {
914	$word = lc($word);
915	if ($symbol{$word}) {
916	$text .= "s$symbol{$word} ";
917	} elsif ($verbosity) {
918	print $out "phind: No thesaurus symbol, ignoring \"$word\"\n";
919	}
920	}
921	$text =~ s/ $//;
922	$thes_symbols{$num} = $text;
923	}
924	close TH;
925
926	# Read the thesaurus links and write the corresponding data
927	open(TH, "<$thesaurus_links");
928	open(THOUT, ">$thesaurusfile");
929
930	while(<TH>) {
931	chomp;
932	($num, $text) = split(/:/, $_);
933
934	if (defined($thes_symbols{$num})) {
935	print THOUT "$num:$thes_symbols{$num}:$text\n";
936	} else {
937	print THOUT "$num:untranslated:$text\n";
938	}
939	}
940	close TH;
941	close THOUT;
942	}
943
944
945
946
947	}
948
949
950	# renumber_phrases
951	#
952	# Prepare the phrases file to be input to mgpp. The biggest problem is
953	# reconciling the phrase identifiers used by the suffix program (which
954	# we'll call suffix-id numbers) with the numbers used in the thesaurus
955	# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
956	# Phind-id numbers must be sorted by frequency of occurance.
957	#
958	# Start creating a set of phind-id numbers from the sorted suffix-id
959	# numbers and (if required) the thesaurus-id numbers. Then add any other
960	# phrases occuring in the thesaurus.
961	#
962	# The last thing we have to do is restore the vocabulary information to the
963	# phrase file so that the phrases are stored as words, not as symbol
964	# numbers.
965
966	# The original phrases file looks something like this:
967	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
968	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
969	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
970	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
971
972
973	sub renumber_phrases {
974	my ($self) = @_;
975
976	renumber_suffix_data($self);
977	renumber_thesaurus_data($self);
978	restore_vocabulary_data($self);
979
980	}
981
982
983
984	# renumber_suffix_data
985	#
986	# Translate phrases file to phrases.2 using phind keys instead
987	# of suffix keys and sorting the expansion data.
988
989	sub renumber_suffix_data {
990	my ($self) = @_;
991
992	my $verbosity = $self->{'verbosity'};
993	my $out = $self->{'outhandle'};
994	print $out "Translate phrases: suffix-ids become phind-id's\n"
995	if ($verbosity);
996
997	my $phinddir = $self->{'phinddir'};
998	my $infile = &util::filename_cat($phinddir, 'phrases');
999	my $outfile = &util::filename_cat($phinddir, 'phrases.2');
1000
1001	# Read the phrase file. Calculate initial set of phind-id
1002	# numbers and store (suffixid -> frequency) relation.
1003
1004	my %suffixtophind;
1005	my @phindfrequency;
1006	my (@fields, $suffixid);
1007	my $nextphind = 1;
1008
1009	open(IN, "<$infile");
1010	while(<IN>) {
1011
1012	chomp;
1013	@fields = split(/:/, $_);
1014
1015	# get next suffixid and phindid
1016	$suffixid = shift @fields;
1017	$suffixtophind{$suffixid} = $nextphind;
1018
1019	# store total frequency
1020	shift @fields;
1021	$totalfrequency[$nextphind] = shift @fields;
1022
1023	$nextphind++;
1024	}
1025	close IN;
1026
1027
1028	# Translate phrases file to phrases.2. Use phind keys (not suffix
1029	# keys), sort expansion and document occurance data in order of
1030	# descending frequency..
1031	open(IN, "<$infile");
1032	open(OUT, ">$outfile");
1033
1034	my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1035	my (@documwents, @newexp, $k, $n);
1036	my $linenumber = 0;
1037
1038	while(<IN>) {
1039
1040	# read the line
1041	chomp;
1042	@fields = split(/:/, $_);
1043
1044	# get a phrase number for this line
1045	$suffixid = shift @fields;
1046	die unless (defined($suffixtophind{$suffixid}));
1047	$phindid = $suffixtophind{$suffixid};
1048
1049	# get the symbols in the phrase
1050	$text = shift @fields;
1051
1052	# output status information
1053	$linenumber++;
1054	if ($verbosity > 2) {
1055	if ($linenumber % 1000 == 0) {
1056	print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1057	}
1058	print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1059	}
1060
1061	# get the phrase frequency
1062	$tf = shift @fields;
1063
1064	# get the number of expansions
1065	$countexp = shift @fields;
1066
1067	# get the expansions, convert them into phind-id numbers, and sort them
1068	$expansions = shift @fields;
1069	@newexp = ();
1070	foreach $k (split(/,/, $expansions)) {
1071	die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1072	$n = $suffixtophind{$k};
1073	push @newexp, $n;
1074	}
1075	@newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1076
1077	# get the number of documents
1078	$countdocs = shift @fields;
1079
1080	# get the documents and sort them
1081	$documents = shift @fields;
1082	$documents =~ s/d//g;
1083	@documents = split(/;/, $documents);
1084	@documents = sort by_doc_frequency @documents;
1085
1086	# output the phrase data
1087	print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1088	print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1089
1090	}
1091
1092	close IN;
1093	close OUT;
1094	}
1095
1096
1097	# renumber_thesaurus_data
1098	#
1099	# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1100
1101	sub renumber_thesaurus_data {
1102	my ($self) = @_;
1103
1104	my $out = $self->{'outhandle'};
1105	my $verbosity = $self->{'verbosity'};
1106	my $thesaurus = $self->{'thesaurus'};
1107
1108	my $phinddir = $self->{'phinddir'};
1109	my $infile = &util::filename_cat($phinddir, "phrases.2");
1110	my $outfile = &util::filename_cat($phinddir, "phrases.3");
1111
1112
1113	# If no thesaurus is defined, simply move the phrases file.
1114	if (!$thesaurus) {
1115	print $out "Translate phrases.2: no thesaurus data\n"
1116	if ($verbosity);
1117	&util::mv($infile, $outfile);
1118	return;
1119	}
1120
1121	print $out "Translate phrases.2: add thesaurus data\n"
1122	if ($verbosity);
1123
1124	# 1.
1125	# Read thesaurus file and store (symbols->thesaurusid) mapping
1126	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1127	my %symbolstothesid;
1128	my (@fields, $thesid, $symbols);
1129
1130	open(TH, "<$thesaurusfile");
1131
1132	while (<TH>) {
1133
1134	chomp;
1135	@fields = split(/:/, $_);
1136
1137	# get id and text
1138	$thesid = shift @fields;
1139	$symbols = shift @fields;
1140	$symbolstothesid{$symbols} = $thesid;
1141	}
1142	close TH;
1143
1144	# 2.
1145	# Read phrases file to find thesaurus entries that already
1146	# have a phindid. Store their phind-ids for later translation,
1147	# and store their frequency for later sorting.
1148	my %thesaurustophindid;
1149	my %phindidtofrequency;
1150	my ($phindid, $freq);
1151
1152	open(IN, "<$infile");
1153
1154	while(<IN>) {
1155
1156	chomp;
1157	@fields = split(/:/, $_);
1158
1159	# phindid and symbols for this line
1160	$phindid = shift @fields;
1161	$symbols = shift @fields;
1162	$freq = shift @fields;
1163
1164	# do we have a thesaurus id corresponding to this phrase?
1165	if (defined($symbolstothesid{$symbols})) {
1166	$thesid = $symbolstothesid{$symbols};
1167	$thesaurustophindid{$thesid} = $phindid;
1168	$phindidtofrequency{$phindid} = $freq;
1169	}
1170	}
1171	close IN;
1172
1173	undef %symbolstothesid;
1174
1175	# 3.
1176	# Create phind-id numbers for remaining thesaurus entries,
1177	# and note that their frequency is 0 for later sorting.
1178	my $nextphindid = $phindid + 1;
1179
1180	open(TH, "<$thesaurusfile");
1181	while(<TH>) {
1182
1183	chomp;
1184	@fields = split(/:/, $_);
1185
1186	# read thesaurus-id and ensure it has a corresponding phind-id
1187	$thesid = shift @fields;
1188	if (!defined($thesaurustophindid{$thesid})) {
1189	$thesaurustophindid{$thesid} = $nextphindid;
1190	$phindidtofrequency{$nextphindid} = 0;
1191	$nextphindid++;
1192	}
1193	}
1194	close TH;
1195
1196	# 4.
1197	# Translate thesaurus file, replacing thesaurus-id numbers with
1198	# phind-id numbers.
1199	my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1200	my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1201	my (@links, $linkid, %linkidtotype, $newrelation);
1202
1203	open(TH, "<$thesaurusfile");
1204	open(TO, ">$newthesaurusfile");
1205	while(<TH>) {
1206
1207	chomp;
1208	@fields = split(/:/, $_);
1209
1210	# phindid and symbols for this line
1211	($thesid, $symbols, $relations) = @fields;
1212
1213	die unless ($thesid && $symbols);
1214	die unless $thesaurustophindid{$thesid};
1215	$phindid = $thesaurustophindid{$thesid};
1216
1217	# convert each part of the relation string to use phind-id numbers
1218	# at the same time, we want to sort the list by frequency.
1219	undef %linkidtotype;
1220
1221	foreach $linktext (split(/;/, $relations)) {
1222	@linkdata = split(/,/, $linktext);
1223
1224	# remember the linktype (e.g. BT, NT)
1225	$linktype = shift @linkdata;
1226
1227	# store the type of each link
1228	foreach $thesid (@linkdata) {
1229	die unless (defined($thesaurustophindid{$thesid}));
1230	$linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1231	}
1232	}
1233
1234	# sort the list of links, first by frequency, then by type.
1235	@links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1236	or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1237	$linkcounter = (scalar @links);
1238
1239	# create a string describing the link information
1240	$linktype = $linkidtotype{$links[0]};
1241	$newrelation = $linktype;
1242	foreach $linkid (@links) {
1243	if ($linkidtotype{$linkid} ne $linktype) {
1244	$linktype = $linkidtotype{$linkid};
1245	$newrelation .= ";" . $linktype;
1246	}
1247	$newrelation .= "," . $linkid;
1248	}
1249	$newrelation .= ";";
1250
1251
1252	# output the new line
1253	print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1254	}
1255	close TH;
1256	close TO;
1257
1258	undef %thesaurustophindid;
1259	undef %linkidtotype;
1260	undef %phindidtofrequency;
1261
1262	# 5.
1263	# Read thesaurus data (in phind-id format) into memory
1264	my %thesaurusdata;
1265
1266	open(TH, "<$newthesaurusfile");
1267	while(<TH>) {
1268	chomp;
1269	($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1270	die unless ($phindid && $symbols);
1271	$thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1272	}
1273	close TH;
1274
1275	# 6.
1276	# Add thesaurus data to phrases file
1277	my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1278	my (@documwents, @newexp, $k, $n);
1279	my $linenumber = 0;
1280
1281	open(IN, "<$infile");
1282	open(OUT, ">$outfile");
1283
1284	# Update existing phrases
1285	while(<IN>) {
1286
1287	chomp;
1288	@fields = split(/:/, $_);
1289
1290	# get data for this line
1291	$phindid = shift @fields;
1292
1293	# output the phrase data, with thesaurus information
1294	print OUT "$phindid:", join(":", @fields);
1295
1296	# add thesaurus data
1297	if (defined($thesaurusdata{$phindid})) {
1298	@fields = split(/:/, $thesaurusdata{$phindid});
1299	shift @fields;
1300	$linkcounter = shift @fields;
1301	$relations = shift @fields;
1302
1303	print OUT ":$linkcounter:$relations";
1304	$thesaurusdata{$phindid} = "";
1305	}
1306	print OUT "\n";
1307	}
1308	close IN;
1309
1310	# Add phrases that aren't already in the file
1311	foreach $phindid (sort numerically keys %thesaurusdata) {
1312	next unless ($thesaurusdata{$phindid});
1313
1314	@fields = split(/:/, $thesaurusdata{$phindid});
1315	$symbols = shift @fields;
1316	$linkcounter = shift @fields;
1317	$relations = shift @fields;
1318
1319	print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1320	}
1321	close OUT;
1322
1323	}
1324
1325	# restore_vocabulary_data
1326	#
1327	# Read phrases.3 and restore vocabulary information. Then write
1328	# this data to the MGPP input files (pwrod.txt and pdata.txt) and
1329	# (if requested) to the saved phrases file.
1330
1331	sub restore_vocabulary_data {
1332	my ($self) = @_;
1333
1334	my $out = $self->{'outhandle'};
1335	my $verbosity = $self->{'verbosity'};
1336	print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1337
1338	my $phinddir = $self->{'phinddir'};
1339	my $infile = &util::filename_cat($phinddir, 'phrases.3');
1340	my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1341	my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1342	my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1343
1344	my $savephrases = $self->{'savephrases'};
1345
1346	# 1.
1347	# Read the vocabulary file
1348	open(V, "<$vocabfile")
1349	\|\| die "Cannot open $vocabfile: $!";
1350	my @symbol;
1351	my $i = 1;
1352	while(<V>) {
1353	chomp;
1354	$symbol[$i++] = $_;
1355	}
1356	close V;
1357
1358	# 2.
1359	# Translate phrases.3 to MGPP input files
1360	my ($key, $text, $word, $isThesaurus);
1361	my @fields;
1362	my $linenumber = 0;
1363
1364	open(IN, "<$infile");
1365	open(DATA, ">$datafile");
1366	open(WORD, ">$wordfile");
1367
1368	# Save the phrases in a separate text file
1369	if ($savephrases) {
1370	print $out "Saving phrases in $savephrases\n" if ($verbosity);
1371	open(SAVE, ">$savephrases");
1372	}
1373
1374	while(<IN>) {
1375
1376	# read the line
1377	chomp;
1378	$line = $_;
1379	@fields = split(/:/, $line);
1380
1381	# get a phrase number for this line
1382	$key = shift @fields;
1383
1384	# restore the text of the phrase
1385	$text = shift @fields;
1386	$text =~ s/s(\d+)/$symbol[$1]/g;
1387	if ($text =~ / /) {
1388	$word = "";
1389	} elsif ($text ne 'untranslated') {
1390	$word = $text;
1391	}
1392
1393	# output the phrase data
1394	print DATA "<Document>";
1395	print DATA "$key:$text:", join(":", @fields), ":\n";
1396
1397	# output the word index search data
1398	print WORD "<Document>$word\n";
1399
1400	# output the phrases to a text file
1401	if ($savephrases) {
1402	if ((scalar @fields) == 7) {
1403	$isThesaurus = 1;
1404	} else {
1405	$isThesaurus = 0;
1406	}
1407	print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1408	}
1409	}
1410	close IN;
1411	close WORD;
1412	close DATA;
1413	close SAVE if ($savephrases);
1414
1415	}
1416
1417
1418
1419	# sort routines used to renumber phrases
1420
1421	sub numerically { $a <=> $b }
1422
1423	sub by_doc_frequency {
1424	my $fa = 1;
1425	if ($a =~ /,/) {
1426	$fa = $a;
1427	$fa =~ s/\d+,//;
1428	}
1429	my $fb = 1;
1430	if ($b =~ /,/) {
1431	$fb = $b;
1432	$fb =~ s/\d+,//;
1433	}
1434
1435	return ($fb <=> $fa);
1436	}
1437
1438	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: