Context Navigation

source: trunk/gsdl/perllib/classify.pm@ 13418

Last change on this file since 13418 was 13068, checked in by kjdon, 18 years ago
when unbuilding and rebuilding the gdbm database for incremental building, we need to make sure the old documents are processed in the same order as they were originally added, otherwise the docnums don't match up with the index or text
Property svn:keywords set to `Author Date Id Revision`
File size: 14.3 KB

Rev	Line
[537]	1	###########################################################################
	2	#
	3	# classify.pm --
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
[214]	26	# functions to handle classifiers
	27
	28	package classify;
	29
	30	require util;
[8220]	31	require AllList;
[5682]	32	use gsprintf;
[214]	33
[12559]	34	#use GDBM_File;
	35	use unbuildutil;
[214]	36
[11994]	37
[5682]	38	sub gsprintf
	39	{
	40	return &gsprintf::gsprintf(@_);
	41	}
	42
	43
[315]	44	$next_classify_num = 1;
[12844]	45	$oid_to_clids = {};
	46
[6967]	47	sub load_classifier_for_info {
	48	my ($classifier) = shift @_;
[214]	49
[6967]	50	# find the classifier
	51	my $colclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
	52	"perllib/classify",
	53	"${classifier}.pm");
	54	my $mainclassname = &util::filename_cat($ENV{'GSDLHOME'},
	55	"perllib/classify",
	56	"${classifier}.pm");
	57
	58	if (-e $colclassname) { require $colclassname; }
	59	elsif (-e $mainclassname) { require $mainclassname; }
	60	else {
	61	&gsprintf(STDERR, "{classify.could_not_find_classifier}\n", $classifier) && die "\n";
	62	}
	63	my ($classobj);
	64	my $options = "-gsdlinfo";
[10218]	65	eval ("\$classobj = new \$classifier([],[$options])");
[6967]	66	die "$@" if $@;
	67
	68	return $classobj;
	69	}
	70
[811]	71	sub load_classifiers {
[1839]	72	my ($classify_list, $build_dir, $outhandle) = @_;
[811]	73	my @classify_objects = ();
[8220]	74	my $classify_number = 1;
[811]	75
	76	foreach $classifyoption (@$classify_list) {
[214]	77
[811]	78	# get the classifier name
	79	my $classname = shift @$classifyoption;
	80	next unless defined $classname;
[1839]	81
[811]	82	# find the classifier
	83	my $colclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"perllib/classify",
	84	"${classname}.pm");
	85	my $mainclassname = &util::filename_cat($ENV{'GSDLHOME'},"perllib/classify",
	86	"${classname}.pm");
[214]	87
[811]	88	if (-e $colclassname) { require $colclassname; }
	89	elsif (-e $mainclassname) { require $mainclassname; }
[5682]	90	else { &gsprintf(STDERR, "{classify.could_not_find_classifier}\n", $classname) && die "\n";
	91	# die "ERROR - couldn't find classifier \"$classname\"\n";
	92	}
[214]	93
[811]	94	# create the classify object
	95	my ($classobj);
[1839]	96
	97	my @newoptions;
[6964]	98
	99	# do these first so they can be overriden by user supplied options
	100	push @newoptions, "-builddir", "$build_dir" if ($build_dir);
	101	push @newoptions, "-outhandle", "$outhandle" if ($outhandle);
	102	push @newoptions, "-verbosity", "2";
	103
[6967]	104	# backwards compatability hack: if the classifier options are
	105	# in "x=y" format, convert them to parsearg ("-x y") format.
	106	my ($opt, $key, $value);
[1839]	107	foreach $opt (@$classifyoption) {
[11644]	108	# if ($opt =~ /^(\w+)=(.*)$/) {
	109	# push @newoptions, "-$1", $2;
	110	# } else {
[1839]	111	push @newoptions, $opt;
[11644]	112	#}
[1839]	113	}
	114
	115	map { $_ = "\"$_\""; } @newoptions;
	116	my $options .= join (",", @newoptions);
	117
[10218]	118
	119	eval ("\$classobj = new \$classname([],[$options])");
[811]	120	die "$@" if $@;
[1839]	121
[8220]	122	$classobj->set_number($classify_number);
	123	$classify_number ++;
	124
[1839]	125	# add this object to the list
[811]	126	push (@classify_objects, $classobj);
	127	}
	128
[8220]	129	my ($classobj);
	130	eval ("\$classobj = new AllList()");
	131	die "$@" if $@;
	132	push (@classify_objects, $classobj);
	133
[811]	134	return \@classify_objects;
[214]	135	}
	136
	137	# init_classifiers resets all the classifiers and readys them to process
[315]	138	# the documents.
[214]	139	sub init_classifiers {
	140	my ($classifiers) = @_;
	141
	142	foreach $classobj (@$classifiers) {
	143	$classobj->init();
	144	}
	145	}
	146
[11994]	147
	148
	149	# takes a hashref containing the metadata for a gdbmfile entry, and extracts
	150	# the childrens numbers (from the 'contains' entry).
	151	# assumes format is ".1;".2;".3
	152	sub get_children {
	153	my ($doc_db_hash) = @_;
	154
	155	my $children = undef;
	156
	157	$childs = $doc_db_hash->{'contains'};
	158	if (defined ($childs)) {
	159	$childs =~ s/\@$//; #remove trailing @
	160	$childs =~ s/^\"\.//; #remove initial ".
	161	@$children = split /\;\"\./, $childs;
	162
	163	}
	164
	165	return $children;
	166	}
	167
	168
	169	sub recurse_sections {
	170	my ($doc_obj, $children, $parentoid, $parentsection, $gdbm_recs) = @_;
	171
	172	return if (!defined $children);
	173
	174	foreach my $child (sort { $a <=> $b} @$children) {
	175	$doc_obj->create_named_section("$parentsection.$child");
	176	my $doc_db_rec = $gdbm_recs->{"$parentoid.$child"};
	177	my $doc_db_hash = db_rec_to_hash($doc_db_rec);
	178
	179	# get child's children
	180	my $newchildren = &get_children($doc_db_hash);
	181
	182	# add content for current section
	183	add_section_content($doc_obj, "$parentsection.$child", $doc_db_hash);
	184
	185	# process all the children if there are any
	186	if (defined ($newchildren))
	187	{
	188	recurse_sections($doc_obj, $newchildren, "$parentoid.$child",
	189	"$parentsection.$child", $gdbm_recs);
	190	}
	191	}
	192	}
	193
	194
	195	sub add_section_content {
	196	my ($doc_obj, $cursection, $doc_db_hash) = @_;
	197
	198	foreach $key (keys %$doc_db_hash) {
	199	#don't need to store these metadata
	200	next if $key =~ /(thistype\|childtype\|contains\|docnum\|doctype\|classifytype)/i;
	201	# but do want things like hastxt and archivedir
	202	my @items = split /@/, $doc_db_hash->{$key};
	203	map {$doc_obj->add_metadata ($cursection, $key, $_); } @items;
	204
	205	}
	206	}
	207
	208
	209	# gets all the metadata from a gdbm file entry, and puts it into a hashref
	210	sub db_rec_to_hash {
	211
	212	my ($gdb_str_ref) = @_;
	213
	214	my $hashref = {};
	215
	216	my @entries = split(/\n/, $gdb_str_ref);
	217	foreach $entry (@entries) {
	218	my($key, $value) = ($entry =~ /^<([^>])>(.?)$/ );
	219	$hashref->{$key} .= '@' if defined $hashref->{$key};
	220	$hashref->{$key} .= $value;
	221
	222	}
	223
	224	return $hashref;
	225	}
	226
	227
	228	sub reconstruct_doc_objs_metadata
	229	{
	230	my ($fulldbname) = @_;
	231
[12559]	232	# tie %gdbm_recs, 'GDBM_File', $fulldbname, &GDBM_WRCREAT, 0640;
[11994]	233
[12559]	234	my %gdbm_recs;
	235	&unbuildutil::read_gdbm($fulldbname,\%gdbm_recs);
	236
	237
[11994]	238	# dig out top level doc sections
	239	my %top_sections = ();
[13068]	240	my %top_docnums = ();
[11994]	241	foreach my $key ( keys %gdbm_recs )
	242	{
	243	my $md_rec = $gdbm_recs{$key};
	244	my $md_hash = db_rec_to_hash($md_rec);
	245
	246	if ((defined $md_hash->{'doctype'}) && ($md_hash->{'doctype'} eq "doc")) {
	247	next if ($key =~ m/\./);
	248	$top_sections{$key} = $md_hash;
[13068]	249	$top_docnums{$key} = $md_hash->{'docnum'};
[11994]	250	}
	251	}
	252
	253	# for greenstone document objects based on metadata in gdbm file
	254	my @all_docs = ();
[13068]	255	# we need to make sure the documents were processed in the same order as
	256	# before, so sort based on their docnums
	257	foreach my $oid ( sort { $top_docnums{$a} <=> $top_docnums{$b} } keys %top_sections )
[11994]	258	{
	259	my $doc_db_hash = $top_sections{$oid};
	260
	261	my $doc_obj = new doc();
	262	$doc_obj->set_OID($oid);
	263	my $top = $doc_obj->get_top_section();
	264	add_section_content ($doc_obj, $top, $doc_db_hash);
	265	my $children = &get_children($doc_db_hash);
	266	recurse_sections($doc_obj, $children, $oid, $top, \%gdbm_recs);
	267
	268	push(@all_docs,$doc_obj);
	269	}
	270
[12559]	271	# untie %gdbm_recs;
[11994]	272
	273	return \@all_docs;
	274	}
	275
	276
	277
	278
	279
[214]	280	# classify_doc lets each of the classifiers classify a document
	281	sub classify_doc {
	282	my ($classifiers, $doc_obj) = @_;
	283
	284	foreach $classobj (@$classifiers) {
[8220]	285	my $title = $classobj->{'title'};
[214]	286	$classobj->classify($doc_obj);
	287	}
	288	}
	289
	290	# output_classify_info outputs all the info needed for the classification
	291	# to the gdbm
	292	sub output_classify_info {
[8361]	293	my ($classifiers, $handle, $remove_empty_classifications, $gli) = @_;
[214]	294	# $handle = "main::STDOUT";
	295
[6332]	296	$gli = 0 unless defined $gli;
	297
[315]	298	# create a classification containing all the info
	299	my $classifyinfo = {'classifyOID'=>'browse',
	300	'contains'=>[]};
	301
	302	# get each of the classifications
[8275]	303	foreach $classobj (@$classifiers) {
[6332]	304	my $tempinfo = $classobj->get_classify_info($gli);
[8220]	305	my $classID = $tempinfo->{'classifyOID'};
	306
	307	$tempinfo->{'classifyOID'} = "CL$next_classify_num" unless defined($tempinfo->{'classifyOID'});
[315]	308	$next_classify_num++;
[12844]	309
	310	print STDERR "*** outputting information for classifier: $tempinfo->{'classifyOID'}\n";
	311
[315]	312	push (@{$classifyinfo->{'contains'}}, $tempinfo);
[214]	313	}
	314
[8361]	315	&print_classify_info ($handle, $classifyinfo, "", $remove_empty_classifications);
[214]	316	}
	317
[315]	318	sub print_classify_info {
[8361]	319	my ($handle, $classifyinfo, $OID, $remove_empty_classifications) = @_;
[831]	320
[315]	321	$OID =~ s/^\.+//; # just for good luck
[214]	322
[315]	323	# book information is printed elsewhere
	324	return if (defined ($classifyinfo->{'OID'}));
	325
	326	# don't want empty classifications
[8445]	327	return if (&check_contents ($classifyinfo, $remove_empty_classifications) == 0 && $remove_empty_classifications);
[315]	328
[8361]	329	$OID = $classifyinfo->{'classifyOID'} if defined ($classifyinfo->{'classifyOID'});
[315]	330
[8361]	331	my $outputtext = "[$OID]\n";
	332	$outputtext .= "<doctype>classify\n";
	333	$outputtext .= "<hastxt>0\n";
	334	$outputtext .= "<childtype>$classifyinfo->{'childtype'}\n"
	335	if defined $classifyinfo->{'childtype'};
	336	$outputtext .= "<Title>$classifyinfo->{'Title'}\n"
	337	if defined $classifyinfo->{'Title'};
	338	$outputtext .= "<numleafdocs>$classifyinfo->{'numleafdocs'}\n"
	339	if defined $classifyinfo->{'numleafdocs'};
	340	$outputtext .= "<thistype>$classifyinfo->{'thistype'}\n"
	341	if defined $classifyinfo->{'thistype'};
	342	$outputtext .= "<parameters>$classifyinfo->{'parameters'}\n"
	343	if defined $classifyinfo->{'parameters'};
	344	$outputtext .= "<supportsmemberof>$classifyinfo->{'supportsmemberof'}\n"
	345	if defined $classifyinfo->{'supportsmemberof'};
	346
	347	my $contains_text = "<contains>";
	348	my $mdoffset_text = "<mdoffset>";
	349
	350	my $next_subOID = 1;
	351	my $first = 1;
	352	foreach $tempinfo (@{$classifyinfo->{'contains'}}) {
	353	# empty contents were made undefined by clean_contents()
	354	next unless defined $tempinfo;
[315]	355
[8361]	356	if (!defined ($tempinfo->{'classifyOID'}) \|\|
	357	$tempinfo->{'classifyOID'} ne "oai") {
	358	$contains_text .= ";" unless $first;
	359	}
	360	$mdoffset_text .= ";" unless $first;
	361	$first = 0;
[315]	362
[8361]	363	if (defined ($tempinfo->{'classifyOID'})) {
	364	if ($tempinfo->{'classifyOID'} ne "oai") {
	365	$contains_text .= $tempinfo->{'classifyOID'};
[8275]	366	}
[12844]	367
	368	# Extra code for incremental building.
	369	# We need to store a listing of the classifiers each DOI is in
	370	my $clids = [];
	371	#rint STDERR "==1. Recording reverse lookup for " . $tempinfo->{'classifyOID'} . "==\n";
	372	if(defined($oid_to_clids->{$tempinfo->{'classifyOID'}})) {
	373	#rint STDERR "Found existing array!\n";
	374	$clids = $oid_to_clids->{$tempinfo->{'classifyOID'}};
	375	}
	376	#rint STDERR "Appended $OID to \"" . join(";", @{$clids}) . "\"\n";
	377	push(@{$clids}, $OID);
	378	$oid_to_clids->{$tempinfo->{'classifyOID'}} = $clids;
	379	#rint STDERR "Result: \"" . join(";", @{$clids}) . "\"\n";
	380
[8361]	381	&print_classify_info ($handle, $tempinfo, $tempinfo->{'classifyOID'},
	382	$remove_empty_classifications);
	383	} elsif (defined ($tempinfo->{'OID'})) {
	384	$contains_text .= $tempinfo->{'OID'};
[12844]	385	$mdoffset_text .= $tempinfo->{'offset'} if (defined ($tempinfo->{'offset'}));
	386
	387
	388	# note: we don't want to print the contents of the books
	389	# Extra code for incremental building.
	390	# We need to store a listing of the classifiers each DOI is in
	391	my $clids = [];
	392	#rint STDERR "==2. Recording reverse lookup for " . $tempinfo->{'OID'} . "==\n";
	393	if(defined($oid_to_clids->{$tempinfo->{'OID'}})) {
	394	#rint STDERR "Found existing array!\n";
	395	$clids = $oid_to_clids->{$tempinfo->{'OID'}};
	396	}
	397	#rint STDERR "Appended $OID to \"" . join(";", @{$clids}) . "\"\n";
	398	push(@{$clids}, $OID);
	399	$oid_to_clids->{$tempinfo->{'OID'}} = $clids;
	400	#rint STDERR "Result: \"" . join(";", @{$clids}) . "\"\n";
	401
	402
[315]	403	} else {
[7346]	404
	405	# Supress having top-level node in Collage classifier
	406	# so no bookshelf icon appears, top-level, along with the
	407	# applet
[8361]	408
[7346]	409	if (!defined ($tempinfo->{'Title'}) \|\| $tempinfo->{'Title'} ne "Collage") {
	410	$contains_text .= "\".$next_subOID";
	411	}
[12844]	412
	413	# Extra code for incremental building.
	414	# We need to store a listing of the classifiers each DOI is in
	415	my $clids = [];
	416	#rint STDERR "==3. Recording reverse lookup for $OID.$next_subOID==\n";
	417	if(defined($oid_to_clids->{$OID . "." . $next_subOID})) {
	418	#rint STDERR "Found existing array!\n";
	419	$clids = $oid_to_clids->{$OID . "." . $next_subOID};
	420	}
	421	#rint STDERR "Appended $OID to \"" . join(";", @{$clids}) . "\"\n";
	422	push(@{$clids}, $OID);
	423	$oid_to_clids->{$OID . "." . $next_subOID} = $clids;
	424	#rint STDERR "Result: \"" . join(";", @{$clids}) . "\"\n";
[8361]	425
[315]	426	&print_classify_info ($handle, $tempinfo, "$OID.$next_subOID",
[8361]	427	$remove_empty_classifications);
[315]	428	$next_subOID++;
	429	}
	430	}
[8361]	431
	432	$outputtext .= "$contains_text\n";
	433	$outputtext .= "<mdtype>$classifyinfo->{'mdtype'}\n"
	434	if defined $classifyinfo->{'mdtype'};
	435	$outputtext .= "$mdoffset_text\n"
	436	if ($mdoffset_text !~ m/^<mdoffset>;+$/);
	437
	438	$outputtext .= '-' x 70 . "\n";
	439
	440	print $handle $outputtext;
	441
[315]	442	}
	443
[637]	444	sub check_contents {
[8445]	445	my ($classifyinfo,$remove_empty_classifications) = @_;
	446	$remove_empty_classifications = 0 unless ($remove_empty_classifications);
[637]	447	my $num_leaf_docs = 0;
	448	my $sub_num_leaf_docs = 0;
[315]	449
[637]	450	return $classifyinfo->{'numleafdocs'} if (defined $classifyinfo->{'numleafdocs'});
	451
[315]	452	foreach $content (@{$classifyinfo->{'contains'}}) {
	453	if (defined $content->{'OID'}) {
	454	# found a book
[637]	455	$num_leaf_docs ++;
[9790]	456	} elsif (($sub_num_leaf_docs = &check_contents ($content,$remove_empty_classifications)) > 0) {
[315]	457	# there's a book somewhere below
[637]	458	$num_leaf_docs += $sub_num_leaf_docs;
[315]	459	} else {
[8445]	460	if ($remove_empty_classifications){
	461	# section contains no books so we want to remove
	462	# it from its parents contents
	463	$content = undef;
	464	}
[315]	465	}
	466	}
[637]	467
	468	$classifyinfo->{'numleafdocs'} = $num_leaf_docs;
	469	return $num_leaf_docs;
[315]	470	}
	471
[214]	472	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: