Context Navigation

source: main/trunk/greenstone2/perllib/classify.pm@ 27329

Last change on this file since 27329 was 27303, checked in by jmt12, 11 years ago
Replacing hardcoded additions to INC and PATH environment variables with conditional ones - this allows us to use the order of values in these variables for precedence, thus allows better support for extensions that override classifiers, plugins etc. ENV and PATH functions already exists in util, but augmentINC() is a new function
Property svn:keywords set to `Author Date Id Revision`
File size: 14.5 KB

Rev	Line
[537]	1	###########################################################################
	2	#
	3	# classify.pm --
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
[214]	26	# functions to handle classifiers
	27
	28	package classify;
	29
	30	require util;
[27303]	31	use FileUtils;
[8220]	32	require AllList;
[15703]	33
[15705]	34	use dbutil;
[5682]	35	use gsprintf;
[15703]	36	use strict; no strict 'subs';
[214]	37
[11994]	38
[5682]	39	sub gsprintf
	40	{
	41	return &gsprintf::gsprintf(@_);
	42	}
	43
	44
[6967]	45	sub load_classifier_for_info {
	46	my ($classifier) = shift @_;
[214]	47
[6967]	48	# find the classifier
[27303]	49	# - used to have hardcoded list of places to load classifier from. We
	50	# should, instead, try loading from all of the perllib places on the
	51	# library path, as that improves support for extensions. Special cases
	52	# needed for collection specific and custom classifier. [jmt12]
	53	my @possible_class_paths;
[14239]	54	if (defined($ENV{'GSDLCOLLECTION'}))
	55	{
[27303]	56	push(@possible_class_paths, &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'custom', $ENV{'GSDLCOLLECTION'}, 'perllib', 'classify', $classifier . '.pm'));
[14239]	57	}
[27303]	58	# (why does GSDLCOLLECTDIR get set to GSDLHOME for classinfo calls?)
	59	if ($ENV{'GSDLCOLLECTDIR'} ne $ENV{'GSDLHOME'})
	60	{
	61	push(@possible_class_paths, &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'perllib', 'classify', $classifier . '.pm'));
	62	}
	63	foreach my $library_path (@INC)
	64	{
	65	# only interested in classify paths found in the library paths
	66	if ($library_path =~ /classify$/)
	67	{
	68	push(@possible_class_paths, &FileUtils::filenameConcatenate($library_path, $classifier . '.pm'));
	69	}
	70	}
	71	my $found_class = 0;
	72	foreach my $possible_class_path (@possible_class_paths)
	73	{
	74	if (-e $possible_class_path)
	75	{
	76	require $possible_class_path;
	77	$found_class = 1;
	78	last;
	79	}
	80	}
	81	if (!$found_class)
	82	{
	83	&gsprintf(STDERR, "{classify.could_not_find_classifier}\n", $classifier) && die "\n";
	84	}
[6967]	85
	86	my ($classobj);
	87	my $options = "-gsdlinfo";
[10218]	88	eval ("\$classobj = new \$classifier([],[$options])");
[6967]	89	die "$@" if $@;
	90
	91	return $classobj;
	92	}
	93
[811]	94	sub load_classifiers {
[1839]	95	my ($classify_list, $build_dir, $outhandle) = @_;
[811]	96	my @classify_objects = ();
[8220]	97	my $classify_number = 1;
[13933]	98
[27303]	99	my $colclassdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},"perllib/classify");
	100	# - ensure colclassdir doesn't already exist in INC before adding, other-
	101	# wise we risk clobbering classifier inheritence implied by order of paths
	102	# in INC [jmt12]
	103	my $inc_paths = join(':',@INC);
	104	if ($inc_paths !~ /$colclassdir/)
	105	{
	106	unshift (@INC, $colclassdir);
	107	}
	108
[15703]	109	foreach my $classifyoption (@$classify_list) {
[214]	110
[811]	111	# get the classifier name
	112	my $classname = shift @$classifyoption;
	113	next unless defined $classname;
[1839]	114
[811]	115	# find the classifier
[27303]	116	# - replaced as explained in load_classifier_for_info() [jmt12]
	117	my @possible_class_paths;
	118	if (defined($ENV{'GSDLCOLLECTION'}))
	119	{
	120	push(@possible_class_paths, &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'custom', $ENV{'GSDLCOLLECTION'}, 'perllib', 'classify', $classname . '.pm'));
[14112]	121	}
[27303]	122	# (why does GSDLCOLLECTDIR get set to GSDLHOME for classinfo calls?)
	123	if ($ENV{'GSDLCOLLECTDIR'} ne $ENV{'GSDLHOME'})
	124	{
	125	push(@possible_class_paths,&FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'perllib', 'classify', $classname . '.pm'));
	126	}
	127	foreach my $library_path (@INC)
	128	{
	129	# only interested in classify paths found in the library paths
	130	if ($library_path =~ /classify$/)
	131	{
	132	push(@possible_class_paths, &FileUtils::filenameConcatenate($library_path, $classname . '.pm'));
	133	}
	134	}
	135	my $found_class = 0;
	136	foreach my $possible_class_path (@possible_class_paths)
	137	{
	138	if (-e $possible_class_path)
	139	{
	140	require $possible_class_path;
	141	$found_class = 1;
	142	last;
	143	}
	144	}
	145	if (!$found_class)
	146	{
	147	&gsprintf(STDERR, "{classify.could_not_find_classifier}\n", $classname) && die "\n";
	148	}
[214]	149
[811]	150	# create the classify object
	151	my ($classobj);
[1839]	152
	153	my @newoptions;
[6964]	154
	155	# do these first so they can be overriden by user supplied options
	156	push @newoptions, "-builddir", "$build_dir" if ($build_dir);
	157	push @newoptions, "-outhandle", "$outhandle" if ($outhandle);
	158	push @newoptions, "-verbosity", "2";
	159
[6967]	160	# backwards compatability hack: if the classifier options are
	161	# in "x=y" format, convert them to parsearg ("-x y") format.
	162	my ($opt, $key, $value);
[1839]	163	foreach $opt (@$classifyoption) {
[11644]	164	# if ($opt =~ /^(\w+)=(.*)$/) {
	165	# push @newoptions, "-$1", $2;
	166	# } else {
[1839]	167	push @newoptions, $opt;
[11644]	168	#}
[1839]	169	}
	170
	171	map { $_ = "\"$_\""; } @newoptions;
	172	my $options .= join (",", @newoptions);
	173
[10218]	174
	175	eval ("\$classobj = new \$classname([],[$options])");
[811]	176	die "$@" if $@;
[1839]	177
[8220]	178	$classobj->set_number($classify_number);
	179	$classify_number ++;
	180
[1839]	181	# add this object to the list
[811]	182	push (@classify_objects, $classobj);
	183	}
	184
[8220]	185	my ($classobj);
	186	eval ("\$classobj = new AllList()");
	187	die "$@" if $@;
	188	push (@classify_objects, $classobj);
	189
[811]	190	return \@classify_objects;
[214]	191	}
	192
	193	# init_classifiers resets all the classifiers and readys them to process
[315]	194	# the documents.
[214]	195	sub init_classifiers {
	196	my ($classifiers) = @_;
	197
[15703]	198	foreach my $classobj (@$classifiers) {
[214]	199	$classobj->init();
	200	}
	201	}
	202
[11994]	203
	204
[15704]	205	# takes a hashref containing the metadata for an infodb entry, and extracts
[11994]	206	# the childrens numbers (from the 'contains' entry).
	207	# assumes format is ".1;".2;".3
	208	sub get_children {
	209	my ($doc_db_hash) = @_;
	210
	211	my $children = undef;
	212
[15703]	213	my $contains = $doc_db_hash->{'contains'};
	214	if (defined ($contains)) {
	215	$contains =~ s/\@$//; #remove trailing @
	216	$contains =~ s/^\"\.//; #remove initial ".
	217	@$children = split /\;\"\./, $contains;
[11994]	218	}
	219
	220	return $children;
	221	}
	222
	223
	224	sub recurse_sections {
[15704]	225	my ($doc_obj, $children, $parentoid, $parentsection, $database_recs) = @_;
[11994]	226
	227	return if (!defined $children);
	228
	229	foreach my $child (sort { $a <=> $b} @$children) {
	230	$doc_obj->create_named_section("$parentsection.$child");
[15704]	231	my $doc_db_rec = $database_recs->{"$parentoid.$child"};
[11994]	232	my $doc_db_hash = db_rec_to_hash($doc_db_rec);
	233
	234	# get child's children
	235	my $newchildren = &get_children($doc_db_hash);
	236
	237	# add content for current section
	238	add_section_content($doc_obj, "$parentsection.$child", $doc_db_hash);
	239
	240	# process all the children if there are any
	241	if (defined ($newchildren))
	242	{
	243	recurse_sections($doc_obj, $newchildren, "$parentoid.$child",
[15704]	244	"$parentsection.$child", $database_recs);
[11994]	245	}
	246	}
	247	}
	248
	249
	250	sub add_section_content {
	251	my ($doc_obj, $cursection, $doc_db_hash) = @_;
	252
[15703]	253	foreach my $key (keys %$doc_db_hash) {
[11994]	254	#don't need to store these metadata
	255	next if $key =~ /(thistype\|childtype\|contains\|docnum\|doctype\|classifytype)/i;
	256	# but do want things like hastxt and archivedir
	257	my @items = split /@/, $doc_db_hash->{$key};
[21564]	258	# metadata is all from the info database so should already be in utf8
[17288]	259	map {$doc_obj->add_utf8_metadata ($cursection, $key, $_); } @items;
[11994]	260
	261	}
	262	}
	263
	264
[15704]	265	# gets all the metadata from an infodb entry, and puts it into a hashref
[11994]	266	sub db_rec_to_hash {
	267
[21564]	268	my ($infodb_str_ref) = @_;
[11994]	269
	270	my $hashref = {};
	271
[21564]	272	my @entries = split(/\n/, $infodb_str_ref);
[15703]	273	foreach my $entry (@entries) {
[11994]	274	my($key, $value) = ($entry =~ /^<([^>])>(.?)$/ );
	275	$hashref->{$key} .= '@' if defined $hashref->{$key};
	276	$hashref->{$key} .= $value;
	277
	278	}
	279
	280	return $hashref;
	281	}
	282
	283
	284	sub reconstruct_doc_objs_metadata
	285	{
[15725]	286	my $infodb_type = shift(@_);
	287	my $infodb_file_path = shift(@_);
[20575]	288	my $database_recs = shift(@_);
[11994]	289
	290	# dig out top level doc sections
	291	my %top_sections = ();
[13068]	292	my %top_docnums = ();
[20575]	293	foreach my $key ( keys %$database_recs )
[11994]	294	{
[20575]	295	my $md_rec = $database_recs->{$key};
[11994]	296	my $md_hash = db_rec_to_hash($md_rec);
	297
	298	if ((defined $md_hash->{'doctype'}) && ($md_hash->{'doctype'} eq "doc")) {
	299	next if ($key =~ m/\./);
	300	$top_sections{$key} = $md_hash;
[13068]	301	$top_docnums{$key} = $md_hash->{'docnum'};
[11994]	302	}
	303	}
	304
[15704]	305	# for greenstone document objects based on metadata in database file
[11994]	306	my @all_docs = ();
[13068]	307	# we need to make sure the documents were processed in the same order as
	308	# before, so sort based on their docnums
	309	foreach my $oid ( sort { $top_docnums{$a} <=> $top_docnums{$b} } keys %top_sections )
[11994]	310	{
	311	my $doc_db_hash = $top_sections{$oid};
	312
	313	my $doc_obj = new doc();
	314	$doc_obj->set_OID($oid);
	315	my $top = $doc_obj->get_top_section();
	316	add_section_content ($doc_obj, $top, $doc_db_hash);
	317	my $children = &get_children($doc_db_hash);
[20575]	318	recurse_sections($doc_obj, $children, $oid, $top, $database_recs);
[11994]	319
	320	push(@all_docs,$doc_obj);
	321	}
	322
	323	return \@all_docs;
	324	}
	325
	326
	327
	328
	329
[214]	330	# classify_doc lets each of the classifiers classify a document
	331	sub classify_doc {
[23118]	332	my ($classifiers, $doc_obj) = @_;
[19772]	333
[15703]	334	foreach my $classobj (@$classifiers) {
[8220]	335	my $title = $classobj->{'title'};
[19772]	336
[23118]	337	$classobj->classify($doc_obj);
[214]	338	}
	339	}
	340
[15702]	341
[18520]	342	our $next_classify_num = 1;
	343
[214]	344	# output_classify_info outputs all the info needed for the classification
[15702]	345	# to the database
	346	sub output_classify_info
	347	{
[15725]	348	my ($classifiers, $infodb_type, $infodb_handle, $remove_empty_classifications, $gli) = @_;
[214]	349
[6332]	350	$gli = 0 unless defined $gli;
	351
[315]	352	# create a classification containing all the info
[15702]	353	my $classifyinfo = { 'classifyOID'=> 'browse',
	354	'contains' => [] };
[315]	355
	356	# get each of the classifications
[15703]	357	foreach my $classifier (@$classifiers)
[15702]	358	{
	359	my $classifier_info = $classifier->get_classify_info($gli);
	360	$classifier_info->{'classifyOID'} = "CL$next_classify_num" unless defined($classifier_info->{'classifyOID'});
	361	print STDERR "*** outputting information for classifier: $classifier_info->{'classifyOID'}\n";
[8220]	362
[15702]	363	push(@{$classifyinfo->{'contains'}}, $classifier_info);
[315]	364	$next_classify_num++;
[214]	365	}
	366
[15725]	367	&print_classify_info($infodb_type, $infodb_handle, $classifyinfo, "", $remove_empty_classifications);
[214]	368	}
	369
[831]	370
[15702]	371	sub print_classify_info
	372	{
[15725]	373	my ($infodb_type, $infodb_handle, $classifyinfo, $OID, $remove_empty_classifications) = @_;
[15702]	374
[315]	375	$OID =~ s/^\.+//; # just for good luck
[214]	376
[315]	377	# book information is printed elsewhere
	378	return if (defined ($classifyinfo->{'OID'}));
	379
	380	# don't want empty classifications
[8445]	381	return if (&check_contents ($classifyinfo, $remove_empty_classifications) == 0 && $remove_empty_classifications);
[315]	382
[8361]	383	$OID = $classifyinfo->{'classifyOID'} if defined ($classifyinfo->{'classifyOID'});
[15702]	384
	385	my %classify_infodb = ();
	386	$classify_infodb{"doctype"} = [ "classify" ];
	387	$classify_infodb{"hastxt"} = [ "0" ];
	388	$classify_infodb{"childtype"} = [ $classifyinfo->{'childtype'} ]
[8361]	389	if defined $classifyinfo->{'childtype'};
[15702]	390	$classify_infodb{"Title"} = [ $classifyinfo->{'Title'} ]
[8361]	391	if defined $classifyinfo->{'Title'};
[15702]	392	$classify_infodb{"numleafdocs"} = [ $classifyinfo->{'numleafdocs'} ]
[8361]	393	if defined $classifyinfo->{'numleafdocs'};
[15702]	394	$classify_infodb{"thistype"} = [ $classifyinfo->{'thistype'} ]
[8361]	395	if defined $classifyinfo->{'thistype'};
[15702]	396	$classify_infodb{"parameters"} = [ $classifyinfo->{'parameters'} ]
[8361]	397	if defined $classifyinfo->{'parameters'};
[15702]	398	$classify_infodb{"supportsmemberof"} = [ $classifyinfo->{'supportsmemberof'} ]
[8361]	399	if defined $classifyinfo->{'supportsmemberof'};
	400
[15702]	401	my $contains_text = "";
	402	my $mdoffset_text = "";
[8361]	403
	404	my $next_subOID = 1;
	405	my $first = 1;
[15702]	406	foreach my $tempinfo (@{$classifyinfo->{'contains'}}) {
[8361]	407	# empty contents were made undefined by clean_contents()
	408	next unless defined $tempinfo;
	409	if (!defined ($tempinfo->{'classifyOID'}) \|\|
	410	$tempinfo->{'classifyOID'} ne "oai") {
	411	$contains_text .= ";" unless $first;
	412	}
	413	$mdoffset_text .= ";" unless $first;
	414	$first = 0;
[315]	415
[16959]	416	if (defined ($tempinfo->{'classifyOID'}))
	417	{
	418	if ($tempinfo->{'classifyOID'} ne "oai")
	419	{
[8361]	420	$contains_text .= $tempinfo->{'classifyOID'};
[8275]	421	}
[12844]	422
[15725]	423	&print_classify_info ($infodb_type, $infodb_handle, $tempinfo, $tempinfo->{'classifyOID'},
[8361]	424	$remove_empty_classifications);
[16959]	425	}
	426	elsif (defined ($tempinfo->{'OID'}))
	427	{
[8361]	428	$contains_text .= $tempinfo->{'OID'};
[12844]	429	$mdoffset_text .= $tempinfo->{'offset'} if (defined ($tempinfo->{'offset'}));
[16959]	430	}
	431	else
	432	{
	433	# Supress having top-level node in Collage classifier
	434	# so no bookshelf icon appears, top-level, along with the
	435	# applet
	436	if (!defined ($tempinfo->{'Title'}) \|\| $tempinfo->{'Title'} ne "Collage")
	437	{
	438	$contains_text .= "\".$next_subOID";
	439	}
[12844]	440
[16959]	441	&print_classify_info ($infodb_type, $infodb_handle, $tempinfo, "$OID.$next_subOID",
	442	$remove_empty_classifications);
	443	$next_subOID++;
	444	}
[315]	445	}
[8361]	446
[15702]	447	$classify_infodb{"contains"} = [ $contains_text ];
	448	$classify_infodb{"mdtype"} = [ $classifyinfo->{'mdtype'} ]
[8361]	449	if defined $classifyinfo->{'mdtype'};
[15702]	450	$classify_infodb{"mdoffset"} = [ $mdoffset_text ]
	451	if ($mdoffset_text !~ m/^;+$/);
[8361]	452
[15725]	453	&dbutil::write_infodb_entry($infodb_type, $infodb_handle, $OID, \%classify_infodb);
[315]	454	}
	455
[15702]	456
[637]	457	sub check_contents {
[8445]	458	my ($classifyinfo,$remove_empty_classifications) = @_;
	459	$remove_empty_classifications = 0 unless ($remove_empty_classifications);
[637]	460	my $num_leaf_docs = 0;
	461	my $sub_num_leaf_docs = 0;
[315]	462
[637]	463	return $classifyinfo->{'numleafdocs'} if (defined $classifyinfo->{'numleafdocs'});
	464
[15703]	465	foreach my $content (@{$classifyinfo->{'contains'}}) {
[315]	466	if (defined $content->{'OID'}) {
	467	# found a book
[637]	468	$num_leaf_docs ++;
[9790]	469	} elsif (($sub_num_leaf_docs = &check_contents ($content,$remove_empty_classifications)) > 0) {
[315]	470	# there's a book somewhere below
[637]	471	$num_leaf_docs += $sub_num_leaf_docs;
[315]	472	} else {
[8445]	473	if ($remove_empty_classifications){
	474	# section contains no books so we want to remove
	475	# it from its parents contents
	476	$content = undef;
	477	}
[315]	478	}
	479	}
[637]	480
	481	$classifyinfo->{'numleafdocs'} = $num_leaf_docs;
	482	return $num_leaf_docs;
[315]	483	}
	484
[214]	485	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: