Context Navigation

source: gs3-extensions/solr/trunk/src/perllib/solrbuildproc.pm@ 24501

Last change on this file since 24501 was 24447, checked in by davidb, 13 years ago
Tidy up of code (removing commented out redundant code), plus tweaking of code that starts and stops jetty to cope with situation where the server is already running
File size: 15.9 KB

Rev	Line
[24446]	1	###########################################################################
	2	#
	3	# solrbuildproc.pm -- perl wrapper for building index with Solr
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
	26	package solrbuildproc;
	27
	28	# This document processor outputs a document for solr to process
	29
	30	# Rather then use the XML structure developed for mgppbuilder/mgppbuildproc
	31	# whose use was then extended to Lucene, Solr has its own XML syntax:
	32	#
	33	# http://wiki.apache.org/solr/UpdateXmlMessages
	34	#
	35	# Using this means we don't need to write SolrWrapper.jar, as had to be
	36	# done for Lucene, translating the XML syntax piped to it into appropriate
	37	# calls to the Lucene API
	38
	39
	40	use lucenebuildproc;
	41	use ghtml;
	42	use strict;
	43	no strict 'refs'; # allow filehandles to be variables and viceversa
	44
	45
	46	use IncrementalBuildUtils;
	47
	48	sub BEGIN {
	49	@solrbuildproc::ISA = ('lucenebuildproc');
	50	}
	51
	52
	53	sub new {
	54	my $class = shift @_;
	55	my $self = new lucenebuildproc (@_);
	56
	57	return bless $self, $class;
	58	}
	59
	60
	61	#----
	62
	63	sub index_field_mapping_edit {
	64	my $self = shift (@_);
	65	my ($doc_obj,$file,$edit_mode) = @_;
	66
	67	# Only add/update gets to here
	68	# Currently there is no need to distinguish between these edit modes
	69
	70	my $outhandle = $self->{'outhandle'};
	71
	72	# only study this document if it is one to be indexed
	73	return if ($doc_obj->get_doc_type() ne "indexed_doc");
	74
	75	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
	76
	77	# get the parameters for the output
	78	# split on : just in case there is subcoll and lang stuff
	79	my ($fields) = split (/:/, $self->{'index'});
	80
	81	my $doc_section = 0; # just for this document
	82
	83	# get the text for this document
	84	my $section = $doc_obj->get_top_section();
	85
	86	while (defined $section)
	87	{
	88	$doc_section++;
	89
	90	# if we are doing subcollections, then some docs shouldn't be
	91	# considered for indexing
	92
	93	my $indexed_section
	94	= $doc_obj->get_metadata_element($section, "gsdldoctype")
	95	\|\| "indexed_section";
	96
	97	if (($indexed_doc == 0)
	98	\|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
	99	$section = $doc_obj->get_next_section($section);
	100	next;
	101	}
	102
	103	# has the user added a 'metadata' index?
	104	my $all_metadata_specified = 0;
	105
	106	# which fields have already been indexed?
	107	# (same as fields, but in a map)
	108	my $specified_fields = {};
	109
	110	# do we have an allfields index??
	111	my $allfields_index = 0;
	112
	113	# collect up all the text for it in here
	114	my $allfields_text = "";
	115
	116	foreach my $field (split (/;/, $fields)) {
	117	if ($field eq "allfields") {
	118	$allfields_index = 1;
	119	} elsif ($field eq "metadata") {
	120	$all_metadata_specified = 1;
	121	}
	122	}
	123
	124	foreach my $field (split (/;/, $fields)) {
	125
	126	# only deal with this field if it doesn't start with top or
	127	# this is the first section
	128	my $real_field = $field;
	129	next if (($real_field =~ s/^top//) && ($doc_section != 1));
	130
	131	# process these two later
	132	next if ($real_field eq "allfields" \|\| $real_field eq "metadata");
	133
	134	# individual metadata and or text specified
	135	# -- could be a comma separated list
	136	$specified_fields->{$real_field} = 1;
	137
	138	if (!defined $self->{'indexfieldmap'}->{$real_field}) {
	139	my $shortname = $self->create_shortname($real_field);
	140	$self->{'indexfieldmap'}->{$real_field} = $shortname;
	141	$self->{'indexfieldmap'}->{$shortname} = 1;
	142	}
	143	} # foreach field
	144
	145
	146	if ($all_metadata_specified) {
	147
	148	my $new_text = "";
	149	my $shortname = "";
	150	my $metadata = $doc_obj->get_all_metadata ($section);
	151
	152	foreach my $pair (@$metadata) {
	153	my ($mfield, $mvalue) = (@$pair);
	154
	155	# no value
	156	next unless defined $mvalue && $mvalue ne "";
	157
	158	# we have already indexed this
	159	next if defined ($specified_fields->{$mfield});
	160
	161	# check fields here, maybe others dont want - change to use dontindex!!
	162	next if ($mfield eq "Identifier" \|\| $mfield eq "classifytype" \|\| $mfield eq "assocfilepath");
	163	next if ($mfield =~ /^gsdl/);
	164
	165	if (defined $self->{'indexfieldmap'}->{$mfield}) {
	166	$shortname = $self->{'indexfieldmap'}->{$mfield};
	167	}
	168	else {
	169	$shortname = $self->create_shortname($mfield);
	170	$self->{'indexfieldmap'}->{$mfield} = $shortname;
	171	$self->{'indexfieldmap'}->{$shortname} = 1;
	172	}
	173
	174	if (!defined $self->{'indexfields'}->{$mfield}) {
	175	$self->{'indexfields'}->{$mfield} = 1;
	176	}
	177	}
	178	}
	179
	180	if ($allfields_index) {
	181	# add the index name mapping
	182	$self->{'indexfieldmap'}->{"allfields"} = "ZZ";
	183	$self->{'indexfieldmap'}->{"ZZ"} = 1;
	184	}
	185
	186	$section = $doc_obj->get_next_section($section);
	187
	188	} # while defined section
	189
	190
	191	}
	192
	193	sub index_field_mapping {
	194	my $self = shift (@_);
	195	my ($doc_obj,$file) = @_;
	196
	197	$self->index_field_mapping_edit($doc_obj,$file,"add");
	198	}
	199
	200	sub index_field_mappingreindex
	201	{
	202	my $self = shift (@_);
	203	my ($doc_obj,$file) = @_;
	204
	205	$self->index_field_mapping_edit($doc_obj,$file,"update");
	206	}
	207
	208	sub index_field_mappingdelete
	209	{
	210	my $self = shift (@_);
	211	my ($doc_obj,$file) = @_;
	212
	213	return; # nothing to be done
	214	}
	215
	216
	217	#----
	218
	219	sub textedit {
	220	my $self = shift (@_);
	221	my ($doc_obj,$file,$edit_mode) = @_;
	222
	223
	224	if (!$self->get_indexing_text()) {
	225	# In text-compress mode:
	226	# => want document to be output in the simple <Doc>..</Doc> as is
	227	# done by its super-class
	228	return $self->SUPER::textedit(@_);
	229	}
	230
	231	# "update" for $edit_mode near identical to "add" as we use Solr in its
	232	# default mode of replacing an existing document if the new document
	233	# has the same doc id. Main area of difference between "add" and "update"
	234	# is that we do not update our 'stats' for number of documents or number
	235	# of bytes processed. The latter is inaccurate, but considered better
	236	# than allowing the value to steadily climb.
	237
	238
	239	my $solrhandle = $self->{'output_handle'};
	240	my $outhandle = $self->{'outhandle'};
	241
	242	# only output this document if it is one to be indexed
	243	return if ($doc_obj->get_doc_type() ne "indexed_doc");
	244
	245	# skip this document if in "compress-text" mode and asked to delete it
	246	return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
	247
	248	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
	249
	250	# this is another document
	251	if ($edit_mode eq "add") {
	252	$self->{'num_docs'} += 1;
	253	}
	254	elsif ($edit_mode eq "delete") {
	255	$self->{'num_docs'} -= 1;
	256	}
	257
	258	# get the parameters for the output
	259	# split on : just in case there is subcoll and lang stuff
	260	my ($fields) = split (/:/, $self->{'index'});
	261
	262	my $levels = $self->{'levels'};
	263	my $ldoc_level = $levels->{'document'};
	264	my $lsec_level = $levels->{'section'};
	265
	266	my $gs2_docOID = $doc_obj->get_OID();
	267
	268
	269	my $start_doc;
	270	my $end_doc;
	271
	272	if ($edit_mode eq "add") {
	273	$start_doc = " <add>\n";
	274	$start_doc .= " <doc>\n";
	275	$start_doc .= " <field name=\"docOID\">$gs2_docOID</field>\n";
	276
	277	$end_doc = " </doc>\n";
	278	$end_doc .= " </add>\n";
	279	}
	280	else {
	281	$start_doc = " <delete>\n";
	282	$start_doc .= " <id>$gs2_docOID</id>\n";
	283
	284	$end_doc = " </delete>\n";
	285	}
	286
	287	# add/update, delete
	288
	289	my $sec_tag_name = "";
	290	if ($lsec_level)
	291	{
	292	$sec_tag_name = $mgppbuildproc::level_map{'section'};
	293	}
	294
	295	my $doc_section = 0; # just for this document
	296
	297	# only output if working with doc level
	298	my $text = $start_doc if ($sec_tag_name eq "");
	299
	300	# get the text for this document
	301	my $section = $doc_obj->get_top_section();
	302
	303	while (defined $section)
	304	{
	305	# update a few statistics
	306	$doc_section++;
	307	$self->{'num_sections'}++;
	308
	309	my $sec_gs2_id = $self->{'num_sections'};
	310	my $sec_gs2_docOID = $gs2_docOID;
	311	$sec_gs2_docOID .= ".$section" if ($section ne "");
	312
	313	my $start_sec;
	314	my $end_sec;
	315
	316	if ($edit_mode eq "add") {
	317	$start_sec = " <add>\n";
	318	$start_sec .= " <doc>\n";
	319	$start_sec .= " <field name=\"docOID\">$sec_gs2_docOID</field>\n";
	320
	321	$end_sec = " </doc>\n";
	322	$end_sec .= " </add>\n";
	323	}
	324	else {
	325	$start_sec = " <delete>\n";
	326	$start_sec .= " <id>$sec_gs2_docOID</id>\n";
	327
	328	$end_sec = " </delete>\n";
	329	}
	330
	331
	332	# if we are doing subcollections, then some docs shouldn't be indexed.
	333	# but we need to put the section tag placeholders in there so the
	334	# sections match up with database
	335	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
	336	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
	337	if ($sec_tag_name ne "") {
	338	$text .= $start_sec;
	339	$text .= $end_sec;
	340	}
	341	$section = $doc_obj->get_next_section($section);
	342	next;
	343	}
	344
	345	# add in start section tag if indexing at the section level
	346	$text .= $start_sec if ($sec_tag_name ne "");
	347
	348	if ($edit_mode eq "add") {
	349	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
	350	}
	351	elsif ($edit_mode eq "delete") {
	352	$self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
	353	}
	354
	355
	356	# has the user added a 'metadata' index?
	357	my $all_metadata_specified = 0;
	358	# which fields have already been indexed? (same as fields, but in a map)
	359	my $specified_fields = {};
	360
	361	# do we have an allfields index??
	362	my $allfields_index = 0;
	363	# collect up all the text for it in here
	364	my $allfields_text = "";
	365	foreach my $field (split (/;/, $fields)) {
	366	if ($field eq "allfields") {
	367	$allfields_index = 1;
	368	} elsif ($field eq "metadata") {
	369	$all_metadata_specified = 1;
	370	}
	371	}
	372
	373	foreach my $field (split (/;/, $fields)) {
	374
	375	# only deal with this field if it doesn't start with top or
	376	# this is the first section
	377	my $real_field = $field;
	378	next if (($real_field =~ s/^top//) && ($doc_section != 1));
	379
	380	# process these two later
	381	next if ($real_field eq "allfields" \|\| $real_field eq "metadata");
	382
	383	#individual metadata and or text specified - could be a comma separated list
	384	$specified_fields->{$real_field} = 1;
	385	my $shortname="";
	386	my $new_field = 0; # have we found a new field name?
	387	if (defined $self->{'indexfieldmap'}->{$real_field}) {
	388	$shortname = $self->{'indexfieldmap'}->{$real_field};
	389	}
	390	else {
	391	$shortname = $self->create_shortname($real_field);
	392	$new_field = 1;
	393	}
	394
	395	my @metadata_list = (); # put any metadata values in here
	396	my $section_text = ""; # put the text in here
	397	foreach my $submeta (split /,/, $real_field) {
	398	if ($submeta eq "text") {
	399	# no point in indexing text more than once
	400	if ($section_text eq "") {
	401	$section_text = $doc_obj->get_text($section);
	402	if ($self->{'indexing_text'}) {
	403	# we always strip html
	404	$section_text = $self->preprocess_text($section_text, 1, "");
	405	}
	406	else {
	407	# leave html stuff in, but escape the tags
	408	&ghtml::htmlsafe($section_text);
	409	}
	410	}
	411	}
	412	else {
	413	$submeta =~ s/^ex\.//; #strip off ex.
	414
	415	# its a metadata element
	416	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
	417	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
	418	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
	419	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
	420	}
	421	}
	422	push (@metadata_list, @section_metadata);
	423	}
	424	} # for each field in this one index
	425
	426	# now we add the text and/or metadata into new_text
	427	if ($section_text ne "" \|\| scalar(@metadata_list)) {
	428	my $new_text = "";
	429
	430	if ($section_text ne "") {
	431	$new_text .= "$section_text ";
	432	}
	433
	434	foreach my $item (@metadata_list) {
	435	&ghtml::htmlsafe($item);
	436	$new_text .= "$item ";
	437	}
	438
	439	if ($allfields_index) {
	440	$allfields_text .= $new_text;
	441	}
	442
	443	if ($self->{'indexing_text'}) {
	444	# add the tag
	445	$new_text = "<field name=\"$shortname\" >$new_text</field>\n";
	446	}
	447	# filter the text
	448	$new_text = $self->filter_text ($field, $new_text);
	449
	450	if ($edit_mode eq "add") {
	451	$self->{'num_processed_bytes'} += length ($new_text);
	452	$text .= "$new_text";
	453	}
	454	elsif ($edit_mode eq "update") {
	455	$text .= "$new_text";
	456	}
	457	elsif ($edit_mode eq "delete") {
	458	$self->{'num_processed_bytes'} -= length ($new_text);
	459	}
	460
	461
	462	if ($self->{'indexing_text'} && $new_field) {
	463	# we need to add to the list in indexfields
	464
	465	$self->{'indexfieldmap'}->{$real_field} = $shortname;
	466	$self->{'indexfieldmap'}->{$shortname} = 1;
	467	}
	468
	469	}
	470
	471	} # foreach field
	472
	473
	474	if ($all_metadata_specified) {
	475
	476	my $new_text = "";
	477	my $shortname = "";
	478	my $metadata = $doc_obj->get_all_metadata ($section);
	479	foreach my $pair (@$metadata) {
	480	my ($mfield, $mvalue) = (@$pair);
	481
	482	# no value
	483	next unless defined $mvalue && $mvalue ne "";
	484
	485	# we have already indexed this
	486	next if defined ($specified_fields->{$mfield});
	487
	488	# check fields here, maybe others dont want - change to use dontindex!!
	489	next if ($mfield eq "Identifier" \|\| $mfield eq "classifytype" \|\| $mfield eq "assocfilepath");
	490	next if ($mfield =~ /^gsdl/);
	491
	492	&ghtml::htmlsafe($mvalue);
	493
	494	if (defined $self->{'indexfieldmap'}->{$mfield}) {
	495	$shortname = $self->{'indexfieldmap'}->{$mfield};
	496	}
	497	else {
	498	$shortname = $self->create_shortname($mfield);
	499	$self->{'indexfieldmap'}->{$mfield} = $shortname;
	500	$self->{'indexfieldmap'}->{$shortname} = 1;
	501	}
	502	$new_text .= "<field name=\"$shortname\">$mvalue</field>\n";
	503	if ($allfields_index) {
	504	$allfields_text .= "$mvalue ";
	505	}
	506
	507	if (!defined $self->{'indexfields'}->{$mfield}) {
	508	$self->{'indexfields'}->{$mfield} = 1;
	509	}
	510
	511	}
	512	# filter the text
	513	$new_text = $self->filter_text ("metadata", $new_text);
	514
	515	if ($edit_mode eq "add") {
	516	$self->{'num_processed_bytes'} += length ($new_text);
	517	$text .= "$new_text";
	518	}
	519	elsif ($edit_mode eq "update") {
	520	$text .= "$new_text";
	521	}
	522	elsif ($edit_mode eq "delete") {
	523	$self->{'num_processed_bytes'} -= length ($new_text);
	524	}
	525	}
	526
	527	if ($allfields_index) {
	528	# add the index name mapping
	529	$self->{'indexfieldmap'}->{"allfields"} = "ZZ";
	530	$self->{'indexfieldmap'}->{"ZZ"} = 1;
	531
	532	my $new_text = "<field name=\"ZZ\">$allfields_text</field>\n";
	533	# filter the text
	534	$new_text = $self->filter_text ("allfields", $new_text);
	535
	536	if ($edit_mode eq "add") {
	537	$self->{'num_processed_bytes'} += length ($new_text);
	538	$text .= "$new_text";
	539	}
	540	elsif ($edit_mode eq "update") {
	541	$text .= "$new_text";
	542	}
	543	elsif ($edit_mode eq "delete") {
	544	$self->{'num_processed_bytes'} -= length ($new_text);
	545	}
	546	}
	547
	548	# add in end tag if at top-level doc root, or indexing at the section level
	549	$text .= $end_sec if ($sec_tag_name ne "");
	550
	551	$section = $doc_obj->get_next_section($section);
	552	} # while defined section
	553
	554
	555	# only output if working with doc level
	556	$text .= $end_doc if ($sec_tag_name eq "");
	557
	558	## $text .= "<commit/>\n";
	559
	560	print $solrhandle $text;
	561
	562	}
	563
	564
	565
	566
	567	sub textreindex
	568	{
	569	my $self = shift (@_);
	570	my ($doc_obj,$file) = @_;
	571
	572	$self->textedit($doc_obj,$file,"update");
	573	}
	574
	575
	576	1;
	577
	578

Note: See TracBrowser for help on using the repository browser.

Download in other formats: