Context Navigation

source: gs3-extensions/solr/trunk/src/perllib/solrbuildproc.pm@ 27802

Last change on this file since 27802 was 27802, checked in by kjdon, 11 years ago
adding in code for sort fields. just copied form lucene build code
File size: 18.1 KB

Rev	Line
[24446]	1	###########################################################################
	2	#
	3	# solrbuildproc.pm -- perl wrapper for building index with Solr
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
	26	package solrbuildproc;
	27
	28	# This document processor outputs a document for solr to process
	29
	30	# Rather then use the XML structure developed for mgppbuilder/mgppbuildproc
	31	# whose use was then extended to Lucene, Solr has its own XML syntax:
	32	#
	33	# http://wiki.apache.org/solr/UpdateXmlMessages
	34	#
	35	# Using this means we don't need to write SolrWrapper.jar, as had to be
	36	# done for Lucene, translating the XML syntax piped to it into appropriate
	37	# calls to the Lucene API
	38
	39
	40	use lucenebuildproc;
	41	use ghtml;
	42	use strict;
	43	no strict 'refs'; # allow filehandles to be variables and viceversa
	44
	45
	46	use IncrementalBuildUtils;
	47
	48	sub BEGIN {
	49	@solrbuildproc::ISA = ('lucenebuildproc');
	50	}
	51
	52
	53	sub new {
	54	my $class = shift @_;
	55	my $self = new lucenebuildproc (@_);
	56
	57	return bless $self, $class;
	58	}
	59
	60
	61	#----
	62
	63	sub index_field_mapping_edit {
	64	my $self = shift (@_);
	65	my ($doc_obj,$file,$edit_mode) = @_;
	66
	67	# Only add/update gets to here
	68	# Currently there is no need to distinguish between these edit modes
	69
	70	my $outhandle = $self->{'outhandle'};
	71
	72	# only study this document if it is one to be indexed
	73	return if ($doc_obj->get_doc_type() ne "indexed_doc");
	74
	75	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
	76
	77	# get the parameters for the output
	78	# split on : just in case there is subcoll and lang stuff
	79	my ($fields) = split (/:/, $self->{'index'});
	80
	81	my $doc_section = 0; # just for this document
	82
	83	# get the text for this document
	84	my $section = $doc_obj->get_top_section();
	85
	86	while (defined $section)
	87	{
	88	$doc_section++;
	89
	90	# if we are doing subcollections, then some docs shouldn't be
	91	# considered for indexing
	92
	93	my $indexed_section
	94	= $doc_obj->get_metadata_element($section, "gsdldoctype")
	95	\|\| "indexed_section";
	96
	97	if (($indexed_doc == 0)
	98	\|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
	99	$section = $doc_obj->get_next_section($section);
	100	next;
	101	}
	102
	103	# has the user added a 'metadata' index?
	104	my $all_metadata_specified = 0;
	105
	106	# which fields have already been indexed?
	107	# (same as fields, but in a map)
	108	my $specified_fields = {};
	109
	110	# do we have an allfields index??
	111	my $allfields_index = 0;
	112
	113	# collect up all the text for it in here
	114	my $allfields_text = "";
	115
	116	foreach my $field (split (/;/, $fields)) {
	117	if ($field eq "allfields") {
	118	$allfields_index = 1;
	119	} elsif ($field eq "metadata") {
	120	$all_metadata_specified = 1;
	121	}
	122	}
	123
	124	foreach my $field (split (/;/, $fields)) {
	125
	126	# only deal with this field if it doesn't start with top or
	127	# this is the first section
	128	my $real_field = $field;
	129	next if (($real_field =~ s/^top//) && ($doc_section != 1));
	130
	131	# process these two later
	132	next if ($real_field eq "allfields" \|\| $real_field eq "metadata");
	133
	134	# individual metadata and or text specified
	135	# -- could be a comma separated list
	136	$specified_fields->{$real_field} = 1;
	137
	138	if (!defined $self->{'indexfieldmap'}->{$real_field}) {
	139	my $shortname = $self->create_shortname($real_field);
	140	$self->{'indexfieldmap'}->{$real_field} = $shortname;
	141	$self->{'indexfieldmap'}->{$shortname} = 1;
	142	}
	143	} # foreach field
	144
	145
	146	if ($all_metadata_specified) {
	147
	148	my $new_text = "";
	149	my $shortname = "";
	150	my $metadata = $doc_obj->get_all_metadata ($section);
	151
	152	foreach my $pair (@$metadata) {
	153	my ($mfield, $mvalue) = (@$pair);
	154
	155	# no value
	156	next unless defined $mvalue && $mvalue ne "";
	157
	158	# we have already indexed this
	159	next if defined ($specified_fields->{$mfield});
	160
	161	# check fields here, maybe others dont want - change to use dontindex!!
	162	next if ($mfield eq "Identifier" \|\| $mfield eq "classifytype" \|\| $mfield eq "assocfilepath");
	163	next if ($mfield =~ /^gsdl/);
	164
	165	if (defined $self->{'indexfieldmap'}->{$mfield}) {
	166	$shortname = $self->{'indexfieldmap'}->{$mfield};
	167	}
	168	else {
	169	$shortname = $self->create_shortname($mfield);
	170	$self->{'indexfieldmap'}->{$mfield} = $shortname;
	171	$self->{'indexfieldmap'}->{$shortname} = 1;
	172	}
	173
	174	if (!defined $self->{'indexfields'}->{$mfield}) {
	175	$self->{'indexfields'}->{$mfield} = 1;
	176	}
	177	}
	178	}
	179
	180	if ($allfields_index) {
	181	# add the index name mapping
	182	$self->{'indexfieldmap'}->{"allfields"} = "ZZ";
	183	$self->{'indexfieldmap'}->{"ZZ"} = 1;
	184	}
	185
	186	$section = $doc_obj->get_next_section($section);
	187
	188	} # while defined section
	189
	190
	191	}
	192
	193	sub index_field_mapping {
	194	my $self = shift (@_);
	195	my ($doc_obj,$file) = @_;
	196
	197	$self->index_field_mapping_edit($doc_obj,$file,"add");
	198	}
	199
	200	sub index_field_mappingreindex
	201	{
	202	my $self = shift (@_);
	203	my ($doc_obj,$file) = @_;
	204
	205	$self->index_field_mapping_edit($doc_obj,$file,"update");
	206	}
	207
	208	sub index_field_mappingdelete
	209	{
	210	my $self = shift (@_);
	211	my ($doc_obj,$file) = @_;
	212
	213	return; # nothing to be done
	214	}
	215
	216
	217	#----
	218
	219	sub textedit {
	220	my $self = shift (@_);
	221	my ($doc_obj,$file,$edit_mode) = @_;
	222
	223
	224	if (!$self->get_indexing_text()) {
	225	# In text-compress mode:
	226	# => want document to be output in the simple <Doc>..</Doc> as is
	227	# done by its super-class
	228	return $self->SUPER::textedit(@_);
	229	}
	230
	231	# "update" for $edit_mode near identical to "add" as we use Solr in its
	232	# default mode of replacing an existing document if the new document
	233	# has the same doc id. Main area of difference between "add" and "update"
	234	# is that we do not update our 'stats' for number of documents or number
	235	# of bytes processed. The latter is inaccurate, but considered better
	236	# than allowing the value to steadily climb.
	237
	238
	239	my $solrhandle = $self->{'output_handle'};
	240	my $outhandle = $self->{'outhandle'};
	241
	242	# only output this document if it is one to be indexed
	243	return if ($doc_obj->get_doc_type() ne "indexed_doc");
	244
	245	# skip this document if in "compress-text" mode and asked to delete it
	246	return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
	247
	248	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
	249
	250	# this is another document
	251	if ($edit_mode eq "add") {
	252	$self->{'num_docs'} += 1;
	253	}
	254	elsif ($edit_mode eq "delete") {
	255	$self->{'num_docs'} -= 1;
	256	}
	257
	258	# get the parameters for the output
	259	# split on : just in case there is subcoll and lang stuff
	260	my ($fields) = split (/:/, $self->{'index'});
	261
	262	my $levels = $self->{'levels'};
	263	my $ldoc_level = $levels->{'document'};
	264	my $lsec_level = $levels->{'section'};
	265
	266	my $gs2_docOID = $doc_obj->get_OID();
	267
	268	my $start_doc;
	269	my $end_doc;
	270
	271	if ($edit_mode eq "add") {
	272	$start_doc = " <add>\n";
	273	$start_doc .= " <doc>\n";
	274	$start_doc .= " <field name=\"docOID\">$gs2_docOID</field>\n";
	275
	276	$end_doc = " </doc>\n";
	277	$end_doc .= " </add>\n";
	278	}
	279	else {
	280	$start_doc = " <delete>\n";
	281	$start_doc .= " <id>$gs2_docOID</id>\n";
	282
	283	$end_doc = " </delete>\n";
	284	}
	285
	286	# add/update, delete
	287
	288	my $sec_tag_name = "";
	289	if ($lsec_level)
	290	{
	291	$sec_tag_name = $mgppbuildproc::level_map{'section'};
	292	}
	293
	294	my $doc_section = 0; # just for this document
	295
	296	# only output if working with doc level
[25846]	297	# my $text = undef;
	298
	299	my $text = ($sec_tag_name eq "") ? $start_doc : "";
[24446]	300
[25846]	301	# my $text = $start_doc if ($sec_tag_name eq "");
	302
[24446]	303	# get the text for this document
	304	my $section = $doc_obj->get_top_section();
	305
	306	while (defined $section)
	307	{
	308	# update a few statistics
	309	$doc_section++;
	310	$self->{'num_sections'}++;
	311
	312	my $sec_gs2_id = $self->{'num_sections'};
	313	my $sec_gs2_docOID = $gs2_docOID;
	314	$sec_gs2_docOID .= ".$section" if ($section ne "");
	315
	316	my $start_sec;
	317	my $end_sec;
	318
	319	if ($edit_mode eq "add") {
	320	$start_sec = " <add>\n";
	321	$start_sec .= " <doc>\n";
	322	$start_sec .= " <field name=\"docOID\">$sec_gs2_docOID</field>\n";
[25846]	323
[24446]	324	$end_sec = " </doc>\n";
	325	$end_sec .= " </add>\n";
	326	}
	327	else {
	328	$start_sec = " <delete>\n";
	329	$start_sec .= " <id>$sec_gs2_docOID</id>\n";
	330
	331	$end_sec = " </delete>\n";
	332	}
	333
	334
	335	# if we are doing subcollections, then some docs shouldn't be indexed.
	336	# but we need to put the section tag placeholders in there so the
	337	# sections match up with database
	338	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
	339	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
	340	if ($sec_tag_name ne "") {
	341	$text .= $start_sec;
	342	$text .= $end_sec;
	343	}
	344	$section = $doc_obj->get_next_section($section);
	345	next;
	346	}
	347
	348	# add in start section tag if indexing at the section level
	349	$text .= $start_sec if ($sec_tag_name ne "");
	350
	351	if ($edit_mode eq "add") {
	352	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
	353	}
	354	elsif ($edit_mode eq "delete") {
	355	$self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
	356	}
	357
	358
	359	# has the user added a 'metadata' index?
	360	my $all_metadata_specified = 0;
	361	# which fields have already been indexed? (same as fields, but in a map)
	362	my $specified_fields = {};
	363
	364	# do we have an allfields index??
	365	my $allfields_index = 0;
	366	# collect up all the text for it in here
	367	my $allfields_text = "";
	368	foreach my $field (split (/;/, $fields)) {
	369	if ($field eq "allfields") {
	370	$allfields_index = 1;
	371	} elsif ($field eq "metadata") {
	372	$all_metadata_specified = 1;
	373	}
	374	}
	375
	376	foreach my $field (split (/;/, $fields)) {
	377
	378	# only deal with this field if it doesn't start with top or
	379	# this is the first section
	380	my $real_field = $field;
	381	next if (($real_field =~ s/^top//) && ($doc_section != 1));
	382
	383	# process these two later
	384	next if ($real_field eq "allfields" \|\| $real_field eq "metadata");
	385
	386	#individual metadata and or text specified - could be a comma separated list
	387	$specified_fields->{$real_field} = 1;
	388	my $shortname="";
	389	my $new_field = 0; # have we found a new field name?
	390	if (defined $self->{'indexfieldmap'}->{$real_field}) {
	391	$shortname = $self->{'indexfieldmap'}->{$real_field};
	392	}
	393	else {
	394	$shortname = $self->create_shortname($real_field);
	395	$new_field = 1;
	396	}
	397
	398	my @metadata_list = (); # put any metadata values in here
	399	my $section_text = ""; # put the text in here
	400	foreach my $submeta (split /,/, $real_field) {
	401	if ($submeta eq "text") {
	402	# no point in indexing text more than once
	403	if ($section_text eq "") {
	404	$section_text = $doc_obj->get_text($section);
	405	if ($self->{'indexing_text'}) {
	406	# we always strip html
	407	$section_text = $self->preprocess_text($section_text, 1, "");
	408	}
	409	else {
	410	# leave html stuff in, but escape the tags
	411	&ghtml::htmlsafe($section_text);
	412	}
	413	}
	414	}
	415	else {
	416	$submeta =~ s/^ex\.//; #strip off ex.
	417
	418	# its a metadata element
	419	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
	420	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
	421	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
	422	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
	423	}
	424	}
	425	push (@metadata_list, @section_metadata);
	426	}
	427	} # for each field in this one index
	428
	429	# now we add the text and/or metadata into new_text
	430	if ($section_text ne "" \|\| scalar(@metadata_list)) {
	431	my $new_text = "";
	432
	433	if ($section_text ne "") {
	434	$new_text .= "$section_text ";
	435	}
	436
	437	foreach my $item (@metadata_list) {
	438	&ghtml::htmlsafe($item);
	439	$new_text .= "$item ";
	440	}
	441
	442	if ($allfields_index) {
	443	$allfields_text .= $new_text;
	444	}
	445
[25846]	446	# Remove any leading or trailing white space
	447	$new_text =~ s/\s+$//;
	448	$new_text =~ s/^\s+//;
	449
	450
[24446]	451	if ($self->{'indexing_text'}) {
	452	# add the tag
	453	$new_text = "<field name=\"$shortname\" >$new_text</field>\n";
	454	}
	455	# filter the text
	456	$new_text = $self->filter_text ($field, $new_text);
	457
	458	if ($edit_mode eq "add") {
	459	$self->{'num_processed_bytes'} += length ($new_text);
	460	$text .= "$new_text";
	461	}
	462	elsif ($edit_mode eq "update") {
	463	$text .= "$new_text";
	464	}
	465	elsif ($edit_mode eq "delete") {
	466	$self->{'num_processed_bytes'} -= length ($new_text);
	467	}
	468
	469
	470	if ($self->{'indexing_text'} && $new_field) {
	471	# we need to add to the list in indexfields
	472
	473	$self->{'indexfieldmap'}->{$real_field} = $shortname;
	474	$self->{'indexfieldmap'}->{$shortname} = 1;
	475	}
	476
	477	}
	478
	479	} # foreach field
	480
	481
	482	if ($all_metadata_specified) {
	483
	484	my $new_text = "";
	485	my $shortname = "";
	486	my $metadata = $doc_obj->get_all_metadata ($section);
	487	foreach my $pair (@$metadata) {
	488	my ($mfield, $mvalue) = (@$pair);
	489
	490	# no value
	491	next unless defined $mvalue && $mvalue ne "";
	492
	493	# we have already indexed this
	494	next if defined ($specified_fields->{$mfield});
	495
	496	# check fields here, maybe others dont want - change to use dontindex!!
	497	next if ($mfield eq "Identifier" \|\| $mfield eq "classifytype" \|\| $mfield eq "assocfilepath");
	498	next if ($mfield =~ /^gsdl/);
	499
	500	&ghtml::htmlsafe($mvalue);
	501
	502	if (defined $self->{'indexfieldmap'}->{$mfield}) {
	503	$shortname = $self->{'indexfieldmap'}->{$mfield};
	504	}
	505	else {
	506	$shortname = $self->create_shortname($mfield);
	507	$self->{'indexfieldmap'}->{$mfield} = $shortname;
	508	$self->{'indexfieldmap'}->{$shortname} = 1;
	509	}
	510	$new_text .= "<field name=\"$shortname\">$mvalue</field>\n";
	511	if ($allfields_index) {
	512	$allfields_text .= "$mvalue ";
	513	}
	514
	515	if (!defined $self->{'indexfields'}->{$mfield}) {
	516	$self->{'indexfields'}->{$mfield} = 1;
	517	}
	518
	519	}
	520	# filter the text
	521	$new_text = $self->filter_text ("metadata", $new_text);
	522
	523	if ($edit_mode eq "add") {
	524	$self->{'num_processed_bytes'} += length ($new_text);
	525	$text .= "$new_text";
	526	}
	527	elsif ($edit_mode eq "update") {
	528	$text .= "$new_text";
	529	}
	530	elsif ($edit_mode eq "delete") {
	531	$self->{'num_processed_bytes'} -= length ($new_text);
	532	}
	533	}
	534
	535	if ($allfields_index) {
	536	# add the index name mapping
	537	$self->{'indexfieldmap'}->{"allfields"} = "ZZ";
	538	$self->{'indexfieldmap'}->{"ZZ"} = 1;
	539
	540	my $new_text = "<field name=\"ZZ\">$allfields_text</field>\n";
	541	# filter the text
	542	$new_text = $self->filter_text ("allfields", $new_text);
	543
	544	if ($edit_mode eq "add") {
	545	$self->{'num_processed_bytes'} += length ($new_text);
	546	$text .= "$new_text";
	547	}
	548	elsif ($edit_mode eq "update") {
	549	$text .= "$new_text";
	550	}
	551	elsif ($edit_mode eq "delete") {
	552	$self->{'num_processed_bytes'} -= length ($new_text);
	553	}
	554	}
	555
[27802]	556	# only add sort fields for this section if we are indexing this section, we are doing section level indexing or this is the top section
	557	if ($self->{'indexing_text'} && ($sec_tag_name ne "" \|\| $doc_section == 1 )) {
	558	# add sort fields if there are any
	559
	560	foreach my $sfield (@{$self->{'sortfields'}}) {
	561	# ignore special field rank
	562	next if $sfield eq "rank";
	563	my $sf_shortname;
	564	if (defined $self->{'sortfieldnamemap'}->{$sfield}) {
	565	$sf_shortname = $self->{'sortfieldnamemap'}->{$sfield};
	566	}
	567	else {
	568	$sf_shortname = $self->create_sortfield_shortname($sfield);
	569	$self->{'sortfieldnamemap'}->{$sfield} = $sf_shortname;
	570	$self->{'sortfieldnamemap'}->{$sf_shortname} = 1;
	571	}
	572	my @metadata_list = (); # put any metadata values in here
	573	foreach my $submeta (split /,/, $sfield) {
	574	$submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
	575
	576	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
	577	if ($section ne $doc_obj->get_top_section() && defined ($self->{'sections_sort_on_document_metadata'})) {
	578	if ($self->{'sections_sort_on_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_sort_on_document_metadata'} eq "unless_section_metadata_exists")) {
	579	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
	580	}
	581	}
	582	push (@metadata_list, @section_metadata);
	583	}
	584	my $new_text = "";
	585	foreach my $item (@metadata_list) {
	586	&ghtml::htmlsafe($item);
	587	$new_text .= "$item";
	588	}
	589	if ($new_text =~ /\S/) {
	590	#$new_text = "<$sf_shortname index=\"1\" tokenize=\"0\">$new_text</$sf_shortname>";
	591	$new_text = "<field name=\"$sf_shortname\">$new_text</field>\n";
	592	# filter the text???
	593	$text .= "$new_text"; # add it to the main text block
	594	print STDERR "adding in sort text $new_text\n";
	595	$self->{'actualsortfields'}->{$sfield} = 1;
	596	}
	597	}
	598	}
	599
[24446]	600	# add in end tag if at top-level doc root, or indexing at the section level
	601	$text .= $end_sec if ($sec_tag_name ne "");
	602
	603	$section = $doc_obj->get_next_section($section);
	604	} # while defined section
	605
	606
	607	# only output if working with doc level
	608	$text .= $end_doc if ($sec_tag_name eq "");
	609
	610	## $text .= "<commit/>\n";
[27802]	611	open (TEXTOUT, ">text.out");
	612	print TEXTOUT "$text";
	613	close TEXTOUT;
[24446]	614
	615	print $solrhandle $text;
	616
	617	}
	618
	619
	620
	621
	622	sub textreindex
	623	{
	624	my $self = shift (@_);
	625	my ($doc_obj,$file) = @_;
	626
	627	$self->textedit($doc_obj,$file,"update");
	628	}
	629
	630
	631	1;
	632
	633

Note: See TracBrowser for help on using the repository browser.

Download in other formats: