- Timestamp:
- 2024-03-19T16:02:07+13:00 (3 months ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugouts/OpenAIGPTsPlugout.pm
r38750 r38851 79 79 } 80 80 81 # For now, don't allow groups82 if ($self->{'group_size'} > 1) {83 print STDERR "OpenAIGPTsPlugout: Has not been tested with groups\n";84 print STDERR "=> Reverting group_size back to 1\n";85 86 $self->{'group_size'} = 1;87 }88 89 81 return bless $self, $class; 90 82 } … … 102 94 } 103 95 96 # Note: This type of grouping (into dirs) is different to the base 97 # classes group_size. 'group_size' is about concatenating multiple 98 # Greenstone documents into one file (useful for small docs/records 99 # such as MARC). The 'grouped_into_dirs' variables provide 100 # an ability to multiple, separate documents, saved into the same 101 # archives/export folder. 102 103 my $grouped_into_dirs_root = "group-"; 104 my $grouped_into_dirs_doc_count = 0; 105 106 my $max_docs_per_grouped_dir = 10; 107 my $grouped_into_dirs_group_count = 0; 108 109 104 110 sub get_new_doc_dir 105 111 { … … 107 113 my($working_info,$working_dir,$OID) = @_; 108 114 109 # A slimmed down version of BasePlugout::get_new_doc_dir() 110 # which creates a flat file structure, rather then nested 115 my $doc_dir; 111 116 112 my $doc_dir_rest = $OID; 113 114 # remove any \ and / from the OID 115 $doc_dir_rest =~ s/[\\\/]//g; 116 117 # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters 118 if ($ENV{'GSDLOS'} =~ /^windows$/i) 119 { 120 $doc_dir_rest =~ s/\://g; 121 } 122 123 my $doc_dir = $doc_dir_rest; 124 117 if (defined $grouped_into_dirs_root) { 118 $grouped_into_dirs_doc_count++; 119 120 if (($grouped_into_dirs_doc_count % $max_docs_per_grouped_dir) == 0) { 121 $grouped_into_dirs_group_count++; 122 } 123 $doc_dir = sprintf("${grouped_into_dirs_root}%04d", $grouped_into_dirs_group_count); 124 } 125 else { 126 127 # A slimmed down version of BasePlugout::get_new_doc_dir() 128 # which creates a flat file structure, rather then nested 129 130 my $doc_dir_rest = $OID; 131 132 # remove any \ and / from the OID 133 $doc_dir_rest =~ s/[\\\/]//g; 134 135 # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters 136 if ($ENV{'GSDLOS'} =~ /^windows$/i) 137 { 138 $doc_dir_rest =~ s/\://g; 139 } 140 141 $doc_dir = $doc_dir_rest; 142 } 143 125 144 my $created_directory = 0; 126 145 … … 131 150 $created_directory = 1; 132 151 } 133 152 else { 153 $created_directory = 1; 154 } 134 155 135 # in theory this should never happen136 156 if (!$created_directory) 137 157 { … … 144 164 145 165 166 sub get_group_doc_dir { 167 my $self = shift (@_); 168 my ($doc_obj) = @_; 169 170 # If this Plugout is being used to with grouped_into_dirs, then 171 # how get_group_dor_dir() needs to operate is different. In fact 172 # it is simpler than the super-class implementation, because (due 173 # to the prefix manipulation of gsdlassocfiles) it is safe for 174 # associated files to be saved in the same directory as other 175 # documents. 176 177 my $doc_dir = undef; 178 179 if (defined $grouped_into_dirs_root) { 180 181 my $outhandle = $self->{'output_handle'}; 182 my $OID = $doc_obj->get_OID(); 183 $OID = "NULL" unless defined $OID; 184 185 my $groupsize = $self->{'group_size'}; 186 my $gs_count = $self->{'gs_count'}; 187 188 my $open_new_file = (($gs_count % $groupsize)==0); 189 190 # opening a new file 191 if (($open_new_file) || !defined($self->{'gs_doc_dir'})) { 192 # first we close off the old output 193 if ($gs_count>0) 194 { 195 return if (!$self->close_group_output()); 196 } 197 198 # this will create the directory 199 $doc_dir = $self->get_doc_dir ($doc_obj); 200 $self->{'new_doc_dir'} = 1; 201 $self->{'gs_doc_dir'} = $doc_dir; 202 $self->{'group_position'} = 1; 203 } 204 else { 205 $doc_dir = $self->{'gs_doc_dir'}; 206 $self->{'new_doc_dir'} = 0; 207 } 208 209 } 210 else { 211 $doc_dir = $self->SUPER::get_group_doc_dir(); 212 } 213 214 return $doc_dir; 215 } 216 217 218 219 146 220 sub recursive_process_section_content 147 221 { … … 158 232 $section_ptr->{'text'} = $text; 159 233 234 # Turn into text 235 $text =~ s/<style[^>]*>.*?<\/style>//si; 236 $text =~ s/<[^>]*>/ /g; 237 $text =~ s/\s+ / /mg; 238 239 my $gsdoc_marker = "<span fromGSDocId=\"$oid\"></span>"; 240 $text =~ s/((?:[^\s]+\s*){10})/$1$gsdoc_marker/sg; 241 242 243 $section_ptr->{'text'} = "<div gsdocid=\"$oid\">$gsdoc_marker$text$gsdoc_marker</div>"; 244 245 # print STDERR "*** text = $text\n"; 246 160 247 # work through all the sub-sections 161 248 foreach my $subsection (@{$section_ptr->{'subsection_order'}}) { … … 215 302 # print $outhandle "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/strict.dtd\">\n"; 216 303 #print $outhandle "<html xmlns=\"http://www.w3.org/TR/xhtml1/strict\">\n"; 304 217 305 print $outhandle "<Archive>\n"; 218 306 } … … 222 310 my ($outhandle) = @_; 223 311 224 #print $outhandle "</html>\n";225 312 print $outhandle "</Archive>\n"; 226 313 }
Note:
See TracChangeset
for help on using the changeset viewer.