Ignore:
Timestamp:
2024-03-19T16:02:07+13:00 (3 months ago)
Author:
davidb
Message:

General upgrade to support exporting files suitable for inclusion in OpenAI's Assistants API

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugouts/OpenAIGPTsPlugout.pm

    r38750 r38851  
    7979    }
    8080
    81     # For now, don't allow groups
    82     if ($self->{'group_size'} > 1) {
    83     print STDERR "OpenAIGPTsPlugout: Has not been tested with groups\n";
    84     print STDERR "=> Reverting group_size back to 1\n";
    85    
    86     $self->{'group_size'} = 1;
    87     }
    88    
    8981    return bless $self, $class;
    9082}
     
    10294}
    10395
     96# Note: This type of grouping (into dirs) is different to the base
     97# classes group_size. 'group_size' is about concatenating multiple
     98# Greenstone documents into one file (useful for small docs/records
     99# such as MARC).  The 'grouped_into_dirs' variables provide
     100# an ability to multiple, separate documents, saved into the same
     101# archives/export folder.
     102
     103my $grouped_into_dirs_root = "group-";
     104my $grouped_into_dirs_doc_count = 0;
     105
     106my $max_docs_per_grouped_dir = 10;
     107my $grouped_into_dirs_group_count = 0;
     108   
     109   
    104110sub get_new_doc_dir
    105111{
     
    107113  my($working_info,$working_dir,$OID) = @_;
    108114
    109   # A slimmed down version of BasePlugout::get_new_doc_dir()
    110   # which creates a flat file structure, rather then nested
     115  my $doc_dir;
    111116 
    112   my $doc_dir_rest = $OID;
    113 
    114   # remove any \ and / from the OID
    115   $doc_dir_rest =~ s/[\\\/]//g;
    116 
    117   # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
    118   if ($ENV{'GSDLOS'} =~ /^windows$/i)
    119   {
    120     $doc_dir_rest =~ s/\://g;
    121   }
    122 
    123   my $doc_dir = $doc_dir_rest;
    124 
     117  if (defined $grouped_into_dirs_root) {
     118      $grouped_into_dirs_doc_count++;
     119
     120      if (($grouped_into_dirs_doc_count % $max_docs_per_grouped_dir) == 0) {
     121      $grouped_into_dirs_group_count++;
     122      }
     123      $doc_dir = sprintf("${grouped_into_dirs_root}%04d", $grouped_into_dirs_group_count);
     124  }
     125  else {
     126
     127      # A slimmed down version of BasePlugout::get_new_doc_dir()
     128      # which creates a flat file structure, rather then nested
     129     
     130      my $doc_dir_rest = $OID;
     131     
     132      # remove any \ and / from the OID
     133      $doc_dir_rest =~ s/[\\\/]//g;
     134
     135      # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
     136      if ($ENV{'GSDLOS'} =~ /^windows$/i)
     137      {
     138      $doc_dir_rest =~ s/\://g;
     139      }
     140     
     141      $doc_dir = $doc_dir_rest;
     142  }
     143 
    125144  my $created_directory = 0;
    126145 
     
    131150      $created_directory = 1;
    132151  }
    133 
     152  else {
     153      $created_directory = 1;
     154  }
    134155 
    135   # in theory this should never happen
    136156  if (!$created_directory)
    137157  {
     
    144164
    145165
     166sub get_group_doc_dir {
     167    my $self = shift (@_);
     168    my ($doc_obj) = @_;
     169
     170    # If this Plugout is being used to with grouped_into_dirs, then
     171    # how get_group_dor_dir() needs to operate is different.  In fact
     172    # it is simpler than the super-class implementation, because (due
     173    # to the prefix manipulation of gsdlassocfiles) it is safe for
     174    # associated files to be saved in the same directory as other
     175    # documents.
     176
     177    my $doc_dir = undef;
     178   
     179    if (defined $grouped_into_dirs_root) {
     180
     181    my $outhandle = $self->{'output_handle'};
     182    my $OID = $doc_obj->get_OID();
     183    $OID = "NULL" unless defined $OID;
     184
     185    my $groupsize = $self->{'group_size'};
     186    my $gs_count = $self->{'gs_count'};
     187   
     188    my $open_new_file = (($gs_count % $groupsize)==0);
     189   
     190    # opening a new file
     191    if (($open_new_file)  || !defined($self->{'gs_doc_dir'})) {
     192        # first we close off the old output
     193        if ($gs_count>0)
     194        {
     195        return if (!$self->close_group_output());
     196        }
     197       
     198        # this will create the directory
     199        $doc_dir = $self->get_doc_dir ($doc_obj);
     200        $self->{'new_doc_dir'} = 1;
     201        $self->{'gs_doc_dir'} = $doc_dir;
     202        $self->{'group_position'} = 1;
     203    }
     204    else {
     205        $doc_dir = $self->{'gs_doc_dir'};
     206        $self->{'new_doc_dir'} = 0;
     207    }
     208       
     209    }
     210    else {
     211    $doc_dir = $self->SUPER::get_group_doc_dir();
     212    }
     213   
     214    return $doc_dir;
     215}
     216
     217
     218
     219
    146220sub recursive_process_section_content
    147221{
     
    158232    $section_ptr->{'text'} = $text;
    159233
     234    # Turn into text
     235    $text =~ s/<style[^>]*>.*?<\/style>//si;
     236    $text =~ s/<[^>]*>/ /g;
     237    $text =~ s/\s+ / /mg;
     238
     239    my $gsdoc_marker = "<span fromGSDocId=\"$oid\"></span>";
     240    $text =~ s/((?:[^\s]+\s*){10})/$1$gsdoc_marker/sg;
     241   
     242   
     243    $section_ptr->{'text'} = "<div gsdocid=\"$oid\">$gsdoc_marker$text$gsdoc_marker</div>";
     244
     245    # print STDERR "*** text = $text\n";
     246   
    160247    # work through all the sub-sections
    161248    foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
     
    215302    # print $outhandle "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/strict.dtd\">\n";
    216303    #print $outhandle "<html xmlns=\"http://www.w3.org/TR/xhtml1/strict\">\n";
     304   
    217305    print $outhandle "<Archive>\n";
    218306}
     
    222310    my ($outhandle) = @_;
    223311
    224     #print $outhandle "</html>\n";
    225312    print $outhandle "</Archive>\n";   
    226313}
Note: See TracChangeset for help on using the changeset viewer.