# plugin which processes an the BBC Sound archive exported in an ASCII format package BRSPlug; use BasPlug; use ghtml; sub BEGIN { @ISA = ('BasPlug'); unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib"); } %Exclude = (); # Note: sext is short for scrollable text #field_id label disp_type crit_type width height brs_para #---------------------------------------------------------------- #10 sport prdcr text text 30 1 T006 #1 text sext text 40 16 TEXT #16 scol ref text text text 30 1 SCOL #19 date text date 10 1 D001 #25 Item title text text 20 1 T001 #26 B/W Seqs text text 30 1 S001 #27 Medium text text 30 1 S002 #28 shot type text text 30 1 S003 #29 B/W Prog/Item text text 30 1 S004 #30 Seqs text text 10 1 S005 #31 Format text text 30 1 S006 #32 prog/item text text 30 1 S007 #35 reg libno text text 30 1 S008 #36 reg tape no text text 30 1 S009 #37 C Indicator text text 30 1 S010 #38 C Holder text text 30 1 T002 #39 cprd ind text text 30 1 S011 #40 cprd name text text 30 1 T003 #44 reference sext text 30 5 R001 #49 prog num text text 10 1 PRNO #59 news title text text 30 1 T005 #65 cat num text text 10 1 CATN #72 Record number text text 30 1 T007 #73 rx_date text date 10 1 S013 #74 Music performer text text 30 2 T008 #75 Composer text text 30 2 T009 #76 mono/stereo text text 30 1 S014 #77 music medium text text 30 1 S015 #82 Sound Tech text text 30 1 S016 #83 Music publisher text text 30 1 S017 #84 music tape text text 30 1 T012 #89 catalogue text text 3 1 CODE #104 subcat text text 10 1 SUBC #106 prog title text text 40 1 TTLE #109 screen type text text 10 1 DOCT #110 stock title text text 30 1 T013 #113 stock ln 4 text text 30 1 T015 #114 stock lib 1 text text 30 1 S020 #115 stock lib 2 text text 30 1 S021 #---------------------------------------------------------------- my @brs_field_table = ( { field_name => "TTLE", field_id => 106, label => "Prog title", disp_type => "text", crit_type => "text", width => 40, height => 1 }, { field_name => "T001", field_id => 25, label => "Item title", disp_type => "text", crit_type => "text", width => 20, height => 1 }, { field_name => "T005", field_id => 59, label => "News title", disp_type => "text", crit_type => "text", width => 30, height => 1 }, { field_name => "T013", field_id => 110, label => "Stock title", disp_type => "text", crit_type => "text", width => 30, height => 1 }, { field_name => "T007", field_id => 72, label => "Record number", disp_type => "text", crit_type => "text", width => 30, height => 1 }, { field_name => "TEXT", field_id => 1, label => "Text", disp_type => "scrl", crit_type => "text", width => 40, height => 16 }, { field_name => "T006", field_id => 10, label => "Sport prdcr", disp_type => "text", crit_type => "text", width => 30, height => 1 }, { field_name => "SCOL", field_id => 16, label => "Scol ref text", disp_type => "text", crit_type => "text", width => 30, height => 1 }, { field_name => "D001", field_id => 19, label => "Date", disp_type => "text", crit_type => "date", width => 10, height => 1 }, { field_name => "S001", field_id => 26, label => "B/W Seqs", disp_type => "text", crit_type => "text", width => 30, height => 1 }, { field_name => "S002", field_id => 27, label => "Medium", disp_type => "text", crit_type => "text", width => 30, height => 1 }, { field_name => "S003", field_id => 28, label => "Shot type", disp_type => "text", crit_type => "text", width => 30, height => 1 }, { field_name => "S004", field_id => 29, label => "B/W Prog/Item", disp_type => "text", crit_type => "text", width => 30, height => 1 }, { field_name => "S005", field_id => 30, label => "Seqs", disp_type => "text", crit_type => "text", width => 10, height => 1 }, { field_name => "S006", field_id => 31, label => "Format", disp_type => "text", crit_type => "text", width => 30, height => 1 }, { field_name => "S007", field_id => 32, label => "Prog/item", disp_type => "text", crit_type => "text", width => 30, height => 1 }, { field_name => "S008", field_id => 35, label => "Reg libno", disp_type => "text", crit_type => "text", width => 30, height => 1 }, { field_name => "S009", field_id => 36, label => "Reg tape no", disp_type => "text", crit_type => "text", width => 30, height => 1 }, { field_name => "S010", field_id => 37, label => "C Indicator", disp_type => "text", crit_type => "text", width => 30, height => 1 }, { field_name => "T002", field_id => 38, label => "C Holder", disp_type => "text", crit_type => "text", width => 30, height => 1 }, { field_name => "S011", field_id => 39, label => "Cprd ind", disp_type => "text", crit_type => "text", width => 30, height => 1 }, { field_name => "T003", field_id => 40, label => "Cprd name", disp_type => "text", crit_type => "text", width => 30, height => 1 }, { field_name => "R001", field_id => 44, label => "Reference", disp_type => "scrl", crit_type => "text", width => 30, height => 5 }, { field_name => "PRNO", field_id => 49, label => "Prog num", disp_type => "text", crit_type => "text", width => 10, height => 1 }, { field_name => "CATN", field_id => 65, label => "Cat num", disp_type => "text", crit_type => "text", width => 10, height => 1 }, { field_name => "S013", field_id => 73, label => "Rx_date", disp_type => "text", crit_type => "date", width => 10, height => 1 }, { field_name => "T008", field_id => 74, label => "Music performer", disp_type => "text", crit_type => "text", width => 30, height => 2 }, { field_name => "T009", field_id => 75, label => "Composer", disp_type => "text", crit_type => "text", width => 30, height => 2 }, { field_name => "S014", field_id => 76, label => "Mono/stereo", disp_type => "text", crit_type => "text", width => 30, height => 1 }, { field_name => "S015", field_id => 77, label => "Music medium", disp_type => "text", crit_type => "text", width => 30, height => 1 }, { field_name => "S016", field_id => 82, label => "Sound Tech", disp_type => "text", crit_type => "text", width => 30, height => 1 }, { field_name => "S017", field_id => 83, label => "Music publisher", disp_type => "text", crit_type => "text", width => 30, height => 1 }, { field_name => "T012", field_id => 84, label => "Music tape", disp_type => "text", crit_type => "text", width => 30, height => 1 }, { field_name => "CODE", field_id => 89, label => "Catalogue", disp_type => "text", crit_type => "text", width => 3, height => 1 }, { field_name => "SUBC", field_id => 104, label => "Subcat", disp_type => "text", crit_type => "text", width => 10, height => 1 }, { field_name => "DOCT", field_id => 109, label => "Screen type", disp_type => "text", crit_type => "text", width => 10, height => 1 }, { field_name => "T015", field_id => 113, label => "Stock ln 4", disp_type => "text", crit_type => "text", width => 30, height => 1 }, { field_name => "S020", field_id => 114, label => "Stock lib 1", disp_type => "text", crit_type => "text", width => 30, height => 1 }, { field_name => "S021", field_id => 115, label => "Stock lib 2", disp_type => "text", crit_type => "text", width => 30, height => 1 } ); my @brs_title_list = ("TTLE", "T001", "T005", "T013", "T007", "zzTI"); my %cat_num_list = (); sub new { my ($class) = @_; my $self = new BasPlug ($class, @_); return bless $self, $class; } sub get_default_process_exp { my $self = shift (@_); return q^(?i)\.brs$^; } my $brs_doc_count = 0; my $brs_dot_count = 0; my $brs_line_no = 0; sub brs_error { my ($file, $mess, $outhandle) = @_; print $outhandle "Malformed BRS recorded at line $brs_line_no in $file:\n"; print $outhandle " $mess\n"; } sub read_brs_record { my ($recordref, $file, $outhandle) = @_; my $extra_trigger = '\s*>'; my $text_divider = "-" x 78; my $text_end = "..R001"; # Print "." to signify processing if enough records have been read in $brs_doc_count++; if (($brs_doc_count % 10) == 0) { print $outhandle "."; $brs_dot_count++; print $outhandle "\n" if (($brs_dot_count % 80) == 0); } my $brs_rec = undef; my $line = undef; # read in the record my @brs_lines = split /\n/, $$recordref; while (scalar(@brs_lines)>0) { $line = shift(@brs_lines); $line =~ s/\cM//g; $brs_line_no++; if ($line =~ m/^\.\.([A-Z]|[0-9]){4}:$/) { my $field_name = substr($line,2,4); if ($field_name eq "TEXT") { my $text_above = ""; my $text_extra = ""; my $text_below = ""; my $add_mode = "above"; while (scalar(@brs_lines)>0) { last if ($brs_lines[0] =~ m/^$text_end/); $line = shift(@brs_lines); $brs_line_no++; if ($line =~ m/^$text_divider/) { $add_mode = "below"; next; } if ($line =~ m/^$extra_trigger/) { $text_extra .= "$line\n"; $add_mode = "below"; next; } if ($line =~ m/^\.\.([A-Z]|[0-9]){4}:/) { if ($line =~ m/^\.\.TEXT/) { $add_mode = "above"; next; } else { brs_error($file, "Unexpected field $line", $outhandle); # put field name back unshift(@brs_lines, $line); $brs_line_no--; last; } } if ($line !~ m/^\s+(-){10,}$/) { if ($add_mode eq "above") { $text_above .= "$line "; } else { $text_below .= "$line "; } } } $brs_rec->{TEXTA} .= $text_above if ($text_above ne ""); $brs_rec->{TEXTB} .= $text_below if ($text_below ne ""); $brs_rec->{TEXTE} .= $text_extra if ($text_extra ne ""); } elsif ($field_name eq "R001") { # read zz fields until end of record while (scalar(@brs_lines)>0) { $line = shift(@brs_lines); $brs_line_no++; if ($line =~ m/^(zz[A-Z]+)( +)(.*)/) { my $field_name = $1; my $field_text = $3; while (scalar(@brs_lines)>0) { my $line_ahead = $brs_lines[0]; last if ($line_ahead =~ m/^zz[A-Z]+/); $line = shift(@brs_lines); $brs_line_no++; $field_text .= $line; } if (!defined($brs_rec->{$field_name})) { $brs_rec->{$field_name} = [$field_text]; } else { push(@{$brs_rec->{$field_name}},$field_text); } } else { brs_error($file, "Unexpected field $line", $outhandle); } } } else { my $field_entry = ""; while ($brs_lines[0] !~ m/^..([A-Z]|[0-9]){4}:$/) { $field_entry .= shift(@brs_lines); $brs_line_no++; last if (scalar(@brs_lines)==0); } if (!defined($brs_rec->{$field_name})) { $brs_rec->{$field_name} = $field_entry; } else { brs_error($file, "$field_name already defined.", $outhandle); } } } else { brs_error($file, "Malformed field: $line.", $outhandle); } } # format the programme number if required &format_prognum($brs_rec); return ($brs_rec); } my $brs_processed_count = 0; sub brs_full_record_in_html { my ($doc_obj,$cursection,$brs_rec) = @_; my $html_table = ""; my $table_line = ""; my $cell_count = 0; my $long_lines = ""; my $i; for ($i=0; $i<=$#brs_field_table; $i++) { my $field_name = $brs_field_table[$i]->{'field_name'}; if (defined($brs_rec->{$field_name})) { my $field_label = $brs_field_table[$i]->{'label'}; my $field_value = $brs_rec->{$field_name}; # these fields get their own line if ($field_label =~ /^(Prog|Item|News|Stock) title$/) { my $safe_value = $field_value; &ghtml::urlsafe ($safe_value); $long_lines .= "$field_label:"; $long_lines .= "$field_value\n"; $long_lines .= "$field_value\n"; } else { if (($cell_count>0) && ($cell_count%3 == 0)) { $html_table .= "\n\n$table_line\n\n"; $table_line = ""; } $cell_count++; if ($field_name eq "D001") # a Dublin core style date { my @mon_convert = ("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"); my ($year,$monnum,$day) = ($field_value =~ m/(\d{4})(\d{2})(\d{2})/); my $moneng = $mon_convert[$monnum-1]; $field_value = "$day $moneng $year"; } elsif ($field_name eq "PRNO") { my $safe_value = $field_value; &ghtml::urlsafe ($safe_value); $field_value = "$field_value"; } elsif ($field_name eq "CATN") { my $safe_value = $field_value; &ghtml::urlsafe ($safe_value); $field_value = "$field_value"; } elsif ($field_name eq "T003") { my $safe_value = $field_value; &ghtml::urlsafe ($safe_value); $field_value = "$field_value"; } $table_line .= "$field_label:$field_value\n"; } } } if ($cell_count%3 != 0) { $table_line .= ""; } $html_table .= "\n\n$table_line\n\n"; $html_table = "\n$long_lines$html_table\n
\n"; $doc_obj->add_utf8_text($cursection, $html_table); } sub process_brs_record { my ($processor, $brs_rec, $file, $outhandle) = @_; if (defined($brs_rec)) { # only include those records in the chosen subcats - these lines # should be commented out to build the entire collection # National Sound Archive collection if (!defined ($brs_rec->{'SUBC'}) || $brs_rec->{'SUBC'} !~ /^(RADCOMP|RADIOX|RADPROG|RADNEWS|BDSPROG|UNCATRAD|RADIOREF|WILDSOUND)$/) { return; } # British Film Institute collection # if (!defined ($brs_rec->{'SUBC'}) || # $brs_rec->{'SUBC'} !~ /^(LONPROG|NEWS)$/) { # return; # } # "the rest" collection # if (!defined ($brs_rec->{'SUBC'}) || # $brs_rec->{'SUBC'} =~ /^(LONPROG|NEWS|RADCOMP|RADIOX|RADPROG|RADNEWS|BDSPROG|UNCATRAD|RADIOREF)$/) { # return; # } # don't include the secret stuff ;-) if (defined ($brs_rec->{'SUBC'}) && $brs_rec->{'SUBC'} eq "HISTORY") { print STDERR "excluding secret stuff - line $brs_line_no\n"; return; } my $doc_obj = new doc ($file, "indexed_doc"); $brs_processed_count++; my $cursection = $doc_obj->get_top_section(); my $found_match = "no"; my $pot_title; foreach $pot_title ( @brs_title_list ) { if (defined ($brs_rec->{$pot_title})) { my $title = ""; if ($pot_title =~ m/^zz/) { my $zz_join = join(" ", @{$brs_rec->{$pot_title}}); my @zz_split = split(" ", $zz_join); map { $title .= "$_ " if ($_ !~ m/^xx/); } @zz_split; } else { $title = $brs_rec->{$pot_title}; } my $tl_ref = $doc_obj->get_metadata ($cursection, "Title"); if (scalar(@$tl_ref==0)) { $doc_obj->add_utf8_metadata ($cursection, "Title", $title); } else { my $exists = "no"; map { $exists = "yes" if ($title eq $_); } @$tl_ref; if ($exists eq "no") { $doc_obj->add_utf8_metadata($cursection, "Title", $title); } } $found_match = "yes"; } } if ($found_match eq "no") { if (defined($brs_rec->{TEXTA})) { my $sub_title = substr($brs_rec->{TEXTA},0,60) . " ... "; $doc_obj->add_utf8_metadata ($cursection, "Title", $sub_title); } elsif (defined($brs_rec->{TEXTB})) { my $sub_title = substr($brs_rec->{TEXTB},0,60) . " ... "; $doc_obj->add_utf8_metadata ($cursection, "Title", $sub_title); } else { print $outhandle " No title or text for record ending at line "; print $outhandle $brs_line_no-1, " "; } } if (defined($brs_rec->{'T001'})) # item title { $doc_obj->add_utf8_metadata ($cursection, "ItemTitle", $brs_rec->{'T001'}); } if (defined($brs_rec->{'D001'})) # date { $doc_obj->add_utf8_metadata ($cursection, "Date", $brs_rec->{'D001'}); } else { print $outhandle " No date for record ending at line "; print $outhandle $brs_line_no-1, " "; } if (defined($brs_rec->{'PRNO'})) # programme number { $doc_obj->add_utf8_metadata ($cursection, "ProgNumber", $brs_rec->{'PRNO'}); } if (defined($brs_rec->{'CATN'})) # catalogue number { $doc_obj->add_utf8_metadata ($cursection, "CatNum", $brs_rec->{'CATN'}); } # zzAN, zzAB and T003 fields if (defined($brs_rec->{'zzAB'}) || defined($brs_rec->{'zzAN'}) || defined($brs_rec->{'T003'})) { my $zzabn = ""; if (defined($brs_rec->{'zzAB'})) { foreach my $a (@{$brs_rec->{'zzAB'}}) { $a =~ s/[^a-zA-Z0-9]//g; $zzabn .= " " . $a; } } if (defined($brs_rec->{'zzAN'})) { foreach my $a (@{$brs_rec->{'zzAN'}}) { $a =~ s/[^a-zA-Z0-9]//g; $zzabn .= " " . $a; } } if (defined($brs_rec->{'T003'})) { my $value = $brs_rec->{'T003'}; $value =~ s/[^a-zA-Z0-9]//g; $zzabn .= " " . $value; } print STDERR "zzabn: '$zzabn'\n"; $doc_obj->add_utf8_metadata ($cursection, "zzabn", $zzabn); } if (defined($brs_rec->{zzN})) { my $name; foreach $name (@{$brs_rec->{zzN}}) { $doc_obj->add_utf8_metadata ($cursection, "People", $name); } } brs_full_record_in_html($doc_obj,$cursection,$brs_rec); if (defined($brs_rec->{TEXTA})) { my $desc_texta = "

Description

$brs_rec->{TEXTA}"; $doc_obj->add_utf8_text ($cursection, $desc_texta); if (defined($brs_rec->{TEXTE})) { my $raw_texte = $brs_rec->{TEXTE}; $raw_texte =~ s/^\s*>\s*/
  • /mg; my $test_empty = $raw_texte; $test_empty =~ s/(
  • )+//g; if ($test_empty ne "") { my $desc_texte = $raw_texte; $desc_texte =~ s/^
  • (.*)$/
  • $1<\/i>/mg; $desc_texte = "

    Additional:

    "; $doc_obj->add_utf8_text ($cursection, $desc_texte); } } } if (defined($brs_rec->{TEXTB})) { my $desc_textb = "

    Comments

    $brs_rec->{TEXTB}"; $doc_obj->add_utf8_text($cursection, $desc_textb); } my $full_text = ""; $full_text .= "$brs_rec->{TEXTA}\n" if (defined($brs_rec->{TEXTA})); $full_text .= "$brs_rec->{TEXTE}\n" if (defined($brs_rec->{TEXTE})); $full_text .= "$brs_rec->{TEXTB}\n" if (defined($brs_rec->{TEXTB})); my @zz_list = ( "zzPG", "zzLO", "zzN" ); my $zz; foreach $zz ( @zz_list) { if (defined($brs_rec->{$zz})) { my $filtered_text = ""; my $name; foreach $name (@{$brs_rec->{$zz}}) { my @split_zz = split(' ',$name); map { $filtered_text .= "$_ " if ($_ =~ m/^xx/) } @split_zz } $full_text .= "$filtered_text\n"; } } # format zz fields (tail end of record) my $zz_html = "\n"; my $pot_zz; foreach $pot_zz (keys %{$brs_rec}) { if ($pot_zz =~ m/^zz/) { my $zz = $pot_zz; my $name; foreach $name (@{$brs_rec->{$zz}}) { if ($zz eq "zzN") { my $safe_name = $name; &ghtml::urlsafe ($safe_name); $zz_html .= "\n"; } elsif ($zz =~ /^zzA[BN]$/) { my $safe_name = $name; &ghtml::urlsafe ($safe_name); $zz_html .= "\n"; } else { $zz_html .= "\n"; } } } } $zz_html .= "
    $zz:$name
    $zz:$name
    $zz:$name
    \n"; $doc_obj->add_utf8_text($cursection, $zz_html); # add OID - we'll use the catalog number (hoping it's unique) my $cat_num = $brs_rec->{'CATN'}; if (!defined $cat_num) { die "bugger!!\n"; } if (defined ($cat_num_list{$cat_num})) { print $outhandle "WARNING: catalog number $cat_num used more than once\n"; } $doc_obj->set_OID ("bbc" . $cat_num); $cat_num_list{$cat_num} = 1; # process the document $processor->process($doc_obj); } else { print $outhandle "BRS record empty\n"; } } sub read { my $self = shift (@_); my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_; my $outhandle = $self->{'outhandle'}; my $filename = &util::filename_cat($base_dir, $file); # return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/; if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) { return undef; } # my $plugin_name = ref ($self); $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up # create a new document # my $doc_obj = new doc ($filename, "indexed_doc"); # read in file ($text will be in utf8) # my $text = ""; # $self->read_file ($filename, \$text); # if ($text !~ /\w/) { # print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'}; # return 0; # } # text is always plain ascii undef $/; open (FILE, $filename) || die; my $text = ; close FILE; $/ = "\n"; # include any metadata passed in from previous plugins # note that this metadata is associated with the top level section # $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata); print $outhandle "BBC Sound Archive Plug for BRS format: processing $file\n" if $self->{'verbosity'} > 1; my @records = split /\*\*\* BRS DOCUMENT BOUNDARY \*\*\*\s+/, $text; $text = ""; foreach $record (@records) { next if $record !~ /\w/; # first record will be empty my $brs_rec = &read_brs_record(\$record, $file, $self->{'outhandle'}); &process_brs_record ($processor, $brs_rec, $file, $self->{'outhandle'}); } print $outhandle "\nNumber of BRS records = $brs_doc_count\n"; print $outhandle "Number of BRS records processed = $brs_processed_count\n"; return 1; # processed the file } sub end { my $self = shift (@_); my ($processor) = @_; $processor->close_file_output() if defined $processor; } sub format_prognum { my ($brs_rec) = @_; return unless defined $brs_rec->{'PRNO'}; my $old_num = $brs_rec->{'PRNO'}; if (defined $brs_rec->{'magazine'}) { if ($brs_rec->{'PRNO'} =~ /^\S\S /) { # space at position 3 if ($brs_rec->{'magazine'} =~ /(\d+)/) { # algorithm A my $magnumber = $1; $brs_rec->{'PRNO'} =~ /(\S\S) (\S\S)(\S+)/; $brs_rec->{'PRNO'} = $2 . $1 . $3 . $magnumber; } else { # common algorithm $brs_rec->{'PRNO'} =~ /(\S\S) (\S\S)(\S+)/; $brs_rec->{'PRNO'} = $2 . $1 . $3; } } } else { if ($brs_rec->{'PRNO'} =~ /^\S\S / && $brs_rec->{'PRNO'} !~ /:/) { # space at position 3 and no colon # common algorithm $brs_rec->{'PRNO'} =~ /(\S\S) (\S\S)(\S+)/; $brs_rec->{'PRNO'} = $2 . $1 . $3; } elsif ($brs_rec->{'PRNO'} =~ /^(\S):(\S\S) (\S\S)(\S+)/) { # colon at position 2 and space at position 5 # algorithm B $brs_rec->{'PRNO'} = $3 . $2 . $4 . $1; } } # if ($brs_rec->{'PRNO'} ne $old_num) { # print STDERR "\n$old_num ... " . $brs_rec->{'PRNO'} . "\n"; # } } 1;