source: trunk/bbc/collect/bbc/perllib/plugins/BRSPlug.pm@ 12265

Last change on this file since 12265 was 12265, checked in by nzdl, 18 years ago

Chi's changes from the last BBC build

  • Property svn:keywords set to Author Date Id Revision
File size: 28.8 KB
RevLine 
[4647]1
2# plugin which processes an the BBC Sound archive exported in an ASCII format
3
4package BRSPlug;
5
6use BasPlug;
7use ghtml;
8
9sub BEGIN
10{
11 @ISA = ('BasPlug');
12 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
13}
14
[12265]15my $bbc_collections_list =
16 [{'name' => "all", # all collections
17 'desc' => "{BRSPlug.bbc_collections.all}"},
18 {'name' => "nsa", # National Sound Archive
19 'desc' => "{BRSPlug.bbc_collections.nsa"},
20 {'name' => "bfi", # Brisith File Institute
21 'desc' => "{BRSPlug.bbc_collections.bfi"},
22 {'name' => "bbcother", # BBC Other
23 'desc' => "{BRSPlug.bbc_collections.bbcother"}];
24
25
26my $arguments =
27 [{ 'name' => "bbc_collections",
28 'desc' => "{BRSPlug.bbc_collections}",
29 #'type' => $bbc_collections_list,
30 'type' => "string",
31 'reqd' => "no",
32 'deft' => "all"}];
33
[4647]34%Exclude = ();
35
[12265]36my $options = { 'name' => "BRSPlug",
37 'desc' => "{BRSPlug.desc}",
38 'abstract' => "yes",
39 'inherits' => "no",
40 'args' => $arguments };
[4647]41
42# Note: sext is short for scrollable text
43
44#field_id label disp_type crit_type width height brs_para
45#----------------------------------------------------------------
46#10 sport prdcr text text 30 1 T006
47#1 text sext text 40 16 TEXT
48#16 scol ref text text text 30 1 SCOL
49#19 date text date 10 1 D001
50#25 Item title text text 20 1 T001
51#26 B/W Seqs text text 30 1 S001
52#27 Medium text text 30 1 S002
53#28 shot type text text 30 1 S003
54#29 B/W Prog/Item text text 30 1 S004
55#30 Seqs text text 10 1 S005
56#31 Format text text 30 1 S006
57#32 prog/item text text 30 1 S007
58#35 reg libno text text 30 1 S008
59#36 reg tape no text text 30 1 S009
60#37 C Indicator text text 30 1 S010
61#38 C Holder text text 30 1 T002
62#39 cprd ind text text 30 1 S011
63#40 cprd name text text 30 1 T003
64#44 reference sext text 30 5 R001
65#49 prog num text text 10 1 PRNO
66#59 news title text text 30 1 T005
67#65 cat num text text 10 1 CATN
68#72 Record number text text 30 1 T007
69#73 rx_date text date 10 1 S013
70#74 Music performer text text 30 2 T008
71#75 Composer text text 30 2 T009
72#76 mono/stereo text text 30 1 S014
73#77 music medium text text 30 1 S015
74#82 Sound Tech text text 30 1 S016
75#83 Music publisher text text 30 1 S017
76#84 music tape text text 30 1 T012
77#89 catalogue text text 3 1 CODE
78#104 subcat text text 10 1 SUBC
79#106 prog title text text 40 1 TTLE
80#109 screen type text text 10 1 DOCT
81#110 stock title text text 30 1 T013
82#113 stock ln 4 text text 30 1 T015
83#114 stock lib 1 text text 30 1 S020
84#115 stock lib 2 text text 30 1 S021
85#----------------------------------------------------------------
86
87
88my @brs_field_table
89 = (
90 { field_name => "TTLE", field_id => 106, label => "Prog title",
91 disp_type => "text", crit_type => "text",
92 width => 40, height => 1 },
93 { field_name => "T001", field_id => 25, label => "Item title",
94 disp_type => "text", crit_type => "text",
95 width => 20, height => 1 },
96 { field_name => "T005", field_id => 59, label => "News title",
97 disp_type => "text", crit_type => "text",
98 width => 30, height => 1 },
99 { field_name => "T013", field_id => 110, label => "Stock title",
100 disp_type => "text", crit_type => "text",
101 width => 30, height => 1 },
102 { field_name => "T007", field_id => 72, label => "Record number",
103 disp_type => "text", crit_type => "text",
104 width => 30, height => 1 },
[12265]105 { field_name => "RFTI", field_id => 116, label => "RFTI",
106 disp_type => "text", crit_type => "text",
107 width => 30, height => 1 },
[4647]108 { field_name => "TEXT", field_id => 1, label => "Text",
109 disp_type => "scrl", crit_type => "text",
110 width => 40, height => 16 },
111 { field_name => "T006", field_id => 10, label => "Sport prdcr",
112 disp_type => "text", crit_type => "text",
113 width => 30, height => 1 },
114 { field_name => "SCOL", field_id => 16, label => "Scol ref text",
115 disp_type => "text", crit_type => "text",
116 width => 30, height => 1 },
117 { field_name => "D001", field_id => 19, label => "Date",
118 disp_type => "text", crit_type => "date",
119 width => 10, height => 1 },
120 { field_name => "S001", field_id => 26, label => "B/W Seqs",
121 disp_type => "text", crit_type => "text",
122 width => 30, height => 1 },
123 { field_name => "S002", field_id => 27, label => "Medium",
124 disp_type => "text", crit_type => "text",
125 width => 30, height => 1 },
126 { field_name => "S003", field_id => 28, label => "Shot type",
127 disp_type => "text", crit_type => "text",
128 width => 30, height => 1 },
129 { field_name => "S004", field_id => 29, label => "B/W Prog/Item",
130 disp_type => "text", crit_type => "text",
131 width => 30, height => 1 },
132 { field_name => "S005", field_id => 30, label => "Seqs",
133 disp_type => "text", crit_type => "text",
134 width => 10, height => 1 },
135 { field_name => "S006", field_id => 31, label => "Format",
136 disp_type => "text", crit_type => "text",
137 width => 30, height => 1 },
138 { field_name => "S007", field_id => 32, label => "Prog/item",
139 disp_type => "text", crit_type => "text",
140 width => 30, height => 1 },
141 { field_name => "S008", field_id => 35, label => "Reg libno",
142 disp_type => "text", crit_type => "text",
143 width => 30, height => 1 },
144 { field_name => "S009", field_id => 36, label => "Reg tape no",
145 disp_type => "text", crit_type => "text",
146 width => 30, height => 1 },
147 { field_name => "S010", field_id => 37, label => "C Indicator",
148 disp_type => "text", crit_type => "text",
149 width => 30, height => 1 },
150 { field_name => "T002", field_id => 38, label => "C Holder",
151 disp_type => "text", crit_type => "text",
152 width => 30, height => 1 },
153 { field_name => "S011", field_id => 39, label => "Cprd ind",
154 disp_type => "text", crit_type => "text",
155 width => 30, height => 1 },
156 { field_name => "T003", field_id => 40, label => "Cprd name",
157 disp_type => "text", crit_type => "text",
158 width => 30, height => 1 },
159 { field_name => "R001", field_id => 44, label => "Reference",
160 disp_type => "scrl", crit_type => "text",
161 width => 30, height => 5 },
162 { field_name => "PRNO", field_id => 49, label => "Prog num",
163 disp_type => "text", crit_type => "text",
164 width => 10, height => 1 },
165 { field_name => "CATN", field_id => 65, label => "Cat num",
166 disp_type => "text", crit_type => "text",
167 width => 10, height => 1 },
168 { field_name => "S013", field_id => 73, label => "Rx_date",
169 disp_type => "text", crit_type => "date",
170 width => 10, height => 1 },
171 { field_name => "T008", field_id => 74, label => "Music performer",
172 disp_type => "text", crit_type => "text",
173 width => 30, height => 2 },
174 { field_name => "T009", field_id => 75, label => "Composer",
175 disp_type => "text", crit_type => "text",
176 width => 30, height => 2 },
177 { field_name => "S014", field_id => 76, label => "Mono/stereo",
178 disp_type => "text", crit_type => "text",
179 width => 30, height => 1 },
180 { field_name => "S015", field_id => 77, label => "Music medium",
181 disp_type => "text", crit_type => "text",
182 width => 30, height => 1 },
183 { field_name => "S016", field_id => 82, label => "Sound Tech",
184 disp_type => "text", crit_type => "text",
185 width => 30, height => 1 },
186 { field_name => "S017", field_id => 83, label => "Music publisher",
187 disp_type => "text", crit_type => "text",
188 width => 30, height => 1 },
189 { field_name => "T012", field_id => 84, label => "Music tape",
190 disp_type => "text", crit_type => "text",
191 width => 30, height => 1 },
192 { field_name => "CODE", field_id => 89, label => "Catalogue",
193 disp_type => "text", crit_type => "text",
194 width => 3, height => 1 },
195 { field_name => "SUBC", field_id => 104, label => "Subcat",
196 disp_type => "text", crit_type => "text",
197 width => 10, height => 1 },
198 { field_name => "DOCT", field_id => 109, label => "Screen type",
199 disp_type => "text", crit_type => "text",
200 width => 10, height => 1 },
201 { field_name => "T015", field_id => 113, label => "Stock ln 4",
202 disp_type => "text", crit_type => "text",
203 width => 30, height => 1 },
204 { field_name => "S020", field_id => 114, label => "Stock lib 1",
205 disp_type => "text", crit_type => "text",
206 width => 30, height => 1 },
207 { field_name => "S021", field_id => 115, label => "Stock lib 2",
208 disp_type => "text", crit_type => "text",
209 width => 30, height => 1 }
210 );
211
212
213my @brs_title_list = ("TTLE", "T001", "T005", "T013", "T007", "RFTI");
214my %cat_num_list = ();
215
216sub new {
[12265]217 my $class = shift (@_);
[4647]218 my $self = new BasPlug ($class, @_);
[12265]219 $self->{'plugin_type'} = "BRSPlug";
[4647]220
[12265]221 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
222 my $option_list = $self->{'option_list'};
223 push( @{$option_list}, $options );
224
225 if (!parsargv::parse(\@_,
226 q^bbc_collections/(all|nsa|bfi|bbcother)/^, \$self->{'bbc_collections'},
227 "allow_extra_options")) {
228
229 print STDERR "\nIncorrect options passed to BRSPlug, check your collect.cfg configuration file\n";
230 $self->print_txt_usage(""); # Use default resource bundle
231 die "\n";
232 }
233
234 #my ($class) = @_;
235 #my $self = new BasPlug ($class, @_);
236
[4647]237 return bless $self, $class;
238}
239
240sub get_default_process_exp {
241 my $self = shift (@_);
242
243# return q^(?i)\.brs$^;
[12265]244 return q^TVRD.+^;
[4647]245}
246
247my $brs_doc_count = 0;
248my $brs_dot_count = 0;
249my $brs_line_no = 0;
250
251sub brs_error
252{
253 my ($file, $mess, $outhandle) = @_;
254
255 print $outhandle "Malformed BRS recorded at line $brs_line_no in $file:\n";
256 print $outhandle " $mess\n";
257}
258
259sub read_brs_record
260{
261 my ($recordref, $file, $outhandle) = @_;
262
263 my $extra_trigger = '\s*>';
264 my $text_divider = "-" x 78;
[12265]265 #my $text_end = '^\.\.[^:]+:$';
266 my $meta_key = '^\.\.(\w+):$';
267
[4647]268 # Print "." to signify processing if enough records have been read in
269 $brs_doc_count++;
270 if (($brs_doc_count % 10) == 0)
271 {
272 print $outhandle ".";
273 $brs_dot_count++;
274 print $outhandle "\n" if (($brs_dot_count % 80) == 0);
275 }
276
277 my $brs_rec = undef;
278 my $line = undef;
279
280 # read in the record
281 my @brs_lines = split /\n/, $$recordref;
282
283 while (scalar(@brs_lines)>0)
284 {
285 $line = shift(@brs_lines);
[12265]286
[4647]287 $line =~ s/\cM//g;
288
289 $brs_line_no++;
290
[12265]291# if ($line =~ m/^\.\.([^:]+):$/)
292 if ($line =~ m/$meta_key/)
[4647]293 {
294 my $field_name = $1;
295
296 if ($field_name eq "TEXT")
297 {
298 my $text_above = "";
299 my $text_extra = "";
300 my $text_below = "";
301
302 my $add_mode = "above";
303
304 while (scalar(@brs_lines)>0)
305 {
[12265]306
307 last if ($brs_lines[0] =~ m/$meta_key/);
308
309
310 $line = shift(@brs_lines);
[4647]311
[12265]312
[4647]313 $brs_line_no++;
314
315 if ($line =~ m/^$text_divider/)
316 {
317 $add_mode = "below";
318 next;
319 }
320
321 if ($line =~ m/^$extra_trigger/)
322 {
323 $text_extra .= "$line\n";
324 $add_mode = "below";
325 next;
326 }
[12265]327
328 if ($line =~ m/$meta_key/)
329 #if ($line =~ m/^\.\.[^:]+:/)
[4647]330 {
331 if ($line =~ m/^\.\.TEXT/)
332 {
333 $add_mode = "above";
334 next;
[12265]335 }
[4647]336 else
337 {
338 brs_error($file, "Unexpected field $line", $outhandle);
339 # put field name back
340 unshift(@brs_lines, $line);
341 $brs_line_no--;
342 last;
343 }
344 }
345
346 if ($line !~ m/^\s+(-){10,}$/)
347 {
348 if ($add_mode eq "above")
349 {
350 $text_above .= "$line ";
351 }
352 else
353 {
354 $text_below .= "$line ";
355 }
356 }
357 }
358
359 $brs_rec->{TEXTA} .= $text_above if ($text_above ne "");
360 $brs_rec->{TEXTB} .= $text_below if ($text_below ne "");
361 $brs_rec->{TEXTE} .= $text_extra if ($text_extra ne "");
362 }
363 elsif ($field_name =~ /^RF/)
364 {
[12265]365 # deal with the field name start with RF, eg. RFTI,RFAN, RFAB...
[4647]366 # read zz fields until end of record
367
368 while (scalar(@brs_lines)>0)
369 {
370 my $line_ahead = $brs_lines[0];
[12265]371 #last if ($line_ahead =~ m/^\.\.[^:]+:$/);
372 last if ($line_ahead =~ m/$meta_key/);
[4647]373 $line = shift(@brs_lines);
374 $brs_line_no++;
[12265]375 #if (!defined($brs_rec->{$field_name}))
376 #{
377 #$brs_rec->{$field_name} = [$line];
378 #}
379 #else
380 #{
381 #push(@{$brs_rec->{$field_name}},$line);
382 #}
383
[4647]384 if (!defined($brs_rec->{$field_name}))
385 {
[12265]386 $brs_rec->{$field_name} = (); #[$line];
[4647]387 }
[12265]388 push(@{$brs_rec->{$field_name}},$line);
[4647]389 }
390 }
391 else
392 {
393 my $field_entry = "";
[12265]394 #while ($brs_lines[0] !~ m/^\.\.[^:]+:$/)
395 while ($brs_lines[0] !~ m/$meta_key/)
[4647]396 {
397 $field_entry .= shift(@brs_lines);
398 $brs_line_no++;
399 last if (scalar(@brs_lines)==0);
400 }
401
402 if (!defined($brs_rec->{$field_name}))
403 {
404 $brs_rec->{$field_name} = $field_entry;
405 }
406 else
407 {
408 brs_error($file, "$field_name already defined.", $outhandle);
409 }
410 }
411 }
412 else
413 {
414 brs_error($file, "Malformed field: $line.", $outhandle);
415 }
416 }
417
418 # format the programme number if required
419 &format_prognum($brs_rec);
420
421 return ($brs_rec);
422}
423
424my $brs_processed_count = 0;
425
426sub brs_full_record_in_html
427{
428 my ($doc_obj,$cursection,$brs_rec) = @_;
429
430 my $html_table = "";
431 my $table_line = "";
432 my $cell_count = 0;
433 my $long_lines = "";
434
435 my $i;
436 for ($i=0; $i<=$#brs_field_table; $i++)
437 {
438 my $field_name = $brs_field_table[$i]->{'field_name'};
[12265]439
[4647]440 if (defined($brs_rec->{$field_name}))
441 {
442 my $field_label = $brs_field_table[$i]->{'label'};
443 my $field_value = $brs_rec->{$field_name};
444
445 # these fields get their own line
446 if ($field_label =~ /^(Prog|Item|News|Stock) title$/) {
447 my $safe_value = $field_value;
448 &ghtml::urlsafe ($safe_value);
449 $long_lines .= "<tr valign=top><td><b>$field_label:</b></td><td colspan=5>";
450 $long_lines .= "<a href=\"_httpquerytitle_&q=$safe_value\">$field_value</a></td></tr>\n";
[12265]451 $long_lines .= "$field_value </td></tr>\n";
[4647]452 } else {
453
454 if (($cell_count>0) && ($cell_count%3 == 0))
455 {
456 $html_table .= "\n<tr valign=top>\n$table_line\n</tr>\n";
457 $table_line = "";
458 }
459 $cell_count++;
460 if ($field_name eq "D001") # a Dublin core style date
461 {
462 my @mon_convert
463 = ("Jan", "Feb", "Mar", "Apr", "May", "Jun",
464 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec");
465
466 my ($year,$monnum,$day)
467 = ($field_value =~ m/(\d{4})(\d{2})(\d{2})/);
468 my $moneng = $mon_convert[$monnum-1];
469
470 $field_value = "$day $moneng $year";
471
472 } elsif ($field_name eq "PRNO") {
473 my $safe_value = $field_value;
474 &ghtml::urlsafe ($safe_value);
475 $field_value = "<a href=\"_httpqueryprogname_&q=$safe_value\">$field_value</a>";
476
477 } elsif ($field_name eq "CATN") {
478 my $safe_value = $field_value;
479 &ghtml::urlsafe ($safe_value);
480 $field_value = "<a href=\"_httpquerycatnum_&q=$safe_value\">$field_value</a>";
481
482 } elsif ($field_name eq "T003") {
483 my $safe_value = $field_value;
484 &ghtml::urlsafe ($safe_value);
485 $field_value = "<a href=\"_httpqueryzzabn_&q=$safe_value\">$field_value</a>";
486 }
[12265]487 if ($field_label =~ /^(RFTI)/){
488 $field_value = join(" ",@{$brs_rec->{'RFTI'}});
489 #print STDERR "**** $field_label, $field_value\n";
490 }
[4647]491 $table_line .= "<td><b>$field_label:</b></td><td>$field_value</td>\n";
492 }
493 }
494 }
495
496 if ($cell_count%3 != 0) {
497 $table_line .= "<td colspan=" . (3-($cell_count%3))*2 . "></td>";
498 }
499
500 $html_table .= "\n<tr>\n$table_line\n</tr>\n";
501 $html_table = "<table>\n$long_lines$html_table\n</table>\n";
502
503 $doc_obj->add_utf8_text($cursection, $html_table);
504}
505
506sub process_brs_record
507{
[12265]508 my $self = shift (@_);
509
[4647]510 my ($processor, $brs_rec, $file, $outhandle) = @_;
511
512 if (defined($brs_rec))
513 {
514 # only include those records in the chosen subcats - these lines
515 # should be commented out to build the entire collection
516
[12265]517 # National Sound Archive collection-Stephan
518 #if (!defined ($brs_rec->{'SUBC'}) ||
519 # $brs_rec->{'SUBC'} !~ /^(RADCOMP|RADIOX|RADPROG|RADNEWS|BDSPROG|UNCATRAD|RADIOREF|WILDSOUND)$/) {
520 # return;
521 #}
522
[4647]523 # British Film Institute collection
[12265]524 #if (!defined ($brs_rec->{'SUBC'}) ||
525 # $brs_rec->{'SUBC'} !~ /^(LONPROG|NEWS)$/) {
526 # return;
527 #}
528
[4647]529 # "the rest" collection
[12265]530 #if (!defined ($brs_rec->{'SUBC'}) ||
531 # $brs_rec->{'SUBC'} =~ /^(LONPROG|NEWS|RADCOMP|RADIOX|RADPROG|RADNEWS|BDSPROG|UNCATRAD|RADIOREF|WILDSOUND)$/) {
532 # return;
533 #}
534 #}
535
536 # modified Chi-Yu Huang
537 if (defined $self->{'bbc_collections'}){
538 # National Sound Archive collection
539 if ($self->{'bbc_collections'} eq "nsa"){
540 #print STDERR "***This is NSA collections\n";
541 if (!defined ($brs_rec->{'SUBC'}) ||
542 $brs_rec->{'SUBC'} !~ /^(RADCOMP|RADIOX|RADPROG|RADNEWS|BDSPROG|UNCATRAD|RADIOREF|WILDSOUND)$/) {
543 return;
544 }
545 }
546 elsif ($self->{'bbc_collections'} eq "bfi"){
547 print STDERR "***This is BFI collections\n";
548 # British Film Institute collection
549 if (!defined ($brs_rec->{'SUBC'}) ||
550 $brs_rec->{'SUBC'} !~ /^(LONPROG|NEWS)$/) {
551 return;
552 }
553 }
554 elsif ($self->{'bbc_collections'} eq "bbcother"){
555 #print STDERR "*** This is bbc Other collections\n";
556 # "the rest" collection
557 if (!defined ($brs_rec->{'SUBC'}) ||
558 $brs_rec->{'SUBC'} =~ /^(LONPROG|NEWS|RADCOMP|RADIOX|RADPROG|RADNEWS|BDSPROG|UNCATRAD|RADIOREF|WILDSOUND)$/) {
559 return;
560 }
561 } else {
562 # build the whole collections
563 #print STDERR "*** This is to build the whole collection\n";
564 }
565 } else {
566 #build the whole collections
[4647]567 }
568
569 # don't include the secret stuff ;-)
570 if (defined ($brs_rec->{'SUBC'}) && $brs_rec->{'SUBC'} eq "HISTORY") {
571 print STDERR "\nexcluding secret stuff - line $brs_line_no in $file\n";
572 return;
573 }
574
575
576 my $doc_obj = new doc ($file, "indexed_doc");
577 $brs_processed_count++;
578
579 my $cursection = $doc_obj->get_top_section();
580
581 my $found_match = "no";
582
583 my $pot_title;
584 foreach $pot_title ( @brs_title_list )
585 {
586 if (defined ($brs_rec->{$pot_title}))
587 {
588 my $title = "";
589
590 if ($pot_title =~ m/^zz/)
591 {
592 my $zz_join = join(" ", @{$brs_rec->{$pot_title}});
593 my @zz_split = split(" ", $zz_join);
594 map { $title .= "$_ " if ($_ !~ m/^xx/); } @zz_split;
595 }
596 else
597 {
[12265]598 if ($pot_title =~ /^RF/){
599 $title = join (" ", @{$brs_rec->{$pot_title}});
600 } else{
601 $title = $brs_rec->{$pot_title};
602 }
603 #$title = $brs_rec->{$pot_title};
[4647]604 }
605
[12265]606 #print STDERR "*** What is the title=$title\n";
607
[4647]608 my $tl_ref = $doc_obj->get_metadata ($cursection, "Title");
609
610 if (scalar(@$tl_ref==0))
611 {
612 $doc_obj->add_utf8_metadata ($cursection, "Title", $title);
613 }
614 else
615 {
616 my $exists = "no";
617
618 map { $exists = "yes" if ($title eq $_); } @$tl_ref;
619 if ($exists eq "no")
620 {
621 $doc_obj->add_utf8_metadata($cursection, "Title", $title);
622 }
623 }
624 $found_match = "yes";
625 }
626 }
627 if ($found_match eq "no")
628 {
629 if (defined($brs_rec->{TEXTA}))
630 {
631 my $sub_title = substr($brs_rec->{TEXTA},0,60) . " ... ";
632 $doc_obj->add_utf8_metadata ($cursection, "Title", $sub_title);
633 }
634 elsif (defined($brs_rec->{TEXTB}))
635 {
636 my $sub_title = substr($brs_rec->{TEXTB},0,60) . " ... ";
637 $doc_obj->add_utf8_metadata ($cursection, "Title", $sub_title);
638 }
639 else
640 {
641 print $outhandle "\nNo title or text for record ending at line ";
642 print $outhandle $brs_line_no-1, " in $file\n";
643 }
644 }
645
646 if (defined($brs_rec->{'T001'})) # item title
647 {
648 $doc_obj->add_utf8_metadata ($cursection, "ItemTitle", $brs_rec->{'T001'});
649 }
650
651 if (defined($brs_rec->{'D001'})) # date
652 {
653 $doc_obj->add_utf8_metadata ($cursection, "Date", $brs_rec->{'D001'});
654 }
655 else
656 {
657 print $outhandle "\nNo date for record ending at line ";
658 print $outhandle $brs_line_no-1, " in $file\n";
659 }
660
661 if (defined($brs_rec->{'PRNO'})) # programme number
662 {
663 $doc_obj->add_utf8_metadata ($cursection, "ProgNumber", $brs_rec->{'PRNO'});
664 }
665
666 if (defined($brs_rec->{'CATN'})) # catalogue number
667 {
668 $doc_obj->add_utf8_metadata ($cursection, "CatNum", $brs_rec->{'CATN'});
669 }
670
671 # RFAN, RFAB and T003 fields
672 if (defined($brs_rec->{'RFAB'}) || defined($brs_rec->{'RFAN'}) || defined($brs_rec->{'T003'}))
673 {
674 my $zzabn = "";
675 if (defined($brs_rec->{'RFAB'})) {
676 foreach my $a (@{$brs_rec->{'RFAB'}}) {
677 $a =~ s/[^a-zA-Z0-9]//g;
678 $zzabn .= " " . $a;
679 }
680 }
681 if (defined($brs_rec->{'RFAN'})) {
682 foreach my $a (@{$brs_rec->{'RFAN'}}) {
683 $a =~ s/[^a-zA-Z0-9]//g;
684 $zzabn .= " " . $a;
685 }
686 }
687 if (defined($brs_rec->{'T003'})) {
688 my $value = $brs_rec->{'T003'};
689 $value =~ s/[^a-zA-Z0-9]//g;
690 $zzabn .= " " . $value;
691 }
692 $doc_obj->add_utf8_metadata ($cursection, "zzabn", $zzabn);
693 }
[12265]694
[4647]695 if (defined($brs_rec->{'RFN'}))
696 {
697 my $name;
698 foreach $name (@{$brs_rec->{'RFN'}})
699 {
700 $doc_obj->add_utf8_metadata ($cursection, "People", $name);
701 }
702 }
703
704 brs_full_record_in_html($doc_obj,$cursection,$brs_rec);
705
706 if (defined($brs_rec->{TEXTA}))
707 {
708
709 my $desc_texta = "<h3>Description</h3>$brs_rec->{TEXTA}";
710 $doc_obj->add_utf8_text ($cursection, $desc_texta);
711
712 if (defined($brs_rec->{TEXTE}))
713 {
714 my $raw_texte = $brs_rec->{TEXTE};
715 $raw_texte =~ s/^\s*>\s*/<li>/mg;
716
717 my $test_empty = $raw_texte;
718 $test_empty =~ s/(<li>)+//g;
719
720 if ($test_empty ne "")
721 {
722 my $desc_texte = $raw_texte;
723 $desc_texte =~ s/^<li>(.*)$/<li><i>$1<\/i>/mg;
724 $desc_texte = "<p>Additional:<ul>$desc_texte</ul>";
725 $doc_obj->add_utf8_text ($cursection, $desc_texte);
726 }
727 }
728 }
729
730 if (defined($brs_rec->{TEXTB}))
731 {
732 my $desc_textb = "<h3>Comments</h3>$brs_rec->{TEXTB}";
733 $doc_obj->add_utf8_text($cursection, $desc_textb);
734 }
735
736 my $full_text = "";
737 $full_text .= "$brs_rec->{TEXTA}\n" if (defined($brs_rec->{TEXTA}));
738 $full_text .= "$brs_rec->{TEXTE}\n" if (defined($brs_rec->{TEXTE}));
739 $full_text .= "$brs_rec->{TEXTB}\n" if (defined($brs_rec->{TEXTB}));
740
741 my @zz_list = ( "RFPG", "RFLO", "RFN" );
742 my $zz;
743 foreach $zz ( @zz_list)
744 {
745 if (defined($brs_rec->{$zz}))
746 {
747 my $filtered_text = "";
748
749 my $name;
750 foreach $name (@{$brs_rec->{$zz}})
751 {
752 my @split_zz = split(' ',$name);
753 map { $filtered_text .= "$_ " if ($_ =~ m/^xx/) } @split_zz
754 }
755
756 $full_text .= "$filtered_text\n";
757 }
758 }
759
760 # format zz fields (tail end of record)
761 my $zz_html = "<table>\n";
762
763 my $pot_zz;
764 foreach $pot_zz (keys %{$brs_rec})
765 {
766 if ($pot_zz =~ m/^RF/)
767 {
768 my $zz = $pot_zz;
769 my $name;
770 foreach $name (@{$brs_rec->{$zz}})
771 {
772 if ($zz eq "RFN") {
773 my $safe_name = $name;
774 &ghtml::urlsafe ($safe_name);
775 $zz_html .= "<tr><td><b>$zz:</b></td><td><a href=\"_httpquerypeople_&q=$safe_name\">$name</a></td></tr>\n";
776 } elsif ($zz =~ /^RFA[BN]$/) {
777 my $safe_name = $name;
778 &ghtml::urlsafe ($safe_name);
779 $zz_html .= "<tr><td><b>$zz:</b></td><td><a href=\"_httpqueryzzabn_&q=$safe_name\">$name</a></td></tr>\n";
780
781 } else {
782 $zz_html .= "<tr><td><b>$zz:</b></td><td>$name</td></tr>\n";
783 }
784 }
785 }
786 }
787 $zz_html .= "</table>\n";
788 $doc_obj->add_utf8_text($cursection, $zz_html);
789
790 # add OID - we'll use the catalog number (hoping it's unique)
791 my $cat_num = $brs_rec->{'CATN'};
792 if (!defined $cat_num) {
793 print $outhandle "\n***** No catalogue number for record ending at line ";
794 print $outhandle $brs_line_no-1, " in $file - THIS RECORD WILL BE IGNORED\n";
795 foreach my $v (keys(%$brs_rec)) {
796 if (ref($brs_rec->{$v}) eq "ARRAY") {
797 foreach my $i (@{$brs_rec->{$v}}) {
798 print STDERR "$v -> $i\n";
799 }
800 } else {
801 print STDERR $v . " -> " . $brs_rec->{$v} . "\n";
802 }
803 }
804 return;
805 }
806 if (defined ($cat_num_list{$cat_num})) {
807 print $outhandle "WARNING: catalog number $cat_num used more than once\n";
808 }
809 $doc_obj->set_OID ("bbc" . $cat_num);
810 $cat_num_list{$cat_num} = 1;
811
812 # process the document
813 $processor->process($doc_obj);
814 }
815 else
816 {
817 print $outhandle "BRS record empty\n";
818 }
819
820}
821
822sub read {
823 my $self = shift (@_);
824 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
825 my $outhandle = $self->{'outhandle'};
826
827 my $filename = &util::filename_cat($base_dir, $file);
828# return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
829 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
830 return undef;
831 }
832# my $plugin_name = ref ($self);
833 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
834
835 # create a new document
836# my $doc_obj = new doc ($filename, "indexed_doc");
837
838 # read in file ($text will be in utf8)
839# my $text = "";
840# $self->read_file ($filename, \$text);
841
842# if ($text !~ /\w/) {
843# print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
844# return 0;
845# }
846
847 # text is always plain ascii
848 undef $/;
849 open (FILE, $filename) || die;
850 my $text = <FILE>;
851 close FILE;
852 $/ = "\n";
853
854
855 # include any metadata passed in from previous plugins
856 # note that this metadata is associated with the top level section
857# $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
858
859 print $outhandle "BBC Sound Archive Plug for BRS format: processing $file\n"
860 if $self->{'verbosity'} > 1;
861
862 # reset line number count
863 $brs_line_no = 0;
864
865 my @records = split /\*\*\* BRS DOCUMENT BOUNDARY \*\*\*\s+/, $text;
866 $text = "";
867
868 foreach $record (@records) {
869 next if $record !~ /\w/; # first record will be empty
870 my $brs_rec = &read_brs_record(\$record, $file, $self->{'outhandle'});
[12265]871 &process_brs_record ($self, $processor, $brs_rec, $file, $self->{'outhandle'});
[4647]872 }
[12265]873
874 $self->{'num_processed'} += $brs_processed_count;
[4647]875
876 print $outhandle "\nNumber of BRS records = $brs_doc_count\n";
877 print $outhandle "Number of BRS records processed = $brs_processed_count\n";
878
879 return 1; # processed the file
880}
881
882sub end {
883 my $self = shift (@_);
884 my ($processor) = @_;
885
886 $processor->close_file_output() if defined $processor;
887}
888
889sub format_prognum {
890 my ($brs_rec) = @_;
891
892 return unless defined $brs_rec->{'PRNO'};
893
894 my $old_num = $brs_rec->{'PRNO'};
895
896 if (defined $brs_rec->{'magazine'}) {
897 if ($brs_rec->{'PRNO'} =~ /^\S\S /) { # space at position 3
898 if ($brs_rec->{'magazine'} =~ /(\d+)/) {
899 # algorithm A
900 my $magnumber = $1;
901 $brs_rec->{'PRNO'} =~ /(\S\S) (\S\S)(\S+)/;
902 $brs_rec->{'PRNO'} = $2 . $1 . $3 . $magnumber;
903 } else {
904 # common algorithm
905 $brs_rec->{'PRNO'} =~ /(\S\S) (\S\S)(\S+)/;
906 $brs_rec->{'PRNO'} = $2 . $1 . $3;
907 }
908 }
909 } else {
910 if ($brs_rec->{'PRNO'} =~ /^\S\S / && $brs_rec->{'PRNO'} !~ /:/) { # space at position 3 and no colon
911 # common algorithm
912 $brs_rec->{'PRNO'} =~ /(\S\S) (\S\S)(\S+)/;
913 $brs_rec->{'PRNO'} = $2 . $1 . $3;
914 } elsif ($brs_rec->{'PRNO'} =~ /^(\S):(\S\S) (\S\S)(\S+)/) { # colon at position 2 and space at position 5
915 # algorithm B
916 $brs_rec->{'PRNO'} = $3 . $2 . $4 . $1;
917 }
918 }
919
920# if ($brs_rec->{'PRNO'} ne $old_num) {
921# print STDERR "\n$old_num ... " . $brs_rec->{'PRNO'} . "\n";
922# }
923}
924
9251;
Note: See TracBrowser for help on using the repository browser.