source: trunk/bbc/collect/bbc/perllib/plugins/BRSPlug.pm@ 12747

Last change on this file since 12747 was 12747, checked in by kjdon, 18 years ago

added the description text here so that it doesn't need to go in greenstone's strings file

  • Property svn:keywords set to Author Date Id Revision
File size: 29.2 KB
RevLine 
[4647]1
2# plugin which processes an the BBC Sound archive exported in an ASCII format
3
4package BRSPlug;
5
6use BasPlug;
7use ghtml;
8
9sub BEGIN
10{
11 @ISA = ('BasPlug');
12 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
13}
14
[12265]15my $bbc_collections_list =
16 [{'name' => "all", # all collections
17 'desc' => "{BRSPlug.bbc_collections.all}"},
18 {'name' => "nsa", # National Sound Archive
19 'desc' => "{BRSPlug.bbc_collections.nsa"},
20 {'name' => "bfi", # Brisith File Institute
21 'desc' => "{BRSPlug.bbc_collections.bfi"},
22 {'name' => "bbcother", # BBC Other
23 'desc' => "{BRSPlug.bbc_collections.bbcother"}];
24
25
26my $arguments =
27 [{ 'name' => "bbc_collections",
28 'desc' => "{BRSPlug.bbc_collections}",
29 #'type' => $bbc_collections_list,
30 'type' => "string",
31 'reqd' => "no",
32 'deft' => "all"}];
33
[4647]34%Exclude = ();
35
[12265]36my $options = { 'name' => "BRSPlug",
[12747]37 'desc' => "BRSPlug deals with the BBC collections. There are three BBC collections we deliver, which are National Sound Archive (nsa), British Film Institute (bfi), and BBC Other (bbcother) (everything does not fit into one of the first two). Four options are provided which are all-build all collections, nsa-build the NSA collections, bfi-build the BFI collections, bbcother-build the OTHER collections.",
[12265]38 'abstract' => "yes",
39 'inherits' => "no",
40 'args' => $arguments };
[4647]41
42# Note: sext is short for scrollable text
43
44#field_id label disp_type crit_type width height brs_para
45#----------------------------------------------------------------
46#10 sport prdcr text text 30 1 T006
47#1 text sext text 40 16 TEXT
48#16 scol ref text text text 30 1 SCOL
49#19 date text date 10 1 D001
50#25 Item title text text 20 1 T001
51#26 B/W Seqs text text 30 1 S001
52#27 Medium text text 30 1 S002
53#28 shot type text text 30 1 S003
54#29 B/W Prog/Item text text 30 1 S004
55#30 Seqs text text 10 1 S005
56#31 Format text text 30 1 S006
57#32 prog/item text text 30 1 S007
58#35 reg libno text text 30 1 S008
59#36 reg tape no text text 30 1 S009
60#37 C Indicator text text 30 1 S010
61#38 C Holder text text 30 1 T002
62#39 cprd ind text text 30 1 S011
63#40 cprd name text text 30 1 T003
64#44 reference sext text 30 5 R001
65#49 prog num text text 10 1 PRNO
66#59 news title text text 30 1 T005
67#65 cat num text text 10 1 CATN
68#72 Record number text text 30 1 T007
69#73 rx_date text date 10 1 S013
70#74 Music performer text text 30 2 T008
71#75 Composer text text 30 2 T009
72#76 mono/stereo text text 30 1 S014
73#77 music medium text text 30 1 S015
74#82 Sound Tech text text 30 1 S016
75#83 Music publisher text text 30 1 S017
76#84 music tape text text 30 1 T012
77#89 catalogue text text 3 1 CODE
78#104 subcat text text 10 1 SUBC
79#106 prog title text text 40 1 TTLE
80#109 screen type text text 10 1 DOCT
81#110 stock title text text 30 1 T013
82#113 stock ln 4 text text 30 1 T015
83#114 stock lib 1 text text 30 1 S020
84#115 stock lib 2 text text 30 1 S021
85#----------------------------------------------------------------
86
87
88my @brs_field_table
89 = (
90 { field_name => "TTLE", field_id => 106, label => "Prog title",
91 disp_type => "text", crit_type => "text",
92 width => 40, height => 1 },
93 { field_name => "T001", field_id => 25, label => "Item title",
94 disp_type => "text", crit_type => "text",
95 width => 20, height => 1 },
96 { field_name => "T005", field_id => 59, label => "News title",
97 disp_type => "text", crit_type => "text",
98 width => 30, height => 1 },
99 { field_name => "T013", field_id => 110, label => "Stock title",
100 disp_type => "text", crit_type => "text",
101 width => 30, height => 1 },
102 { field_name => "T007", field_id => 72, label => "Record number",
103 disp_type => "text", crit_type => "text",
104 width => 30, height => 1 },
[12265]105 { field_name => "RFTI", field_id => 116, label => "RFTI",
106 disp_type => "text", crit_type => "text",
107 width => 30, height => 1 },
[4647]108 { field_name => "TEXT", field_id => 1, label => "Text",
109 disp_type => "scrl", crit_type => "text",
110 width => 40, height => 16 },
111 { field_name => "T006", field_id => 10, label => "Sport prdcr",
112 disp_type => "text", crit_type => "text",
113 width => 30, height => 1 },
114 { field_name => "SCOL", field_id => 16, label => "Scol ref text",
115 disp_type => "text", crit_type => "text",
116 width => 30, height => 1 },
117 { field_name => "D001", field_id => 19, label => "Date",
118 disp_type => "text", crit_type => "date",
119 width => 10, height => 1 },
120 { field_name => "S001", field_id => 26, label => "B/W Seqs",
121 disp_type => "text", crit_type => "text",
122 width => 30, height => 1 },
123 { field_name => "S002", field_id => 27, label => "Medium",
124 disp_type => "text", crit_type => "text",
125 width => 30, height => 1 },
126 { field_name => "S003", field_id => 28, label => "Shot type",
127 disp_type => "text", crit_type => "text",
128 width => 30, height => 1 },
129 { field_name => "S004", field_id => 29, label => "B/W Prog/Item",
130 disp_type => "text", crit_type => "text",
131 width => 30, height => 1 },
132 { field_name => "S005", field_id => 30, label => "Seqs",
133 disp_type => "text", crit_type => "text",
134 width => 10, height => 1 },
135 { field_name => "S006", field_id => 31, label => "Format",
136 disp_type => "text", crit_type => "text",
137 width => 30, height => 1 },
138 { field_name => "S007", field_id => 32, label => "Prog/item",
139 disp_type => "text", crit_type => "text",
140 width => 30, height => 1 },
141 { field_name => "S008", field_id => 35, label => "Reg libno",
142 disp_type => "text", crit_type => "text",
143 width => 30, height => 1 },
144 { field_name => "S009", field_id => 36, label => "Reg tape no",
145 disp_type => "text", crit_type => "text",
146 width => 30, height => 1 },
147 { field_name => "S010", field_id => 37, label => "C Indicator",
148 disp_type => "text", crit_type => "text",
149 width => 30, height => 1 },
150 { field_name => "T002", field_id => 38, label => "C Holder",
151 disp_type => "text", crit_type => "text",
152 width => 30, height => 1 },
153 { field_name => "S011", field_id => 39, label => "Cprd ind",
154 disp_type => "text", crit_type => "text",
155 width => 30, height => 1 },
156 { field_name => "T003", field_id => 40, label => "Cprd name",
157 disp_type => "text", crit_type => "text",
158 width => 30, height => 1 },
159 { field_name => "R001", field_id => 44, label => "Reference",
160 disp_type => "scrl", crit_type => "text",
161 width => 30, height => 5 },
162 { field_name => "PRNO", field_id => 49, label => "Prog num",
163 disp_type => "text", crit_type => "text",
164 width => 10, height => 1 },
165 { field_name => "CATN", field_id => 65, label => "Cat num",
166 disp_type => "text", crit_type => "text",
167 width => 10, height => 1 },
168 { field_name => "S013", field_id => 73, label => "Rx_date",
169 disp_type => "text", crit_type => "date",
170 width => 10, height => 1 },
171 { field_name => "T008", field_id => 74, label => "Music performer",
172 disp_type => "text", crit_type => "text",
173 width => 30, height => 2 },
174 { field_name => "T009", field_id => 75, label => "Composer",
175 disp_type => "text", crit_type => "text",
176 width => 30, height => 2 },
177 { field_name => "S014", field_id => 76, label => "Mono/stereo",
178 disp_type => "text", crit_type => "text",
179 width => 30, height => 1 },
180 { field_name => "S015", field_id => 77, label => "Music medium",
181 disp_type => "text", crit_type => "text",
182 width => 30, height => 1 },
183 { field_name => "S016", field_id => 82, label => "Sound Tech",
184 disp_type => "text", crit_type => "text",
185 width => 30, height => 1 },
186 { field_name => "S017", field_id => 83, label => "Music publisher",
187 disp_type => "text", crit_type => "text",
188 width => 30, height => 1 },
189 { field_name => "T012", field_id => 84, label => "Music tape",
190 disp_type => "text", crit_type => "text",
191 width => 30, height => 1 },
192 { field_name => "CODE", field_id => 89, label => "Catalogue",
193 disp_type => "text", crit_type => "text",
194 width => 3, height => 1 },
195 { field_name => "SUBC", field_id => 104, label => "Subcat",
196 disp_type => "text", crit_type => "text",
197 width => 10, height => 1 },
198 { field_name => "DOCT", field_id => 109, label => "Screen type",
199 disp_type => "text", crit_type => "text",
200 width => 10, height => 1 },
201 { field_name => "T015", field_id => 113, label => "Stock ln 4",
202 disp_type => "text", crit_type => "text",
203 width => 30, height => 1 },
204 { field_name => "S020", field_id => 114, label => "Stock lib 1",
205 disp_type => "text", crit_type => "text",
206 width => 30, height => 1 },
207 { field_name => "S021", field_id => 115, label => "Stock lib 2",
208 disp_type => "text", crit_type => "text",
209 width => 30, height => 1 }
210 );
211
212
213my @brs_title_list = ("TTLE", "T001", "T005", "T013", "T007", "RFTI");
214my %cat_num_list = ();
215
216sub new {
[12265]217 my $class = shift (@_);
[4647]218 my $self = new BasPlug ($class, @_);
[12265]219 $self->{'plugin_type'} = "BRSPlug";
[4647]220
[12265]221 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
222 my $option_list = $self->{'option_list'};
223 push( @{$option_list}, $options );
224
225 if (!parsargv::parse(\@_,
226 q^bbc_collections/(all|nsa|bfi|bbcother)/^, \$self->{'bbc_collections'},
227 "allow_extra_options")) {
228
229 print STDERR "\nIncorrect options passed to BRSPlug, check your collect.cfg configuration file\n";
230 $self->print_txt_usage(""); # Use default resource bundle
231 die "\n";
232 }
233
234 #my ($class) = @_;
235 #my $self = new BasPlug ($class, @_);
236
[4647]237 return bless $self, $class;
238}
239
240sub get_default_process_exp {
241 my $self = shift (@_);
242
243# return q^(?i)\.brs$^;
[12265]244 return q^TVRD.+^;
[4647]245}
246
247my $brs_doc_count = 0;
248my $brs_dot_count = 0;
249my $brs_line_no = 0;
250
251sub brs_error
252{
253 my ($file, $mess, $outhandle) = @_;
254
255 print $outhandle "Malformed BRS recorded at line $brs_line_no in $file:\n";
256 print $outhandle " $mess\n";
257}
258
259sub read_brs_record
260{
261 my ($recordref, $file, $outhandle) = @_;
262
263 my $extra_trigger = '\s*>';
264 my $text_divider = "-" x 78;
[12265]265 #my $text_end = '^\.\.[^:]+:$';
266 my $meta_key = '^\.\.(\w+):$';
267
[4647]268 # Print "." to signify processing if enough records have been read in
269 $brs_doc_count++;
270 if (($brs_doc_count % 10) == 0)
271 {
272 print $outhandle ".";
273 $brs_dot_count++;
274 print $outhandle "\n" if (($brs_dot_count % 80) == 0);
275 }
276
277 my $brs_rec = undef;
278 my $line = undef;
279
280 # read in the record
281 my @brs_lines = split /\n/, $$recordref;
282
283 while (scalar(@brs_lines)>0)
284 {
285 $line = shift(@brs_lines);
[12265]286
[4647]287 $line =~ s/\cM//g;
288
289 $brs_line_no++;
290
[12265]291# if ($line =~ m/^\.\.([^:]+):$/)
292 if ($line =~ m/$meta_key/)
[4647]293 {
294 my $field_name = $1;
295
296 if ($field_name eq "TEXT")
297 {
298 my $text_above = "";
299 my $text_extra = "";
300 my $text_below = "";
301
302 my $add_mode = "above";
303
304 while (scalar(@brs_lines)>0)
305 {
[12265]306
307 last if ($brs_lines[0] =~ m/$meta_key/);
308
309
310 $line = shift(@brs_lines);
[4647]311
[12265]312
[4647]313 $brs_line_no++;
314
315 if ($line =~ m/^$text_divider/)
316 {
317 $add_mode = "below";
318 next;
319 }
320
321 if ($line =~ m/^$extra_trigger/)
322 {
323 $text_extra .= "$line\n";
324 $add_mode = "below";
325 next;
326 }
[12265]327
328 if ($line =~ m/$meta_key/)
329 #if ($line =~ m/^\.\.[^:]+:/)
[4647]330 {
331 if ($line =~ m/^\.\.TEXT/)
332 {
333 $add_mode = "above";
334 next;
[12265]335 }
[4647]336 else
337 {
338 brs_error($file, "Unexpected field $line", $outhandle);
339 # put field name back
340 unshift(@brs_lines, $line);
341 $brs_line_no--;
342 last;
343 }
344 }
345
346 if ($line !~ m/^\s+(-){10,}$/)
347 {
348 if ($add_mode eq "above")
349 {
350 $text_above .= "$line ";
351 }
352 else
353 {
354 $text_below .= "$line ";
355 }
356 }
357 }
358
359 $brs_rec->{TEXTA} .= $text_above if ($text_above ne "");
360 $brs_rec->{TEXTB} .= $text_below if ($text_below ne "");
361 $brs_rec->{TEXTE} .= $text_extra if ($text_extra ne "");
362 }
363 elsif ($field_name =~ /^RF/)
364 {
[12265]365 # deal with the field name start with RF, eg. RFTI,RFAN, RFAB...
[4647]366 # read zz fields until end of record
367
368 while (scalar(@brs_lines)>0)
369 {
370 my $line_ahead = $brs_lines[0];
[12265]371 #last if ($line_ahead =~ m/^\.\.[^:]+:$/);
372 last if ($line_ahead =~ m/$meta_key/);
[4647]373 $line = shift(@brs_lines);
374 $brs_line_no++;
[12265]375 #if (!defined($brs_rec->{$field_name}))
376 #{
377 #$brs_rec->{$field_name} = [$line];
378 #}
379 #else
380 #{
381 #push(@{$brs_rec->{$field_name}},$line);
382 #}
383
[4647]384 if (!defined($brs_rec->{$field_name}))
385 {
[12265]386 $brs_rec->{$field_name} = (); #[$line];
[4647]387 }
[12265]388 push(@{$brs_rec->{$field_name}},$line);
[4647]389 }
390 }
391 else
392 {
393 my $field_entry = "";
[12265]394 #while ($brs_lines[0] !~ m/^\.\.[^:]+:$/)
395 while ($brs_lines[0] !~ m/$meta_key/)
[4647]396 {
397 $field_entry .= shift(@brs_lines);
398 $brs_line_no++;
399 last if (scalar(@brs_lines)==0);
400 }
401
402 if (!defined($brs_rec->{$field_name}))
403 {
404 $brs_rec->{$field_name} = $field_entry;
405 }
406 else
407 {
408 brs_error($file, "$field_name already defined.", $outhandle);
409 }
410 }
411 }
412 else
413 {
414 brs_error($file, "Malformed field: $line.", $outhandle);
415 }
416 }
417
418 # format the programme number if required
419 &format_prognum($brs_rec);
420
421 return ($brs_rec);
422}
423
424my $brs_processed_count = 0;
425
426sub brs_full_record_in_html
427{
428 my ($doc_obj,$cursection,$brs_rec) = @_;
429
430 my $html_table = "";
431 my $table_line = "";
432 my $cell_count = 0;
433 my $long_lines = "";
434
435 my $i;
436 for ($i=0; $i<=$#brs_field_table; $i++)
437 {
438 my $field_name = $brs_field_table[$i]->{'field_name'};
[12265]439
[4647]440 if (defined($brs_rec->{$field_name}))
441 {
442 my $field_label = $brs_field_table[$i]->{'label'};
443 my $field_value = $brs_rec->{$field_name};
444
445 # these fields get their own line
446 if ($field_label =~ /^(Prog|Item|News|Stock) title$/) {
447 my $safe_value = $field_value;
448 &ghtml::urlsafe ($safe_value);
449 $long_lines .= "<tr valign=top><td><b>$field_label:</b></td><td colspan=5>";
450 $long_lines .= "<a href=\"_httpquerytitle_&q=$safe_value\">$field_value</a></td></tr>\n";
[12265]451 $long_lines .= "$field_value </td></tr>\n";
[4647]452 } else {
453
454 if (($cell_count>0) && ($cell_count%3 == 0))
455 {
456 $html_table .= "\n<tr valign=top>\n$table_line\n</tr>\n";
457 $table_line = "";
458 }
459 $cell_count++;
460 if ($field_name eq "D001") # a Dublin core style date
461 {
462 my @mon_convert
463 = ("Jan", "Feb", "Mar", "Apr", "May", "Jun",
464 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec");
465
466 my ($year,$monnum,$day)
467 = ($field_value =~ m/(\d{4})(\d{2})(\d{2})/);
468 my $moneng = $mon_convert[$monnum-1];
469
470 $field_value = "$day $moneng $year";
471
472 } elsif ($field_name eq "PRNO") {
473 my $safe_value = $field_value;
474 &ghtml::urlsafe ($safe_value);
475 $field_value = "<a href=\"_httpqueryprogname_&q=$safe_value\">$field_value</a>";
476
477 } elsif ($field_name eq "CATN") {
478 my $safe_value = $field_value;
479 &ghtml::urlsafe ($safe_value);
480 $field_value = "<a href=\"_httpquerycatnum_&q=$safe_value\">$field_value</a>";
481
482 } elsif ($field_name eq "T003") {
483 my $safe_value = $field_value;
484 &ghtml::urlsafe ($safe_value);
485 $field_value = "<a href=\"_httpqueryzzabn_&q=$safe_value\">$field_value</a>";
486 }
[12265]487 if ($field_label =~ /^(RFTI)/){
488 $field_value = join(" ",@{$brs_rec->{'RFTI'}});
489 #print STDERR "**** $field_label, $field_value\n";
490 }
[4647]491 $table_line .= "<td><b>$field_label:</b></td><td>$field_value</td>\n";
492 }
493 }
494 }
495
496 if ($cell_count%3 != 0) {
497 $table_line .= "<td colspan=" . (3-($cell_count%3))*2 . "></td>";
498 }
499
500 $html_table .= "\n<tr>\n$table_line\n</tr>\n";
501 $html_table = "<table>\n$long_lines$html_table\n</table>\n";
502
503 $doc_obj->add_utf8_text($cursection, $html_table);
504}
505
506sub process_brs_record
507{
[12265]508 my $self = shift (@_);
509
[4647]510 my ($processor, $brs_rec, $file, $outhandle) = @_;
511
512 if (defined($brs_rec))
513 {
514 # only include those records in the chosen subcats - these lines
515 # should be commented out to build the entire collection
516
[12265]517 # National Sound Archive collection-Stephan
518 #if (!defined ($brs_rec->{'SUBC'}) ||
519 # $brs_rec->{'SUBC'} !~ /^(RADCOMP|RADIOX|RADPROG|RADNEWS|BDSPROG|UNCATRAD|RADIOREF|WILDSOUND)$/) {
520 # return;
521 #}
522
[4647]523 # British Film Institute collection
[12265]524 #if (!defined ($brs_rec->{'SUBC'}) ||
525 # $brs_rec->{'SUBC'} !~ /^(LONPROG|NEWS)$/) {
526 # return;
527 #}
528
[4647]529 # "the rest" collection
[12265]530 #if (!defined ($brs_rec->{'SUBC'}) ||
531 # $brs_rec->{'SUBC'} =~ /^(LONPROG|NEWS|RADCOMP|RADIOX|RADPROG|RADNEWS|BDSPROG|UNCATRAD|RADIOREF|WILDSOUND)$/) {
532 # return;
533 #}
534 #}
535
536 # modified Chi-Yu Huang
537 if (defined $self->{'bbc_collections'}){
538 # National Sound Archive collection
539 if ($self->{'bbc_collections'} eq "nsa"){
540 #print STDERR "***This is NSA collections\n";
541 if (!defined ($brs_rec->{'SUBC'}) ||
542 $brs_rec->{'SUBC'} !~ /^(RADCOMP|RADIOX|RADPROG|RADNEWS|BDSPROG|UNCATRAD|RADIOREF|WILDSOUND)$/) {
543 return;
544 }
545 }
546 elsif ($self->{'bbc_collections'} eq "bfi"){
547 print STDERR "***This is BFI collections\n";
548 # British Film Institute collection
549 if (!defined ($brs_rec->{'SUBC'}) ||
550 $brs_rec->{'SUBC'} !~ /^(LONPROG|NEWS)$/) {
551 return;
552 }
553 }
554 elsif ($self->{'bbc_collections'} eq "bbcother"){
555 #print STDERR "*** This is bbc Other collections\n";
556 # "the rest" collection
557 if (!defined ($brs_rec->{'SUBC'}) ||
558 $brs_rec->{'SUBC'} =~ /^(LONPROG|NEWS|RADCOMP|RADIOX|RADPROG|RADNEWS|BDSPROG|UNCATRAD|RADIOREF|WILDSOUND)$/) {
559 return;
560 }
561 } else {
562 # build the whole collections
563 #print STDERR "*** This is to build the whole collection\n";
564 }
565 } else {
566 #build the whole collections
[4647]567 }
568
569 # don't include the secret stuff ;-)
570 if (defined ($brs_rec->{'SUBC'}) && $brs_rec->{'SUBC'} eq "HISTORY") {
571 print STDERR "\nexcluding secret stuff - line $brs_line_no in $file\n";
572 return;
573 }
574
575
576 my $doc_obj = new doc ($file, "indexed_doc");
577 $brs_processed_count++;
578
579 my $cursection = $doc_obj->get_top_section();
580
581 my $found_match = "no";
582
583 my $pot_title;
584 foreach $pot_title ( @brs_title_list )
585 {
586 if (defined ($brs_rec->{$pot_title}))
587 {
588 my $title = "";
589
590 if ($pot_title =~ m/^zz/)
591 {
592 my $zz_join = join(" ", @{$brs_rec->{$pot_title}});
593 my @zz_split = split(" ", $zz_join);
594 map { $title .= "$_ " if ($_ !~ m/^xx/); } @zz_split;
595 }
596 else
597 {
[12265]598 if ($pot_title =~ /^RF/){
599 $title = join (" ", @{$brs_rec->{$pot_title}});
600 } else{
601 $title = $brs_rec->{$pot_title};
602 }
603 #$title = $brs_rec->{$pot_title};
[4647]604 }
605
[12265]606 #print STDERR "*** What is the title=$title\n";
607
[4647]608 my $tl_ref = $doc_obj->get_metadata ($cursection, "Title");
609
610 if (scalar(@$tl_ref==0))
611 {
612 $doc_obj->add_utf8_metadata ($cursection, "Title", $title);
613 }
614 else
615 {
616 my $exists = "no";
617
618 map { $exists = "yes" if ($title eq $_); } @$tl_ref;
619 if ($exists eq "no")
620 {
621 $doc_obj->add_utf8_metadata($cursection, "Title", $title);
622 }
623 }
624 $found_match = "yes";
625 }
626 }
627 if ($found_match eq "no")
628 {
629 if (defined($brs_rec->{TEXTA}))
630 {
631 my $sub_title = substr($brs_rec->{TEXTA},0,60) . " ... ";
632 $doc_obj->add_utf8_metadata ($cursection, "Title", $sub_title);
633 }
634 elsif (defined($brs_rec->{TEXTB}))
635 {
636 my $sub_title = substr($brs_rec->{TEXTB},0,60) . " ... ";
637 $doc_obj->add_utf8_metadata ($cursection, "Title", $sub_title);
638 }
639 else
640 {
641 print $outhandle "\nNo title or text for record ending at line ";
642 print $outhandle $brs_line_no-1, " in $file\n";
643 }
644 }
645
646 if (defined($brs_rec->{'T001'})) # item title
647 {
648 $doc_obj->add_utf8_metadata ($cursection, "ItemTitle", $brs_rec->{'T001'});
649 }
650
651 if (defined($brs_rec->{'D001'})) # date
652 {
653 $doc_obj->add_utf8_metadata ($cursection, "Date", $brs_rec->{'D001'});
654 }
655 else
656 {
657 print $outhandle "\nNo date for record ending at line ";
658 print $outhandle $brs_line_no-1, " in $file\n";
659 }
660
661 if (defined($brs_rec->{'PRNO'})) # programme number
662 {
663 $doc_obj->add_utf8_metadata ($cursection, "ProgNumber", $brs_rec->{'PRNO'});
664 }
665
666 if (defined($brs_rec->{'CATN'})) # catalogue number
667 {
668 $doc_obj->add_utf8_metadata ($cursection, "CatNum", $brs_rec->{'CATN'});
669 }
670
671 # RFAN, RFAB and T003 fields
672 if (defined($brs_rec->{'RFAB'}) || defined($brs_rec->{'RFAN'}) || defined($brs_rec->{'T003'}))
673 {
674 my $zzabn = "";
675 if (defined($brs_rec->{'RFAB'})) {
676 foreach my $a (@{$brs_rec->{'RFAB'}}) {
677 $a =~ s/[^a-zA-Z0-9]//g;
678 $zzabn .= " " . $a;
679 }
680 }
681 if (defined($brs_rec->{'RFAN'})) {
682 foreach my $a (@{$brs_rec->{'RFAN'}}) {
683 $a =~ s/[^a-zA-Z0-9]//g;
684 $zzabn .= " " . $a;
685 }
686 }
687 if (defined($brs_rec->{'T003'})) {
688 my $value = $brs_rec->{'T003'};
689 $value =~ s/[^a-zA-Z0-9]//g;
690 $zzabn .= " " . $value;
691 }
692 $doc_obj->add_utf8_metadata ($cursection, "zzabn", $zzabn);
693 }
[12265]694
[4647]695 if (defined($brs_rec->{'RFN'}))
696 {
697 my $name;
698 foreach $name (@{$brs_rec->{'RFN'}})
699 {
700 $doc_obj->add_utf8_metadata ($cursection, "People", $name);
701 }
702 }
703
704 brs_full_record_in_html($doc_obj,$cursection,$brs_rec);
705
706 if (defined($brs_rec->{TEXTA}))
707 {
708
709 my $desc_texta = "<h3>Description</h3>$brs_rec->{TEXTA}";
710 $doc_obj->add_utf8_text ($cursection, $desc_texta);
711
712 if (defined($brs_rec->{TEXTE}))
713 {
714 my $raw_texte = $brs_rec->{TEXTE};
715 $raw_texte =~ s/^\s*>\s*/<li>/mg;
716
717 my $test_empty = $raw_texte;
718 $test_empty =~ s/(<li>)+//g;
719
720 if ($test_empty ne "")
721 {
722 my $desc_texte = $raw_texte;
723 $desc_texte =~ s/^<li>(.*)$/<li><i>$1<\/i>/mg;
724 $desc_texte = "<p>Additional:<ul>$desc_texte</ul>";
725 $doc_obj->add_utf8_text ($cursection, $desc_texte);
726 }
727 }
728 }
729
730 if (defined($brs_rec->{TEXTB}))
731 {
732 my $desc_textb = "<h3>Comments</h3>$brs_rec->{TEXTB}";
733 $doc_obj->add_utf8_text($cursection, $desc_textb);
734 }
735
736 my $full_text = "";
737 $full_text .= "$brs_rec->{TEXTA}\n" if (defined($brs_rec->{TEXTA}));
738 $full_text .= "$brs_rec->{TEXTE}\n" if (defined($brs_rec->{TEXTE}));
739 $full_text .= "$brs_rec->{TEXTB}\n" if (defined($brs_rec->{TEXTB}));
740
741 my @zz_list = ( "RFPG", "RFLO", "RFN" );
742 my $zz;
743 foreach $zz ( @zz_list)
744 {
745 if (defined($brs_rec->{$zz}))
746 {
747 my $filtered_text = "";
748
749 my $name;
750 foreach $name (@{$brs_rec->{$zz}})
751 {
752 my @split_zz = split(' ',$name);
753 map { $filtered_text .= "$_ " if ($_ =~ m/^xx/) } @split_zz
754 }
755
756 $full_text .= "$filtered_text\n";
757 }
758 }
759
760 # format zz fields (tail end of record)
761 my $zz_html = "<table>\n";
762
763 my $pot_zz;
764 foreach $pot_zz (keys %{$brs_rec})
765 {
766 if ($pot_zz =~ m/^RF/)
767 {
768 my $zz = $pot_zz;
769 my $name;
770 foreach $name (@{$brs_rec->{$zz}})
771 {
772 if ($zz eq "RFN") {
773 my $safe_name = $name;
774 &ghtml::urlsafe ($safe_name);
775 $zz_html .= "<tr><td><b>$zz:</b></td><td><a href=\"_httpquerypeople_&q=$safe_name\">$name</a></td></tr>\n";
776 } elsif ($zz =~ /^RFA[BN]$/) {
777 my $safe_name = $name;
778 &ghtml::urlsafe ($safe_name);
779 $zz_html .= "<tr><td><b>$zz:</b></td><td><a href=\"_httpqueryzzabn_&q=$safe_name\">$name</a></td></tr>\n";
780
781 } else {
782 $zz_html .= "<tr><td><b>$zz:</b></td><td>$name</td></tr>\n";
783 }
784 }
785 }
786 }
787 $zz_html .= "</table>\n";
788 $doc_obj->add_utf8_text($cursection, $zz_html);
789
790 # add OID - we'll use the catalog number (hoping it's unique)
791 my $cat_num = $brs_rec->{'CATN'};
792 if (!defined $cat_num) {
793 print $outhandle "\n***** No catalogue number for record ending at line ";
794 print $outhandle $brs_line_no-1, " in $file - THIS RECORD WILL BE IGNORED\n";
795 foreach my $v (keys(%$brs_rec)) {
796 if (ref($brs_rec->{$v}) eq "ARRAY") {
797 foreach my $i (@{$brs_rec->{$v}}) {
798 print STDERR "$v -> $i\n";
799 }
800 } else {
801 print STDERR $v . " -> " . $brs_rec->{$v} . "\n";
802 }
803 }
804 return;
805 }
806 if (defined ($cat_num_list{$cat_num})) {
807 print $outhandle "WARNING: catalog number $cat_num used more than once\n";
808 }
809 $doc_obj->set_OID ("bbc" . $cat_num);
810 $cat_num_list{$cat_num} = 1;
811
812 # process the document
813 $processor->process($doc_obj);
814 }
815 else
816 {
817 print $outhandle "BRS record empty\n";
818 }
819
820}
821
822sub read {
823 my $self = shift (@_);
824 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
825 my $outhandle = $self->{'outhandle'};
826
827 my $filename = &util::filename_cat($base_dir, $file);
828# return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
829 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
830 return undef;
831 }
832# my $plugin_name = ref ($self);
833 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
834
835 # create a new document
836# my $doc_obj = new doc ($filename, "indexed_doc");
837
838 # read in file ($text will be in utf8)
839# my $text = "";
840# $self->read_file ($filename, \$text);
841
842# if ($text !~ /\w/) {
843# print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
844# return 0;
845# }
846
847 # text is always plain ascii
848 undef $/;
849 open (FILE, $filename) || die;
850 my $text = <FILE>;
851 close FILE;
852 $/ = "\n";
853
854
855 # include any metadata passed in from previous plugins
856 # note that this metadata is associated with the top level section
857# $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
858
859 print $outhandle "BBC Sound Archive Plug for BRS format: processing $file\n"
860 if $self->{'verbosity'} > 1;
861
862 # reset line number count
863 $brs_line_no = 0;
864
865 my @records = split /\*\*\* BRS DOCUMENT BOUNDARY \*\*\*\s+/, $text;
866 $text = "";
867
868 foreach $record (@records) {
869 next if $record !~ /\w/; # first record will be empty
870 my $brs_rec = &read_brs_record(\$record, $file, $self->{'outhandle'});
[12265]871 &process_brs_record ($self, $processor, $brs_rec, $file, $self->{'outhandle'});
[4647]872 }
[12265]873
874 $self->{'num_processed'} += $brs_processed_count;
[4647]875
876 print $outhandle "\nNumber of BRS records = $brs_doc_count\n";
877 print $outhandle "Number of BRS records processed = $brs_processed_count\n";
878
879 return 1; # processed the file
880}
881
882sub end {
883 my $self = shift (@_);
884 my ($processor) = @_;
885
886 $processor->close_file_output() if defined $processor;
887}
888
889sub format_prognum {
890 my ($brs_rec) = @_;
891
892 return unless defined $brs_rec->{'PRNO'};
893
894 my $old_num = $brs_rec->{'PRNO'};
895
896 if (defined $brs_rec->{'magazine'}) {
897 if ($brs_rec->{'PRNO'} =~ /^\S\S /) { # space at position 3
898 if ($brs_rec->{'magazine'} =~ /(\d+)/) {
899 # algorithm A
900 my $magnumber = $1;
901 $brs_rec->{'PRNO'} =~ /(\S\S) (\S\S)(\S+)/;
902 $brs_rec->{'PRNO'} = $2 . $1 . $3 . $magnumber;
903 } else {
904 # common algorithm
905 $brs_rec->{'PRNO'} =~ /(\S\S) (\S\S)(\S+)/;
906 $brs_rec->{'PRNO'} = $2 . $1 . $3;
907 }
908 }
909 } else {
910 if ($brs_rec->{'PRNO'} =~ /^\S\S / && $brs_rec->{'PRNO'} !~ /:/) { # space at position 3 and no colon
911 # common algorithm
912 $brs_rec->{'PRNO'} =~ /(\S\S) (\S\S)(\S+)/;
913 $brs_rec->{'PRNO'} = $2 . $1 . $3;
914 } elsif ($brs_rec->{'PRNO'} =~ /^(\S):(\S\S) (\S\S)(\S+)/) { # colon at position 2 and space at position 5
915 # algorithm B
916 $brs_rec->{'PRNO'} = $3 . $2 . $4 . $1;
917 }
918 }
919
920# if ($brs_rec->{'PRNO'} ne $old_num) {
921# print STDERR "\n$old_num ... " . $brs_rec->{'PRNO'} . "\n";
922# }
923}
924
9251;
Note: See TracBrowser for help on using the repository browser.