Changeset 12202


Ignore:
Timestamp:
2006-07-13T12:45:37+12:00 (18 years ago)
Author:
mdewsnip
Message:

Added new code for parsing the title and author from place references, and improved the RTF to HTML conversion.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/cic-hcap/perllib/plugins/CICPlug.pm

    r12185 r12202  
    710710    my $place_references = $place_references_sql_handle->fetchrow();
    711711    if (defined($place_references)) {
    712         &new_metadata_entry($place_doc_obj, "References", &rtf_to_html($place_references));
     712        $self->add_place_references_metadata($place_doc_obj, $place_id, $place_references);
    713713    }
    714714
     
    928928
    929929
     930sub add_place_references_metadata
     931{
     932    my $self = shift(@_);
     933    my ($place_doc_obj, $place_id, $place_references_rtf_string) = (@_);
     934    my $fail_log_handle = $self->{'failhandle'};
     935
     936    # Convert the place references from RTF to HTML
     937    my $place_references_html_string = &rtf_to_html($place_references_rtf_string);
     938    &new_metadata_entry($place_doc_obj, "PlaceReferences", $place_references_html_string);
     939
     940    # Split the references and try to parse title and author
     941    $place_references_html_string =~ s/(\r|\n)//g;
     942    my @place_references = split(/<br \/><br \/>/, $place_references_html_string);
     943    foreach my $place_reference (@place_references) {
     944    $place_reference =~ s/(<br \/>\s*)*$//;
     945    next if ($place_reference !~ /\w/);
     946    &new_metadata_entry($place_doc_obj, "Reference", $place_reference);
     947
     948    # Case 1: Author (possibly empty), then title in italics or quotes
     949    if ($place_reference =~ /^(.*?)<i>(.*?)<\/i>/ || $place_reference =~ /^(.*)"(.*?)"/) {
     950        &new_metadata_entry($place_doc_obj, "ReferenceAuthor", $1);
     951        &new_metadata_entry($place_doc_obj, "ReferenceTitle", $2);
     952    }
     953    # Case 2: Zero or one fullstops, assume no author and title is complete text
     954    elsif ($place_reference =~ /^[^\.]*\.[^\.]*$/ || $place_reference !~ /\./) {
     955        &new_metadata_entry($place_doc_obj, "ReferenceTitle", $place_reference);
     956    }
     957    else {
     958        print STDERR "<ProcessingError n='Place $place_id' p='CICPlug' r='Could not parse reference: $place_reference'>\n" if ($self->{'gli'});
     959        print STDERR "Warning: Place $place_id -- Could not parse reference: $place_reference\n";
     960        print $fail_log_handle "Warning: Place $place_id -- Could not parse reference: $place_reference\n";
     961    }
     962    }
     963}
     964
     965
    930966sub rtf_to_html
    931967{
    932968    my $rtf_string = shift(@_);
    933969    $rtf_string =~ s/\{(.*?)\}//g;
     970    $rtf_string =~ s/\\ldblquote /"/g;
     971    $rtf_string =~ s/\\rdblquote /"/g;
    934972    $rtf_string =~ s/\\rquote /'/g;  # ' # (for Emacs)
    935973    $rtf_string =~ s/\\pard//g;
    936974    $rtf_string =~ s/\\par/<br \/>/g;
     975    $rtf_string =~ s/\\ul /<i>/g;
     976    $rtf_string =~ s/\\ulnone /<\/i>/g;
    937977    $rtf_string =~ s/\\i0 /<\/i>/g;
     978    $rtf_string =~ s/\\i0\\/<\/i>\\/g;
    938979    $rtf_string =~ s/\\i /<i>/g;
     980    $rtf_string =~ s/\\i\\/<i>\\/g;
    939981    $rtf_string =~ s/\\~/ /g;
    940982    $rtf_string =~ s/\\([A-Za-z0-9]+)//g;
Note: See TracChangeset for help on using the changeset viewer.