Ignore:
Timestamp:
2021-02-26T19:39:51+13:00 (3 years ago)
Author:
anupama
Message:

Committing the improvements to EmbeddedMetaPlugin's processing of Keywords vs other metadata fields. Keywords were literally stored as arrays of words rather than phrases in PDFs (at least in Diego's sample PDF), whereas other meta fields like Subjects and Creators stored them as arrays of phrases. To get both to work, Kathy updated EXIF to a newer version, to retrieve the actual EXIF values stored in the PDF. And Kathy and Dr Bainbridge came up with a new option that I added called apply_join_before_split_to_metafields that's a regex which can list the metadata fields to apply the join_before_split to and whcih previously always got applied to all metadata fields. Now it's applied to any *Keywords metafields by default, as that's the metafield we have experience of that behaves differently to the others, as it stores by word instead of phrases. Tested on Diego's sample PDF. Diego has double-checked it to works on his sample PDF too, setting the split char to ; and turning on the join_before_split and leaving apply_join_before_split_to_metafields at its default of .*Keywords. File changes are strings.properties for the tooltip, the plugin introducing the option and working with it and Kathy's EXIF updates affecting cpan/File and cpan/Image.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/cpan/Image/ExifTool/MXF.pm

    r24107 r34921  
    3737use vars qw($VERSION);
    3838use Image::ExifTool qw(:DataAccess :Utils);
    39 
    40 $VERSION = '1.02';
     39use Image::ExifTool::GPS;
     40
     41$VERSION = '1.08';
    4142
    4243sub ProcessPrimer($$$);
     
    5859    Lat => 1,       Timestamp => 1,
    5960    Length => 1,    UID => 1,
    60            
    6161);
    6262
     
    7979my %geoLat = (
    8080    Groups => { 2 => 'Location' },
    81     PrintConv => 'require Image::ExifTool::GPS; Image::ExifTool::GPS::ToDMS($self, $val, 1, "N")',
     81    PrintConv => 'Image::ExifTool::GPS::ToDMS($self, $val, 1, "N")',
    8282);
    8383my %geoLon = (
    8484    Groups => { 2 => 'Location' },
    85     PrintConv => 'require Image::ExifTool::GPS; Image::ExifTool::GPS::ToDMS($self, $val, 1, "E")',
     85    PrintConv => 'Image::ExifTool::GPS::ToDMS($self, $val, 1, "E")',
    8686);
    8787my %geoLatLon = (
    8888    Groups => { 2 => 'Location' },
    8989    PrintConv => q{
    90         require Image::ExifTool::GPS;
    9190        my ($lat, $lon) = split ' ', $val;
    9291        $lat = Image::ExifTool::GPS::ToDMS($self, $lat, 1, 'N');
     
    118117%Image::ExifTool::MXF::Main = (
    119118    GROUPS => { 2 => 'Video' },
    120     PROCESS_PROC => 0,  # set this to zero to omit tags from lookup (way too many!)
    121     VARS => { NO_ID => 1 }, # tag ID's are too bulky for documentation
     119    VARS => { NO_LOOKUP => 1, NO_ID => 1 }, # tag ID's are too bulky
    122120    NOTES => q{
    123121        Tags extracted from Material Exchange Format files.  Tag ID's are not listed
     
    268266  # '060e2b34.0101.0101.02080000.00000000' => { Name => 'Security', Type => 'Node' },
    269267  # '060e2b34.0101.0101.02080100.00000000' => { Name => 'SystemAccess', Type => 'Node' },
    270     '060e2b34.0101.0101.02080101.00000000' => { Name => 'Username', Format => 'string' },
    271     '060e2b34.0101.0101.02080101.01000000' => { Name => 'Username', Type => 'UTF-16' },
     268    '060e2b34.0101.0101.02080101.00000000' => { Name => 'UserName', Format => 'string' },
     269    '060e2b34.0101.0101.02080101.01000000' => { Name => 'UserName', Type => 'UTF-16' },
    272270    '060e2b34.0101.0101.02080102.00000000' => { Name => 'Password', Format => 'string' },
    273271    '060e2b34.0101.0101.02080102.01000000' => { Name => 'Password', Type => 'UTF-16' },
     
    456454    '060e2b34.0101.0101.04010502.02000000' => { Name => 'ImageWidth', Format => 'int32u' }, # (renamed from StoredWidth)
    457455  # '060e2b34.0101.0101.04010503.00000000' => { Name => 'DigitalQuantizationAndLevelParameters', Type => 'Node' },
    458     '060e2b34.0101.0101.04010503.01000000' => { Name => 'BitsperPixel', Format => 'int8u' },
     456    '060e2b34.0101.0101.04010503.01000000' => { Name => 'BitsPerPixel', Format => 'int8u' },
    459457    '060e2b34.0101.0101.04010503.02000000' => { Name => 'RoundingMethodCode', Format => 'string' },
    460458    '060e2b34.0101.0101.04010503.03000000' => { Name => 'BlackReferenceLevel', Format => 'int32u' },
     
    785783    '060e2b34.0101.0101.07012001.10030100' => { Name => 'TelephoneNumber', Format => 'string' },
    786784    '060e2b34.0101.0101.07012001.10030200' => { Name => 'FaxNumber', Format => 'string' },
    787     '060e2b34.0101.0101.07012001.10030300' => { Name => 'E-MailAddress', Format => 'string' },
     785    '060e2b34.0101.0101.07012001.10030300' => { Name => 'E-mailAddress', Format => 'string' },
    788786  # '060e2b34.0101.0101.07012002.00000000' => { Name => 'PlaceDescriptions', Type => 'Node' },
    789787    '060e2b34.0101.0101.07012002.01000000' => { Name => 'SettingDescription', Format => 'string' },
     
    808806    '060e2b34.0101.0101.07020102.04010000' => { Name => 'TimecodeEndDateTime', Type => 'UILSBF', Unknown => 1, Groups => { 2 => 'Time' } },
    809807  # '060e2b34.0101.0101.07020102.05000000' => { Name => 'MaterialOccurrenceTrueDateTime', Type => 'Node' },
    810     '060e2b34.0101.0101.07020102.05010000' => { Name => 'UTCLastModifyDate', Format => 'string' },
    811     '060e2b34.0101.0101.07020102.05020000' => { Name => 'LocalLastModifyDate', Format => 'string' },
     808    '060e2b34.0101.0101.07020102.05010000' => { Name => 'UTCLastModifyDate', Format => 'string', Groups => { 2 => 'Time' } },
     809    '060e2b34.0101.0101.07020102.05020000' => { Name => 'LocalLastModifyDate', Format => 'string', Groups => { 2 => 'Time' } },
    812810  # '060e2b34.0101.0101.07020102.06000000' => { Name => 'MaterialOccurrenceTimeAddress', Type => 'Node' },
    813811    '060e2b34.0101.0101.07020102.06010000' => { Name => 'TimecodeLastModifyDate', Type => 'UILSBF', Unknown => 1, Groups => { 2 => 'Time' } },
     
    845843    '060e2b34.0101.0101.07020110.01020000' => { Name => 'TimecodeCreationDateTime', Type => 'UILSBF', Unknown => 1, Groups => { 2 => 'Time' } },
    846844  # '060e2b34.0101.0101.07020110.02000000' => { Name => 'ModifyDate', Type => 'Node' },
    847     '060e2b34.0101.0101.07020110.02010000' => { Name => 'LocalModifyDate', Format => 'string' },
     845    '060e2b34.0101.0101.07020110.02010000' => { Name => 'LocalModifyDate', Format => 'string', Groups => { 2 => 'Time' } },
    848846    '060e2b34.0101.0101.07020110.02020000' => { Name => 'TimecodeModifyDate', Type => 'UILSBF', Unknown => 1, Groups => { 2 => 'Time' } },
    849847  # '060e2b34.0101.0101.07020200.00000000' => { Name => 'Durations', Type => 'Node' },
     
    10621060    '060e2b34.0101.0102.04010401.01000000' => { Name => 'AnalogVideoSystemName', Type => 'VideoSignalType', Unknown => 1 },
    10631061    '060e2b34.0101.0102.04010501.10000000' => { Name => 'VerticalSub-sampling', Format => 'int32u' },
    1064     '060e2b34.0101.0102.04010503.01010000' => { Name => 'BitsperPixel', Format => 'int32u' },
     1062    '060e2b34.0101.0102.04010503.01010000' => { Name => 'BitsPerPixel', Format => 'int32u' },
    10651063    '060e2b34.0101.0102.04010503.05000000' => { Name => 'ColorRangeLevels', Format => 'int32u' },
    10661064    '060e2b34.0101.0102.04010503.06000000' => { Name => 'PixelLayout', Type => 'RGBALayout', Unknown => 1 },
     
    13571355    '060e2b34.0101.0103.02080203.00000000' => { Name => 'ClassifiedBy', Format => 'string' },
    13581356    '060e2b34.0101.0103.02080204.00000000' => { Name => 'ClassificationReason', Format => 'string' },
    1359     '060e2b34.0101.0103.02080205.00000000' => { Name => 'DeclassificationDate', Format => 'string' },
     1357    '060e2b34.0101.0103.02080205.00000000' => { Name => 'DeclassificationDate', Format => 'string', Groups => { 2 => 'Time' } },
    13601358    '060e2b34.0101.0103.02080206.00000000' => { Name => 'DerivedFrom', Format => 'string' },
    13611359    '060e2b34.0101.0103.02080207.00000000' => { Name => 'ClassificationComment', Format => 'string' },
     
    13671365    '060e2b34.0101.0103.02300101.01000000' => { Name => 'NatureOfPersonality', Type => 'UTF-16' },
    13681366    '060e2b34.0101.0103.02300102.01010000' => { Name => 'ContributionStatus', Type => 'UTF-16' },
    1369     '060e2b34.0101.0103.02300103.01010000' => { Name => 'SupportorAdministrationStatus', Type => 'UTF-16' },
     1367    '060e2b34.0101.0103.02300103.01010000' => { Name => 'SupportOrAdministrationStatus', Type => 'UTF-16' },
    13701368    '060e2b34.0101.0103.02300201.01000000' => { Name => 'OrganizationKind', Type => 'UTF-16' },
    13711369    '060e2b34.0101.0103.02300202.01010000' => { Name => 'ProductionOrganizationRole', Type => 'UTF-16' },
     
    15251523    '060e2b34.0101.0103.07012001.10030101' => { Name => 'TelephoneNumber', Type => 'UTF-16' },
    15261524    '060e2b34.0101.0103.07012001.10030201' => { Name => 'FaxNumber', Type => 'UTF-16' },
    1527     '060e2b34.0101.0103.07012001.10030301' => { Name => 'E-MailAddress', Type => 'UTF-16' },
     1525    '060e2b34.0101.0103.07012001.10030301' => { Name => 'E-mailAddress', Type => 'UTF-16' },
    15281526    '060e2b34.0101.0103.07012002.01010000' => { Name => 'SettingDescription', Type => 'UTF-16' },
    15291527    '060e2b34.0101.0103.07020101.01050000' => { Name => 'POSIXMicroseconds', Format => 'int64u' },
     
    24182416);
    24192417
    2420 # header information 
     2418# header information
    24212419%Image::ExifTool::MXF::Header = (
    24222420    GROUPS => { 2 => 'Video' },
     
    24792477sub ReadMXFValue($$$)
    24802478{
    2481     my ($exifTool, $val, $type) = @_;
     2479    my ($et, $val, $type) = @_;
    24822480    my $len = length($val);
    24832481    local $_;
    24842482
    24852483    if ($type eq 'UTF-16') {
    2486         $val = $exifTool->Decode($val, 'UCS2'); # (until we handle UTF-16 properly)
     2484        $val = $et->Decode($val, 'UCS2'); # (until we handle UTF-16 properly)
    24872485    } elsif ($type eq 'ProductVersion') {
    24882486        my @a = unpack('n*', $val);
     
    25282526        my ($count, $size) = unpack('NN', $val);
    25292527        # validate data length
    2530         $len == 8 + $count * $size or $exifTool->WarnOnce("Bad array or batch size");
     2528        $len == 8 + $count * $size or $et->WarnOnce("Bad array or batch size");
    25312529        my ($i, @a);
    25322530        for ($i=0; $i<$count; ++$i) {
     
    25382536            $_ = join('-', unpack('H8H4H4H4H12', $_)) foreach @a;
    25392537        } elsif ($type eq 'BatchOfUL' or $type =~ /^WeakReference/) {
    2540             $_ = ReadMXFValue($exifTool, $_, 'UL') foreach @a;
     2538            $_ = ReadMXFValue($et, $_, 'UL') foreach @a;
    25412539        }
    25422540        $val = \@a;
     
    25712569sub ProcessPrimer($$$)
    25722570{
    2573     my ($exifTool, $dirInfo, $tagTablePtr) = @_;
     2571    my ($et, $dirInfo, $tagTablePtr) = @_;
    25742572    my $dataPt = $$dirInfo{DataPt};
    25752573    my $end = $$dirInfo{DirLen};
     
    25782576    my $size = Get32u($dataPt, 4);
    25792577    return 0 unless $size >= 18;
    2580     $exifTool->VerboseDir('MXF Primer', $count);
    2581     my $verbose = $exifTool->Options('Verbose');
    2582     my $primer = $$exifTool{MXFInfo}{Primer};
     2578    $et->VerboseDir('MXF Primer', $count);
     2579    my $verbose = $et->Options('Verbose');
     2580    my $primer = $$et{MXFInfo}{Primer};
    25832581    my $pos = 8;
    25842582    my $i;
     
    25942592        my $indx = $i . ')';
    25952593        $indx .= ' ' if length($indx) < 3;
    2596         $exifTool->VPrint(0, sprintf("  | $indx 0x%.4x => '$global'\n", $local));
     2594        $et->VPrint(0, sprintf("  | $indx 0x%.4x => '${global}'\n", $local));
    25972595    }
    25982596    return 1;
     
    26052603sub ProcessLocalSet($$$)
    26062604{
    2607     my ($exifTool, $dirInfo, $tagTablePtr) = @_;
     2605    my ($et, $dirInfo, $tagTablePtr) = @_;
    26082606    local $_;
    26092607    my $dataPt = $$dirInfo{DataPt};
    26102608    my $dataPos = $$dirInfo{DataPos};
    26112609    my $end = $$dirInfo{DirLen};
    2612     my $mxfInfo = $$exifTool{MXFInfo};
     2610    my $mxfInfo = $$et{MXFInfo};
    26132611    my $primer = $$mxfInfo{Primer};
    26142612    my (@strongRef, @groups, $instance, $editRate, $trackID, $langCode, $textLang);
    26152613
    2616     $exifTool->VerboseDir('MXF LocalSet', undef, $end);
     2614    $et->VerboseDir('MXF LocalSet', undef, $end);
    26172615
    26182616    # loop through all tags in this local set
     
    26292627        } else {
    26302628            $tag = $loc;
    2631           # $exifTool->WarnOnce('Missing local key for at least one tag');
     2629          # $et->WarnOnce('Missing local key for at least one tag');
    26322630            $extra = ', NOT IN PRIMER!';
    26332631        }
     
    26372635            $type = $$tagInfo{Type};
    26382636            if ($type and $knownType{$type}) {
    2639                 $val = ReadMXFValue($exifTool, substr($$dataPt, $pos, $len), $type);
     2637                $val = ReadMXFValue($et, substr($$dataPt, $pos, $len), $type);
    26402638                push @strongRef, (ref $val ? @$val : $val) if $type =~ /^StrongReference/;
    26412639                # remember instance UID of this local set
     
    26512649        }
    26522650        # get tagInfo ref the standard way to handle Unknown tags
    2653         $tagInfo = $langInfo || $exifTool->GetTagInfo($tagTablePtr, $tag);
     2651        $tagInfo = $langInfo || $et->GetTagInfo($tagTablePtr, $tag);
    26542652        # set Binary flag to extract all unknown-format tags as Binary data
    26552653        if ($tagInfo and $$tagInfo{Unknown} and not defined $$tagInfo{Binary}) {
    26562654            $$tagInfo{Binary} = not ($$tagInfo{Format} or ($type and $knownType{$type}));
    26572655        }
    2658         my $key = $exifTool->HandleTag($tagTablePtr, $tag, $val,
     2656        my $key = $et->HandleTag($tagTablePtr, $tag, $val,
    26592657            Extra       => $extra,
    26602658            TagInfo     => $tagInfo,
     
    26702668        # (necessary because we don't have all the information we need
    26712669        #  to do this on the fly when the file is parsed linearly)
    2672         push @groups, $$exifTool{TAG_EXTRA}{$key};
     2670        push @groups, $$et{TAG_EXTRA}{$key};
    26732671        next unless $tagInfo;
    26742672        my $name = $$tagInfo{Name};
     
    26762674            $$mxfInfo{FixDuration}{$key} = 1;
    26772675        } elsif ($$tagInfo{LanguageCode}) {
    2678             $langCode = $$exifTool{VALUE}{$key};
     2676            $langCode = $$et{VALUE}{$key};
    26792677        } elsif ($name eq 'EditRate') {
    2680             $editRate = $$exifTool{VALUE}{$key};
     2678            $editRate = $$et{VALUE}{$key};
    26812679        } elsif ($name =~ /TrackID$/) {
    2682             $trackID = $$exifTool{VALUE}{$key};
     2680            $trackID = $$et{VALUE}{$key};
    26832681            unless ($$mxfInfo{Group1}{$trackID}) {
    26842682                # save lookup to convert TrackID to our group 1 name
     
    27022700        # save instance UID's in groups hash (used to remove duplicates later)
    27032701        $$_{UID} = $instance foreach @groups;
    2704         $$objInfo{Name} = $$exifTool{DIR_NAME};
     2702        $$objInfo{Name} = $$et{DIR_NAME};
    27052703        $$objInfo{TrackID} = $trackID if defined $trackID;
    27062704        $$objInfo{EditRate} = $editRate if $editRate;
     
    27202718        }
    27212719        # save instance UID's of Preface's
    2722         push @{$$mxfInfo{Preface}}, $instance if $$exifTool{DIR_NAME} eq 'Preface';
     2720        push @{$$mxfInfo{Preface}}, $instance if $$et{DIR_NAME} eq 'Preface';
    27232721    }
    27242722    return 1;
     
    27852783sub ConvertDurations($$)
    27862784{
    2787     my ($exifTool, $mxfInfo) = @_;
    2788     my $valueHash = $$exifTool{VALUE};
    2789     my $infoHash = $$exifTool{TAG_INFO};
    2790     my $tagExtra = $$exifTool{TAG_EXTRA};
     2785    my ($et, $mxfInfo) = @_;
     2786    my $valueHash = $$et{VALUE};
     2787    my $infoHash = $$et{TAG_INFO};
     2788    my $tagExtra = $$et{TAG_EXTRA};
    27912789    my $editHash = $$mxfInfo{EditRate};
    27922790    my ($tag, $key, $i);
     
    28092807sub ProcessMXF($$)
    28102808{
    2811     my ($exifTool, $dirInfo) = @_;
     2809    my ($et, $dirInfo) = @_;
    28122810    my $raf = $$dirInfo{RAF};
    2813     my $verbose = $exifTool->Options('Verbose');
    2814     my $unknown = $exifTool->Options('Unknown');
     2811    my $verbose = $et->Options('Verbose');
     2812    my $unknown = $et->Options('Unknown');
    28152813    my ($buff, $preface, $n, $headerEnd, $footerPos);
    28162814
     
    28202818    my $start = pos($buff) - 11;
    28212819
    2822     $exifTool->SetFileType();
     2820    $et->SetFileType();
    28232821    SetByteOrder('MM');
    2824     $raf->Seek($start, 0) or $exifTool->Warn('Seek error'), return 1;
     2822    $raf->Seek($start, 0) or $et->Warn('Seek error'), return 1;
    28252823    my $tagTablePtr = GetTagTable('Image::ExifTool::MXF::Main');
    28262824
     
    28352833        Preface => [ ],     # instance UID's for all Preface objects
    28362834    );
    2837     $$exifTool{MXFInfo} = \%mxfInfo;
     2835    $$et{MXFInfo} = \%mxfInfo;
    28382836
    28392837    # set group 1 name for all tags (so we can overwrite with track number later)
    2840     $$exifTool{SET_GROUP1} = 'MXF';
     2838    $$et{SET_GROUP1} = 'MXF';
    28412839
    28422840    for (;;) {
     
    28492847            # skip directly to footer if possible
    28502848            if ($footerPos and $footerPos > $pos and (not $verbose or not $unknown)) {
    2851                 $exifTool->VPrint(0, "[Skipping to footer. Use Unknown option to parse body partitions]\n");
     2849                $et->VPrint(0, "[Skipping to footer. Use Unknown option to parse body partitions]\n");
    28522850                $raf->Seek($footerPos, 0) or last;
    28532851                $pos = $footerPos;
     
    28782876            my $name = $1 eq '0d' ? 'UserOrganizationPublicUse' : 'Experimental';
    28792877            $tagInfo = { Name => $name, %localSet };
    2880             Image::ExifTool::AddTagToTable($tagTablePtr, $ul, $tagInfo);
     2878            AddTagToTable($tagTablePtr, $ul, $tagInfo);
    28812879        }
    28822880        my ($val, $dataPt);
     
    28952893            $dataPt = \$buff;
    28962894            my $type = $$tagInfo{Type};
    2897             $val = ReadMXFValue($exifTool, $buff, $type) if $type and $knownType{$type};
     2895            $val = ReadMXFValue($et, $buff, $type) if $type and $knownType{$type};
    28982896        } elsif (($tagInfo and (not $$tagInfo{Unknown} or $unknown)) or $verbose) {
    28992897            if ($tagInfo) {
     
    29222920            next;
    29232921        }
    2924         $exifTool->HandleTag($tagTablePtr, $ul, $val,
     2922        $et->HandleTag($tagTablePtr, $ul, $val,
    29252923            TagInfo     => $tagInfo,
    29262924            DataPt      => $dataPt,
     
    29322930    # walk entire MXF object tree to fix family 1 group names
    29332931    my ($pathInfo, $tag, %did, %bestDur);
    2934     $pathInfo = { Path => [ 'MXF' ] } if $exifTool->Options('SavePath');
     2932    $pathInfo = { Path => [ 'MXF' ] } if $et->Options('SavePath');
    29352933    foreach $preface (@{$mxfInfo{Preface}}) {
    29362934        SetGroups(\%mxfInfo, $preface, $pathInfo);
    29372935    }
    29382936    # convert Duration values to seconds based on the appropriate EditRate
    2939     ConvertDurations($exifTool, \%mxfInfo);
     2937    ConvertDurations($et, \%mxfInfo);
    29402938
    29412939    # remove tags to keep only the one from the most recent instance of the object
    2942     my $tagExtra = $$exifTool{TAG_EXTRA};
    2943     my $fileOrder = $$exifTool{FILE_ORDER};
     2940    my $tagExtra = $$et{TAG_EXTRA};
     2941    my $fileOrder = $$et{FILE_ORDER};
    29442942    # also determine our best Duration value
    29452943    if ($mxfInfo{BestDuration}) {
     
    29542952        my $utag = "$1 $instance";      # instance-specific tag name
    29552953        if ($did{$utag}) {
    2956             Image::ExifTool::DeleteTag($exifTool, $tag); # delete the duplicate
     2954            Image::ExifTool::DeleteTag($et, $tag); # delete the duplicate
    29572955        } else {
    29582956            $did{$utag} = 1;
    29592957            if ($bestDur{$utag}) {
    29602958                # save best duration value
    2961                 my $val = $$exifTool{VALUE}{$tag};
    2962                 $exifTool->HandleTag($tagTablePtr, '060e2b34.0101.0102.07020201.01030000', $val);
     2959                my $val = $$et{VALUE}{$tag};
     2960                $et->HandleTag($tagTablePtr, '060e2b34.0101.0102.07020201.01030000', $val);
    29632961            }
    29642962        }
     
    29662964
    29672965    # clean up and return
    2968     delete $$exifTool{SET_GROUP1};
    2969     delete $$exifTool{MXFInfo};
     2966    delete $$et{SET_GROUP1};
     2967    delete $$et{MXFInfo};
    29702968    return 1;
    29712969}
     
    29902988=head1 AUTHOR
    29912989
    2992 Copyright 2003-2011, Phil Harvey (phil at owl.phy.queensu.ca)
     2990Copyright 2003-2021, Phil Harvey (philharvey66 at gmail.com)
    29932991
    29942992This library is free software; you can redistribute it and/or modify it
Note: See TracChangeset for help on using the changeset viewer.