source: main/trunk/greenstone2/perllib/cpan/Image/ExifTool/Torrent.pm@ 34921

Last change on this file since 34921 was 34921, checked in by anupama, 3 years ago

Committing the improvements to EmbeddedMetaPlugin's processing of Keywords vs other metadata fields. Keywords were literally stored as arrays of words rather than phrases in PDFs (at least in Diego's sample PDF), whereas other meta fields like Subjects and Creators stored them as arrays of phrases. To get both to work, Kathy updated EXIF to a newer version, to retrieve the actual EXIF values stored in the PDF. And Kathy and Dr Bainbridge came up with a new option that I added called apply_join_before_split_to_metafields that's a regex which can list the metadata fields to apply the join_before_split to and whcih previously always got applied to all metadata fields. Now it's applied to any *Keywords metafields by default, as that's the metafield we have experience of that behaves differently to the others, as it stores by word instead of phrases. Tested on Diego's sample PDF. Diego has double-checked it to works on his sample PDF too, setting the split char to ; and turning on the join_before_split and leaving apply_join_before_split_to_metafields at its default of .*Keywords. File changes are strings.properties for the tooltip, the plugin introducing the option and working with it and Kathy's EXIF updates affecting cpan/File and cpan/Image.

File size: 11.3 KB
Line 
1#------------------------------------------------------------------------------
2# File: Torrent.pm
3#
4# Description: Read information from BitTorrent file
5#
6# Revisions: 2013/08/27 - P. Harvey Created
7#
8# References: 1) https://wiki.theory.org/BitTorrentSpecification
9#------------------------------------------------------------------------------
10
11package Image::ExifTool::Torrent;
12
13use strict;
14use vars qw($VERSION);
15use Image::ExifTool qw(:DataAccess :Utils);
16
17$VERSION = '1.04';
18
19sub ReadBencode($$);
20sub ExtractTags($$$;$$@);
21
22# tags extracted from BitTorrent files
23%Image::ExifTool::Torrent::Main = (
24 GROUPS => { 2 => 'Document' },
25 NOTES => q{
26 Below are tags commonly found in BitTorrent files. As well as these tags,
27 any other existing tags will be extracted. For convenience, list items are
28 expanded into individual tags with an index in the tag name, but only the
29 tags with index "1" are listed in the tables below. See
30 L<https://wiki.theory.org/BitTorrentSpecification> for the BitTorrent
31 specification.
32 },
33 'announce' => { },
34 'announce-list' => { Name => 'AnnounceList1' },
35 'comment' => { },
36 'created by' => { Name => 'Creator' }, # software used to create the torrent
37 'creation date' => {
38 Name => 'CreateDate',
39 Groups => { 2 => 'Time' },
40 ValueConv => 'ConvertUnixTime($val,1)',
41 PrintConv => '$self->ConvertDateTime($val)',
42 },
43 'encoding' => { },
44 'info' => {
45 SubDirectory => { TagTable => 'Image::ExifTool::Torrent::Info' },
46 Notes => 'extracted as a structure with the Struct option',
47 },
48 'url-list' => { Name => 'URLList1' },
49);
50
51%Image::ExifTool::Torrent::Info = (
52 GROUPS => { 2 => 'Document' },
53 'file-duration' => { Name => 'File1Duration' },
54 'file-media' => { Name => 'File1Media' },
55 'files' => { SubDirectory => { TagTable => 'Image::ExifTool::Torrent::Files' } },
56 'length' => { },
57 'md5sum' => { Name => 'MD5Sum' },
58 'name' => { },
59 'name.utf-8' => { Name => 'NameUTF-8' },
60 'piece length' => { Name => 'PieceLength' },
61 'pieces' => {
62 Name => 'Pieces',
63 Notes => 'concatenation of 20-byte SHA-1 digests for each piece',
64 },
65 'private' => { },
66 'profiles' => { SubDirectory => { TagTable => 'Image::ExifTool::Torrent::Profiles' } },
67);
68
69%Image::ExifTool::Torrent::Profiles = (
70 GROUPS => { 2 => 'Document' },
71 'width' => { Name => 'Profile1Width' },
72 'height' => { Name => 'Profile1Height' },
73 'acodec' => { Name => 'Profile1AudioCodec' },
74 'vcodec' => { Name => 'Profile1VideoCodec' },
75);
76
77%Image::ExifTool::Torrent::Files = (
78 GROUPS => { 2 => 'Document' },
79 'length' => { Name => 'File1Length', PrintConv => 'ConvertFileSize($val)' },
80 'md5sum' => { Name => 'File1MD5Sum' },
81 'path' => { Name => 'File1Path', JoinPath => 1 },
82 'path.utf-8' => { Name => 'File1PathUTF-8', JoinPath => 1 },
83);
84
85#------------------------------------------------------------------------------
86# Read 64kB more data into buffer
87# Inputs: 0) RAF ref, 1) buffer ref
88# Returns: number of bytes read
89# Notes: Sets BencodeEOF element of RAF on end of file
90sub ReadMore($$)
91{
92 my ($raf, $dataPt) = @_;
93 my $buf2;
94 my $n = $raf->Read($buf2, 65536);
95 $$raf{BencodeEOF} = 1 if $n != 65536;
96 $$dataPt = substr($$dataPt, pos($$dataPt)) . $buf2 if $n;
97 return $n;
98}
99
100#------------------------------------------------------------------------------
101# Read bencoded value
102# Inputs: 0) input file, 1) buffer (pos must be set to current position)
103# Returns: HASH ref, ARRAY ref, SCALAR ref, SCALAR, or undef on error or end of data
104# Notes: Sets BencodeError element of RAF on any error
105sub ReadBencode($$)
106{
107 my ($raf, $dataPt) = @_;
108
109 # read more if necessary (keep a minimum of 64 bytes in the buffer)
110 my $pos = pos($$dataPt);
111 return undef unless defined $pos;
112 my $remaining = length($$dataPt) - $pos;
113 ReadMore($raf, $dataPt) if $remaining < 64 and not $$raf{BencodeEOF};
114
115 # read next token
116 $$dataPt =~ /(.)/sg or return undef;
117
118 my $val;
119 my $tok = $1;
120 if ($tok eq 'i') { # integer
121 $$dataPt =~ /\G(-?\d+)e/g or return $val;
122 $val = $1;
123 } elsif ($tok eq 'd') { # dictionary
124 $val = { };
125 for (;;) {
126 my $k = ReadBencode($raf, $dataPt);
127 last unless defined $k;
128 # the key must be a byte string
129 if (ref $k) {
130 ref $k ne 'SCALAR' and $$raf{BencodeError} = 'Bad dictionary key', last;
131 $k = $$k;
132 }
133 my $v = ReadBencode($raf, $dataPt);
134 last unless defined $v;
135 $$val{$k} = $v;
136 }
137 } elsif ($tok eq 'l') { # list
138 $val = [ ];
139 for (;;) {
140 my $v = ReadBencode($raf, $dataPt);
141 last unless defined $v;
142 push @$val, $v;
143 }
144 } elsif ($tok eq 'e') { # end of dictionary or list
145 # return undef (no error)
146 } elsif ($tok =~ /^\d$/ and $$dataPt =~ /\G(\d*):/g) { # byte string
147 my $len = $tok . $1;
148 my $more = $len - (length($$dataPt) - pos($$dataPt));
149 my $value;
150 if ($more <= 0) {
151 $value = substr($$dataPt,pos($$dataPt),$len);
152 pos($$dataPt) = pos($$dataPt) + $len;
153 } elsif ($more > 10000000) {
154 # just skip over really long values
155 $val = \ "(Binary data $len bytes)" if $raf->Seek($more, 1);
156 } else {
157 # need to read more from file
158 my $buff;
159 my $n = $raf->Read($buff, $more);
160 if ($n == $more) {
161 $value = substr($$dataPt,pos($$dataPt)) . $buff;
162 $$dataPt = '';
163 pos($$dataPt) = 0;
164 }
165 }
166 if (defined $value) {
167 # return as binary data unless it is a reasonable-length ASCII string
168 if (length($value) > 256 or $value =~ /[^\t\x20-\x7e]/) {
169 $val = \$value;
170 } else {
171 $val = $value;
172 }
173 } elsif (not defined $val) {
174 $$raf{BencodeError} = 'Truncated byte string';
175 }
176 } else {
177 $$raf{BencodeError} = 'Bad format';
178 }
179 return $val;
180}
181
182#------------------------------------------------------------------------------
183# Extract tags from dictionary hash
184# Inputs: 0) ExifTool ref, 1) dictionary hash reference, 2) tag table ref,
185# 3) parent hash ID, 4) parent hash name, 5-N) list indices
186# Returns: number of tags extracted
187sub ExtractTags($$$;$$@)
188{
189 my ($et, $hashPtr, $tagTablePtr, $baseID, $baseName, @index) = @_;
190 my $count = 0;
191 my $tag;
192 foreach $tag (sort keys %$hashPtr) {
193 my $val = $$hashPtr{$tag};
194 my ($i, $j, @more);
195 for (; defined $val; $val = shift @more) {
196 my $id = defined $baseID ? "$baseID/$tag" : $tag;
197 unless ($$tagTablePtr{$id}) {
198 my $name = ucfirst $tag;
199 # capitalize all words in tag name and remove illegal characters
200 $name =~ s/[^-_a-zA-Z0-9]+(.?)/\U$1/g;
201 $name = "Tag$name" if length($name) < 2 or $name !~ /^[A-Z]/;
202 $name = $baseName . $name if defined $baseName; # add base name if necessary
203 AddTagToTable($tagTablePtr, $id, { Name => $name });
204 $et->VPrint(0, " [adding $id '${name}']\n");
205 }
206 my $tagInfo = $et->GetTagInfo($tagTablePtr, $id) or next;
207 if (ref $val eq 'ARRAY') {
208 if ($$tagInfo{JoinPath}) {
209 $val = join '/', @$val;
210 } else {
211 push @more, @$val;
212 next if ref $more[0] eq 'ARRAY'; # continue expanding nested lists
213 $val = shift @more;
214 $i or $i = 0, push(@index, $i);
215 }
216 }
217 $index[-1] = ++$i if defined $i;
218 if (@index) {
219 $id .= join '_', @index; # add instance number(s) to tag ID
220 unless ($$tagTablePtr{$id}) {
221 my $name = $$tagInfo{Name};
222 # embed indices at position of '1' in tag name
223 my $n = ($name =~ tr/1/#/);
224 for ($j=0; $j<$n; ++$j) {
225 my $idx = $index[$j] || '';
226 $name =~ s/#/$idx/;
227 }
228 # put remaining indices at end of tag name
229 for (; $j<@index; ++$j) {
230 $name .= '_' if $name =~ /\d$/;
231 $name .= $index[$j];
232 }
233 AddTagToTable($tagTablePtr, $id, { %$tagInfo, Name => $name });
234 }
235 $tagInfo = $et->GetTagInfo($tagTablePtr, $id) or next;
236 }
237 if (ref $val eq 'HASH') {
238 if ($et->Options('Struct') and $tagInfo and $$tagInfo{Name} eq 'Info') {
239 $et->FoundTag($tagInfo, $val);
240 ++$count;
241 next;
242 }
243 # extract tags from this dictionary
244 my ($table, $rootID, $rootName);
245 if ($$tagInfo{SubDirectory}) {
246 $table = GetTagTable($$tagInfo{SubDirectory}{TagTable});
247 } else {
248 $table = $tagTablePtr;
249 # use hash ID and Name as base for contained tags to avoid conflicts
250 $rootID = $id;
251 $rootName = $$tagInfo{Name};
252 }
253 $count += ExtractTags($et, $val, $table, $rootID, $rootName, @index);
254 } else {
255 # handle this simple tag value
256 $et->HandleTag($tagTablePtr, $id, $val);
257 ++$count;
258 }
259 }
260 pop @index if defined $i;
261 }
262 return $count;
263}
264
265#------------------------------------------------------------------------------
266# Process BitTorrent file
267# Inputs: 0) ExifTool object reference, 1) dirInfo reference (with RAF set)
268# Returns: 1 on success, 0 if this wasn't a valid BitTorrent file
269sub ProcessTorrent($$)
270{
271 my ($et, $dirInfo) = @_;
272 my $success = 0;
273 my $raf = $$dirInfo{RAF};
274 my $buff = '';
275 pos($buff) = 0;
276 my $dict = ReadBencode($raf, \$buff);
277 my $err = $$raf{BencodeError};
278 $et->Warn("Bencode error: $err") if $err;
279 if (ref $dict eq 'HASH' and ($$dict{announce} or $$dict{'created by'})) {
280 $et->SetFileType();
281 my $tagTablePtr = GetTagTable('Image::ExifTool::Torrent::Main');
282 ExtractTags($et, $dict, $tagTablePtr) and $success = 1;
283 }
284 return $success;
285}
286
2871; # end
288
289__END__
290
291=head1 NAME
292
293Image::ExifTool::Torrent - Read information from BitTorrent file
294
295=head1 SYNOPSIS
296
297This module is used by Image::ExifTool
298
299=head1 DESCRIPTION
300
301This module contains definitions required by Image::ExifTool to read
302bencoded information from BitTorrent files.
303
304=head1 AUTHOR
305
306Copyright 2003-2021, Phil Harvey (philharvey66 at gmail.com)
307
308This library is free software; you can redistribute it and/or modify it
309under the same terms as Perl itself.
310
311=head1 REFERENCES
312
313=over 4
314
315=item L<https://wiki.theory.org/BitTorrentSpecification>
316
317=back
318
319=head1 SEE ALSO
320
321L<Image::ExifTool::TagNames/Torrent Tags>,
322L<Image::ExifTool(3pm)|Image::ExifTool>
323
324=cut
325
Note: See TracBrowser for help on using the repository browser.