Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

Torrent.pm@ 34921

Last change on this file since 34921 was 34921, checked in by anupama, 3 years ago

Committing the improvements to EmbeddedMetaPlugin's processing of Keywords vs other metadata fields. Keywords were literally stored as arrays of words rather than phrases in PDFs (at least in Diego's sample PDF), whereas other meta fields like Subjects and Creators stored them as arrays of phrases. To get both to work, Kathy updated EXIF to a newer version, to retrieve the actual EXIF values stored in the PDF. And Kathy and Dr Bainbridge came up with a new option that I added called apply_join_before_split_to_metafields that's a regex which can list the metadata fields to apply the join_before_split to and whcih previously always got applied to all metadata fields. Now it's applied to any *Keywords metafields by default, as that's the metafield we have experience of that behaves differently to the others, as it stores by word instead of phrases. Tested on Diego's sample PDF. Diego has double-checked it to works on his sample PDF too, setting the split char to ; and turning on the join_before_split and leaving apply_join_before_split_to_metafields at its default of .*Keywords. File changes are strings.properties for the tooltip, the plugin introducing the option and working with it and Kathy's EXIF updates affecting cpan/File and cpan/Image.

File size: 11.3 KB

Line
1	#------------------------------------------------------------------------------
2	# File: Torrent.pm
3	#
4	# Description: Read information from BitTorrent file
5	#
6	# Revisions: 2013/08/27 - P. Harvey Created
7	#
8	# References: 1) https://wiki.theory.org/BitTorrentSpecification
9	#------------------------------------------------------------------------------
10
11	package Image::ExifTool::Torrent;
12
13	use strict;
14	use vars qw($VERSION);
15	use Image::ExifTool qw(:DataAccess :Utils);
16
17	$VERSION = '1.04';
18
19	sub ReadBencode($$);
20	sub ExtractTags($$$;$$@);
21
22	# tags extracted from BitTorrent files
23	%Image::ExifTool::Torrent::Main = (
24	GROUPS => { 2 => 'Document' },
25	NOTES => q{
26	Below are tags commonly found in BitTorrent files. As well as these tags,
27	any other existing tags will be extracted. For convenience, list items are
28	expanded into individual tags with an index in the tag name, but only the
29	tags with index "1" are listed in the tables below. See
30	L<https://wiki.theory.org/BitTorrentSpecification> for the BitTorrent
31	specification.
32	},
33	'announce' => { },
34	'announce-list' => { Name => 'AnnounceList1' },
35	'comment' => { },
36	'created by' => { Name => 'Creator' }, # software used to create the torrent
37	'creation date' => {
38	Name => 'CreateDate',
39	Groups => { 2 => 'Time' },
40	ValueConv => 'ConvertUnixTime($val,1)',
41	PrintConv => '$self->ConvertDateTime($val)',
42	},
43	'encoding' => { },
44	'info' => {
45	SubDirectory => { TagTable => 'Image::ExifTool::Torrent::Info' },
46	Notes => 'extracted as a structure with the Struct option',
47	},
48	'url-list' => { Name => 'URLList1' },
49	);
50
51	%Image::ExifTool::Torrent::Info = (
52	GROUPS => { 2 => 'Document' },
53	'file-duration' => { Name => 'File1Duration' },
54	'file-media' => { Name => 'File1Media' },
55	'files' => { SubDirectory => { TagTable => 'Image::ExifTool::Torrent::Files' } },
56	'length' => { },
57	'md5sum' => { Name => 'MD5Sum' },
58	'name' => { },
59	'name.utf-8' => { Name => 'NameUTF-8' },
60	'piece length' => { Name => 'PieceLength' },
61	'pieces' => {
62	Name => 'Pieces',
63	Notes => 'concatenation of 20-byte SHA-1 digests for each piece',
64	},
65	'private' => { },
66	'profiles' => { SubDirectory => { TagTable => 'Image::ExifTool::Torrent::Profiles' } },
67	);
68
69	%Image::ExifTool::Torrent::Profiles = (
70	GROUPS => { 2 => 'Document' },
71	'width' => { Name => 'Profile1Width' },
72	'height' => { Name => 'Profile1Height' },
73	'acodec' => { Name => 'Profile1AudioCodec' },
74	'vcodec' => { Name => 'Profile1VideoCodec' },
75	);
76
77	%Image::ExifTool::Torrent::Files = (
78	GROUPS => { 2 => 'Document' },
79	'length' => { Name => 'File1Length', PrintConv => 'ConvertFileSize($val)' },
80	'md5sum' => { Name => 'File1MD5Sum' },
81	'path' => { Name => 'File1Path', JoinPath => 1 },
82	'path.utf-8' => { Name => 'File1PathUTF-8', JoinPath => 1 },
83	);
84
85	#------------------------------------------------------------------------------
86	# Read 64kB more data into buffer
87	# Inputs: 0) RAF ref, 1) buffer ref
88	# Returns: number of bytes read
89	# Notes: Sets BencodeEOF element of RAF on end of file
90	sub ReadMore($$)
91	{
92	my ($raf, $dataPt) = @_;
93	my $buf2;
94	my $n = $raf->Read($buf2, 65536);
95	$$raf{BencodeEOF} = 1 if $n != 65536;
96	$$dataPt = substr($$dataPt, pos($$dataPt)) . $buf2 if $n;
97	return $n;
98	}
99
100	#------------------------------------------------------------------------------
101	# Read bencoded value
102	# Inputs: 0) input file, 1) buffer (pos must be set to current position)
103	# Returns: HASH ref, ARRAY ref, SCALAR ref, SCALAR, or undef on error or end of data
104	# Notes: Sets BencodeError element of RAF on any error
105	sub ReadBencode($$)
106	{
107	my ($raf, $dataPt) = @_;
108
109	# read more if necessary (keep a minimum of 64 bytes in the buffer)
110	my $pos = pos($$dataPt);
111	return undef unless defined $pos;
112	my $remaining = length($$dataPt) - $pos;
113	ReadMore($raf, $dataPt) if $remaining < 64 and not $$raf{BencodeEOF};
114
115	# read next token
116	$$dataPt =~ /(.)/sg or return undef;
117
118	my $val;
119	my $tok = $1;
120	if ($tok eq 'i') { # integer
121	$$dataPt =~ /\G(-?\d+)e/g or return $val;
122	$val = $1;
123	} elsif ($tok eq 'd') { # dictionary
124	$val = { };
125	for (;;) {
126	my $k = ReadBencode($raf, $dataPt);
127	last unless defined $k;
128	# the key must be a byte string
129	if (ref $k) {
130	ref $k ne 'SCALAR' and $$raf{BencodeError} = 'Bad dictionary key', last;
131	$k = $$k;
132	}
133	my $v = ReadBencode($raf, $dataPt);
134	last unless defined $v;
135	$$val{$k} = $v;
136	}
137	} elsif ($tok eq 'l') { # list
138	$val = [ ];
139	for (;;) {
140	my $v = ReadBencode($raf, $dataPt);
141	last unless defined $v;
142	push @$val, $v;
143	}
144	} elsif ($tok eq 'e') { # end of dictionary or list
145	# return undef (no error)
146	} elsif ($tok =~ /^\d$/ and $$dataPt =~ /\G(\d*):/g) { # byte string
147	my $len = $tok . $1;
148	my $more = $len - (length($$dataPt) - pos($$dataPt));
149	my $value;
150	if ($more <= 0) {
151	$value = substr($$dataPt,pos($$dataPt),$len);
152	pos($$dataPt) = pos($$dataPt) + $len;
153	} elsif ($more > 10000000) {
154	# just skip over really long values
155	$val = \ "(Binary data $len bytes)" if $raf->Seek($more, 1);
156	} else {
157	# need to read more from file
158	my $buff;
159	my $n = $raf->Read($buff, $more);
160	if ($n == $more) {
161	$value = substr($$dataPt,pos($$dataPt)) . $buff;
162	$$dataPt = '';
163	pos($$dataPt) = 0;
164	}
165	}
166	if (defined $value) {
167	# return as binary data unless it is a reasonable-length ASCII string
168	if (length($value) > 256 or $value =~ /[^\t\x20-\x7e]/) {
169	$val = \$value;
170	} else {
171	$val = $value;
172	}
173	} elsif (not defined $val) {
174	$$raf{BencodeError} = 'Truncated byte string';
175	}
176	} else {
177	$$raf{BencodeError} = 'Bad format';
178	}
179	return $val;
180	}
181
182	#------------------------------------------------------------------------------
183	# Extract tags from dictionary hash
184	# Inputs: 0) ExifTool ref, 1) dictionary hash reference, 2) tag table ref,
185	# 3) parent hash ID, 4) parent hash name, 5-N) list indices
186	# Returns: number of tags extracted
187	sub ExtractTags($$$;$$@)
188	{
189	my ($et, $hashPtr, $tagTablePtr, $baseID, $baseName, @index) = @_;
190	my $count = 0;
191	my $tag;
192	foreach $tag (sort keys %$hashPtr) {
193	my $val = $$hashPtr{$tag};
194	my ($i, $j, @more);
195	for (; defined $val; $val = shift @more) {
196	my $id = defined $baseID ? "$baseID/$tag" : $tag;
197	unless ($$tagTablePtr{$id}) {
198	my $name = ucfirst $tag;
199	# capitalize all words in tag name and remove illegal characters
200	$name =~ s/[^-_a-zA-Z0-9]+(.?)/\U$1/g;
201	$name = "Tag$name" if length($name) < 2 or $name !~ /^[A-Z]/;
202	$name = $baseName . $name if defined $baseName; # add base name if necessary
203	AddTagToTable($tagTablePtr, $id, { Name => $name });
204	$et->VPrint(0, " [adding $id '${name}']\n");
205	}
206	my $tagInfo = $et->GetTagInfo($tagTablePtr, $id) or next;
207	if (ref $val eq 'ARRAY') {
208	if ($$tagInfo{JoinPath}) {
209	$val = join '/', @$val;
210	} else {
211	push @more, @$val;
212	next if ref $more[0] eq 'ARRAY'; # continue expanding nested lists
213	$val = shift @more;
214	$i or $i = 0, push(@index, $i);
215	}
216	}
217	$index[-1] = ++$i if defined $i;
218	if (@index) {
219	$id .= join '_', @index; # add instance number(s) to tag ID
220	unless ($$tagTablePtr{$id}) {
221	my $name = $$tagInfo{Name};
222	# embed indices at position of '1' in tag name
223	my $n = ($name =~ tr/1/#/);
224	for ($j=0; $j<$n; ++$j) {
225	my $idx = $index[$j] \|\| '';
226	$name =~ s/#/$idx/;
227	}
228	# put remaining indices at end of tag name
229	for (; $j<@index; ++$j) {
230	$name .= '_' if $name =~ /\d$/;
231	$name .= $index[$j];
232	}
233	AddTagToTable($tagTablePtr, $id, { %$tagInfo, Name => $name });
234	}
235	$tagInfo = $et->GetTagInfo($tagTablePtr, $id) or next;
236	}
237	if (ref $val eq 'HASH') {
238	if ($et->Options('Struct') and $tagInfo and $$tagInfo{Name} eq 'Info') {
239	$et->FoundTag($tagInfo, $val);
240	++$count;
241	next;
242	}
243	# extract tags from this dictionary
244	my ($table, $rootID, $rootName);
245	if ($$tagInfo{SubDirectory}) {
246	$table = GetTagTable($$tagInfo{SubDirectory}{TagTable});
247	} else {
248	$table = $tagTablePtr;
249	# use hash ID and Name as base for contained tags to avoid conflicts
250	$rootID = $id;
251	$rootName = $$tagInfo{Name};
252	}
253	$count += ExtractTags($et, $val, $table, $rootID, $rootName, @index);
254	} else {
255	# handle this simple tag value
256	$et->HandleTag($tagTablePtr, $id, $val);
257	++$count;
258	}
259	}
260	pop @index if defined $i;
261	}
262	return $count;
263	}
264
265	#------------------------------------------------------------------------------
266	# Process BitTorrent file
267	# Inputs: 0) ExifTool object reference, 1) dirInfo reference (with RAF set)
268	# Returns: 1 on success, 0 if this wasn't a valid BitTorrent file
269	sub ProcessTorrent($$)
270	{
271	my ($et, $dirInfo) = @_;
272	my $success = 0;
273	my $raf = $$dirInfo{RAF};
274	my $buff = '';
275	pos($buff) = 0;
276	my $dict = ReadBencode($raf, \$buff);
277	my $err = $$raf{BencodeError};
278	$et->Warn("Bencode error: $err") if $err;
279	if (ref $dict eq 'HASH' and ($$dict{announce} or $$dict{'created by'})) {
280	$et->SetFileType();
281	my $tagTablePtr = GetTagTable('Image::ExifTool::Torrent::Main');
282	ExtractTags($et, $dict, $tagTablePtr) and $success = 1;
283	}
284	return $success;
285	}
286
287	1; # end
288
289	__END__
290
291	=head1 NAME
292
293	Image::ExifTool::Torrent - Read information from BitTorrent file
294
295	=head1 SYNOPSIS
296
297	This module is used by Image::ExifTool
298
299	=head1 DESCRIPTION
300
301	This module contains definitions required by Image::ExifTool to read
302	bencoded information from BitTorrent files.
303
304	=head1 AUTHOR
305
306	Copyright 2003-2021, Phil Harvey (philharvey66 at gmail.com)
307
308	This library is free software; you can redistribute it and/or modify it
309	under the same terms as Perl itself.
310
311	=head1 REFERENCES
312
313	=over 4
314
315	=item L<https://wiki.theory.org/BitTorrentSpecification>
316
317	=back
318
319	=head1 SEE ALSO
320
321	L<Image::ExifTool::TagNames/Torrent Tags>,
322	L<Image::ExifTool(3pm)\|Image::ExifTool>
323
324	=cut
325

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/perllib/cpan/Image/ExifTool/Torrent.pm@ 34921

Download in other formats: