1 | #!/usr/bin/perl
|
---|
2 |
|
---|
3 | # Given a large Greenstone import directory, create a random subset of that
|
---|
4 | # import collection with a specific document count. Uses symlinking so won't
|
---|
5 | # work well under windows.
|
---|
6 | # jmt12
|
---|
7 |
|
---|
8 | use strict;
|
---|
9 | use warnings;
|
---|
10 |
|
---|
11 | if (!defined $ARGV[0] || !-d $ARGV[0] || !defined $ARGV[1] || $ARGV[1] !~ /^\d+$/)
|
---|
12 | {
|
---|
13 | print "usage: importsubsetinator.pl <import directory> <max number of documents>\n";
|
---|
14 | exit(0);
|
---|
15 | }
|
---|
16 |
|
---|
17 | my $import_dir = $ARGV[0];
|
---|
18 | my $max_docs = $ARGV[1];
|
---|
19 |
|
---|
20 | my $subset_dir = 'import-' . $max_docs;
|
---|
21 | mkdir($subset_dir, 0755);
|
---|
22 |
|
---|
23 | # 1. While we haven't reached our target
|
---|
24 | print "Processing";
|
---|
25 | my $current_docs = 0;
|
---|
26 | while ($current_docs < $max_docs)
|
---|
27 | {
|
---|
28 | # 2. Find a random document
|
---|
29 | my $path = &pickRandomDoc($import_dir);
|
---|
30 | my $path_suffix = substr($path, length($import_dir) + 1);
|
---|
31 | # 3. Check we don't have it already
|
---|
32 | my $target_path = './' . $subset_dir . '/' . $path_suffix;
|
---|
33 | if (-f $target_path)
|
---|
34 | {
|
---|
35 | next;
|
---|
36 | }
|
---|
37 | # 4. Symlink it into the subset directory
|
---|
38 | &recursiveMkdir($subset_dir, $target_path);
|
---|
39 | my $cmd = "ln -s $path $target_path";
|
---|
40 | `$cmd`;
|
---|
41 | print ".";
|
---|
42 | # 5. Repeat until complete
|
---|
43 | $current_docs++;
|
---|
44 | if ((10 + $current_docs) % 80 == 0)
|
---|
45 | {
|
---|
46 | print "\n";
|
---|
47 | }
|
---|
48 | }
|
---|
49 | if ((10 + $current_docs) % 80 != 0)
|
---|
50 | {
|
---|
51 | print "\n";
|
---|
52 | }
|
---|
53 | print "Complete!\n";
|
---|
54 | exit;
|
---|
55 |
|
---|
56 | sub pickRandomDoc
|
---|
57 | {
|
---|
58 | my ($dir) = @_;
|
---|
59 |
|
---|
60 | if (!opendir(DH, $dir))
|
---|
61 | {
|
---|
62 | die ("Failed to open import directory for reading!\n");
|
---|
63 | }
|
---|
64 | # get the files in this dir, but skip anything starting with a fullstop
|
---|
65 | my @files = grep {!/^\./} readdir(DH);
|
---|
66 | my $file = @files[int(rand(scalar(@files)))];
|
---|
67 | # found a directory or a file
|
---|
68 | my $path = $dir . '/' . $file;
|
---|
69 | # descend into directories
|
---|
70 | if (-d $path)
|
---|
71 | {
|
---|
72 | return &pickRandomDoc($path);
|
---|
73 | }
|
---|
74 | # return the file
|
---|
75 | else
|
---|
76 | {
|
---|
77 | return $path;
|
---|
78 | }
|
---|
79 | }
|
---|
80 |
|
---|
81 | sub recursiveMkdir
|
---|
82 | {
|
---|
83 | my ($subset_dir, $full_path) = @_;
|
---|
84 | my $test_path = $subset_dir;
|
---|
85 | # extract just the juicy part of the path
|
---|
86 | if ($full_path =~ /import-\d+\/(.+)\/[^\/]+\.txt/)
|
---|
87 | {
|
---|
88 | my $dirs = $1;
|
---|
89 | my @dir_parts = split(/\//, $dirs);
|
---|
90 | foreach my $dir (@dir_parts)
|
---|
91 | {
|
---|
92 | $test_path .= '/' . $dir;
|
---|
93 | if (!-d $test_path)
|
---|
94 | {
|
---|
95 | mkdir($test_path, 0755);
|
---|
96 | }
|
---|
97 | }
|
---|
98 | }
|
---|
99 | }
|
---|
100 |
|
---|
101 | 1;
|
---|