[26923] | 1 | #!/usr/bin/perl
|
---|
| 2 |
|
---|
| 3 | # Given a large Greenstone import directory, create a random subset of that
|
---|
| 4 | # import collection with a specific document count. Uses symlinking so won't
|
---|
| 5 | # work well under windows.
|
---|
| 6 | # jmt12
|
---|
| 7 |
|
---|
| 8 | use strict;
|
---|
| 9 | use warnings;
|
---|
| 10 |
|
---|
| 11 | if (!defined $ARGV[0] || !-d $ARGV[0] || !defined $ARGV[1] || $ARGV[1] !~ /^\d+$/)
|
---|
| 12 | {
|
---|
| 13 | print "usage: importsubsetinator.pl <import directory> <max number of documents>\n";
|
---|
| 14 | exit(0);
|
---|
| 15 | }
|
---|
| 16 |
|
---|
| 17 | my $import_dir = $ARGV[0];
|
---|
| 18 | my $max_docs = $ARGV[1];
|
---|
| 19 |
|
---|
| 20 | my $subset_dir = 'import-' . $max_docs;
|
---|
| 21 | mkdir($subset_dir, 0755);
|
---|
| 22 |
|
---|
| 23 | # 1. While we haven't reached our target
|
---|
| 24 | print "Processing";
|
---|
| 25 | my $current_docs = 0;
|
---|
| 26 | while ($current_docs < $max_docs)
|
---|
| 27 | {
|
---|
| 28 | # 2. Find a random document
|
---|
| 29 | my $path = &pickRandomDoc($import_dir);
|
---|
| 30 | my $path_suffix = substr($path, length($import_dir) + 1);
|
---|
| 31 | # 3. Check we don't have it already
|
---|
| 32 | my $target_path = './' . $subset_dir . '/' . $path_suffix;
|
---|
| 33 | if (-f $target_path)
|
---|
| 34 | {
|
---|
| 35 | next;
|
---|
| 36 | }
|
---|
| 37 | # 4. Symlink it into the subset directory
|
---|
| 38 | &recursiveMkdir($subset_dir, $target_path);
|
---|
| 39 | my $cmd = "ln -s $path $target_path";
|
---|
| 40 | `$cmd`;
|
---|
| 41 | print ".";
|
---|
| 42 | # 5. Repeat until complete
|
---|
| 43 | $current_docs++;
|
---|
[29103] | 44 | if ($current_docs % 10000 == 0)
|
---|
[26923] | 45 | {
|
---|
[29103] | 46 | print '[' . $current_docs . "]\n";
|
---|
[26923] | 47 | }
|
---|
| 48 | }
|
---|
[29103] | 49 | print '[' . $current_docs . "]\n";
|
---|
[26923] | 50 | print "Complete!\n";
|
---|
| 51 | exit;
|
---|
| 52 |
|
---|
| 53 | sub pickRandomDoc
|
---|
| 54 | {
|
---|
| 55 | my ($dir) = @_;
|
---|
| 56 |
|
---|
| 57 | if (!opendir(DH, $dir))
|
---|
| 58 | {
|
---|
| 59 | die ("Failed to open import directory for reading!\n");
|
---|
| 60 | }
|
---|
| 61 | # get the files in this dir, but skip anything starting with a fullstop
|
---|
| 62 | my @files = grep {!/^\./} readdir(DH);
|
---|
| 63 | my $file = @files[int(rand(scalar(@files)))];
|
---|
| 64 | # found a directory or a file
|
---|
| 65 | my $path = $dir . '/' . $file;
|
---|
| 66 | # descend into directories
|
---|
| 67 | if (-d $path)
|
---|
| 68 | {
|
---|
| 69 | return &pickRandomDoc($path);
|
---|
| 70 | }
|
---|
| 71 | # return the file
|
---|
| 72 | else
|
---|
| 73 | {
|
---|
| 74 | return $path;
|
---|
| 75 | }
|
---|
| 76 | }
|
---|
| 77 |
|
---|
| 78 | sub recursiveMkdir
|
---|
| 79 | {
|
---|
| 80 | my ($subset_dir, $full_path) = @_;
|
---|
| 81 | my $test_path = $subset_dir;
|
---|
| 82 | # extract just the juicy part of the path
|
---|
| 83 | if ($full_path =~ /import-\d+\/(.+)\/[^\/]+\.txt/)
|
---|
| 84 | {
|
---|
| 85 | my $dirs = $1;
|
---|
| 86 | my @dir_parts = split(/\//, $dirs);
|
---|
| 87 | foreach my $dir (@dir_parts)
|
---|
| 88 | {
|
---|
| 89 | $test_path .= '/' . $dir;
|
---|
| 90 | if (!-d $test_path)
|
---|
| 91 | {
|
---|
| 92 | mkdir($test_path, 0755);
|
---|
| 93 | }
|
---|
| 94 | }
|
---|
| 95 | }
|
---|
| 96 | }
|
---|
| 97 |
|
---|
| 98 | 1;
|
---|