#!/usr/bin/perl # Given a large Greenstone import directory, create a random subset of that # import collection with a specific document count. Uses symlinking so won't # work well under windows. # jmt12 use strict; use warnings; if (!defined $ARGV[0] || !-d $ARGV[0] || !defined $ARGV[1] || $ARGV[1] !~ /^\d+$/) { print "usage: importsubsetinator.pl \n"; exit(0); } my $import_dir = $ARGV[0]; my $max_docs = $ARGV[1]; my $subset_dir = 'import-' . $max_docs; mkdir($subset_dir, 0755); # 1. While we haven't reached our target print "Processing"; my $current_docs = 0; while ($current_docs < $max_docs) { # 2. Find a random document my $path = &pickRandomDoc($import_dir); my $path_suffix = substr($path, length($import_dir) + 1); # 3. Check we don't have it already my $target_path = './' . $subset_dir . '/' . $path_suffix; if (-f $target_path) { next; } # 4. Symlink it into the subset directory &recursiveMkdir($subset_dir, $target_path); my $cmd = "ln -s $path $target_path"; `$cmd`; print "."; # 5. Repeat until complete $current_docs++; if ((10 + $current_docs) % 80 == 0) { print "\n"; } } if ((10 + $current_docs) % 80 != 0) { print "\n"; } print "Complete!\n"; exit; sub pickRandomDoc { my ($dir) = @_; if (!opendir(DH, $dir)) { die ("Failed to open import directory for reading!\n"); } # get the files in this dir, but skip anything starting with a fullstop my @files = grep {!/^\./} readdir(DH); my $file = @files[int(rand(scalar(@files)))]; # found a directory or a file my $path = $dir . '/' . $file; # descend into directories if (-d $path) { return &pickRandomDoc($path); } # return the file else { return $path; } } sub recursiveMkdir { my ($subset_dir, $full_path) = @_; my $test_path = $subset_dir; # extract just the juicy part of the path if ($full_path =~ /import-\d+\/(.+)\/[^\/]+\.txt/) { my $dirs = $1; my @dir_parts = split(/\//, $dirs); foreach my $dir (@dir_parts) { $test_path .= '/' . $dir; if (!-d $test_path) { mkdir($test_path, 0755); } } } } 1;