source: gs2-extensions/parallel-building/trunk/src/bin/script/importsubsetinator.pl@ 26923

Last change on this file since 26923 was 26923, checked in by jmt12, 11 years ago

Generates a specficied-size subset of a larger import directory

  • Property svn:executable set to *
File size: 2.1 KB
Line 
1#!/usr/bin/perl
2
3# Given a large Greenstone import directory, create a random subset of that
4# import collection with a specific document count. Uses symlinking so won't
5# work well under windows.
6# jmt12
7
8use strict;
9use warnings;
10
11if (!defined $ARGV[0] || !-d $ARGV[0] || !defined $ARGV[1] || $ARGV[1] !~ /^\d+$/)
12{
13 print "usage: importsubsetinator.pl <import directory> <max number of documents>\n";
14 exit(0);
15}
16
17my $import_dir = $ARGV[0];
18my $max_docs = $ARGV[1];
19
20my $subset_dir = 'import-' . $max_docs;
21mkdir($subset_dir, 0755);
22
23# 1. While we haven't reached our target
24print "Processing";
25my $current_docs = 0;
26while ($current_docs < $max_docs)
27{
28 # 2. Find a random document
29 my $path = &pickRandomDoc($import_dir);
30 my $path_suffix = substr($path, length($import_dir) + 1);
31 # 3. Check we don't have it already
32 my $target_path = './' . $subset_dir . '/' . $path_suffix;
33 if (-f $target_path)
34 {
35 next;
36 }
37 # 4. Symlink it into the subset directory
38 &recursiveMkdir($subset_dir, $target_path);
39 my $cmd = "ln -s $path $target_path";
40 `$cmd`;
41 print ".";
42 # 5. Repeat until complete
43 $current_docs++;
44 if ((10 + $current_docs) % 80 == 0)
45 {
46 print "\n";
47 }
48}
49if ((10 + $current_docs) % 80 != 0)
50{
51 print "\n";
52}
53print "Complete!\n";
54exit;
55
56sub pickRandomDoc
57{
58 my ($dir) = @_;
59
60 if (!opendir(DH, $dir))
61 {
62 die ("Failed to open import directory for reading!\n");
63 }
64 # get the files in this dir, but skip anything starting with a fullstop
65 my @files = grep {!/^\./} readdir(DH);
66 my $file = @files[int(rand(scalar(@files)))];
67 # found a directory or a file
68 my $path = $dir . '/' . $file;
69 # descend into directories
70 if (-d $path)
71 {
72 return &pickRandomDoc($path);
73 }
74 # return the file
75 else
76 {
77 return $path;
78 }
79}
80
81sub recursiveMkdir
82{
83 my ($subset_dir, $full_path) = @_;
84 my $test_path = $subset_dir;
85 # extract just the juicy part of the path
86 if ($full_path =~ /import-\d+\/(.+)\/[^\/]+\.txt/)
87 {
88 my $dirs = $1;
89 my @dir_parts = split(/\//, $dirs);
90 foreach my $dir (@dir_parts)
91 {
92 $test_path .= '/' . $dir;
93 if (!-d $test_path)
94 {
95 mkdir($test_path, 0755);
96 }
97 }
98 }
99}
100
1011;
Note: See TracBrowser for help on using the repository browser.