source: gs2-extensions/parallel-building/trunk/src/bin/script/importsubsetinator.pl@ 30354

Last change on this file since 30354 was 29103, checked in by jmt12, 10 years ago

updated - not any more efficient (Schlemiel the painter performance) but at least there is a little more feedback

  • Property svn:executable set to *
File size: 2.1 KB
Line 
1#!/usr/bin/perl
2
3# Given a large Greenstone import directory, create a random subset of that
4# import collection with a specific document count. Uses symlinking so won't
5# work well under windows.
6# jmt12
7
8use strict;
9use warnings;
10
11if (!defined $ARGV[0] || !-d $ARGV[0] || !defined $ARGV[1] || $ARGV[1] !~ /^\d+$/)
12{
13 print "usage: importsubsetinator.pl <import directory> <max number of documents>\n";
14 exit(0);
15}
16
17my $import_dir = $ARGV[0];
18my $max_docs = $ARGV[1];
19
20my $subset_dir = 'import-' . $max_docs;
21mkdir($subset_dir, 0755);
22
23# 1. While we haven't reached our target
24print "Processing";
25my $current_docs = 0;
26while ($current_docs < $max_docs)
27{
28 # 2. Find a random document
29 my $path = &pickRandomDoc($import_dir);
30 my $path_suffix = substr($path, length($import_dir) + 1);
31 # 3. Check we don't have it already
32 my $target_path = './' . $subset_dir . '/' . $path_suffix;
33 if (-f $target_path)
34 {
35 next;
36 }
37 # 4. Symlink it into the subset directory
38 &recursiveMkdir($subset_dir, $target_path);
39 my $cmd = "ln -s $path $target_path";
40 `$cmd`;
41 print ".";
42 # 5. Repeat until complete
43 $current_docs++;
44 if ($current_docs % 10000 == 0)
45 {
46 print '[' . $current_docs . "]\n";
47 }
48}
49print '[' . $current_docs . "]\n";
50print "Complete!\n";
51exit;
52
53sub pickRandomDoc
54{
55 my ($dir) = @_;
56
57 if (!opendir(DH, $dir))
58 {
59 die ("Failed to open import directory for reading!\n");
60 }
61 # get the files in this dir, but skip anything starting with a fullstop
62 my @files = grep {!/^\./} readdir(DH);
63 my $file = @files[int(rand(scalar(@files)))];
64 # found a directory or a file
65 my $path = $dir . '/' . $file;
66 # descend into directories
67 if (-d $path)
68 {
69 return &pickRandomDoc($path);
70 }
71 # return the file
72 else
73 {
74 return $path;
75 }
76}
77
78sub recursiveMkdir
79{
80 my ($subset_dir, $full_path) = @_;
81 my $test_path = $subset_dir;
82 # extract just the juicy part of the path
83 if ($full_path =~ /import-\d+\/(.+)\/[^\/]+\.txt/)
84 {
85 my $dirs = $1;
86 my @dir_parts = split(/\//, $dirs);
87 foreach my $dir (@dir_parts)
88 {
89 $test_path .= '/' . $dir;
90 if (!-d $test_path)
91 {
92 mkdir($test_path, 0755);
93 }
94 }
95 }
96}
97
981;
Note: See TracBrowser for help on using the repository browser.