source: main/trunk/greenstone2/bin/script/anonymiselog.pl@ 31888

Last change on this file since 31888 was 5072, checked in by mdewsnip, 21 years ago

A small Perl script that anonymises usage logs. Uses the MD5 algorithm to hash the IP addresses and Greenstone user identifiers. Handles the occasional stray newline that appears in some of the logs.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 3.4 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# anonymiselog.pl -- anonymise a log file by MD5 hashing all IP addresses
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29
30use MD5;
31
32
33sub main
34{
35 # Get the name of the log file to process
36 local $logfilename = shift(@_);
37
38 # Check that the necessary arguments were supplied
39 if (!$logfilename) {
40 print STDERR "Usage: anonymiselog.pl <logfile-name>\n";
41 die "Error: Required argument missing.\n";
42 }
43
44 # Open the log file
45 open(LOG_FILE, "<$logfilename") or die "Error: Could not open log file $logfilename.\n";
46
47 # Open the output file
48 local $outfilename = $logfilename . ".anon";
49 open(OUT_FILE, ">$outfilename") or die "Error: Could not write file $outfilename.\n";
50
51 # Create a new MD5 (RSA Data Security Inc. MD5 Message Digest) object
52 local $md5 = new MD5;
53
54 # Process the log, one line at a time
55 local $entry = "";
56 while (<LOG_FILE>) {
57 local $line = $_;
58 # print "Line: $line";
59
60 # If this line starts a new entry, process the previous one
61 if ($line =~ /^\//) {
62 print OUT_FILE &anonymise_log_entry($entry);
63 $entry = "";
64 }
65
66 # Remove trailing whitespace, and skip blank lines
67 $line =~ s/(\s*)$//;
68 next if ($line =~ /^$/);
69
70 $entry = $entry . $line;
71 }
72
73 # Process the last entry
74 print OUT_FILE &anonymise_log_entry($entry);
75
76 # All done
77 close(LOG_FILE);
78 close(OUT_FILE);
79}
80
81
82sub anonymise_log_entry
83{
84 local $entry = shift(@_);
85 return "" if ($entry eq "");
86
87 # Parse the IP address from the entry
88 $entry =~ /^\S+\s((\w|-|\.)+)\s\[/;
89 if (!defined($1)) {
90 print STDERR "Could not extract IP address from entry: $entry\n";
91 return "";
92 }
93
94 # Casefold the IP address, hash using MD5, and take the last 16 characters
95 local $ipaddress = $1;
96 $ipaddress =~ tr/A-Z/a-z/;
97 local $hashedaddress = substr($md5->hexhash($ipaddress), -16);
98
99 # Replace the IP address with the hashed value
100 $entry =~ s/$ipaddress/$hashedaddress/ig;
101
102 # Parse the Greenstone user identifier (z variable) from the entry
103 $entry =~ /\sz=((\w|-|\.)+)/;
104 if (!defined($1)) {
105 print STDERR "No z variable in entry: $entry\n";
106 return "";
107 }
108
109 # Casefold the Greenstone user ID, hash using MD5, and take the last 16 characters
110 local $gsuserid = $1;
111 $gsuserid =~ tr/A-Z/a-z/;
112 local $hasheduserid = substr($md5->hexhash($gsuserid), -16);
113
114 # Replace the Greenstone user ID with the hashed value
115 $entry =~ s/$gsuserid/$hasheduserid/ig;
116 return $entry . "\n";
117}
118
119
120&main(@ARGV);
Note: See TracBrowser for help on using the repository browser.