source: gsdl/trunk/perllib/DateExtract.pm@ 15895

Last change on this file since 15895 was 15895, checked in by mdewsnip, 16 years ago

Added "use strict" to one more file, then fixed all the problems identified.

  • Property svn:keywords set to Author Date Id Revision
File size: 8.3 KB
Line 
1package DateExtract;
2
3##use BasPlug; ## no, DON'T use BasPlug, BasPlug uses us....
4use sorttools;
5use strict;
6use util;
7
8#75% of the instances of the word century use full name ordinals
9my %ordinals = ("first" => 1, "second" => 2, "third" => 3, "fourth" => 4,
10 "fifth" => 5, "sixth" => 6, "seventh" => 7, "eighth" => 8,
11 "ninth" => 9, "tenth" => 10, "eleventh" => 11, "twelfth" => 12,
12 "thirteenth" => 13, "fourteenth" => 14, "fifteenth" => 15,
13 "sixteenth" => 16, "seventeenth" => 17, "eighteenth" => 18,
14 "nineteenth" => 19, "twentieth" => 20);
15
16
17
18
19#definitions for a date grammar.
20my $fulOrd = join('|',(keys %ordinals));
21
22my @months = ("january","february","march","april","may","june","july",
23 "august","september","october","november","december");
24
25my $shortmth = "";
26foreach my $m (@months) { $shortmth .= (substr($m,0,3)."\\.?|"); }
27chop($shortmth);
28
29my $longmth = join('|',@months);
30
31
32my $Qualifier = "(B(\\.)?C(\\.)?(E(\\.)?)?)|(A(\\.)?D(\\.)?)|(C(\\.)E(\\.)?)";
33my $Century = "Cent(\\.|ur(y|ies))";
34my $Ord = "st|nd|rd|th";
35my $seasonref = "(spring|fall|autumn|winter|summer)";
36my $sep = " ?- ?";
37my $tri_digit = "(in|since|(the year)|($seasonref of)) \\d{3}\\D( ($Qualifier))?";
38my $centurydate = "(((\\d{1,2})($Ord))|($fulOrd)) ($Century)( ($Qualifier))?";
39my $qualified = "(($Qualifier) ?(\\d{1,4}))|((\\d{1,4}) ?($Qualifier))";
40my $millenium = "[1-9]\\d{3}";
41my $range = "(($millenium)($sep)($millenium))|(($millenium)($sep)(\\d{1,2}))|(($qualified)($sep)($qualified))|(($Qualifier) ?(\\d{1,3})($sep)(\\d{1,3}))|((\\d{1,3})($sep)(\\d{1,3}) ?($Qualifier))";
42
43my $pgnum = "(p(p|g)?\\.? ?\\d+((-|,)(\\d+))?)|(page \\d+)|(pages \\d+((-|,)(\\d+))?)";
44my $lgnum = "\\d{1,3},(\\d{3})+";
45my $colon = ":\\d+";
46my $money = "\$(\\d{1,3}(((,\\d{3})+)|\\d+))"; $money = "\\" . $money;
47my $microfilm = " reel([^\\.\\)])*(\\.|\\))";
48my $lastaltered = "last (edited|updated)";
49my $references = "reference(s?)(:?)\\n";
50my $cited = "work(s?) cited";
51my $biblio = "bibliography(:?)( ?)\\n";
52
53my $direction = "(N(\\.|o\\.|th\\.|orth))|(S(\\.|th\\.|outh))|(E(\\.|ast))|(W(\\.|est))";
54my $street_id = "(st\\.)|(street[^A-Za-z])|(ave\\.)|(avenue[^A-Za-z])|(boulevard[^A-Za-z])|(blvd\\.)|(rd\\.)|(road[^A-Za-z])";
55my $streetname = "(\\d{1,2}(<SUP>)?($Ord)(</SUP>)?)|([A-Za-z]+( [A-Za-z]+)?)|([A-Za-z]+-[A-Za-z]+($Ord))";
56my $address = "\\d{1,4} (($direction) )?($streetname) ($street_id)";
57
58my $bracket = "\\($millenium\\)";
59my $ref_end = ", ?$millenium\\)";
60my $colonsp = ": $millenium";
61my $reprint = "[Rr]eprint of \\d{4} edition";
62my $comma = ", $millenium\\.";
63my $fullstop = "\\. $millenium\\.";
64my $semi = "; $millenium\\.";
65
66my $lookalikes = "($pgnum)|($lgnum)|($colon)|($money)|($microfilm)|($address)";
67my $spurious = "($lastaltered)";
68my $bibheader = "($references)|($cited)|($biblio)";
69
70
71sub get_date_metadata {
72 #get the text of the document, the "document object" concerned,
73 #and the current section within the document
74 my ($text, $doc, $cursection, $keep_bib, $max_year, $max_century) = @_;
75
76 #format a prechristian maximum century value to be negative so that it can
77 #be used in numeric comparison
78 if($max_century =~ /B/)
79 {
80 $max_century = $`;
81 $max_century =~ /\d+/;
82 $max_century = $&;
83 $max_century *=-1
84 }
85
86 my $extr = &remove_excess($text);
87 #print "EXTRACTION TEXT:\n $extr";
88 $extr = &remove_tags($extr);
89 if(!$keep_bib){
90 $extr = &remove_biblio($extr);
91 }
92
93
94 my @datelist = ();
95 while($extr =~ m!($range)|($millenium)|($qualified)|($centurydate)|($tri_digit)!i)
96 {
97 $extr = $';
98 my $fulldate = $&;
99 if ($fulldate =~ /$centurydate/i)
100 {
101 if($max_century!=-1)
102 {
103
104 my $date = $fulldate; if($date =~ /\d+/) {$date = $&;}
105 else
106 {
107 $date=$fulldate; $date =~ m! ($Century)!i; $date = $`;
108 $date =~ tr/A-Z/a-z/;
109 $date = $ordinals{$date};
110 }
111 if($max_century >= $date){
112 $date = ($date-1)*100 +1;
113 #if it BC, make it negative
114 $date = &convert_bc($fulldate,$date);
115 my $end = $date + 99;
116 my @century = ($date..$end);
117 @datelist = (@datelist,@century);
118 }
119 }
120 }
121
122 elsif($fulldate =~ /$range/)
123 {
124 $fulldate =~ /$sep/;
125 my @addlist = ();
126 #print "Range: $fulldate\n";
127 my $fullfirst = $`;
128 my $fullsecond = $';
129 $fullfirst =~ /\d+/;
130 my $first = $&;
131 $fullsecond =~ /\d+/;
132 my $second = $&;
133 my $len1 = length($first);
134 my $len2 = length($second);
135 $second = (substr($first,0,($len1-$len2))).$second;
136 $first = &convert_bc($fullfirst,$first);
137 $second = &convert_bc($fullsecond,$second);
138 @addlist = ($first..$second);
139 @datelist = (@datelist,@addlist);
140
141 }
142 else {
143
144 my $date = $fulldate; $date =~ /\d+/; $date = $&;
145 $date = &convert_bc($fulldate,$date);
146 #add the date metadata
147 push(@datelist,$date);
148 #print "datelist @datelist\n"
149 }
150
151 }
152
153 if(@datelist){
154 @datelist = sort { $a <=> $b } @datelist;
155 @datelist = &post_process($max_year, @datelist);
156 foreach my $date (@datelist)
157 {
158 if($date>0){
159 $doc->add_metadata($cursection,"Coverage",$date);}
160 else{
161 $doc->add_metadata($cursection,"Coverage","bc".(-1*$date));}
162
163 }
164 }
165}
166sub convert_bc {
167 my ($full,$num) = @_;
168 if ($full =~ /B/) { $num *= -1; }
169 $num;
170}
171
172sub post_process {
173 my ($max_year, @list) = @_;
174 my @cleanlist = ();
175 my $prev = 0;
176 foreach my $e (@list) {
177 if ($e!=$prev && $e <= $max_year) {
178 push(@cleanlist, $e);
179 }
180 $prev = $e;
181 }
182 @cleanlist;
183}
184
185
186#removes all html tags from that data, as they will not contain dates which
187#are part of the content of the document, and therefore interesting, but do
188#contain date lookalikes
189sub remove_tags {
190 my ($tmp) = @_;
191
192 my $parsed = "";
193 #while there is still text to be parsed and tags are still found
194 while($tmp=~ m!<([^>])*(>|$)! && $tmp ne "")
195 {
196 $parsed .= $`;#keep all that is not in a tag
197 $tmp = $'; #restart the search after then end of the tag
198 }
199 $parsed .= $tmp; #add anything after the last match
200 $parsed;
201}
202
203
204sub remove_excess {
205 my ($tmp) = @_;
206 my $parsed = "";
207
208
209 if(($tmp =~ m!($spurious)|($lookalikes)!i) == 0 )
210 {
211 $parsed = $tmp;
212 }
213 else {
214 while ($tmp =~ m!($spurious)|($lookalikes)!i
215 && $tmp ne "")
216 {
217 $parsed .= $`;
218 my $storage = $&;
219 $tmp = $';
220 #match the pattern which indicates most recent alteration
221 if ($storage =~ m!$lastaltered!i)
222 {
223 #match a four digit year or up until the first /
224 #(as in last edited 3/97).
225 $tmp =~ m!($millenium)|(\/)!;
226 $tmp = $';
227 }
228
229 }
230
231 $parsed .= $tmp;
232
233 }
234 #print "Parsed:\n $parsed\n\n";
235 $parsed;
236
237}
238
239sub remove_biblio{
240 my ($tmp) = @_;
241 my $parsed = "";
242
243 if($tmp =~ m!$bibheader!i)
244 {
245 $tmp=$`;
246 }
247
248 $tmp =~ s/( |\t)+/ /g;
249 if(($tmp =~ m!($ref_end)|($bracket)|($colonsp)|($reprint)|($comma)|($fullstop)|($semi)|($seasonref) ($millenium)!i) == 0)
250 {
251 $parsed = $tmp;
252 }
253 else{
254
255 #print "removing bib\n";
256 while ($tmp =~ m!($ref_end)|($bracket)|($colonsp)|($reprint)|($comma)|($fullstop)|($semi)|(($seasonref) ($millenium))|($bibheader)!i && $tmp ne "")
257 {
258
259 $parsed .= $`;
260 $tmp = $';
261 if($&=~m!($comma)|($fullstop)!)
262 {
263
264 my $date = $&;
265 if($parsed =~ m!((\d)($Ord)$)|(($shortmth)$)|(($longmth)$)!i)
266 {
267 $parsed .= $date;
268 }
269 }
270
271 }
272 $parsed .= $tmp;
273 }
274 $parsed;
275}
276
277
2781;
Note: See TracBrowser for help on using the repository browser.