root/gsdl/trunk/perllib/DateExtract.pm @ 17110

Revision 15895, 8.3 KB (checked in by mdewsnip, 11 years ago)

Added "use strict" to one more file, then fixed all the problems identified.

  • Property svn:keywords set to Author Date Id Revision
Line 
1package DateExtract;
2
3##use BasPlug; ## no, DON'T use BasPlug, BasPlug uses us....
4use sorttools;
5use strict;
6use util;
7
8#75% of the instances of the word century use full name ordinals
9my %ordinals = ("first" => 1, "second" => 2, "third" => 3, "fourth" => 4,
10             "fifth" => 5, "sixth" => 6, "seventh" => 7, "eighth" => 8,
11             "ninth" => 9, "tenth" => 10, "eleventh" => 11, "twelfth" => 12,
12             "thirteenth" => 13, "fourteenth" => 14, "fifteenth" => 15,
13             "sixteenth" => 16, "seventeenth" => 17, "eighteenth" => 18,
14             "nineteenth" => 19, "twentieth" => 20);
15
16
17             
18
19#definitions for a date grammar.
20my $fulOrd = join('|',(keys %ordinals));
21
22my @months = ("january","february","march","april","may","june","july",
23              "august","september","october","november","december");
24
25my $shortmth = "";
26foreach my $m (@months) { $shortmth .= (substr($m,0,3)."\\.?|"); }
27chop($shortmth);
28
29my $longmth = join('|',@months);
30
31
32my $Qualifier = "(B(\\.)?C(\\.)?(E(\\.)?)?)|(A(\\.)?D(\\.)?)|(C(\\.)E(\\.)?)";
33my $Century = "Cent(\\.|ur(y|ies))";
34my $Ord = "st|nd|rd|th";
35my $seasonref = "(spring|fall|autumn|winter|summer)";
36my $sep = " ?- ?";
37my $tri_digit = "(in|since|(the year)|($seasonref of)) \\d{3}\\D( ($Qualifier))?";
38my $centurydate = "(((\\d{1,2})($Ord))|($fulOrd)) ($Century)( ($Qualifier))?";
39my $qualified = "(($Qualifier) ?(\\d{1,4}))|((\\d{1,4}) ?($Qualifier))";
40my $millenium = "[1-9]\\d{3}";
41my $range = "(($millenium)($sep)($millenium))|(($millenium)($sep)(\\d{1,2}))|(($qualified)($sep)($qualified))|(($Qualifier) ?(\\d{1,3})($sep)(\\d{1,3}))|((\\d{1,3})($sep)(\\d{1,3}) ?($Qualifier))";
42
43my $pgnum = "(p(p|g)?\\.? ?\\d+((-|,)(\\d+))?)|(page \\d+)|(pages \\d+((-|,)(\\d+))?)";
44my $lgnum = "\\d{1,3},(\\d{3})+";
45my $colon = ":\\d+";
46my $money = "\$(\\d{1,3}(((,\\d{3})+)|\\d+))";  $money = "\\" . $money;
47my $microfilm = " reel([^\\.\\)])*(\\.|\\))";
48my $lastaltered = "last (edited|updated)";
49my $references = "reference(s?)(:?)\\n";
50my $cited = "work(s?) cited";
51my $biblio = "bibliography(:?)( ?)\\n";
52
53my $direction = "(N(\\.|o\\.|th\\.|orth))|(S(\\.|th\\.|outh))|(E(\\.|ast))|(W(\\.|est))";
54my $street_id = "(st\\.)|(street[^A-Za-z])|(ave\\.)|(avenue[^A-Za-z])|(boulevard[^A-Za-z])|(blvd\\.)|(rd\\.)|(road[^A-Za-z])";
55my $streetname = "(\\d{1,2}(<SUP>)?($Ord)(</SUP>)?)|([A-Za-z]+( [A-Za-z]+)?)|([A-Za-z]+-[A-Za-z]+($Ord))";
56my $address = "\\d{1,4} (($direction) )?($streetname) ($street_id)";
57
58my $bracket = "\\($millenium\\)";
59my $ref_end = ", ?$millenium\\)";
60my $colonsp = ": $millenium";
61my $reprint = "[Rr]eprint of \\d{4} edition";
62my $comma = ", $millenium\\.";
63my $fullstop = "\\. $millenium\\.";
64my $semi = "; $millenium\\.";
65
66my $lookalikes = "($pgnum)|($lgnum)|($colon)|($money)|($microfilm)|($address)";
67my $spurious = "($lastaltered)";
68my $bibheader = "($references)|($cited)|($biblio)";
69
70
71sub get_date_metadata {
72    #get the text of the document, the "document object" concerned,
73    #and the current section within the document
74    my ($text, $doc, $cursection, $keep_bib, $max_year, $max_century) = @_;
75   
76    #format a prechristian maximum century value to be negative so that it can
77    #be used in numeric comparison
78    if($max_century =~ /B/)
79    {
80        $max_century = $`;
81        $max_century =~ /\d+/;
82        $max_century = $&;
83        $max_century *=-1
84    }
85   
86    my $extr = &remove_excess($text);   
87    #print "EXTRACTION TEXT:\n $extr";
88    $extr = &remove_tags($extr);
89    if(!$keep_bib){
90        $extr = &remove_biblio($extr);
91    }
92 
93
94    my @datelist = ();
95    while($extr =~ m!($range)|($millenium)|($qualified)|($centurydate)|($tri_digit)!i)
96    {
97        $extr = $';
98        my $fulldate = $&;
99        if ($fulldate =~ /$centurydate/i)
100        {
101            if($max_century!=-1)
102            {
103
104                my $date = $fulldate; if($date =~ /\d+/) {$date = $&;}
105                else
106                {
107                    $date=$fulldate; $date =~ m! ($Century)!i; $date = $`;
108                    $date =~ tr/A-Z/a-z/;
109                    $date = $ordinals{$date};
110                }
111                if($max_century >= $date){
112                    $date = ($date-1)*100 +1;
113                    #if it BC, make it negative
114                    $date = &convert_bc($fulldate,$date);
115                    my $end = $date + 99;
116                    my @century = ($date..$end);
117                    @datelist = (@datelist,@century);
118                }
119            }
120        }
121       
122        elsif($fulldate =~ /$range/)
123        {
124            $fulldate =~ /$sep/;
125            my @addlist = ();
126            #print "Range: $fulldate\n";
127            my $fullfirst = $`;
128            my $fullsecond = $';
129            $fullfirst =~ /\d+/;
130        my $first = $&;
131            $fullsecond =~ /\d+/;
132        my $second = $&;
133            my $len1 = length($first);
134            my $len2 = length($second);
135            $second = (substr($first,0,($len1-$len2))).$second;
136            $first = &convert_bc($fullfirst,$first);
137            $second = &convert_bc($fullsecond,$second);
138            @addlist = ($first..$second);
139            @datelist = (@datelist,@addlist);
140           
141        }
142        else {
143           
144            my $date = $fulldate; $date =~ /\d+/; $date = $&; 
145            $date = &convert_bc($fulldate,$date);
146            #add the date metadata
147            push(@datelist,$date);
148            #print "datelist @datelist\n"
149        }
150       
151    }
152   
153    if(@datelist){
154        @datelist = sort { $a <=> $b } @datelist;
155        @datelist = &post_process($max_year, @datelist);
156        foreach my $date (@datelist)
157        {
158            if($date>0){
159                $doc->add_metadata($cursection,"Coverage",$date);}
160            else{
161                $doc->add_metadata($cursection,"Coverage","bc".(-1*$date));}
162               
163        }
164    }
165}
166sub convert_bc {
167    my ($full,$num) = @_;
168    if ($full =~ /B/) { $num *= -1; }
169    $num;
170}
171
172sub post_process {
173    my ($max_year, @list) = @_;
174    my @cleanlist = ();
175    my $prev = 0;
176    foreach my $e (@list) {
177        if ($e!=$prev && $e <= $max_year) {
178            push(@cleanlist, $e);
179        }
180        $prev = $e;
181    }
182    @cleanlist;
183}
184
185
186#removes all html tags from that data, as they will not contain dates which
187#are part of the content of the document, and therefore interesting, but do
188#contain date lookalikes
189sub remove_tags {
190    my ($tmp) = @_;
191   
192    my $parsed = "";
193    #while there is still text to be parsed and tags are still found
194    while($tmp=~ m!<([^>])*(>|$)! && $tmp ne "")
195    {
196        $parsed .= $`;#keep all that is not in a tag
197        $tmp = $';    #restart the search after then end of the tag
198    }
199    $parsed .= $tmp; #add anything after the last match
200    $parsed;
201}
202
203
204sub remove_excess {
205    my ($tmp) = @_;
206    my $parsed = "";
207   
208
209    if(($tmp =~ m!($spurious)|($lookalikes)!i) == 0 )
210    {
211        $parsed = $tmp;
212    }
213    else {
214        while ($tmp =~ m!($spurious)|($lookalikes)!i
215               && $tmp ne "")
216        {
217            $parsed .= $`;
218            my $storage = $&;
219            $tmp = $';
220            #match the pattern which indicates most recent alteration
221            if ($storage =~ m!$lastaltered!i)
222            {
223                #match a four digit year or up until the first /
224                #(as in last edited 3/97).
225                $tmp =~ m!($millenium)|(\/)!;
226                $tmp = $';
227            }
228               
229        }
230       
231        $parsed .= $tmp;
232           
233    }
234    #print "Parsed:\n $parsed\n\n";
235    $parsed;
236   
237}
238
239sub remove_biblio{
240    my ($tmp) = @_;
241    my $parsed = "";
242   
243    if($tmp =~ m!$bibheader!i)
244    {
245        $tmp=$`;
246    }
247   
248    $tmp =~ s/( |\t)+/ /g;
249    if(($tmp =~ m!($ref_end)|($bracket)|($colonsp)|($reprint)|($comma)|($fullstop)|($semi)|($seasonref) ($millenium)!i) == 0)
250    {
251        $parsed = $tmp;
252    }
253    else{
254
255        #print "removing bib\n";
256        while ($tmp =~ m!($ref_end)|($bracket)|($colonsp)|($reprint)|($comma)|($fullstop)|($semi)|(($seasonref) ($millenium))|($bibheader)!i && $tmp ne "")
257        {
258           
259            $parsed .= $`;
260            $tmp = $';
261            if($&=~m!($comma)|($fullstop)!)
262            {
263               
264                my $date = $&;
265                if($parsed =~ m!((\d)($Ord)$)|(($shortmth)$)|(($longmth)$)!i)
266                {
267                    $parsed .= $date;
268                }
269            } 
270           
271        }
272                   $parsed .= $tmp;
273    }
274    $parsed;
275}
276
277
2781;
Note: See TracBrowser for help on using the browser.