source: other-projects/book-stumper/trunk/extract-posts.pl@ 33353

Last change on this file since 33353 was 33353, checked in by davidb, 5 years ago

Initial set of files to page scrape and turn in the OpenOffice spreadsheet format

  • Property svn:executable set to *
File size: 19.3 KB
Line 
1#!/usr/bin/perl -w
2
3use v5.10;
4use strict;
5
6use Mojo::DOM;
7
8my $FOUT = undef;
9my $HOUT = undef;
10my $REFNUM = 1;
11
12sub tidy_header_field
13{
14 my ($prefix,$untidy_text) = @_;
15 my $tidy_text = $untidy_text;
16 $tidy_text =~ s/^$prefix:\s*//;
17
18 $tidy_text =~ s/<\s*>//gs;
19
20 return $tidy_text;
21}
22
23sub store_header_field
24{
25 my ($untidy_text,$header_record) = @_;
26
27 my ($header_label,$header_value) = ($untidy_text =~ m/^(.*?)\s*:\s*(.*)$/s);
28
29 $header_value =~ s/<\s*>//gs;
30
31 $header_record->{$header_label} = $header_value;
32}
33
34sub print_csv_escaped
35{
36 my ($text) = @_;
37
38 my $escaped_text = $text;
39
40 $escaped_text =~ s/\"/\"\"/g;
41 $escaped_text =~ s/\n/ /gs;
42
43 print $FOUT ",\"$escaped_text\"";
44}
45
46
47# plain text
48# <Row ss:AutoFitHeight="0">
49# <Cell ss:Index="2" ss:StyleID="s62"><Data ss:Type="String">On Sat, 1 Jan 2000, Richard Griscom wrote: &gt; &gt; So. It still works.</Data></Cell>
50# </Row>
51
52
53# Hyperlinked cell
54# <Row>
55# <Cell ss:StyleID="s63" ss:HRef="foo.html">
56# <Data ss:Type="String">Test Boldfoo.html</Data>
57# </Cell>
58# </Row>
59
60sub print_spreadsheet_text_cell_xml
61{
62 my ($text) = @_;
63
64 $text =~ s/&/&amp;/g;
65
66 $text =~ s/</&lt;/g;
67 $text =~ s/>/&gt;/g;
68 $text =~ s/"/&quot;/g;
69
70 print $FOUT " <Cellss:StyleID=\"sPlainText\"><ss:Data ss:Type=\"String\">$text</ss:Data></Cell>\n";
71}
72
73# html
74# <Row ss:Index="7" ss:AutoFitHeight="0" ss:Height="13.5">
75# <Cell ss:Index="2" ss:StyleID="s66"><ss:Data ss:Type="String"
76# xmlns="http://www.w3.org/TR/REC-html40"><I><Font html:Color="#000000">YYYY Happy </Font></I><B><I><Font
77# html:Color="#000000">New</Font></I></B><I><Font html:Color="#000000"> Year Dick&#10; et al !!</Font></I></ss:Data></Cell>
78# </Row>
79
80sub print_spreadsheet_html_cell_xml_orig
81{
82 my ($html) = @_;
83
84# $html =~ s/<br\s*\/?>/&#10;/ig;
85 $html =~ s/<br\s*>/&#10;/ig;
86 $html =~ s/<\/br>//ig;
87
88 $html =~ s/<p[^>]*>(.*?)<\/p>/$1&#10;/ig;
89 $html =~ s/<div[^>]*>(.*?)<\/div>/$1&#10;/ig;
90
91 $html =~ s/<[^>]*>//g;
92# $html =~ s/<\/?(font|big|small)[^>]*>//ig;
93
94
95
96 print $FOUT " <Cell ss:StyleID=\"sHtmlWrap\"><ss:Data ss:Type=\"String\" xmlns=\"http://www.w3.org/TR/REC-html40\">$html</ss:Data></Cell>\n";
97}
98
99sub ucattrs
100{
101 my ($inner) = @_;
102
103 ##$inner =~ s/(\w+)=("[^"]+")/"".uc($1)."=".$2/ge;
104 $inner =~ s/(\w+)=("[^"]+")/\u$1=$2/g;
105
106 return $inner;
107}
108
109sub ucattrsNS
110{
111 my ($tag,$inner) = @_;
112
113 if ($tag =~ m/^Font$/i) {
114 $inner =~ s/(\w+)=("[^"]+")/html:\u$1=$2/g;
115 }
116 else {
117 $inner =~ s/(\w+)=("[^"]+")/\u$1=$2/g;
118 }
119
120 return $inner;
121}
122
123
124sub print_spreadsheet_html_cell_xml_b_i_font
125{
126 my ($html) = @_;
127
128 $html =~ s/<br([^>]*)>/<br$1\/>/ig;
129 $html =~ s/<hr([^>]*)>/<hr$1\/>/ig;
130 $html =~ s/<img([^>]*)>/<img$1\/>/ig;
131
132 $html =~ s/<br\s*\/?>/&#10;/ig;
133# $html =~ s/<\/br>//ig;
134
135 $html =~ s/<p[^>]*>(.*?)<\/p>/$1&#10;/isg;
136# $html =~ s/<p[^>]*>/&#10;/ig;
137 $html =~ s/<div[^>]*>(.*?)<\/div>/$1&#10;/isg;
138
139# $html =~ s/<[^>]*>//g;
140
141 $html =~ s/<table/<div/ig;
142 $html =~ s/<\/table/<\/div/ig;
143
144 $html =~ s/<tr/<div/ig;
145 $html =~ s/<\/tr/<\/div/ig;
146
147 $html =~ s/<td/<div/ig;
148 $html =~ s/<\/td/<\/div/ig;
149
150 # Make HTML all uppercase, as this seems to be needed for spreadsheet format
151
152# $html =~ s/<(\w+)([^>]*)>/"<".uc($1).$2.">"/ge;
153# $html =~ s/<\/(\w+)([^>]*)>/"<\/".uc($1).$2.">"/ge;
154
155 $html =~ s/<(\w+)([^>]*)>/"<".ucfirst($1).$2.">"/ge;
156 $html =~ s/<\/(\w+)([^>]*)>/"<\/".ucfirst($1).$2.">"/ge;
157
158 ##$html =~ s/<([^>]*)(\w+="[^"]+")([^>]*)>/"<".$1.uc($2).$3.">"/ge;
159 $html =~ s/<(\w+)([^>]*)>/"<".$1.ucattrs($2).">"/ge;
160
161 $html =~ s/<Font(.*?)Size=\"[^"]*\"/<Font$1/gs;
162 $html =~ s/<Font(.*?)Face=\"[^"]*\"/<Font$1/gs;
163 $html =~ s/<Big[^>]*>(.*?)<\/Big>/$1/sg;
164 $html =~ s/<Span[^>]*>(.*?)<\/Span>/$1/sg;
165
166 $html =~ s/<[^>]*>//g;
167
168 $html =~ s/^(&#10;)+//s;
169
170
171 print $FOUT " <Cell ss:StyleID=\"sHtmlWrap\"><ss:Data ss:Type=\"String\" xmlns=\"http://www.w3.org/TR/REC-html40\">$html</ss:Data></Cell>\n";
172}
173
174
175sub print_spreadsheet_html_cell_xml
176{
177 my ($html) = @_;
178
179 $html =~ s/<br([^>]*)>/<br$1\/>/ig;
180 $html =~ s/<hr([^>]*)>/<hr$1\/>/ig;
181 $html =~ s/<img([^>]*)>/<img$1\/>/ig;
182
183 $html =~ s/<br[^\/]*\/>/&#10;/ig;
184
185# $html =~ s/<\/br>//ig;
186
187 $html =~ s/<p[^>]*>(.*?)<\/p>/&#10;$1/isg;
188 $html =~ s/<blockquote[^>]*>(.*?)<\/blockquote>/&#10;--$1/isg;
189
190# $html =~ s/<p[^>]*>/&#10;/ig;
191## $html =~ s/<div[^>]*>(.*?)<\/div>/$1&#10;/isg;
192
193 $html =~ s/<(Font)([^>]*)>/AABBCCDD$1$2DDCCBBAA/ig;
194 $html =~ s/<\/(Font)>/AABBCCDD\/$1DDCCBBAA/ig;
195 $html =~ s/<(I|B)\s+([^>]+)>/AABBCCDD$1 $2DDCCBBAA/ig;
196 $html =~ s/<(I|B)>/AABBCCDD$1DDCCBBAA/ig;
197 $html =~ s/<\/(I|B)>/AABBCCDD\/$1DDCCBBAA/ig;
198
199 $html =~ s/<[^>]*>//g;
200
201 $html =~ s/AABBCCDD/</g;
202 $html =~ s/DDCCBBAA/>/g;
203
204 # Make HTML all uppercase, as this seems to be needed for spreadsheet format
205
206## $html =~ s/<(\w+)([^>]*)>/"<".uc($1).$2.">"/ge;
207## $html =~ s/<\/(\w+)([^>]*)>/"<\/".uc($1).$2.">"/ge;
208
209 $html =~ s/<(\w+)([^>]*)>/"<".ucfirst($1).$2.">"/ge;
210 $html =~ s/<\/(\w+)([^>]*)>/"<\/".ucfirst($1).$2.">"/ge;
211
212# ##$html =~ s/<([^>]*)(\w+="[^"]+")([^>]*)>/"<".$1.uc($2).$3.">"/ge;
213 $html =~ s/<(\w+)([^>]*)>/"<".$1.ucattrs($2).">"/ge;
214
215 $html =~ s/<Font(.*?)Size=\"((?:\+|-)\d+)[^"]*\"/"<Font".$1." Size=\"".(12+$2)."\""/egs;
216# $html =~ s/<Font(.*?)Face=\"[^"]*\"/<Font$1/gs;
217# # $html =~ s/<Big[^>]*>(.*?)<\/Big>/$1/sg;
218# # $html =~ s/<Span[^>]*>(.*?)<\/Span>/$1/sg;
219
220# $html =~ s/<[^>]*>//g;
221 $html =~ s/Style="color: rgb\(64, 0, 128\)(;?)\"/Color="#400080"/sig;
222 $html =~ s/Style="color: rgb\(0, 0, 0\)(;?)\"/Color="#000000"/sig;
223
224 # consolidate sequence of <Font> tags
225 while ($html =~ m/^(.*?)<Font ([^>]+)><Font ([^>]+)>(.*?)<\/Font><\/Font>(.*$)/is) {
226 my $front=$1;
227 my $attr1=$2;
228 my $attr2=$3;
229 my $inner=$4;
230 my $back=$5;
231
232 #$attr1 =~ s/Style="color: rgb\(64, 0, 128\)(;?)/Color="#400080"/ig;
233 #$attr2 =~ s/Style="color: rgb\(64, 0, 128\)(;?)/Color="#400080"/ig;
234
235 my ($attr_name1) = ($attr1 =~ m/^(.*?)=/);
236 my ($attr_name2) = ($attr2 =~ m/^(.*?)=/);
237
238 if ($attr_name1 ne $attr_name2) {
239# if (index($attr1,$attr_name2)<0) {
240 $html = "$front<Font $attr1 $attr2>$inner<\/Font>$back";
241 }
242 else {
243 $html = "$front<Font $attr2>$inner<\/Font>$back";
244 }
245 }
246
247 $html =~ s/\s+/ /sg;
248
249 $html =~ s/^(&#10;)+//s;
250
251 # parse
252 my $tidy_dom = Mojo::DOM->new($html);
253 my $child_font_collection = $tidy_dom->find('font>font');
254
255 for my $child_font_elem (@$child_font_collection) {
256 my $children = $child_font_elem->child_nodes();
257 $child_font_elem->replace($children->join()->say());
258 }
259
260 my $tidy_html = $tidy_dom->content;
261 $tidy_html =~ s/<(\w+)([^>]*)>/"<".ucfirst($1).$2.">"/ge;
262 $tidy_html =~ s/<\/(\w+)([^>]*)>/"<\/".ucfirst($1).$2.">"/ge;
263 $tidy_html =~ s/<(\w+)([^>]*)>/"<".$1.ucattrsNS($1,$2).">"/ge;
264
265 $tidy_html =~ s/$/&#10;/mg;
266
267 print $FOUT " <Cell ss:StyleID=\"sHtmlWrap\"><ss:Data ss:Type=\"String\" xmlns=\"http://www.w3.org/TR/REC-html40\">$tidy_html</ss:Data></Cell>\n";
268
269
270}
271
272
273
274sub str_cat_eval
275{
276 my ($str) = @_;
277 $str =~ s/\+/./g;
278
279 return eval $str;
280}
281
282sub process_html_message
283{
284 my ($html_filename,$message_id) = @_;
285 print " Processing: $html_filename\n";
286
287 open (my $FH, '<:encoding(UTF-8)', $html_filename)
288 || die "Could not open file '$html_filename' $!";
289
290 my $content = "";
291 while (my $line = <$FH>) {
292 $content .= $line;
293 }
294
295 close($FH);
296
297 my $dom = Mojo::DOM->new($content);
298
299 my $li_collection = $dom->find('div[class="block"]>ul>li');
300
301# my $from = tidy_header_field("From", $li_collection->[0]->all_text);
302# my $to = tidy_header_field("To", $li_collection->[1]->all_text);
303# my $subject = tidy_header_field("Subject",$li_collection->[2]->all_text);
304# my $date = tidy_header_field("Date", $li_collection->[3]->all_text);
305# print " From : $from\n";
306# print " To : $to\n";
307# print " Subject : $subject\n";
308# print " Date : $date\n";
309
310 my $header_record = {};
311
312 for my $li_elem (@$li_collection) {
313 store_header_field($li_elem->all_text,$header_record);
314 }
315
316# for my $header_label (sort keys %$header_record) {
317# print " $header_label : $header_record->{$header_label}\n";
318# }
319
320
321 print $FOUT " <Row>\n";
322
323 #print $FOUT "\"$message_id\"";
324 print_spreadsheet_text_cell_xml($message_id);
325
326 if (defined $header_record->{'From'}) {
327 my $from = $header_record->{'From'};
328 #print_csv_escaped($from);
329 print_spreadsheet_text_cell_xml($from)
330 }
331 else {
332 # leave empty cell in CSV file
333 #print_csv_escaped("");
334 print_spreadsheet_text_cell_xml("")
335 }
336
337# if (defined $header_record->{'To'}) {
338# my $to = $header_record->{'To'};
339# #print_csv_escaped($to);
340# print_spreadsheet_text_cell_xml($to)
341# }
342# else {
343# # leave empty cell in CSV file
344# #print_csv_escaped("");
345# print_spreadsheet_text_cell_xml("")
346# }
347
348 if (defined $header_record->{'Date'}) {
349 my $date = $header_record->{'Date'};
350 #print_csv_escaped($date);
351 print_spreadsheet_text_cell_xml($date)
352 }
353 else {
354 # leave empty cell in CSV file
355 #print_csv_escaped("");
356 print_spreadsheet_text_cell_xml("")
357 }
358
359 if (defined $header_record->{'Subject'}) {
360 my $subject = $header_record->{'Subject'};
361 #print_csv_escaped($subject);
362 print_spreadsheet_text_cell_xml($subject)
363 }
364 else {
365 # leave empty cell in CSV file
366 #print_csv_escaped("");
367 print_spreadsheet_text_cell_xml("")
368 }
369
370 # Extract article
371 my $posting_div = $dom->at('#ActionHeader + div');
372 if (!defined $posting_div) {
373 print_spreadsheet_html_cell_xml("Empty article posted");
374
375 print "Empty article!!!\n";
376 print "=============\n";
377
378 # print $FOUT "\n";
379 print $FOUT " </Row>\n";
380
381 $REFNUM++;
382
383 return;
384 }
385
386 my $posting = $posting_div->child_nodes;
387 my $posting_array = $posting->to_array();
388
389# if (scalar(@$posting_array)>1) {
390# print "*****### dealing with multi value posting array: ", scalar(@$posting_array), "\n";
391# }
392
393 foreach my $elem (@$posting_array) {
394 if ($elem =~ m/^<hr\s+[^>]*>/si) {
395
396 my ($article) = ($elem =~ m/<!--X-Body-of-Message-->(.*)<!--X-Body-of-Message-End-->/s);
397 $article =~ s/<script[^>]*>.*?document\.write\((.*?)\).*?<\/script>/str_cat_eval($1)/sge;
398
399 my $cutpos = rindex($article,"To leave MLA-L, send the command SIGNOFF MLA-L to");
400 my $tidy_article = substr($article,0,$cutpos);
401 $article = $tidy_article;
402 $article =~ s/\*{70,}<br>$//s;
403
404 $article =~ s/^\s*//s;
405 $article =~ s/\s*$//s;
406# if ($article =~ m/Get Your Private/) {
407# print "***** $article\n\n";
408# }
409
410 print_spreadsheet_html_cell_xml($article);
411
412 # clean up tail, lock onto last occurrance of 'To leave ...'
413
414## my $cutpos = rindex($elem,"To leave MLA-L, send the command SIGNOFF MLA-L to");
415## my $tidy_elem = substr($elem,0,$cutpos);
416## $elem = $tidy_elem;
417
418# $elem =~ s/To leave MLA-L, send the command SIGNOFF MLA-L to<br>.*?$//s;
419# $elem =~ s/\*{70,}<br>$//s;
420# $elem =~ s/\s*$//s;
421
422# # clean up head
423# $elem =~ s/^.*<!--X-Body-of-Message-->//s;
424# $elem =~ s/^\s*//s;
425
426# $elem =~ s/<script[^>]*>.*?document\.write\((.*?)\).*?<\/script>/str_cat_eval($1)/sge;
427
428# if ($elem =~ m/Get Your Private/) {
429# print "***** $elem\n\n";
430# }
431
432
433 ##print " <article>\n$elem\n </article>\n";
434 #print_csv_escaped("<html>$elem</html>");
435# print_spreadsheet_html_cell_xml($elem)
436
437 }
438 }
439 # <!--X-Body-of-Message-->
440 # ....
441 # *************************************************************************<br>
442 # To leave MLA-L, send the command SIGNOFF MLA-L to<br>
443
444
445 print "=============\n";
446
447 # print $FOUT "\n";
448 print $FOUT " </Row>\n";
449
450 $REFNUM++;
451}
452
453sub spreadsheet_header
454{
455 my ($group) = @_;
456
457 open (my $FH, '<:encoding(UTF-8)', "spreadsheet-template-header.xml")
458 || die "Could not open file 'spreadsheet-template-header' $!";
459
460 my $content = "";
461 while (my $line = <$FH>) {
462 $line =~ s/####Worksheet####/$group/g;
463 $content .= $line;
464 }
465
466 close($FH);
467
468 print $FOUT $content;
469}
470
471sub spreadsheet_footer
472{
473 open (my $FH, '<:encoding(UTF-8)', "spreadsheet-template-footer.xml")
474 || die "Could not open file 'spreadsheet-template-footer' $!";
475
476 my $content = "";
477 while (my $line = <$FH>) {
478 $content .= $line;
479 }
480
481 close($FH);
482
483 print $FOUT $content;
484}
485
486
487sub process_html_file
488{
489 my ($group, $html_filename) = @_;
490 print " Processing: $html_filename\n";
491
492 open (my $FH, '<:encoding(windows-1252)', $html_filename)
493 || die "Could not open file '$html_filename' $!";
494
495 my $content = "";
496 while (my $line = <$FH>) {
497 $content .= $line;
498 }
499
500 close($FH);
501
502 my $dom = Mojo::DOM->new($content);
503
504 # my $article_dom = $dom->find('center')->first()->following();
505
506 my $last_post = 0;
507
508 my $posts = [];
509 my $missing_titles = [];
510 my $missing_body = [];
511
512
513 for my $hr_elem ($dom->find('hr[width="100%"]')->each) {
514
515 my $opt_icons = [];
516 my $title_line = undef;
517 my $body_html = "";
518
519 for my $e ($hr_elem->following()->each) {
520
521 if ($e->tag eq "center") {
522 $last_post = 1;
523 last;
524 }
525
526 if ($e->tag eq "hr") {
527 last;
528 }
529
530# if ($e->tag eq "br") {
531# next;
532# }
533
534 my $opt_head = $e->find('font[size="+3"]');
535 my $opt_font_color = $e->find('font[color="#400080"]');
536 my $opt_style = $e->find('[style*="color: rgb(64, 0, 128)"]');
537
538 my $is_title_line = ((scalar(@$opt_head) > 0) || (scalar(@$opt_font_color)>0) || (scalar(@$opt_style)>0));
539
540 # print "*** testing for title: $e\n";
541 if (!$is_title_line) {
542 if ($e->tag eq "font"){
543 my $opt_size = $e->{'size'};
544 if (defined $opt_size && $opt_size eq "+3") {
545 $is_title_line = 1;
546 }
547 my $opt_color = $e->{'color'};
548 if (defined $opt_color && $opt_color eq "#400080") {
549 $is_title_line = 1;
550 }
551 }
552 elsif ($e->tag eq "big") {
553 my $opt_font_color = $e->find('font[color="#400080"]');
554 if (scalar(@$opt_font_color)>0) {
555 $is_title_line = 1;
556 }
557 else {
558 my $opt_style = $e->{'style'};
559 if (defined $opt_style && $opt_style =~ m/color:\s*rgb\(64,\s*0,\s*128\)/) {
560 $is_title_line = 1;
561 }
562 }
563
564
565 }
566 }
567
568 if ($e->tag eq "a") {
569 push(@$opt_icons,$e);
570 }
571 elsif ((!defined $title_line) && ($is_title_line)) {
572 $title_line = $e;
573 }
574 else {
575 $body_html .= $e;
576 }
577## print " ==\n";
578 }
579
580 if (defined $title_line) {
581 my $visible_title = $title_line;
582 $visible_title =~ s/<[^>]*>//g;
583 $visible_title =~ s/\s+/ /gs;
584
585
586 if ($body_html ne "") {
587
588 push(@$posts,{ 'title' => $title_line, 'body' => $body_html, 'opt_icons' => scalar(@$opt_icons)});
589
590 print "Title: $visible_title\n";
591
592# print "Title Line: $title_line\n";
593# print "Body: $body_html\n";
594 }
595 else {
596 print "Warning: No BODY HTML to go with TITLE LINE: '$visible_title'\n";
597 push(@$missing_body, $title_line);
598 }
599 }
600 else {
601 if ($body_html eq "") {
602 print "Skipping entry with no title or body text\n";
603 }
604 else {
605 my $visible_text = $body_html;
606 $visible_text =~ s/<[^>]*>//g;
607 if ($visible_text =~ m/^\s*$/s) {
608 print "Skipping entry with no title or body text\n";
609 }
610 else {
611 if ($body_html !~ m/^<table/i) {
612 print "Warning: article has BODY TEXT but no TITLE: '$body_html'\n";
613 }
614 push(@$missing_titles, $body_html);
615 }
616 }
617 }
618 print "+++====++++\n";
619
620 if ($last_post) {
621 last;
622 }
623 }
624
625 my ($tail_filename) = ($html_filename =~ m/^.*\/(.*)$/);
626 my $anchored_html_filename = "anchored-$tail_filename";
627
628 my $i = 1;
629 foreach my $rec (@$posts) {
630 print $FOUT " <Row>\n";
631 my $ref_id = "Ref-$group-".sprintf("%03d",$i);
632
633 print_spreadsheet_html_cell_xml($ref_id);
634 print_spreadsheet_html_cell_xml($rec->{'title'});
635 print_spreadsheet_html_cell_xml($rec->{'body'});
636 print_spreadsheet_html_cell_xml($rec->{'opt_icons'});
637 print $FOUT " <Cell ss:StyleID=\"sPlainText\" ss:HRef=\"$anchored_html_filename#$ref_id\"><ss:Data ss:Type=\"String\">View external HTML $ref_id</ss:Data></Cell>\n";
638
639 print $FOUT " </Row>\n";
640
641 $rec->{'title'}->prepend("<div><i><a name=\"$ref_id\" >Waikato added $ref_id</a></i></div>");
642 $i++;
643 }
644
645 print "*\n";
646 print "* Generated anchored version $anchored_html_filename\n";
647 print "*\n";
648
649 # <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
650 $dom->at('meta[http-equiv]')->replace('<meta http-equiv="Content-Type" content="text/html; charset=utf-8">');
651
652 ## https://web.archive.org/web/20070218055720/http://www.logan.com/loganberry/solved-carbonel.jpg
653## print "*** looking for html tags\n";
654 my $a_collection = $dom->find('a');
655 for my $a_elem (@$a_collection) {
656 my $href = $a_elem->attr('href');
657 if (defined $href) {
658 if ($href =~ m/^http:\/\/www\.logan\.com\/loganberry\/.*\.(jpg|gif|png)$/i) {
659 $a_elem->attr({ 'href' => "https://web.archive.org/web/20030218055720/$href"});
660 }
661 }
662 }
663
664 my $img_collection = $dom->find('img');
665 for my $img_elem (@$img_collection) {
666 # <img alt="click for image of book" src="http://www.logan.com/loganberry/camera1.gif" width="64" height="64" border="0" align="LEFT">
667
668 my $src = $img_elem->attr('src');
669# print "*** src = $src\n";
670 if ($src =~ m/^http:\/\/www\.logan\.com\/loganberry\/.*\.(jpg|gif|png)$/i) {
671## print "**** relplacing src with https://web.archive.org/web/20170218055720/$src\n";
672# if ($src =~ m/camera1\.gif$/i) {
673# $img_elem->attr({ 'src' => "https://web.archive.org/web/20120213150518/$src"});
674# }
675# else {
676 $img_elem->attr({ 'src' => "https://web.archive.org/web/20020218055720/$src"});
677## $img_elem->attr({ 'src' => "https://web.archive.org/web/20170218055720/$src"});
678# }
679 }
680 }
681
682
683
684 open(my $AOUT, '>:encoding(UTF-8)', $anchored_html_filename)
685 || die "Failed to open $anchored_html_filename: $!\n";
686 print $AOUT $dom->content;
687 close($AOUT);
688
689
690 print "*****\n";
691
692 print "* num posts = ", scalar(@$posts), "\n";
693 print "* missing titles = ", scalar(@$missing_titles), "\n";
694 print "* missing body = ", scalar(@$missing_body), "\n";
695 print "*****\n";
696}
697
698sub main
699{
700 my $ARGC = scalar(@ARGV);
701
702 if ($ARGC != 1) {
703 print STDERR "Usage: $0 directory\n";
704 exit(1);
705 }
706
707 my $input_dir = $ARGV[0];
708 $input_dir .= "/" unless ($input_dir =~ /\/$/);
709
710 print "Reading sub-directories that match: solved-[a-z]+.html\n";
711
712 opendir(my $DIRIN, $input_dir) || die "can't opendir $input_dir: $!";
713 my @input_files = sort grep { $_ =~ /^solved-[a-z]+.html$/ } readdir($DIRIN);
714 closedir $DIRIN;
715
716 print "Number of matching files: ", scalar(@input_files), "\n";
717
718## my $prev_year = "0000";
719
720 foreach my $file (@input_files) {
721
722 my $full_filename = "$input_dir$file";
723 print " Processing file: $full_filename\n";
724
725 my ($group) = ($file =~ m/^solved-([a-z]+).html$/);
726
727 my $csv_filename = "solved-$group.xml";
728
729 open $FOUT, '>:encoding(UTF-8)', $csv_filename
730 || die "Failed to open $csv_filename: $!\n";
731
732 spreadsheet_header($group);
733 process_html_file($group,$full_filename);
734 spreadsheet_footer();
735
736 close($FOUT);
737 }
738
739## spreadsheet_footer();
740## close($FOUT);
741}
742
743main();
744
745
Note: See TracBrowser for help on using the repository browser.