Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

extract-posts.pl@ 33353

Last change on this file since 33353 was 33353, checked in by davidb, 5 years ago
Initial set of files to page scrape and turn in the OpenOffice spreadsheet format
Property svn:executable set to ``*
File size: 19.3 KB

Line
1	#!/usr/bin/perl -w
2
3	use v5.10;
4	use strict;
5
6	use Mojo::DOM;
7
8	my $FOUT = undef;
9	my $HOUT = undef;
10	my $REFNUM = 1;
11
12	sub tidy_header_field
13	{
14	my ($prefix,$untidy_text) = @_;
15	my $tidy_text = $untidy_text;
16	$tidy_text =~ s/^$prefix:\s*//;
17
18	$tidy_text =~ s/<\s*>//gs;
19
20	return $tidy_text;
21	}
22
23	sub store_header_field
24	{
25	my ($untidy_text,$header_record) = @_;
26
27	my ($header_label,$header_value) = ($untidy_text =~ m/^(.?)\s:\s(.)$/s);
28
29	$header_value =~ s/<\s*>//gs;
30
31	$header_record->{$header_label} = $header_value;
32	}
33
34	sub print_csv_escaped
35	{
36	my ($text) = @_;
37
38	my $escaped_text = $text;
39
40	$escaped_text =~ s/\"/\"\"/g;
41	$escaped_text =~ s/\n/ /gs;
42
43	print $FOUT ",\"$escaped_text\"";
44	}
45
46
47	# plain text
48	# <Row ss:AutoFitHeight="0">
49	# <Cell ss:Index="2" ss:StyleID="s62"><Data ss:Type="String">On Sat, 1 Jan 2000, Richard Griscom wrote: > > So. It still works.</Data></Cell>
50	# </Row>
51
52
53	# Hyperlinked cell
54	# <Row>
55	# <Cell ss:StyleID="s63" ss:HRef="foo.html">
56	# <Data ss:Type="String">Test Boldfoo.html</Data>
57	# </Cell>
58	# </Row>
59
60	sub print_spreadsheet_text_cell_xml
61	{
62	my ($text) = @_;
63
64	$text =~ s/&/&/g;
65
66	$text =~ s/</</g;
67	$text =~ s/>/>/g;
68	$text =~ s/"/"/g;
69
70	print $FOUT " <Cellss:StyleID=\"sPlainText\"><ss:Data ss:Type=\"String\">$text</ss:Data></Cell>\n";
71	}
72
73	# html
74	# <Row ss:Index="7" ss:AutoFitHeight="0" ss:Height="13.5">
75	# <Cell ss:Index="2" ss:StyleID="s66"><ss:Data ss:Type="String"
76	# xmlns="http://www.w3.org/TR/REC-html40"><I><Font html:Color="#000000">YYYY Happy </Font></I><B><I><Font
77	# html:Color="#000000">New</Font></I></B><I><Font html:Color="#000000"> Year Dick et al !!</Font></I></ss:Data></Cell>
78	# </Row>
79
80	sub print_spreadsheet_html_cell_xml_orig
81	{
82	my ($html) = @_;
83
84	# $html =~ s/<br\s*\/?>/ /ig;
85	$html =~ s/<br\s*>/ /ig;
86	$html =~ s/<\/br>//ig;
87
88	$html =~ s/<p[^>]>(.?)<\/p>/$1 /ig;
89	$html =~ s/<div[^>]>(.?)<\/div>/$1 /ig;
90
91	$html =~ s/<[^>]*>//g;
92	# $html =~ s/<\/?(font\|big\|small)[^>]*>//ig;
93
94
95
96	print $FOUT " <Cell ss:StyleID=\"sHtmlWrap\"><ss:Data ss:Type=\"String\" xmlns=\"http://www.w3.org/TR/REC-html40\">$html</ss:Data></Cell>\n";
97	}
98
99	sub ucattrs
100	{
101	my ($inner) = @_;
102
103	##$inner =~ s/(\w+)=("[^"]+")/"".uc($1)."=".$2/ge;
104	$inner =~ s/(\w+)=("[^"]+")/\u$1=$2/g;
105
106	return $inner;
107	}
108
109	sub ucattrsNS
110	{
111	my ($tag,$inner) = @_;
112
113	if ($tag =~ m/^Font$/i) {
114	$inner =~ s/(\w+)=("[^"]+")/html:\u$1=$2/g;
115	}
116	else {
117	$inner =~ s/(\w+)=("[^"]+")/\u$1=$2/g;
118	}
119
120	return $inner;
121	}
122
123
124	sub print_spreadsheet_html_cell_xml_b_i_font
125	{
126	my ($html) = @_;
127
128	$html =~ s/<br([^>]*)>/<br$1\/>/ig;
129	$html =~ s/<hr([^>]*)>/<hr$1\/>/ig;
130	$html =~ s/<img([^>]*)>/<img$1\/>/ig;
131
132	$html =~ s/<br\s*\/?>/ /ig;
133	# $html =~ s/<\/br>//ig;
134
135	$html =~ s/<p[^>]>(.?)<\/p>/$1 /isg;
136	# $html =~ s/<p[^>]*>/ /ig;
137	$html =~ s/<div[^>]>(.?)<\/div>/$1 /isg;
138
139	# $html =~ s/<[^>]*>//g;
140
141	$html =~ s/<table/<div/ig;
142	$html =~ s/<\/table/<\/div/ig;
143
144	$html =~ s/<tr/<div/ig;
145	$html =~ s/<\/tr/<\/div/ig;
146
147	$html =~ s/<td/<div/ig;
148	$html =~ s/<\/td/<\/div/ig;
149
150	# Make HTML all uppercase, as this seems to be needed for spreadsheet format
151
152	# $html =~ s/<(\w+)([^>]*)>/"<".uc($1).$2.">"/ge;
153	# $html =~ s/<\/(\w+)([^>]*)>/"<\/".uc($1).$2.">"/ge;
154
155	$html =~ s/<(\w+)([^>]*)>/"<".ucfirst($1).$2.">"/ge;
156	$html =~ s/<\/(\w+)([^>]*)>/"<\/".ucfirst($1).$2.">"/ge;
157
158	##$html =~ s/<([^>])(\w+="[^"]+")([^>])>/"<".$1.uc($2).$3.">"/ge;
159	$html =~ s/<(\w+)([^>]*)>/"<".$1.ucattrs($2).">"/ge;
160
161	$html =~ s/<Font(.?)Size=\"[^"]\"/<Font$1/gs;
162	$html =~ s/<Font(.?)Face=\"[^"]\"/<Font$1/gs;
163	$html =~ s/<Big[^>]>(.?)<\/Big>/$1/sg;
164	$html =~ s/<Span[^>]>(.?)<\/Span>/$1/sg;
165
166	$html =~ s/<[^>]*>//g;
167
168	$html =~ s/^( )+//s;
169
170
171	print $FOUT " <Cell ss:StyleID=\"sHtmlWrap\"><ss:Data ss:Type=\"String\" xmlns=\"http://www.w3.org/TR/REC-html40\">$html</ss:Data></Cell>\n";
172	}
173
174
175	sub print_spreadsheet_html_cell_xml
176	{
177	my ($html) = @_;
178
179	$html =~ s/<br([^>]*)>/<br$1\/>/ig;
180	$html =~ s/<hr([^>]*)>/<hr$1\/>/ig;
181	$html =~ s/<img([^>]*)>/<img$1\/>/ig;
182
183	$html =~ s/<br[^\/]*\/>/ /ig;
184
185	# $html =~ s/<\/br>//ig;
186
187	$html =~ s/<p[^>]>(.?)<\/p>/ $1/isg;
188	$html =~ s/<blockquote[^>]>(.?)<\/blockquote>/ --$1/isg;
189
190	# $html =~ s/<p[^>]*>/ /ig;
191	## $html =~ s/<div[^>]>(.?)<\/div>/$1 /isg;
192
193	$html =~ s/<(Font)([^>]*)>/AABBCCDD$1$2DDCCBBAA/ig;
194	$html =~ s/<\/(Font)>/AABBCCDD\/$1DDCCBBAA/ig;
195	$html =~ s/<(I\|B)\s+([^>]+)>/AABBCCDD$1 $2DDCCBBAA/ig;
196	$html =~ s/<(I\|B)>/AABBCCDD$1DDCCBBAA/ig;
197	$html =~ s/<\/(I\|B)>/AABBCCDD\/$1DDCCBBAA/ig;
198
199	$html =~ s/<[^>]*>//g;
200
201	$html =~ s/AABBCCDD/</g;
202	$html =~ s/DDCCBBAA/>/g;
203
204	# Make HTML all uppercase, as this seems to be needed for spreadsheet format
205
206	## $html =~ s/<(\w+)([^>]*)>/"<".uc($1).$2.">"/ge;
207	## $html =~ s/<\/(\w+)([^>]*)>/"<\/".uc($1).$2.">"/ge;
208
209	$html =~ s/<(\w+)([^>]*)>/"<".ucfirst($1).$2.">"/ge;
210	$html =~ s/<\/(\w+)([^>]*)>/"<\/".ucfirst($1).$2.">"/ge;
211
212	# ##$html =~ s/<([^>])(\w+="[^"]+")([^>])>/"<".$1.uc($2).$3.">"/ge;
213	$html =~ s/<(\w+)([^>]*)>/"<".$1.ucattrs($2).">"/ge;
214
215	$html =~ s/<Font(.?)Size=\"((?:\+\|-)\d+)[^"]\"/"<Font".$1." Size=\"".(12+$2)."\""/egs;
216	# $html =~ s/<Font(.?)Face=\"[^"]\"/<Font$1/gs;
217	# # $html =~ s/<Big[^>]>(.?)<\/Big>/$1/sg;
218	# # $html =~ s/<Span[^>]>(.?)<\/Span>/$1/sg;
219
220	# $html =~ s/<[^>]*>//g;
221	$html =~ s/Style="color: rgb$64, 0, 128$(;?)\"/Color="#400080"/sig;
222	$html =~ s/Style="color: rgb$0, 0, 0$(;?)\"/Color="#000000"/sig;
223
224	# consolidate sequence of <Font> tags
225	while ($html =~ m/^(.?)<Font ([^>]+)><Font ([^>]+)>(.?)<\/Font><\/Font>(.*$)/is) {
226	my $front=$1;
227	my $attr1=$2;
228	my $attr2=$3;
229	my $inner=$4;
230	my $back=$5;
231
232	#$attr1 =~ s/Style="color: rgb$64, 0, 128$(;?)/Color="#400080"/ig;
233	#$attr2 =~ s/Style="color: rgb$64, 0, 128$(;?)/Color="#400080"/ig;
234
235	my ($attr_name1) = ($attr1 =~ m/^(.*?)=/);
236	my ($attr_name2) = ($attr2 =~ m/^(.*?)=/);
237
238	if ($attr_name1 ne $attr_name2) {
239	# if (index($attr1,$attr_name2)<0) {
240	$html = "$front<Font $attr1 $attr2>$inner<\/Font>$back";
241	}
242	else {
243	$html = "$front<Font $attr2>$inner<\/Font>$back";
244	}
245	}
246
247	$html =~ s/\s+/ /sg;
248
249	$html =~ s/^( )+//s;
250
251	# parse
252	my $tidy_dom = Mojo::DOM->new($html);
253	my $child_font_collection = $tidy_dom->find('font>font');
254
255	for my $child_font_elem (@$child_font_collection) {
256	my $children = $child_font_elem->child_nodes();
257	$child_font_elem->replace($children->join()->say());
258	}
259
260	my $tidy_html = $tidy_dom->content;
261	$tidy_html =~ s/<(\w+)([^>]*)>/"<".ucfirst($1).$2.">"/ge;
262	$tidy_html =~ s/<\/(\w+)([^>]*)>/"<\/".ucfirst($1).$2.">"/ge;
263	$tidy_html =~ s/<(\w+)([^>]*)>/"<".$1.ucattrsNS($1,$2).">"/ge;
264
265	$tidy_html =~ s/$/ /mg;
266
267	print $FOUT " <Cell ss:StyleID=\"sHtmlWrap\"><ss:Data ss:Type=\"String\" xmlns=\"http://www.w3.org/TR/REC-html40\">$tidy_html</ss:Data></Cell>\n";
268
269
270	}
271
272
273
274	sub str_cat_eval
275	{
276	my ($str) = @_;
277	$str =~ s/\+/./g;
278
279	return eval $str;
280	}
281
282	sub process_html_message
283	{
284	my ($html_filename,$message_id) = @_;
285	print " Processing: $html_filename\n";
286
287	open (my $FH, '<:encoding(UTF-8)', $html_filename)
288	\|\| die "Could not open file '$html_filename' $!";
289
290	my $content = "";
291	while (my $line = <$FH>) {
292	$content .= $line;
293	}
294
295	close($FH);
296
297	my $dom = Mojo::DOM->new($content);
298
299	my $li_collection = $dom->find('div[class="block"]>ul>li');
300
301	# my $from = tidy_header_field("From", $li_collection->[0]->all_text);
302	# my $to = tidy_header_field("To", $li_collection->[1]->all_text);
303	# my $subject = tidy_header_field("Subject",$li_collection->[2]->all_text);
304	# my $date = tidy_header_field("Date", $li_collection->[3]->all_text);
305	# print " From : $from\n";
306	# print " To : $to\n";
307	# print " Subject : $subject\n";
308	# print " Date : $date\n";
309
310	my $header_record = {};
311
312	for my $li_elem (@$li_collection) {
313	store_header_field($li_elem->all_text,$header_record);
314	}
315
316	# for my $header_label (sort keys %$header_record) {
317	# print " $header_label : $header_record->{$header_label}\n";
318	# }
319
320
321	print $FOUT " <Row>\n";
322
323	#print $FOUT "\"$message_id\"";
324	print_spreadsheet_text_cell_xml($message_id);
325
326	if (defined $header_record->{'From'}) {
327	my $from = $header_record->{'From'};
328	#print_csv_escaped($from);
329	print_spreadsheet_text_cell_xml($from)
330	}
331	else {
332	# leave empty cell in CSV file
333	#print_csv_escaped("");
334	print_spreadsheet_text_cell_xml("")
335	}
336
337	# if (defined $header_record->{'To'}) {
338	# my $to = $header_record->{'To'};
339	# #print_csv_escaped($to);
340	# print_spreadsheet_text_cell_xml($to)
341	# }
342	# else {
343	# # leave empty cell in CSV file
344	# #print_csv_escaped("");
345	# print_spreadsheet_text_cell_xml("")
346	# }
347
348	if (defined $header_record->{'Date'}) {
349	my $date = $header_record->{'Date'};
350	#print_csv_escaped($date);
351	print_spreadsheet_text_cell_xml($date)
352	}
353	else {
354	# leave empty cell in CSV file
355	#print_csv_escaped("");
356	print_spreadsheet_text_cell_xml("")
357	}
358
359	if (defined $header_record->{'Subject'}) {
360	my $subject = $header_record->{'Subject'};
361	#print_csv_escaped($subject);
362	print_spreadsheet_text_cell_xml($subject)
363	}
364	else {
365	# leave empty cell in CSV file
366	#print_csv_escaped("");
367	print_spreadsheet_text_cell_xml("")
368	}
369
370	# Extract article
371	my $posting_div = $dom->at('#ActionHeader + div');
372	if (!defined $posting_div) {
373	print_spreadsheet_html_cell_xml("Empty article posted");
374
375	print "Empty article!!!\n";
376	print "=============\n";
377
378	# print $FOUT "\n";
379	print $FOUT " </Row>\n";
380
381	$REFNUM++;
382
383	return;
384	}
385
386	my $posting = $posting_div->child_nodes;
387	my $posting_array = $posting->to_array();
388
389	# if (scalar(@$posting_array)>1) {
390	# print "*****### dealing with multi value posting array: ", scalar(@$posting_array), "\n";
391	# }
392
393	foreach my $elem (@$posting_array) {
394	if ($elem =~ m/^<hr\s+[^>]*>/si) {
395
396	my ($article) = ($elem =~ m/<!--X-Body-of-Message-->(.*)<!--X-Body-of-Message-End-->/s);
397	$article =~ s/<script[^>]>.?document\.write$(.?)$.?<\/script>/str_cat_eval($1)/sge;
398
399	my $cutpos = rindex($article,"To leave MLA-L, send the command SIGNOFF MLA-L to");
400	my $tidy_article = substr($article,0,$cutpos);
401	$article = $tidy_article;
402	$article =~ s/\*{70,}<br>$//s;
403
404	$article =~ s/^\s*//s;
405	$article =~ s/\s*$//s;
406	# if ($article =~ m/Get Your Private/) {
407	# print "***** $article\n\n";
408	# }
409
410	print_spreadsheet_html_cell_xml($article);
411
412	# clean up tail, lock onto last occurrance of 'To leave ...'
413
414	## my $cutpos = rindex($elem,"To leave MLA-L, send the command SIGNOFF MLA-L to");
415	## my $tidy_elem = substr($elem,0,$cutpos);
416	## $elem = $tidy_elem;
417
418	# $elem =~ s/To leave MLA-L, send the command SIGNOFF MLA-L to<br>.*?$//s;
419	# $elem =~ s/\*{70,}<br>$//s;
420	# $elem =~ s/\s*$//s;
421
422	# # clean up head
423	# $elem =~ s/^.*<!--X-Body-of-Message-->//s;
424	# $elem =~ s/^\s*//s;
425
426	# $elem =~ s/<script[^>]>.?document\.write$(.?)$.?<\/script>/str_cat_eval($1)/sge;
427
428	# if ($elem =~ m/Get Your Private/) {
429	# print "***** $elem\n\n";
430	# }
431
432
433	##print " <article>\n$elem\n </article>\n";
434	#print_csv_escaped("<html>$elem</html>");
435	# print_spreadsheet_html_cell_xml($elem)
436
437	}
438	}
439	# <!--X-Body-of-Message-->
440	# ....
441	# *************************************************************************<br>
442	# To leave MLA-L, send the command SIGNOFF MLA-L to<br>
443
444
445	print "=============\n";
446
447	# print $FOUT "\n";
448	print $FOUT " </Row>\n";
449
450	$REFNUM++;
451	}
452
453	sub spreadsheet_header
454	{
455	my ($group) = @_;
456
457	open (my $FH, '<:encoding(UTF-8)', "spreadsheet-template-header.xml")
458	\|\| die "Could not open file 'spreadsheet-template-header' $!";
459
460	my $content = "";
461	while (my $line = <$FH>) {
462	$line =~ s/####Worksheet####/$group/g;
463	$content .= $line;
464	}
465
466	close($FH);
467
468	print $FOUT $content;
469	}
470
471	sub spreadsheet_footer
472	{
473	open (my $FH, '<:encoding(UTF-8)', "spreadsheet-template-footer.xml")
474	\|\| die "Could not open file 'spreadsheet-template-footer' $!";
475
476	my $content = "";
477	while (my $line = <$FH>) {
478	$content .= $line;
479	}
480
481	close($FH);
482
483	print $FOUT $content;
484	}
485
486
487	sub process_html_file
488	{
489	my ($group, $html_filename) = @_;
490	print " Processing: $html_filename\n";
491
492	open (my $FH, '<:encoding(windows-1252)', $html_filename)
493	\|\| die "Could not open file '$html_filename' $!";
494
495	my $content = "";
496	while (my $line = <$FH>) {
497	$content .= $line;
498	}
499
500	close($FH);
501
502	my $dom = Mojo::DOM->new($content);
503
504	# my $article_dom = $dom->find('center')->first()->following();
505
506	my $last_post = 0;
507
508	my $posts = [];
509	my $missing_titles = [];
510	my $missing_body = [];
511
512
513	for my $hr_elem ($dom->find('hr[width="100%"]')->each) {
514
515	my $opt_icons = [];
516	my $title_line = undef;
517	my $body_html = "";
518
519	for my $e ($hr_elem->following()->each) {
520
521	if ($e->tag eq "center") {
522	$last_post = 1;
523	last;
524	}
525
526	if ($e->tag eq "hr") {
527	last;
528	}
529
530	# if ($e->tag eq "br") {
531	# next;
532	# }
533
534	my $opt_head = $e->find('font[size="+3"]');
535	my $opt_font_color = $e->find('font[color="#400080"]');
536	my $opt_style = $e->find('[style*="color: rgb(64, 0, 128)"]');
537
538	my $is_title_line = ((scalar(@$opt_head) > 0) \|\| (scalar(@$opt_font_color)>0) \|\| (scalar(@$opt_style)>0));
539
540	# print "*** testing for title: $e\n";
541	if (!$is_title_line) {
542	if ($e->tag eq "font"){
543	my $opt_size = $e->{'size'};
544	if (defined $opt_size && $opt_size eq "+3") {
545	$is_title_line = 1;
546	}
547	my $opt_color = $e->{'color'};
548	if (defined $opt_color && $opt_color eq "#400080") {
549	$is_title_line = 1;
550	}
551	}
552	elsif ($e->tag eq "big") {
553	my $opt_font_color = $e->find('font[color="#400080"]');
554	if (scalar(@$opt_font_color)>0) {
555	$is_title_line = 1;
556	}
557	else {
558	my $opt_style = $e->{'style'};
559	if (defined $opt_style && $opt_style =~ m/color:\srgb$64,\s0,\s*128$/) {
560	$is_title_line = 1;
561	}
562	}
563
564
565	}
566	}
567
568	if ($e->tag eq "a") {
569	push(@$opt_icons,$e);
570	}
571	elsif ((!defined $title_line) && ($is_title_line)) {
572	$title_line = $e;
573	}
574	else {
575	$body_html .= $e;
576	}
577	## print " ==\n";
578	}
579
580	if (defined $title_line) {
581	my $visible_title = $title_line;
582	$visible_title =~ s/<[^>]*>//g;
583	$visible_title =~ s/\s+/ /gs;
584
585
586	if ($body_html ne "") {
587
588	push(@$posts,{ 'title' => $title_line, 'body' => $body_html, 'opt_icons' => scalar(@$opt_icons)});
589
590	print "Title: $visible_title\n";
591
592	# print "Title Line: $title_line\n";
593	# print "Body: $body_html\n";
594	}
595	else {
596	print "Warning: No BODY HTML to go with TITLE LINE: '$visible_title'\n";
597	push(@$missing_body, $title_line);
598	}
599	}
600	else {
601	if ($body_html eq "") {
602	print "Skipping entry with no title or body text\n";
603	}
604	else {
605	my $visible_text = $body_html;
606	$visible_text =~ s/<[^>]*>//g;
607	if ($visible_text =~ m/^\s*$/s) {
608	print "Skipping entry with no title or body text\n";
609	}
610	else {
611	if ($body_html !~ m/^<table/i) {
612	print "Warning: article has BODY TEXT but no TITLE: '$body_html'\n";
613	}
614	push(@$missing_titles, $body_html);
615	}
616	}
617	}
618	print "+++====++++\n";
619
620	if ($last_post) {
621	last;
622	}
623	}
624
625	my ($tail_filename) = ($html_filename =~ m/^.\/(.)$/);
626	my $anchored_html_filename = "anchored-$tail_filename";
627
628	my $i = 1;
629	foreach my $rec (@$posts) {
630	print $FOUT " <Row>\n";
631	my $ref_id = "Ref-$group-".sprintf("%03d",$i);
632
633	print_spreadsheet_html_cell_xml($ref_id);
634	print_spreadsheet_html_cell_xml($rec->{'title'});
635	print_spreadsheet_html_cell_xml($rec->{'body'});
636	print_spreadsheet_html_cell_xml($rec->{'opt_icons'});
637	print $FOUT " <Cell ss:StyleID=\"sPlainText\" ss:HRef=\"$anchored_html_filename#$ref_id\"><ss:Data ss:Type=\"String\">View external HTML $ref_id</ss:Data></Cell>\n";
638
639	print $FOUT " </Row>\n";
640
641	$rec->{'title'}->prepend("<div><i><a name=\"$ref_id\" >Waikato added $ref_id</a></i></div>");
642	$i++;
643	}
644
645	print "*\n";
646	print "* Generated anchored version $anchored_html_filename\n";
647	print "*\n";
648
649	# <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
650	$dom->at('meta[http-equiv]')->replace('<meta http-equiv="Content-Type" content="text/html; charset=utf-8">');
651
652	## https://web.archive.org/web/20070218055720/http://www.logan.com/loganberry/solved-carbonel.jpg
653	## print "*** looking for html tags\n";
654	my $a_collection = $dom->find('a');
655	for my $a_elem (@$a_collection) {
656	my $href = $a_elem->attr('href');
657	if (defined $href) {
658	if ($href =~ m/^http:\/\/www\.logan\.com\/loganberry\/.*\.(jpg\|gif\|png)$/i) {
659	$a_elem->attr({ 'href' => "https://web.archive.org/web/20030218055720/$href"});
660	}
661	}
662	}
663
664	my $img_collection = $dom->find('img');
665	for my $img_elem (@$img_collection) {
666	# <img alt="click for image of book" src="http://www.logan.com/loganberry/camera1.gif" width="64" height="64" border="0" align="LEFT">
667
668	my $src = $img_elem->attr('src');
669	# print "*** src = $src\n";
670	if ($src =~ m/^http:\/\/www\.logan\.com\/loganberry\/.*\.(jpg\|gif\|png)$/i) {
671	## print "**** relplacing src with https://web.archive.org/web/20170218055720/$src\n";
672	# if ($src =~ m/camera1\.gif$/i) {
673	# $img_elem->attr({ 'src' => "https://web.archive.org/web/20120213150518/$src"});
674	# }
675	# else {
676	$img_elem->attr({ 'src' => "https://web.archive.org/web/20020218055720/$src"});
677	## $img_elem->attr({ 'src' => "https://web.archive.org/web/20170218055720/$src"});
678	# }
679	}
680	}
681
682
683
684	open(my $AOUT, '>:encoding(UTF-8)', $anchored_html_filename)
685	\|\| die "Failed to open $anchored_html_filename: $!\n";
686	print $AOUT $dom->content;
687	close($AOUT);
688
689
690	print "*****\n";
691
692	print "* num posts = ", scalar(@$posts), "\n";
693	print "* missing titles = ", scalar(@$missing_titles), "\n";
694	print "* missing body = ", scalar(@$missing_body), "\n";
695	print "*****\n";
696	}
697
698	sub main
699	{
700	my $ARGC = scalar(@ARGV);
701
702	if ($ARGC != 1) {
703	print STDERR "Usage: $0 directory\n";
704	exit(1);
705	}
706
707	my $input_dir = $ARGV[0];
708	$input_dir .= "/" unless ($input_dir =~ /\/$/);
709
710	print "Reading sub-directories that match: solved-[a-z]+.html\n";
711
712	opendir(my $DIRIN, $input_dir) \|\| die "can't opendir $input_dir: $!";
713	my @input_files = sort grep { $_ =~ /^solved-[a-z]+.html$/ } readdir($DIRIN);
714	closedir $DIRIN;
715
716	print "Number of matching files: ", scalar(@input_files), "\n";
717
718	## my $prev_year = "0000";
719
720	foreach my $file (@input_files) {
721
722	my $full_filename = "$input_dir$file";
723	print " Processing file: $full_filename\n";
724
725	my ($group) = ($file =~ m/^solved-([a-z]+).html$/);
726
727	my $csv_filename = "solved-$group.xml";
728
729	open $FOUT, '>:encoding(UTF-8)', $csv_filename
730	\|\| die "Failed to open $csv_filename: $!\n";
731
732	spreadsheet_header($group);
733	process_html_file($group,$full_filename);
734	spreadsheet_footer();
735
736	close($FOUT);
737	}
738
739	## spreadsheet_footer();
740	## close($FOUT);
741	}
742
743	main();
744
745

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: other-projects/book-stumper/trunk/extract-posts.pl@ 33353

Download in other formats: