source: gs3-extensions/html-to-expeditee/trunk/src/src/cgi-bin/html-to-expeditee.pl.in@ 26728

Last change on this file since 26728 was 26728, checked in by davidb, 11 years ago

Can now successfully obtain font size, font weight, font colour and font family information about each piece of text on a web page and convert to a corresponding text item on an Expeditee frame. Still need to account for text nodes with parents such as bold elements or heading elements.

File size: 11.7 KB
RevLine 
[26596]1 #!/cygdrive/c/strawberry/perl/bin/perl -w
[24916]2
3# Need to specify the full path of Perl above, e.g. for Windows something like
4#!C:\\Perl32\\bin\\perl -w
5
6
7use strict;
8
9# Set this to 1 to work around IIS 6 craziness
10my $iis6_mode = 0;
11
12
13# IIS 6: for some reason, IIS runs this script with the working directory set to the Greenstone
14# directory rather than the cgi-bin directory, causing lots of stuff to fail
15if ($iis6_mode)
16{
17 # Change into cgi-bin directory
18 chdir("cgi-bin");
19}
20
21
22# We use require and an eval here (instead of "use") to catch any errors loading the module (for IIS)
23eval("require \"gsdlCGI.pm\"");
24if ($@)
25{
26 print STDOUT "Content-type:text/plain\n\n";
27 print STDOUT "ERROR: $@\n";
28 exit 0;
29}
30
31
[24926]32sub generate_html_form
[24916]33{
[24926]34 my ($isGSDL2,$site,$collect,$cl) = @_;
[24916]35
[24926]36 my $html_form = <<EOT;
[24916]37<html>
38 <head>
39 <title>HTML to Expeditee Frames</title>
40
[24922]41 <base href=".." />
42
43 <link type="text/css" href="ext/html-to-expeditee/jquery/css/le-frog/jquery-ui-1.8.16.custom.css" rel="stylesheet" />
44 <script type="text/javascript" src="ext/html-to-expeditee/jquery/js/jquery-1.6.2.min.js"></script>
45 <script type="text/javascript" src="ext/html-to-expeditee/jquery/js/jquery-ui-1.8.16.custom.min.js"></script>
46 <script type="text/javascript" src="ext/html-to-expeditee/js/gsajax-min.js"></script>
47 <script type="text/javascript" src="ext/html-to-expeditee/js/html-to-expeditee.js"></script>
48
49
50
[24916]51 </head>
52 <body>
[24922]53 <form class="ui-widget">
54 Convert the collection
55 <input type="text"
56 class="ui-corner-all" style="padding: 4px;"
57 name="collect" value="$collect" id="collect" />
58 to Expeditee frames by traversing the classifier
59 <input type="text"
60 class="ui-corner-all" style="padding: 4px;"
61 name="cl" value="$cl" id="cl" />
[24916]62
[26593]63 <input value="$site" name="site" id="site" type="hidden">
[24922]64 <input value="Go" id="go"
65 class="ui-button ui-widget ui-state-default ui-corner-all"
66 type="submit">
[24916]67
68 </form>
69 <script type="text/javascript">
70 var docOIDs = [];
71 var numDocOIDs;
72
73 \$(function() {
74 \$('#progress').progressbar();
75
76 \$('#go').button().click(function() {
77 var collect = document.getElementById("collect").value;
78 if (collect.match(/^\\s*\$/)) {
79 alert("No collection specified");
80 return false;
81 }
82
83 var cl = document.getElementById("cl").value;
84 if (cl.match(/^\\s*\$/)) {
85 alert("No classifier specified");
86 return false;
87 }
88 if (cl.match(/^\\d+\$/)) {
[24920]89 cl = "CL" + cl;
[24916]90 }
91
[24920]92 var gs2=$isGSDL2;
93
94 var url;
95 if (gs2) {
96 url = "library.cgi";
97 url += "?c="+collect +"&a=d&cl=" + cl;
98 }
99 else {
[25795]100 url = "library";
[24920]101 url += "?c="+collect +"&a=b&rt=s&s=ClassifierBrowse&cl=" + cl;
[24922]102 url += "&excerptid=gs_content";
[24920]103 }
104
[24916]105 /* processing animation */
106
107 docOIDs = [];
108 var outstandingURLs = [];
[24944]109 var visitedURLs = {};
110
[24916]111 outstandingURLs.push(url);
[24944]112 visitedURLs[url] = 1;
[24920]113
[24916]114 while (outstandingURLs.length>0) {
115 url = outstandingURLs.shift();
116
117 var clHtml = urlGetSync(url);
[24920]118
[24916]119 var aElems;
[24922]120 var workingTrav = document.getElementById("workingTraverse");
121 workingTrav.innerHTML = clHtml;
122 aElems = workingTrav.getElementsByTagName("a");
[24916]123
124
[24944]125 /* any links with a=(b|d) ... cl=??? => outstandingURLS */
[24916]126 /* any links with a=d ... d=??? => docOIDS */
[25795]127
128/*
[24920]129 var actionRE = new RegExp("(\\\\?|&)a=(?:d|b)(&|\$)");
[24944]130 var clRE = new RegExp("(\\\\?|&)cl=" + cl + "(\\\\.\\\\d+)+(&|\$)");
[24920]131 var docRE = new RegExp("(?:\\\\?|&)d=(.*?)(?:&|\$)");
[25795]132*/
133 var actionRE = new RegExp("(\\\\/)(?:document|browse)(\\\\/)");
134 var clRE = new RegExp("(\\\\/)browse\\\\/" + cl + "(\\\\/\\\\d+)+(\$)");
135 var docRE = new RegExp("(?:\\\\/)document\\\\/(.*?)(?:\$)");
136
[24916]137 for (var i=0; i<aElems.length; i++) {
138 var aElem = aElems[i];
139 var href=aElem.href;
140 if (href && href.match(actionRE)) {
141 if (href.match(clRE)) {
[24944]142 if (!visitedURLs[href]) {
143 // console.log("found a new CL line: " + href);
144 outstandingURLs.push(href);
145 visitedURLs[href] = 1;
146 }
[24916]147 }
148 else if (href.match(docRE)) {
[26593]149 if(!visitedURLs[href]){
[24916]150 var docMatch = docRE.exec(href);
151 var docOID = docMatch[1];
[24920]152
[24944]153 // console.log("found a new doc line: " + docOID);
[26518]154 visitedURLs[href] = 1;
[24916]155 docOIDs.push(docOID);
156 }
[26518]157 }
[24916]158 }
159 }
160 }
161
162 numDocOIDs = docOIDs.length;
163
164 var iframe = document.getElementById("iframe");
165
166 var docOID = docOIDs.shift();
[24944]167 //console.log("doc oid = " + docOID);
[24916]168
[24920]169 var url;
170 if (gs2) {
171 url = "library.cgi";
172 url += "?c="+collect +"&a=d&d=" + docOID;
[24934]173 }
174 else {
[25795]175 url = "library";
[24920]176 url += "?c="+collect +"&a=d&d=" + docOID;
[26519]177
178 /** The line below has been commented out. This is because the incorrect page is being loaded up in the iframe (i.e. just the document image and heading is being displayed but not any html customizations made in the collectionConfig.xml file**/
179 //url += "&ed=1&dt=hierarchy";
[24934]180 // url += "&excerptid=gs_content";
181 }
[24916]182
[26596]183 //url += "&p.showExpediteeAttributes=1";
[26694]184 url += "&p.showAssocFilePath=1";
[25059]185
[26596]186
[24934]187 /* load iframe with document drawn from Greenstone collection */
[24916]188 iframe.src = url;
189
190 var progressbar = document.getElementById("progressbar");
191 progressbar.style.display = "block";
192
193
194 return false; });
195 });
196
197 function pageLoaded()
198 {
199 var iframe = document.getElementById("iframe");
200
201 if (iframe.src) {
202 if (iframe.style.display != "block") {
203 iframe.height = "90%";
204 iframe.style.display = "block";
205 }
206
207 var iframeDoc;
208 if ( iframe.contentDocument )
209 { /* FF */
210 iframeDoc = iframe.contentDocument;
211 }
212 else if ( iframe.contentWindow )
213 { /* IE */
214 iframeDoc = iframe.contentWindow.document;
215 }
[26694]216
217 var xmlUrl = iframe.src + "&o=xml";
218
219 var iter = (numDocOIDs - docOIDs.length);
220 var progressPercent = iter/numDocOIDs * 100;
221
222 /*Check if a frame-id metadata element already exists,
223 otherwise generate a new frame number based on the
224 number of documents to process.*/
225 var frameNum = getMetadata(xmlUrl,'frameID');
226
227 if(frameNum === null){
228 frameNum = iter;
229 console.log("no frameNum found, use iter: " + frameNum);
230 }else{
231 console.log("frameNum metadata found: " + frameNum);
232 }
233
[24916]234 \$(function() {
235 \$('#progressbar').progressbar({ value: progressPercent })
236 });
237
[24934]238 var gsContent = iframeDoc.getElementById("gs_content");
239
240 /*
241 var gsContentPos = getElementPosition(gsContent);
242 var pxl = gsContentPos.xl;
243 var pxr = gsContentPos.xr;
244 */
[26596]245
[26694]246 //Check if an assocfilepath annotation already exists on the page, otherwise obtain it from the xml.
247 var assocElem = iframeDoc.getElementById('assocfilepath');
248 var assoc = null;
249
250 if(assocElem === undefined || assocElem === null){
251 assoc = getMetadata(xmlUrl,'assocfilepath');
252 }
[26596]253
[24934]254 var expFrameTree = htmlToExpeditee(gsContent);
255
[24924]256 var expFrame = JSON.stringify(expFrameTree);
[26725]257
258 console.log(expFrame);
259
[24916]260 //alert(expFrame);
261
262 var collect = document.getElementById("collect").value;
[24920]263 var site = document.getElementById("site").value;
[24916]264
[24922]265 var url = "cgi-bin/html-to-expeditee.pl";
[24920]266 var params = "c=" + collect;
267 if (site.match(/\\w/)) {
[26596]268 params += "&site=" + site;
[24920]269 }
[26596]270
[26694]271 //params += "&a=generate-frame&fn=" + iter;
272 params += "&a=generate-frame&fn=" + frameNum;
[24916]273 params += "&json=" + escape(expFrame);
[26596]274
[26694]275 //add an assocfilepath parameter but only if it is defined.
276 if(assoc !== null){
[26596]277 params += "&assoc=" + assoc;
278 }
[24916]279
280 var clHtml = urlPostSync(url,params);
281
282 if (!clHtml.match(/html-to-expeditee saved frame/)) {
[26596]283 alert("Error processing url: " + url);
[24916]284 }
285
286 if (docOIDs.length>0) {
287 var docOID = docOIDs.shift();
288
289 // console.log("doc oid = " + docOID);
290
[24924]291 // is the following line used ????
[24916]292 var cl = document.getElementById("cl").value;
293
[24920]294 var gs2 = $isGSDL2;
295 var url;
296
297 if (gs2) {
298 url = "library.cgi";
299 url += "?c="+collect +"&a=d&d=" + docOID;
300 }
301 else {
[25795]302 url = "library";
[24920]303 url += "?c="+collect +"&a=d&d=" + docOID;
[26519]304
305 /** Refer to comment earlier in this code in regards to the commented out line below.**/
306 //url += "&ed=1&dt=hierarchy";
[24934]307 // url += "&excerptid=gs_content";
[24920]308 }
309
[26596]310 //url += "&p.showExpediteeAttributes=1";
[26694]311 url += "&p.showAssocFilePath=1";
312
313 iframe.src = url;
[24916]314 }
315 else {
[26694]316 var progressbar = document.getElementById("progressbar");
317 progressbar.style.display = "none";
[24916]318
[26694]319 iframe.style.display = "none";
320 delete iframe.src;
[24916]321 }
322 }
323
324 }
[26596]325
[26694]326 /**
327 * This method is used at this stage to retrieve assocfilepath
328 * and frame number metadata values from the document's xml.
329 **/
330 function getMetadata(xmlUrl,nameValue){
[26596]331
[26694]332 var metadata = null;
333
334 \$.ajax({
[26596]335 type: "GET",
336 async: false,
337 url: xmlUrl,
338 dataType: "xml",
339 success: function(xml){
340
341
342 \$(xml).find('metadata').each(function(){
343
344 var name = \$(this).attr('name');
345
[26694]346 if(name === nameValue){
347 if(metadata === null){
348 metadata = \$(this).text();
349 //console.log(metadata);
[26596]350 }
351
352 }
353 });
354
355 }
356
357 });
[26694]358
359 return metadata;
[26596]360 }
[24916]361
362 </script>
363
364 <div id="progressbar" width="100%"
365 style="display: none; margin: 10px; height: 10px;"></div>
366 <div id="workingTraverse" style="display: none"></div>
367
368 <hr style="margin: 10px;">
369
370 <iframe width="100%" id="iframe" style="display: none;"
371 onload="pageLoaded()"></iframe>
372
373 </body>
374</html>
375EOT
376
[24926]377 print "Content-type:text/html\n\n";
378 print $html_form;
379}
[24916]380
[24926]381sub main
382{
[24916]383
[24934]384# $ENV{'QUERY_STRING'} = "a=...";set-import-metadata&c=espresso-music&d=HASH012d6f72cde5dc48162f4a1d.1&metaname=annotation&metapos=0&metavalue=adfadfad";
[24926]385# $ENV{'REQUEST_METHOD'} = "GET";
386
387 my $gsdl_cgi = new gsdlCGI();
388
389 # Load the Greenstone modules that we need to use
390 $gsdl_cgi->setup_gsdl();
391
392 my $gsdlhome = $ENV{'GSDLHOME'};
393 $gsdl_cgi->checked_chdir($gsdlhome);
394
395 require cgiactions::HtmlToExpediteeAction;
396
397 # Useful debug statement for seeing what packages have been included
398#### printf("%-45s%-s\n",$_,$INC{$_}) foreach (sort keys %INC);
399
400 $gsdl_cgi->parse_cgi_args();
401
402 # We don't want the gsdlCGI module to return errors and warnings in XML
403 $gsdl_cgi->{'xml'} = 0;
404
405 my $fn = $gsdl_cgi->clean_param("fn"); # frame number
406
407 if (defined $fn) {
408
409 my $action = new HtmlToExpediteeAction($gsdl_cgi,$iis6_mode);
410
411 $action->do_action();
[24916]412 }
[24926]413 else {
414 # generate form, pre-filled out with any useful values such
415 # as the collection and classifier value
416
[26593]417 my $collect = $gsdl_cgi->clean_param("collect");
418 my $cl = $gsdl_cgi->clean_param("cl");
419
[24926]420 # Establish collect_dir using defining 'site' along the way if GS3
421
422 my $site = undef;
423 my $isGSDL2 = undef;
424
425 if ($gsdl_cgi->greenstone_version() == 2) {
426 $isGSDL2 = 1;
427 }
428 else {
429 $isGSDL2 = 0;
430
431 # GS3 (and possible future versions) make use of 'site'
432 $site = $gsdl_cgi->clean_param("site");
433 if (!defined $site) {
434 $gsdl_cgi->generate_error("No site specified.");
435 }
436 }
437
438 generate_html_form($isGSDL2,$site,$collect,$cl);
439 }
[24916]440}
441
442
443
444&main();
Note: See TracBrowser for help on using the repository browser.