Changeset 37017


Ignore:
Timestamp:
2022-12-10T22:04:37+13:00 (17 months ago)
Author:
davidb
Message:

OCR'd txt now stored in each <div> as a data field

Location:
gs3-installations/intermuse/trunk/sites/intermuse/collect/programmes
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • gs3-installations/intermuse/trunk/sites/intermuse/collect/programmes/js/document_viewer.js

    r37011 r37017  
    1919}
    2020
     21function gv_ocr_paras_to_text(paras)
     22{
     23    var ocr_text = "";
     24
     25    var num_paras = paras.length;
     26    for (var p=0; p<num_paras; p++) {
     27
     28    var para = paras[p];   
     29    var words = para.words;
     30    var num_words = words.length;
     31   
     32    for (var w=0; w<num_words; w++) {
     33        var word = words[w];
     34       
     35        var symbols = word.symbols;
     36        var num_symbols = symbols.length;
     37       
     38        for (var s=0; s<num_symbols; s++) {
     39       
     40        var symbol = symbols[s];
     41
     42        ocr_text += symbol.text;
     43
     44        // Enumerated fields listed at:
     45        //   http://googleapis.github.io/googleapis/java/grpc-google-cloud-vision-v1/0.1.5/apidocs/com/google/cloud/vision/v1/TextAnnotation.DetectedBreak.BreakType.html
     46        if (symbol.property) {
     47            var detected_break = symbol.property.detectedBreak;
     48            if ((detected_break.type == "SPACE" || detected_break.type == "EOL_SURE_SPACE")) {
     49            ocr_text += ' ';
     50            }
     51            else if (detected_break.type == "SURE_SPACE") { // very wide space
     52            ocr_text += '\t';
     53            }
     54            // "HYPHEN" at end end of line => assume word continues onto next line
     55            else if (detected_break.type == "LINE_BREAK" ) {
     56            ocr_text += '\n\n';
     57            }
     58        }
     59        }
     60    }
     61    }
     62   
     63    return ocr_text;
     64}
     65
    2166function display_gv_ocr_bounding_boxes(gv_ocr_json,doc_sec_id)
    2267{
    23     console.log(gv_ocr_json);
     68    //console.log(gv_ocr_json);
    2469
    2570   
     
    3883    var num_blocks = blocks.length;
    3984
    40     for (b=0; b<num_blocks; b++) {
     85    for (var b=0; b<num_blocks; b++) {
    4186        var block = blocks[b];
    4287
     
    5196        var num_vertices = vertices.length;
    5297
    53         for (v=0; v<num_vertices; v++) {
     98        for (var v=0; v<num_vertices; v++) {
    5499        var x = vertices[v].x;
    55100        var y = vertices[v].y;
     
    67112
    68113        var rect = { "x_org": x_org, "y_org": y_org, "x_dim": x_dim, "y_dim": y_dim};
    69         bounding_box_rects.push(rect);
     114
     115        var paras = block.paragraphs;
     116        var ocr_text = gv_ocr_paras_to_text(paras);
     117        rect.text = ocr_text;
     118
     119        bounding_box_rects.push(rect);     
    70120    }
    71 
     121   
    72122       
    73123    display_scaled_div_bounding_boxes(doc_sec_id, bounding_box_rects, page_x_dim,page_y_dim);
     
    100150
    101151    var $screen_div = $('#'+screen_image_id);
    102     //var $screen_img = $screen_div.find("img");
    103152   
    104153    $screen_div.append($boundingbox_overlay);
    105     //$screen_img.append($boundingbox_overlay);
    106 
    107154   
    108155    var num_bb_rects = bounding_box_rects.length;
    109156
    110     console.log("Block bounding boxes:")
    111 
    112     for (r=0; r<num_bb_rects; r++) {
     157    //console.log("Block bounding boxes:")
     158
     159    for (var r=0; r<num_bb_rects; r++) {
    113160    var rect = bounding_box_rects[r];
    114         console.log("  " + JSON.stringify(rect));
     161        //console.log("  " + JSON.stringify(rect));
    115162
    116163    var scaled_x_org = rect.x_org * scale_x;
     
    125172        .css("width", scaled_x_dim)
    126173        .css("height",scaled_y_dim);
    127    
    128     $boundingbox_overlay.append($boundingbox_div)
     174
     175    var text = rect.text;
     176    // //console.log("text = " + text);
     177    // // Could use 'data' here, but then does not appear in the element itself
     178    // // => for now do this 'old school' with an explicit 'data-text' attribute
     179    // $boundingbox_div.attr("data-text",text);
     180
     181    $boundingbox_div.attr("text",text);
     182   
     183    $boundingbox_overlay.append($boundingbox_div)   
    129184    }
    130185}
  • gs3-installations/intermuse/trunk/sites/intermuse/collect/programmes/transform/pages/document.xsl

    r37011 r37017  
    3333    <xsl:param name="assocfilepath"/>
    3434   
     35    <gsf:variable name="GVDocumentOCRJSON"><gsf:metadata name="GVDocumentOCRJSON"/></gsf:variable>
     36    <gsf:variable name="docSectionID"><xsl:value-of select="util:replace(@nodeID, '.', '_')}"/></gsf:variable>
     37   
    3538    <div style="padding-top: 10px; padding-bottom: 10px;">
    3639      Google Vision OCR JSON:
     
    3841    <xsl:attribute name="href">
    3942      <xsl:value-of disable-output-escaping="yes" select="/page/pageResponse/collection/metadataList/metadata[@name = 'httpPath']"/>
    40       <xsl:text>/index/assoc/</xsl:text><xsl:value-of disable-output-escaping="yes" select="$assocfilepath" /><xsl:text>/</xsl:text>gv_document_ocrgoogle-vision-output.json
     43      <xsl:text>/index/assoc/</xsl:text><xsl:value-of disable-output-escaping="yes" select="$assocfilepath" /><xsl:text>/</xsl:text><xsl:value-of select="$GVDocumentOCRJSON"/>
    4144    </xsl:attribute>
    42     gv_document_ocrgoogle-vision-output.json
     45    <xsl:value-of select="$GVDocumentOCRJSON"/>
    4346      </a>
    4447    </div>
    45     <gsf:variable name="GVDocumentOCRJSON"><gsf:metadata name="GVDocumentOCRJSON"/></gsf:variable>
    46     <gsf:variable name="docSectionID"><xsl:value-of select="util:replace(@nodeID, '.', '_')}"/></gsf:variable>
    4748    <gsf:script>
    4849      load_gv_dococr_json(gs.variables.GVDocumentOCRJSON,gs.variables.docSectionID,display_gv_ocr_bounding_boxes);
Note: See TracChangeset for help on using the changeset viewer.