source: main/trunk/binaries/windows/bin/docx2html.vbs@ 24166

Last change on this file since 24166 was 24166, checked in by ak19, 13 years ago

2nd and tentatively final set of changes changes to get the new docx2html functionality to work on docx files. The changes have to do with error reporting when Word is not installed/can't be found/can't be instantiated, when the script is launched with the wrong number of args and if the input file does not exist. WordPlugin now has docx as part of the default process_expression (even when OO is not installed).

File size: 6.6 KB
Line 
1Option Explicit
2
3' http://www.robvanderwoude.com/vbstech_automation_word.php
4' http://www.nilpo.com/2008/06/windows-scripting/reading-word-documents-in-wsh/ - for grabbing just the text (cleaned of Word mark-up) from a doc(x)
5' http://msdn.microsoft.com/en-us/library/3ca8tfek%28v=VS.85%29.aspx - VBScript Functions (CreateObject etc)
6
7' Error Handling:
8' http://blogs.msdn.com/b/ericlippert/archive/2004/08/19/error-handling-in-vbscript-part-one.aspx
9' http://msdn.microsoft.com/en-us/library/53f3k80h%28v=VS.85%29.aspx
10
11
12' To Do:
13' +1. error output on bad input to this file. And commit.
14' +1b. Active X error msg when trying to convert normal *.doc: only when windows scripting is on and Word not installed.
15' +1c. Make docx accepted by default as well. Changed WordPlugin.
16' 2. Try converting from other office types (xlsx, pptx) to html. They may use other constants for conversion filetypes
17' 3. gsConvert.pl's any_to_txt can be implemented for docx by getting all the text contents. Use a separate subroutine for this. Or use wdFormatUnicodeText as outputformat.
18' 4. Try out this script on Windows 7 to see whether WSH is active by default, as it is on XP and Vista.
19' 5. What kind of error occurs if any when user tries to convert docx on a machine with an old version of Word (pre-docx/pre-Word 2007)?
20' 6. Ask Dr Bainbridge whether this script can or shouldn't replace word2html, since this launches all version of word as well I think.
21
22
23' gsConvert.pl expects error output to go to the console's STDERR
24' for which we need to launch this vbs with "CScript //Nologo" '(cannot use WScript if using StdErr
25' and //Nologo is needed to repress Microsoft logo text output which messes up error reporting)
26' http://www.devguru.com/technologies/wsh/quickref/wscript_StdErr.html
27Dim objStdErr, args
28Set objStdErr = WScript.StdErr
29
30args = WScript.Arguments.Count
31If args < 2 then
32 'WScript.Echo Usage: args.vbs argument [input docx path] [output html path]
33 objStdErr.Write ("ERROR. Usage: CScript //Nologo " & WScript.ScriptName & " [input office doc path] [output html path]" & vbCrLf)
34 WScript.Quit
35end If
36
37' Now run the conversion subroutine
38Doc2HTML WScript.Arguments.Item(0),WScript.Arguments.Item(1)
39 ' In terminal, run as: > docx2html.vbs C:\fullpath\to\input.docx C:\fullpath\to\output.html
40 ' In terminal, run as: > CScript //Nologo docx2html.vbs C:\fullpath\to\input.docx C:\fullpath\to\output.html
41 ' if you want echoed error output to go to console (instead of creating a popup) and to avoid 2 lines of MS logo.
42 ' Will be using WScript.StdErr object to make error output go to stderr of CScript console (can't launch with WScript).
43 ' http://www.devguru.com/technologies/wsh/quickref/wscript_StdErr.html
44
45
46Sub Doc2HTML( inFile, outHTML )
47' This subroutine opens a Word document,
48' then saves it as HTML, and closes Word.
49' If the HTML file exists, it is overwritten.
50' If Word was already active, the subroutine
51' will leave the other document(s) alone and
52' close only its "own" document.
53'
54' Written by Rob van der Woude
55' http://www.robvanderwoude.com
56 ' Standard housekeeping
57 Dim objDoc, objFile, objFSO, objWord, strFile
58
59 Const wdFormatDocument = 0
60 Const wdFormatDocument97 = 0
61 Const wdFormatDocumentDefault = 16
62 Const wdFormatDOSText = 4
63 Const wdFormatDOSTextLineBreaks = 5
64 Const wdFormatEncodedText = 7
65 Const wdFormatFilteredHTML = 10
66 Const wdFormatFlatXML = 19
67 Const wdFormatFlatXMLMacroEnabled = 20
68 Const wdFormatFlatXMLTemplate = 21
69 Const wdFormatFlatXMLTemplateMacroEnabled = 22
70 Const wdFormatHTML = 8
71 Const wdFormatPDF = 17
72 Const wdFormatRTF = 6
73 Const wdFormatTemplate = 1
74 Const wdFormatTemplate97 = 1
75 Const wdFormatText = 2
76 Const wdFormatTextLineBreaks = 3
77 Const wdFormatUnicodeText = 7
78 Const wdFormatWebArchive = 9
79 Const wdFormatXML = 11
80 Const wdFormatXMLDocument = 12
81 Const wdFormatXMLDocumentMacroEnabled = 13
82 Const wdFormatXMLTemplate = 14
83 Const wdFormatXMLTemplateMacroEnabled = 15
84 Const wdFormatXPS = 18
85
86 ' Create a File System object
87 Set objFSO = CreateObject( "Scripting.FileSystemObject" )
88
89 ' Create a Word object. Exit with error msg if not possible (such as when Word is not installed)
90 On Error Resume Next
91 Set objWord = CreateObject( "Word.Application" )
92 If CStr(Err.Number) = 429 Then ' 429 is the error code for "ActiveX component can't create object"
93 ' http://msdn.microsoft.com/en-us/library/xe43cc8d%28v=VS.85%29.aspx
94 'WScript.Echo "Microsoft Word cannot be found -- document conversion cannot take place. Error #" & CStr(Err.Number) & ": " & Err.Description & "." & vbCrLf
95 objStdErr.Write ("ERROR: Windows-scripting failed. Document conversion cannot take place:" & vbCrLf)
96 objStdErr.Write (" Microsoft Word cannot be found or cannot be launched. (Error #" & CStr(Err.Number) & ": " & Err.Description & "). " & vbCrLf)
97 objStdErr.Write (" For converting the latest Office documents, install OpenOffice and Greenstone's OpenOffice extension. (Turn it on and turn off windows-scripting.)" & vbCrLf)
98 Exit Sub
99 End If
100
101 With objWord
102 ' True: make Word visible; False: invisible
103 .Visible = False
104
105 ' Check if the Word document exists
106 If objFSO.FileExists( inFile ) Then
107 Set objFile = objFSO.GetFile( inFile )
108 strFile = objFile.Path
109 Else
110 'WScript.Echo "FILE OPEN ERROR: The file does not exist" & vbCrLf
111 objStdErr.Write ("ERROR: Windows-scripting failed. Cannot open " & inFile & ". The file does not exist. ")
112 ' Close Word
113 .Quit
114 Exit Sub
115 End If
116
117 'outHTML = objFSO.BuildPath( objFile.ParentFolder, _
118 ' objFSO.GetBaseName( objFile ) & ".html" )
119
120 ' Open the Word document
121 .Documents.Open strFile
122
123 ' Make the opened file the active document
124 Set objDoc = .ActiveDocument
125
126 ' Save as HTML
127 objDoc.SaveAs outHTML, wdFormatFilteredHTML
128
129 ' Close the active document
130 objDoc.Close
131
132 ' Close Word
133 .Quit
134 End With
135End Sub
Note: See TracBrowser for help on using the repository browser.