VBS转换 Word 2007 文档(.docx)为文本文件
后缀名有个X的Office文档都是采用xml格式存储的,打包成一个zip压缩文件。微软官方也给出了相关的规则,因此不用安装office套件,就可能读取文档内容。这个想法很早就有,但是没有去试验过,
看到 tmplinshi 发了一个 xdoc2txt 转 txt 的工具([url]http://bathome.net/thread-22123-1-1.html[/url])里面提到不依赖office,于是就试着写了一个解析 word 2007 文档的脚本,
在制作过程中发现如果要实现很多细节的东西就会写的非常麻烦,于是就简略的弄了一下,比如项目编号只支持中文、罗马字、英文字母,并且最大39。
在这里我想提倡一下大家多使用xml,不是所有的东西都适合用文本的方式,用正则表达式之类的去处理,现在及以后很多东西都会是 xml 格式的,学会这个干事会非常方便。[code]
Option Explicit
If WScript.Arguments.Count = 0 Then
WScript.Echo "Spring Brother reminds you: docx2txt.vbs DocxFile"
WScript.Quit
End If
Dim docxFile
docxFile = WScript.Arguments(0)
' 项目编号格式。0=左端缩进,1=完整编号
Dim numberingType
numberingType = 0
' 编号与正文之间的分隔
Dim numberingGap
numberingGap = vbTab
' 单元格内的换行用此字符串代替
Dim inTdWrap
inTdWrap = vbCrLf
Const FSO_TEMPORARY_FOLDER = 2
Const DOCX_CONTENT_PATH = "word"
Const DOCX_CONTENT_FILE = "document.xml"
Const DOCX_NUMBERING_FILE = "numbering.xml"
Const COPY_NO_DIALOG = 4
Const ForWriting = 2
Const TristateTrue = -1
Dim sa, fso, docx, pxml, nfmt, nums, ncvt
Set sa = CreateObject("Shell.Application")
Set fso = CreateObject("Scripting.FileSystemObject")
Set docx = CreateObject("Msxml2.DOMDocument")
Set pxml = CreateObject("Msxml2.DOMDocument")
Set nfmt = CreateObject("Msxml2.DOMDocument")
Set nums = CreateObject("Scripting.Dictionary")
Set ncvt = New RegExp
ncvt.Pattern = "^(.*)\%(\d+)(.*)$"
' 初始化变量
' 原文件夹,导出文件,临时文件夹,临时文件,文档内容,文档编号
Dim origFolder, textFile, tempFolder, tempFile, content, numbering
origFolder = fso.GetFile(docxFile).ParentFolder.Path
textFile = fso.BuildPath(origFolder, fso.GetFile(docxFile).Name & ".txt")
tempFolder = fso.GetSpecialFolder(FSO_TEMPORARY_FOLDER)
tempFile = fso.BuildPath(tempFolder, fso.GetFile(docxFile).Name & ".zip")
content = fso.BuildPath(tempFolder, DOCX_CONTENT_FILE)
numbering = fso.BuildPath(tempFolder, DOCX_NUMBERING_FILE)
' 复制原 docx 文档到临时文件夹
fso.CopyFile docxFile, tempFile, True
' 从临时文件中提取出内容和编号格式文件
With sa.NameSpace(fso.BuildPath(tempFile, DOCX_CONTENT_PATH))
If fso.FileExists(content) Then fso.DeleteFile content, True
sa.NameSpace(tempFolder).CopyHere .ParseName(DOCX_CONTENT_FILE), COPY_NO_DIALOG
If fso.FileExists(numbering) Then fso.DeleteFile numbering, True
sa.NameSpace(tempFolder).CopyHere .ParseName(DOCX_NUMBERING_FILE), COPY_NO_DIALOG
Do ' 检查文件是否已经复制完成
WScript.Sleep 100
Loop Until fso.FileExists(content) And fso.FileExists(numbering)
End With
' 载入编号格式
nfmt.load numbering
' 载入文档内容
docx.load content
Dim f, ps, p, pxs, px, n, l, iNum, iLvl, i
' 创建 Unicode 编码文本文件
Set f = fso.OpenTextFile(textFile, ForWriting, True, TristateTrue)
Set ps = docx.documentElement.selectNodes("//w:p")
For Each p In ps
' 在独立空间中处理
pxml.loadXML p.xml
' 解析编号
Set l = pxml.selectSingleNode("//w:ilvl")
Set n = pxml.selectSingleNode("//w:numId")
If Not n Is Nothing Then
' 编号
iNum = n.attributes.Item(0).value
' 级次
iLvl = l.attributes.Item(0).value
If numberingType = 1 Then
For i = 0 To iLvl - 1
f.Write GetNumbering(iNum, i, False)
Next
End If
f.Write GetNumbering(iNum, iLvl, True)
f.Write numberingGap
End If
' 获取文本
Set pxs = pxml.selectNodes("//w:t")
For Each px In pxs
f.Write px.text
Next
' 切换单元格或段落换行
If (Not p.parentNode Is Nothing) And (p.parentNode.nodeName = "w:tc") Then
If p.nextSibling Is Nothing Then
f.Write vbTab
Else
f.Write inTdWrap
End If
If p.parentNode.nextSibling Is Nothing Then f.WriteBlankLines 1
Else
f.WriteBlankLines 1
End If
Next
f.Close
WScript.Echo textFile
'*****************************************************************************
'* 获取一个编号
'************************
Function GetNumbering(n, l, isNext)
Dim k, v, fmt, num, snum, ms, m, i
k = "N" & n & "L" & l
If nums.Exists(k) Then
v = nums.Item(k)
If isNext Then v(3) = v(3) + 1
nums.Item(k) = v
Else
fmt = FindNumberingFormat(n, l)
v = Array(k, fmt(3), fmt(4), fmt(2))
nums.Add k, v
End If
num = v(3)
Set ms = ncvt.Execute(v(2)).Item(0).SubMatches
' 编号前后的间隔字符串
Dim indent
indent = vbTab
Dim sp, lv, st
sp = ms.Item(0)
lv = ms.Item(1)
st = ms.Item(2)
snum = ""
' 从第二级开始才需要缩进
If numberingType = 0 Then
For i = 2 To lv
snum = snum & indent
Next
End If
snum = snum & sp & Num2No(num, v(1)) & st
GetNumbering = snum
End Function
'* 获取一个编号格式
'* 返回:[编号, 级次, 起始值, 类型, 格式]
'*****************************************
Function FindNumberingFormat(numId, lvlId)
Dim num, abstrNumId, abstrNum, isMultilevel, currLvl, start, numFmt, lvlText
Set num = nfmt.documentElement.selectSingleNode("//w:num[@w:numId='" & numId & "']")
' 如果不存在返回空的
If num Is Nothing Then
FindNumberingFormat = Array(numId & "_" & lvlId, -1, "", "")
Exit Function
End If
' 引用的编号
abstrNumId = num.selectSingleNode("w:abstractNumId").attributes.Item(0).value
Set abstrNum = nfmt.documentElement.selectSingleNode("//w:abstractNum[@w:abstractNumId='" & abstrNumId & "']")
' 单一编号还是复合编号
isMultilevel = abstrNum.selectSingleNode("w:multiLevelType").attributes.Item(0).value
If LCase(isMultilevel) = "singlelevel" Then
Set currLvl = abstrNum.selectSingleNode("w:lvl")
Else
Set currLvl = abstrNum.selectSingleNode("//w:lvl[@w:ilvl='" & lvlId & "']")
End If
' 确切的编号
With currLvl
start = .selectSingleNode("w:start").attributes.Item(0).value
numFmt = .selectSingleNode("w:numFmt").attributes.Item(0).value
lvlText = .selectSingleNode("w:lvlText").attributes.Item(0).value
End With
FindNumberingFormat = Array(numId, lvlId, start, numFmt, lvlText)
End Function
'* 转换数字为编号(仅支持 1~39 的整数)
'*************************************
Function Num2No(n, sType)
Dim canConvert, arr, g, s, m, r, i
canConvert = True
Select Case sType
Case "chineseCountingThousand"
arr = Array("", "一", "二", "三", "四", "五", "六", "七", "八", "九")
g = "十"
Case "chineseLegalSimplified"
arr = Array("", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖")
g = "拾"
Case "lowerRoman"
arr = Array("", "i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix")
g = "x"
Case "upperRoman"
arr = Array("", "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX")
g = "X"
Case "lowerLetter"
g = 96
Case "upperLetter"
g = 64
Case Else
canConvert = False
End Select
If canConvert Then
If n <= 9 Then
If sType = "lowerLetter" Or sType = "upperLetter" Then
s = Chr(g + n)
Else
s = arr(n)
End If
ElseIf n <= 39 Then
If sType = "lowerLetter" Or sType = "upperLetter" Then
m = Fix(n / 26)
r = n Mod 26
s = Chr(m + g) & Chr(r + g)
Else
m = Fix(n / 10)
r = n Mod 10
If sType = "lowerRoman" Or sType = "upperRoman" Then
For i = 1 To m
s = s & g
Next
s = s & arr(r)
Else
s = arr(m) & g & arr(r)
End If
End If
Else
s = n
End If
Else
s = n
End If
Num2No = s
End Function
[/code] 很棒的东西,感谢分享 这个牛逼了
页:
[1]